diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..775bde89 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-11-13T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2411.08870v1","updated":"2024-11-13T18:50:13Z","published":"2024-11-13T18:50:13Z","title":"The Limited Impact of Medical Adaptation of Large Language and\n Vision-Language Models","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare ten\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting and supervised fine-tuning regimes for medical question-answering\n(QA). For instance, across all tasks and model pairs we consider in the 3-shot\nsetting, medical LLMs only outperform their base models in 22.7% of cases,\nreach a (statistical) tie in 36.8% of cases, and are significantly worse than\ntheir base models in the remaining 40.5% of cases. Our conclusions are based on\n(i) comparing each medical model head-to-head, directly against the\ncorresponding base model; (ii) optimizing the prompts for each model separately\nin zero-/few-shot prompting; and (iii) accounting for statistical uncertainty\nin comparisons. While these basic practices are not consistently adopted in the\nliterature, our ablations show that they substantially impact conclusions.\nMeanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs\ncan show performance improvements, but the benefits do not carry over to tasks\nbased on clinical notes. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Pranav Mani","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.08870v1.pdf","comment":"Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes\n additional results on clinical note QA tasks and supervised fine-tuning\n evaluations"},{"id":"http://arxiv.org/abs/2411.08868v1","updated":"2024-11-13T18:49:35Z","published":"2024-11-13T18:49:35Z","title":"CamemBERT 2.0: A Smarter French Language Model Aged to Perfection","summary":" French language models, such as CamemBERT, have been widely adopted across\nindustries for natural language processing (NLP) tasks, with models like\nCamemBERT seeing over 4 million downloads per month. However, these models face\nchallenges due to temporal concept drift, where outdated training data leads to\na decline in performance, especially when encountering new topics and\nterminology. This issue emphasizes the need for updated models that reflect\ncurrent linguistic trends. In this paper, we introduce two new versions of the\nCamemBERT base model-CamemBERTav2 and CamemBERTv2-designed to address these\nchallenges. CamemBERTav2 is based on the DeBERTaV3 architecture and makes use\nof the Replaced Token Detection (RTD) objective for better contextual\nunderstanding, while CamemBERTv2 is built on RoBERTa, which uses the Masked\nLanguage Modeling (MLM) objective. Both models are trained on a significantly\nlarger and more recent dataset with longer context length and an updated\ntokenizer that enhances tokenization performance for French. We evaluate the\nperformance of these models on both general-domain NLP tasks and\ndomain-specific applications, such as medical field tasks, demonstrating their\nversatility and effectiveness across a range of use cases. Our results show\nthat these updated models vastly outperform their predecessors, making them\nvaluable tools for modern NLP systems. All our new models, as well as\nintermediate checkpoints, are made openly available on Huggingface.\n","authors":["Wissam Antoun","Francis Kulumba","Rian Touchent","Éric de la Clergerie","Benoît Sagot","Djamé Seddah"],"pdf_url":"https://arxiv.org/pdf/2411.08868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.02538v2","updated":"2024-11-13T18:04:44Z","published":"2024-11-04T19:17:17Z","title":"MILU: A Multi-task Indic Language Understanding Benchmark","summary":" Evaluating Large Language Models (LLMs) in low-resource and linguistically\ndiverse languages remains a significant challenge in NLP, particularly for\nlanguages using non-Latin scripts like those spoken in India. Existing\nbenchmarks predominantly focus on English, leaving substantial gaps in\nassessing LLM capabilities in these languages. We introduce MILU, a Multi task\nIndic Language Understanding Benchmark, a comprehensive evaluation benchmark\ndesigned to address this gap. MILU spans 8 domains and 42 subjects across 11\nIndic languages, reflecting both general and culturally specific knowledge.\nWith an India-centric design, incorporates material from regional and\nstate-level examinations, covering topics such as local history, arts,\nfestivals, and laws, alongside standard subjects like science and mathematics.\nWe evaluate over 45 LLMs, and find that current LLMs struggle with MILU, with\nGPT-4o achieving the highest average accuracy at 72 percent. Open multilingual\nmodels outperform language-specific fine-tuned models, which perform only\nslightly better than random baselines. Models also perform better in high\nresource languages as compared to low resource ones. Domain-wise analysis\nindicates that models perform poorly in culturally relevant areas like Arts and\nHumanities, Law and Governance compared to general fields like STEM. To the\nbest of our knowledge, MILU is the first of its kind benchmark focused on Indic\nlanguages, serving as a crucial step towards comprehensive cultural evaluation.\nAll code, benchmarks, and artifacts are publicly available to foster open\nresearch.\n","authors":["Sshubam Verma","Mohammed Safi Ur Rahman Khan","Vishwajeet Kumar","Rudra Murthy","Jaydeep Sen"],"pdf_url":"https://arxiv.org/pdf/2411.02538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18346v4","updated":"2024-11-13T17:17:43Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from over-reliance on unimodal biases (e.g., language bias\nand vision bias), leading to incorrect answers or hallucinations in complex\nmultimodal tasks. To investigate this issue, we propose a causal framework to\ninterpret the biases in Visual Question Answering (VQA) problems. Within this\nframework, we conduct an in-depth causal analysis to assess the causal effect\nof these biases on MLLM predictions. Based on the analysis, we introduce 1) a\nnovel MORE dataset with 12,000 challenging VQA instances requiring multi-hop\nreasoning and overcoming unimodal biases. 2) a causality-enhanced agent\nframework CAVE that guides models to comprehensively integrate information from\ndifferent modalities and mitigate biases. Our experiments show that MLLMs\nperform poorly on MORE, indicating strong unimodal biases and limited semantic\nunderstanding. However, when integrated with our CAVE, promising improvements\nin reasoning and bias mitigation can be seen. These findings provide important\ninsights for the development of more robust MLLMs and contribute to the broader\ngoal of advancing multimodal AI systems capable of deeper understanding and\nreasoning. Our project page is at https://github.com/OpenCausaLab/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08790v1","updated":"2024-11-13T17:16:48Z","published":"2024-11-13T17:16:48Z","title":"Can sparse autoencoders be used to decompose and interpret steering\n vectors?","summary":" Steering vectors are a promising approach to control the behaviour of large\nlanguage models. However, their underlying mechanisms remain poorly understood.\nWhile sparse autoencoders (SAEs) may offer a potential method to interpret\nsteering vectors, recent findings show that SAE-reconstructed vectors often\nlack the steering properties of the original vectors. This paper investigates\nwhy directly applying SAEs to steering vectors yields misleading\ndecompositions, identifying two reasons: (1) steering vectors fall outside the\ninput distribution for which SAEs are designed, and (2) steering vectors can\nhave meaningful negative projections in feature directions, which SAEs are not\ndesigned to accommodate. These limitations hinder the direct use of SAEs for\ninterpreting steering vectors.\n","authors":["Harry Mayne","Yushi Yang","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.08790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08785v1","updated":"2024-11-13T17:13:25Z","published":"2024-11-13T17:13:25Z","title":"Zero-shot Cross-lingual Transfer Learning with Multiple Source and\n Target Languages for Information Extraction: Language Selection and\n Adversarial Training","summary":" The majority of previous researches addressing multi-lingual IE are limited\nto zero-shot cross-lingual single-transfer (one-to-one) setting, with\nhigh-resource languages predominantly as source training data. As a result,\nthese works provide little understanding and benefit for the realistic goal of\ndeveloping a multi-lingual IE system that can generalize to as many languages\nas possible. Our study aims to fill this gap by providing a detailed analysis\non Cross-Lingual Multi-Transferability (many-to-many transfer learning), for\nthe recent IE corpora that cover a diverse set of languages. Specifically, we\nfirst determine the correlation between single-transfer performance and a wide\nrange of linguistic-based distances. From the obtained insights, a combined\nlanguage distance metric can be developed that is not only highly correlated\nbut also robust across different tasks and model scales. Next, we investigate\nthe more general zero-shot multi-lingual transfer settings where multiple\nlanguages are involved in the training and evaluation processes. Language\nclustering based on the newly defined distance can provide directions for\nachieving the optimal cost-performance trade-off in data (languages) selection\nproblem. Finally, a relational-transfer setting is proposed to further\nincorporate multi-lingual unlabeled data based on adversarial training using\nthe relation induced from the above linguistic distance.\n","authors":["Nghia Trung Ngo","Thien Huu Nguyen"],"pdf_url":"https://arxiv.org/pdf/2411.08785v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.03271v3","updated":"2024-11-13T17:10:20Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08752v1","updated":"2024-11-13T16:30:41Z","published":"2024-11-13T16:30:41Z","title":"Multi-Perspective Stance Detection","summary":" Subjective NLP tasks usually rely on human annotations provided by multiple\nannotators, whose judgments may vary due to their diverse backgrounds and life\nexperiences. Traditional methods often aggregate multiple annotations into a\nsingle ground truth, disregarding the diversity in perspectives that arises\nfrom annotator disagreement. In this preliminary study, we examine the effect\nof including multiple annotations on model accuracy in classification. Our\nmethodology investigates the performance of perspective-aware classification\nmodels in stance detection task and further inspects if annotator disagreement\naffects the model confidence. The results show that multi-perspective approach\nyields better classification performance outperforming the baseline which uses\nthe single label. This entails that designing more inclusive perspective-aware\nAI models is not only an essential first step in implementing responsible and\nethical AI, but it can also achieve superior results than using the traditional\napproaches.\n","authors":["Benedetta Muscato","Praveen Bushipaka","Gizem Gezici","Lucia Passaro","Fosca Giannotti"],"pdf_url":"https://arxiv.org/pdf/2411.08752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07140v2","updated":"2024-11-13T16:27:43Z","published":"2024-11-11T17:10:56Z","title":"Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language\n Models","summary":" New LLM evaluation benchmarks are important to align with the rapid\ndevelopment of Large Language Models (LLMs). In this work, we present Chinese\nSimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality\nability of language models to answer short questions, and Chinese SimpleQA\nmainly has five properties (i.e., Chinese, Diverse, High-quality, Static,\nEasy-to-evaluate). Specifically, first, we focus on the Chinese language over 6\nmajor topics with 99 diverse subtopics. Second, we conduct a comprehensive\nquality control process to achieve high-quality questions and answers, where\nthe reference answers are static and cannot be changed over time. Third,\nfollowing SimpleQA, the questions and answers are very short, and the grading\nprocess is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we\nperform a comprehensive evaluation on the factuality abilities of existing\nLLMs. Finally, we hope that Chinese SimpleQA could guide the developers to\nbetter understand the Chinese factuality abilities of their models and\nfacilitate the growth of foundation models.\n","authors":["Yancheng He","Shilong Li","Jiaheng Liu","Yingshui Tan","Weixun Wang","Hui Huang","Xingyuan Bu","Hangyu Guo","Chengwei Hu","Boren Zheng","Zhuoran Lin","Xuepeng Liu","Dekai Sun","Shirong Lin","Zhicheng Zheng","Xiaoyong Zhu","Wenbo Su","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.07140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08745v1","updated":"2024-11-13T16:26:19Z","published":"2024-11-13T16:26:19Z","title":"Separating Tongue from Thought: Activation Patching Reveals\n Language-Agnostic Concept Representations in Transformers","summary":" A central question in multilingual language modeling is whether large\nlanguage models (LLMs) develop a universal concept representation, disentangled\nfrom specific languages. In this paper, we address this question by analyzing\nlatent representations (latents) during a word translation task in\ntransformer-based LLMs. We strategically extract latents from a source\ntranslation prompt and insert them into the forward pass on a target\ntranslation prompt. By doing so, we find that the output language is encoded in\nthe latent at an earlier layer than the concept to be translated. Building on\nthis insight, we conduct two key experiments. First, we demonstrate that we can\nchange the concept without changing the language and vice versa through\nactivation patching alone. Second, we show that patching with the mean over\nlatents across different languages does not impair and instead improves the\nmodels' performance in translating the concept. Our results provide evidence\nfor the existence of language-agnostic concept representations within the\ninvestigated models.\n","authors":["Clément Dumas","Chris Wendler","Veniamin Veselovsky","Giovanni Monea","Robert West"],"pdf_url":"https://arxiv.org/pdf/2411.08745v1.pdf","comment":"12 pages, 10 figures, previously published under the title \"How Do\n Llamas Process Multilingual Text? A Latent Exploration through Activation\n Patching\" at the ICML 2024 mechanistic interpretability workshop\n https://openreview.net/forum?id=0ku2hIm4BS"},{"id":"http://arxiv.org/abs/2411.08742v1","updated":"2024-11-13T16:20:20Z","published":"2024-11-13T16:20:20Z","title":"A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks\n with Large Language Models","summary":" With the rise of Speech Large Language Models (Speech LLMs), there has been\ngrowing interest in discrete speech tokens for their ability to integrate with\ntext-based tokens seamlessly. Compared to most studies that focus on continuous\nspeech features, although discrete-token based LLMs have shown promising\nresults on certain tasks, the performance gap between these two paradigms is\nrarely explored. In this paper, we present a fair and thorough comparison\nbetween discrete and continuous features across a variety of semantic-related\ntasks using a light-weight LLM (Qwen1.5-0.5B). Our findings reveal that\ncontinuous features generally outperform discrete tokens, particularly in tasks\nrequiring fine-grained semantic understanding. Moreover, this study goes beyond\nsurface-level comparison by identifying key factors behind the\nunder-performance of discrete tokens, such as limited token granularity and\ninefficient information retention. To enhance the performance of discrete\ntokens, we explore potential aspects based on our analysis. We hope our results\ncan offer new insights into the opportunities for advancing discrete speech\ntokens in Speech LLMs.\n","authors":["Dingdong Wang","Mingyu Cui","Dongchao Yang","Xueyuan Chen","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08742v1.pdf","comment":"5 tables, 4 figures"},{"id":"http://arxiv.org/abs/2411.08733v1","updated":"2024-11-13T16:15:38Z","published":"2024-11-13T16:15:38Z","title":"Dynamic Rewarding with Prompt Optimization Enables Tuning-free\n Self-Alignment of Language Models","summary":" Aligning Large Language Models (LLMs) traditionally relies on costly training\nand human preference annotations. Self-alignment seeks to reduce these expenses\nby enabling models to align themselves. To further lower costs and achieve\nalignment without any expensive tuning or annotations, we introduce a new\ntuning-free approach for self-alignment, Dynamic Rewarding with Prompt\nOptimization (\\ours). Our approach leverages a search-based optimization\nframework that allows LLMs to iteratively self-improve and craft the optimal\nalignment instructions, all without additional training or human intervention.\nThe core of \\ours is a dynamic rewarding mechanism, which identifies and\nrectifies model-specific alignment weaknesses, allowing LLMs to adapt\nefficiently to diverse alignment challenges. Empirical evaluations on eight\nrecent LLMs, both open- and closed-sourced, demonstrate that \\ours\nsignificantly enhances alignment performance, with base models outperforming\ntheir SFT/RLHF-tuned counterparts. Moreover, the prompts automatically\noptimized by \\ours surpass those curated by human experts, further validating\nthe effectiveness of our approach. Our findings highlight the great potential\nof current LLMs to achieve adaptive self-alignment through inference-time\noptimization, complementing tuning-based alignment methods.\n","authors":["Somanshu Singla","Zhen Wang","Tianyang Liu","Abdullah Ashfaq","Zhiting Hu","Eric P. Xing"],"pdf_url":"https://arxiv.org/pdf/2411.08733v1.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2411.08726v1","updated":"2024-11-13T16:08:40Z","published":"2024-11-13T16:08:40Z","title":"Analyst Reports and Stock Performance: Evidence from the Chinese Market","summary":" This article applies natural language processing (NLP) to extract and\nquantify textual information to predict stock performance. Using an extensive\ndataset of Chinese analyst reports and employing a customized BERT deep\nlearning model for Chinese text, this study categorizes the sentiment of the\nreports as positive, neutral, or negative. The findings underscore the\npredictive capacity of this sentiment indicator for stock volatility, excess\nreturns, and trading volume. Specifically, analyst reports with strong positive\nsentiment will increase excess return and intraday volatility, and vice versa,\nreports with strong negative sentiment also increase volatility and trading\nvolume, but decrease future excess return. The magnitude of this effect is\ngreater for positive sentiment reports than for negative sentiment reports.\nThis article contributes to the empirical literature on sentiment analysis and\nthe response of the stock market to news in the Chinese stock market.\n","authors":["Rui Liu","Jiayou Liang","Haolong Chen","Yujia Hu"],"pdf_url":"https://arxiv.org/pdf/2411.08726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08708v1","updated":"2024-11-13T15:50:38Z","published":"2024-11-13T15:50:38Z","title":"Are Triggers Needed for Document-Level Event Extraction?","summary":" Most existing work on event extraction has focused on sentence-level texts\nand presumes the identification of a trigger-span -- a word or phrase in the\ninput that evokes the occurrence of an event of interest. Event arguments are\nthen extracted with respect to the trigger. Indeed, triggers are treated as\nintegral to, and trigger detection as an essential component of, event\nextraction. In this paper, we provide the first investigation of the role of\ntriggers for the more difficult and much less studied task of document-level\nevent extraction. We analyze their usefulness in multiple end-to-end and\npipelined neural event extraction models for three document-level event\nextraction datasets, measuring performance using triggers of varying quality\n(human-annotated, LLM-generated, keyword-based, and random). Our research shows\nthat trigger effectiveness varies based on the extraction task's\ncharacteristics and data quality, with basic, automatically-generated triggers\nserving as a viable alternative to human-annotated ones. Furthermore, providing\ndetailed event descriptions to the extraction model helps maintain robust\nperformance even when trigger quality degrades. Perhaps surprisingly, we also\nfind that the mere existence of trigger input, even random ones, is important\nfor prompt-based LLM approaches to the task.\n","authors":["Shaden Shaar","Wayne Chen","Maitreyi Chatterjee","Barry Wang","Wenting Zhao","Claire Cardie"],"pdf_url":"https://arxiv.org/pdf/2411.08708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07879v4","updated":"2024-11-13T15:45:31Z","published":"2023-11-14T03:18:28Z","title":"Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting\n Volunteer Content Moderators","summary":" Extensive efforts in automated approaches for content moderation have been\nfocused on developing models to identify toxic, offensive, and hateful content\nwith the aim of lightening the load for moderators. Yet, it remains uncertain\nwhether improvements on those tasks have truly addressed moderators' needs in\naccomplishing their work. In this paper, we surface gaps between past research\nefforts that have aimed to provide automation for aspects of content moderation\nand the needs of volunteer content moderators, regarding identifying violations\nof various moderation rules. To do so, we conduct a model review on Hugging\nFace to reveal the availability of models to cover various moderation rules and\nguidelines from three exemplar forums. We further put state-of-the-art LLMs to\nthe test, evaluating how well these models perform in flagging violations of\nplatform rules from one particular forum. Finally, we conduct a user survey\nstudy with volunteer moderators to gain insight into their perspectives on\nuseful moderation models. Overall, we observe a non-trivial gap, as missing\ndeveloped models and LLMs exhibit moderate to low performance on a significant\nportion of the rules. Moderators' reports provide guides for future work on\ndeveloping moderation assistant models.\n","authors":["Yang Trista Cao","Lovely-Frances Domingo","Sarah Ann Gilbert","Michelle Mazurek","Katie Shilton","Hal Daumé III"],"pdf_url":"https://arxiv.org/pdf/2311.07879v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15339v3","updated":"2024-11-13T15:25:32Z","published":"2024-07-22T02:53:18Z","title":"Deep Learning for Economists","summary":" Deep learning provides powerful methods to impute structured information from\nlarge-scale, unstructured text and image datasets. For example, economists\nmight wish to detect the presence of economic activity in satellite images, or\nto measure the topics or entities mentioned in social media, the congressional\nrecord, or firm filings. This review introduces deep neural networks, covering\nmethods such as classifiers, regression models, generative AI, and embedding\nmodels. Applications include classification, document digitization, record\nlinkage, and methods for data exploration in massive scale text and image\ncorpora. When suitable methods are used, deep learning models can be cheap to\ntune and can scale affordably to problems involving millions or billions of\ndata points.. The review is accompanied by a companion website, EconDL, with\nuser-friendly demo notebooks, software resources, and a knowledge base that\nprovides technical details and additional applications.\n","authors":["Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2407.15339v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16187v3","updated":"2024-11-13T15:14:38Z","published":"2024-02-25T20:24:07Z","title":"No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design\n Choices","summary":" Advances in generative models have made it possible for AI-generated text,\ncode, and images to mirror human-generated content in many applications.\nWatermarking, a technique that aims to embed information in the output of a\nmodel to verify its source, is useful for mitigating the misuse of such\nAI-generated content. However, we show that common design choices in LLM\nwatermarking schemes make the resulting systems surprisingly susceptible to\nattack -- leading to fundamental trade-offs in robustness, utility, and\nusability. To navigate these trade-offs, we rigorously study a set of simple\nyet effective attacks on common watermarking systems, and propose guidelines\nand defenses for LLM watermarking in practice.\n","authors":["Qi Pang","Shengyuan Hu","Wenting Zheng","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2402.16187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08671v1","updated":"2024-11-13T15:04:02Z","published":"2024-11-13T15:04:02Z","title":"Theoretical Analysis of Byte-Pair Encoding","summary":" Byte-Pair Encoding (BPE) is a widely used method for subword tokenization,\nwith origins in grammar-based text compression. It is employed in a variety of\nlanguage processing tasks such as machine translation or large language model\n(LLM) pretraining, to create a token dictionary of a prescribed size. Most\nevaluations of BPE to date are empirical, and the reasons for its good\npractical performance are not well understood.\n In this paper we focus on the optimization problem underlying BPE: finding a\npair encoding that achieves optimal compression utility. We show that this\nproblem is APX-complete, indicating that it is unlikely to admit a\npolynomial-time approximation scheme. This answers, in a stronger form, a\nquestion recently raised by Zouhar et al.\n On the positive side, we show that BPE approximates the compression utility\nof the optimal pair encoding to a worst-case factor between $0.333$ and\n$0.625$. Our results aim to explain the ongoing success of BPE and are, to our\nknowledge, the first rigorous guarantees on its compression utility that hold\nfor all inputs.\n","authors":["László Kozma","Johannes Voderholzer"],"pdf_url":"https://arxiv.org/pdf/2411.08671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.15736v2","updated":"2024-11-13T14:05:18Z","published":"2024-03-23T06:03:36Z","title":"General LLMs as Instructors for Domain-Specific LLMs: A Sequential\n Fusion Method to Integrate Extraction and Editing","summary":" The substantial interest in updating Large Language Models (LLMs) without\nretraining from scratch is accompanied by several challenges. This is\nparticularly true when updating LLMs with datasets that necessitate\ndomain-expert reasoning across extensive texts, despite limited samples. We\ntermed the scenario as the Few-Shot Domain-Expert Reasoning for Updating LLMs\n(FDoR-UL). Traditional methods such as Low-Rank Adaptation (LoRA) and Retrieval\nAugmented Generation (RAG) are inadequate for addressing this critical issue,\nparticularly evident in our exploration of a specific medical dataset that\nepitomizes the distinct needs of FDoR-UL. To tackle this challenge, we\nintroduce a Sequential Fusion method to integrate knowledge from complex\ncontexts into LLMs. This method employs a two-stage framework: initially\nleveraging general LLMs to perform relation extraction for knowledge\nacquisition from complex texts, followed by updating domain-specific LLMs\nthrough Knowledge Editing (KE). Employing our method, domain-specific LLMs\nachieved a 71.7% accuracy (an average gain of 39.1%) in question-answering\ntasks. Furthermore, we expanded our evaluation to a novel economics-management\ndataset we developed, where our method achieved a 75.0% accuracy (an average\ngain of 45.0%). These findings underscore the effectiveness and flexibility of\nour approach in FDoR-UL across various domains.\n","authors":["Xin Zhang","Tianjie Ju","Huijia Liang","Ying Fu","Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2403.15736v2.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2411.08610v1","updated":"2024-11-13T13:53:10Z","published":"2024-11-13T13:53:10Z","title":"Dynamic Subset Tuning: Expanding the Operational Range of\n Parameter-Efficient Training for Large Language Models","summary":" We propose a novel parameter-efficient training (PET) method for large\nlanguage models that adapts models to downstream tasks by optimizing a small\nsubset of the existing model parameters. Unlike prior methods, this subset is\nnot fixed in location but rather which parameters are modified evolves over the\ncourse of training. This dynamic parameter selection can yield good performance\nwith many fewer parameters than extant methods. Our method enables a seamless\nscaling of the subset size across an arbitrary proportion of the total model\nsize, while popular PET approaches like prompt tuning and LoRA cover only a\nsmall part of this spectrum. We match or outperform prompt tuning and LoRA in\nmost cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given\nparameter budget across different model families and sizes.\n","authors":["Felix Stahlberg","Jared Lichtarge","Shankar Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08610v1.pdf","comment":"NeurIPS 2024 Workshop on Adaptive Foundation Models"},{"id":"http://arxiv.org/abs/2410.20513v2","updated":"2024-11-13T13:40:19Z","published":"2024-10-27T16:52:21Z","title":"Is Moral Self-correction An Innate Capability of Large Language Models?\n A Mechanistic Analysis to Self-correction","summary":" Though intensive attentions to the self-correction capability of Large\nLanguage Models (LLMs), the underlying mechanism of this capability is still\nunder-explored. In this paper, we aim to answer two fundamental questions for\nmoral self-correction: (1) how different components in self-correction, such as\nChain-of-Thought (CoT) reasoning, external feedback, and instructional prompts,\ninteract to enable moral self-correction; and (2) is the self-correction one of\nLLMs' innate capabilities? To answer the first question, we examine how\ndifferent self-correction components interact to intervene the embedded\nmorality within hidden states, therefore contributing to different performance.\nFor the second question, we (i) evaluate the robustness of moral\nself-correction by introducing natural language interventions of weak evidence\ninto prompts; (ii) propose a validation framework, self-distinguish, that\nrequires effective self-correction to enable LLMs to distinguish between\ndesirable and undesirable outputs. Our experimental results indicate that there\nis no universally optimal self-correction method for the tasks considered,\nalthough external feedback and CoT can contribute to additional performance\ngains. However, our mechanistic analysis reveals negative interactions among\ninstructional prompts, CoT, and external feedback, suggesting a conflict\nbetween internal knowledge and external feedback. The self-distinguish\nexperiments demonstrate that while LLMs can self-correct their responses, they\nare unable to reliably distinguish between desired and undesired outputs. With\nour empirical evidence, we can conclude that moral self-correction is not an\ninnate capability of LLMs acquired during pretraining.\n","authors":["Zimo Qi","Guangliang Liu","Kristen Marie Johnson","Lu Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.20513v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08599v1","updated":"2024-11-13T13:30:21Z","published":"2024-11-13T13:30:21Z","title":"XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL","summary":" To tackle the challenges of large language model performance in natural\nlanguage to SQL tasks, we introduce XiYan-SQL, an innovative framework that\nemploys a multi-generator ensemble strategy to improve candidate generation. We\nintroduce M-Schema, a semi-structured schema representation method designed to\nenhance the understanding of database structures. To enhance the quality and\ndiversity of generated candidate SQL queries, XiYan-SQL integrates the\nsignificant potential of in-context learning (ICL) with the precise control of\nsupervised fine-tuning. On one hand, we propose a series of training strategies\nto fine-tune models to generate high-quality candidates with diverse\npreferences. On the other hand, we implement the ICL approach with an example\nselection method based on named entity recognition to prevent overemphasis on\nentities. The refiner optimizes each candidate by correcting logical or\nsyntactical errors. To address the challenge of identifying the best candidate,\nwe fine-tune a selection model to distinguish nuances of candidate SQL queries.\nThe experimental results on multiple dialect datasets demonstrate the\nrobustness of XiYan-SQL in addressing challenges across different scenarios.\nOverall, our proposed XiYan-SQL achieves the state-of-the-art execution\naccuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on\nNL2GQL, and a competitive score of 72.23% on the Bird development benchmark.\nThe proposed framework not only enhances the quality and diversity of SQL\nqueries but also outperforms previous methods.\n","authors":["Yingqi Gao","Yifu Liu","Xiaoxia Li","Xiaorong Shi","Yin Zhu","Yiming Wang","Shiqi Li","Wei Li","Yuntao Hong","Zhiling Luo","Jinyang Gao","Liyu Mou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.08599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.17073v3","updated":"2024-11-13T12:46:54Z","published":"2024-09-25T16:32:35Z","title":"Enhancing Post-Hoc Attributions in Long Document Comprehension via\n Coarse Grained Answer Decomposition","summary":" Accurately attributing answer text to its source document is crucial for\ndeveloping a reliable question-answering system. However, attribution for long\ndocuments remains largely unexplored. Post-hoc attribution systems are designed\nto map answer text back to the source document, yet the granularity of this\nmapping has not been addressed. Furthermore, a critical question arises: What\nexactly should be attributed? This involves identifying the specific\ninformation units within an answer that require grounding. In this paper, we\npropose and investigate a novel approach to the factual decomposition of\ngenerated answers for attribution, employing template-based in-context\nlearning. To accomplish this, we utilize the question and integrate negative\nsampling during few-shot in-context learning for decomposition. This approach\nenhances the semantic understanding of both abstractive and extractive answers.\nWe examine the impact of answer decomposition by providing a thorough\nexamination of various attribution approaches, ranging from retrieval-based\ntechniques to LLM-based attributors.\n","authors":["Pritika Ramu","Koustava Goswami","Apoorv Saxena","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2409.17073v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v2","updated":"2024-11-13T12:37:09Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hanwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2411.08553v1","updated":"2024-11-13T12:09:23Z","published":"2024-11-13T12:09:23Z","title":"CorrSynth -- A Correlated Sampling Method for Diverse Dataset Generation\n from LLMs","summary":" Large language models (LLMs) have demonstrated remarkable performance in\ndiverse tasks using zero-shot and few-shot prompting. Even though their\ncapabilities of data synthesis have been studied well in recent years, the\ngenerated data suffers from a lack of diversity, less adherence to the prompt,\nand potential biases that creep into the data from the generator model. In this\nwork, we tackle the challenge of generating datasets with high diversity, upon\nwhich a student model is trained for downstream tasks. Taking the route of\ndecoding-time guidance-based approaches, we propose CorrSynth, which generates\ndata that is more diverse and faithful to the input prompt using a correlated\nsampling strategy. Further, our method overcomes the complexity drawbacks of\nsome other guidance-based techniques like classifier-based guidance. With\nextensive experiments, we show the effectiveness of our approach and\nsubstantiate our claims. In particular, we perform intrinsic evaluation to show\nthe improvements in diversity. Our experiments show that CorrSynth improves\nboth student metrics and intrinsic metrics upon competitive baselines across\nfour datasets, showing the innate advantage of our method.\n","authors":["Suhas S Kowshik","Abhishek Divekar","Vijit Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08553v1.pdf","comment":"Published as a main conference paper at EMNLP 2024; First two authors\n contributed equally"},{"id":"http://arxiv.org/abs/2411.08534v1","updated":"2024-11-13T11:31:02Z","published":"2024-11-13T11:31:02Z","title":"Neural Topic Modeling with Large Language Models in the Loop","summary":" Topic modeling is a fundamental task in natural language processing, allowing\nthe discovery of latent thematic structures in text corpora. While Large\nLanguage Models (LLMs) have demonstrated promising capabilities in topic\ndiscovery, their direct application to topic modeling suffers from issues such\nas incomplete topic coverage, misalignment of topics, and inefficiency. To\naddress these limitations, we propose LLM-ITL, a novel LLM-in-the-loop\nframework that integrates LLMs with many existing Neural Topic Models (NTMs).\nIn LLM-ITL, global topics and document representations are learned through the\nNTM, while an LLM refines the topics via a confidence-weighted Optimal\nTransport (OT)-based alignment objective. This process enhances the\ninterpretability and coherence of the learned topics, while maintaining the\nefficiency of NTMs. Extensive experiments demonstrate that LLM-ITL can help\nNTMs significantly improve their topic interpretability while maintaining the\nquality of document representation.\n","authors":["Xiaohao Yang","He Zhao","Weijie Xu","Yuanyuan Qi","Jueqing Lu","Dinh Phung","Lan Du"],"pdf_url":"https://arxiv.org/pdf/2411.08534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07268v2","updated":"2024-11-13T11:28:07Z","published":"2024-11-09T15:59:59Z","title":"Target-driven Attack for Large Language Models","summary":" Current large language models (LLM) provide a strong foundation for\nlarge-scale user-oriented natural language tasks. Many users can easily inject\nadversarial text or instructions through the user interface, thus causing LLM\nmodel security challenges like the language model not giving the correct\nanswer. Although there is currently a large amount of research on black-box\nattacks, most of these black-box attacks use random and heuristic strategies.\nIt is unclear how these strategies relate to the success rate of attacks and\nthus effectively improve model robustness. To solve this problem, we propose\nour target-driven black-box attack method to maximize the KL divergence between\nthe conditional probabilities of the clean text and the attack text to redefine\nthe attack's goal. We transform the distance maximization problem into two\nconvex optimization problems based on the attack goal to solve the attack text\nand estimate the covariance. Furthermore, the projected gradient descent\nalgorithm solves the vector corresponding to the attack text. Our target-driven\nblack-box attack approach includes two attack strategies: token manipulation\nand misinformation attack. Experimental results on multiple Large Language\nModels and datasets demonstrate the effectiveness of our attack method.\n","authors":["Chong Zhang","Mingyu Jin","Dong Shu","Taowen Wang","Dongfang Liu","Xiaobo Jin"],"pdf_url":"https://arxiv.org/pdf/2411.07268v2.pdf","comment":"12 pages, 7 figures. This work is an extension of the\n arXiv:2404.07234 work. We propose new methods. 27th European Conference on\n Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2405.10040v3","updated":"2024-11-13T11:13:56Z","published":"2024-05-16T12:22:41Z","title":"SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation","summary":" It is often desirable to distill the capabilities of large language models\n(LLMs) into smaller student models due to compute and memory constraints. One\nway to do this for classification tasks is via dataset synthesis, which can be\naccomplished by generating examples of each label from the LLM. Prior\napproaches to synthesis use few-shot prompting, which relies on the LLM's\nparametric knowledge to generate usable examples. However, this leads to issues\nof repetition, bias towards popular entities, and stylistic differences from\nhuman text. In this work, we propose Synthesize by Retrieval and Refinement\n(SynthesizRR), which uses retrieval augmentation to introduce variety into the\ndataset synthesis process: as retrieved passages vary, the LLM is seeded with\ndifferent content to generate its examples. We empirically study the synthesis\nof six datasets, covering topic classification, sentiment analysis, tone\ndetection, and humor, requiring complex synthesis strategies. We find that\nSynthesizRR greatly improves lexical and semantic diversity, similarity to\nhuman-written text, and distillation performance, when compared to 32-shot\nprompting and four prior approaches. We release our code to perform all steps\nat https://github.com/amazon-science/synthesizrr\n","authors":["Abhishek Divekar","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2405.10040v3.pdf","comment":"Published as a main conference paper at EMNLP 2024. Code available at\n https://github.com/amazon-science/synthesizrr"},{"id":"http://arxiv.org/abs/2411.08516v1","updated":"2024-11-13T11:02:04Z","published":"2024-11-13T11:02:04Z","title":"Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale\n Table Understanding","summary":" The ubiquity and value of tables as semi-structured data across various\ndomains necessitate advanced methods for understanding their complexity and\nvast amounts of information. Despite the impressive capabilities of large\nlanguage models (LLMs) in advancing the natural language understanding\nfrontier, their application to large-scale tabular data presents significant\nchallenges, specifically regarding table size and complex intricate\nrelationships. Existing works have shown promise with small-scale tables but\noften flounder when tasked with the complex reasoning required by larger,\ninterconnected tables found in real-world scenarios. To address this gap, we\nintroduce \"Tree-of-Table\", a novel approach designed to enhance LLMs' reasoning\ncapabilities over large and complex tables. Our method employs Table\nCondensation and Decomposition to distill and reorganize relevant data into a\nmanageable format, followed by the construction of a hierarchical Table-Tree\nthat facilitates tree-structured reasoning. Through a meticulous Table-Tree\nExecution process, we systematically unravel the tree-structured reasoning\nchain to derive the solutions. Experiments across diverse datasets, including\nWikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new\nbenchmark with superior performance, showcasing remarkable efficiency and\ngeneralization capabilities in large-scale table reasoning.\n","authors":["Deyi Ji","Lanyun Zhu","Siqi Gao","Peng Xu","Hongtao Lu","Jieping Ye","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.13929v4","updated":"2024-11-13T10:57:21Z","published":"2024-05-22T18:58:58Z","title":"Vikhr: Constructing a State-of-the-art Bilingual Open-Source\n Instruction-Following Large Language Model for Russian","summary":" There has been a surge in developing various Large Language Models (LLMs).\nHowever, text generation for languages other than English often faces\nsignificant challenges, including poor generation quality and reduced\ncomputational performance due to the disproportionate representation of tokens\nin the model's vocabulary. In this work, we address these issues by developing\na pipeline for adapting English-oriented pre-trained models to other languages\nand constructing efficient bilingual LLMs. Using this pipeline, we construct\nVikhr, a state-of-the-art bilingual open-source instruction-following LLM\ndesigned specifically for the Russian language. \"Vikhr\" refers to the name of\nthe Mistral LLM series and means a \"strong gust of wind.\" Unlike previous\nRussian-language models that typically rely on LoRA adapters on top of\nEnglish-oriented models, sacrificing performance for lower training costs,\nVikhr features an adapted tokenizer vocabulary and undergoes continued\npre-training and instruction tuning of all weights. This not only enhances the\nmodel's performance but also significantly improves its computational and\ncontextual efficiency. The remarkable performance of Vikhr across various\nRussian-language benchmarks can also be attributed to our efforts in expanding\ninstruction datasets and corpora for continued pre-training. Vikhr not only\nsets a new state of the art among open-source LLMs for Russian but even\noutperforms some proprietary closed-source models on certain benchmarks. The\nmodel weights, instruction sets, and code are publicly available.\n","authors":["Aleksandr Nikolich","Konstantin Korolev","Sergei Bratchikov","Igor Kiselev","Artem Shelmanov"],"pdf_url":"https://arxiv.org/pdf/2405.13929v4.pdf","comment":"Accepted at WMRL @ EMNLP-2024"},{"id":"http://arxiv.org/abs/2411.08506v1","updated":"2024-11-13T10:43:31Z","published":"2024-11-13T10:43:31Z","title":"An Information Theoretic Approach to Operationalize Right to Data\n Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v1.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2411.08504v1","updated":"2024-11-13T10:42:11Z","published":"2024-11-13T10:42:11Z","title":"Towards Objective and Unbiased Decision Assessments with LLM-Enhanced\n Hierarchical Attention Networks","summary":" How objective and unbiased are we while making decisions? This work\ninvestigates cognitive bias identification in high-stake decision making\nprocess by human experts, questioning its effectiveness in real-world settings,\nsuch as candidates assessments for university admission. We begin with a\nstatistical analysis assessing correlations among different decision points\namong in the current process, which discovers discrepancies that imply\ncognitive bias and inconsistency in decisions. This motivates our exploration\nof bias-aware AI-augmented workflow that surpass human judgment. We propose\nBGM-HAN, a hierarchical attention network enhanced by byte-pair encoding,\nmulti-head attention and gated residual connection. Using it as backbone model,\nwe further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which\nsimulate real-world decision-making. In our experiments, both the proposed\nmodel and the agentic workflow significantly improves on both human judgment\nand alternative models, validated with real-world data.\n","authors":["Junhua Liu","Kwan Hui Lim","Roy Ka-Wei Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08449v1","updated":"2024-11-13T09:11:56Z","published":"2024-11-13T09:11:56Z","title":"Towards Evaluating Large Language Models for Graph Query Generation","summary":" Large Language Models (LLMs) are revolutionizing the landscape of Generative\nArtificial Intelligence (GenAI), with innovative LLM-backed solutions emerging\nrapidly. However, when applied to database technologies, specifically query\ngeneration for graph databases and Knowledge Graphs (KGs), LLMs still face\nsignificant challenges. While research on LLM-driven query generation for\nStructured Query Language (SQL) exists, similar systems for graph databases\nremain underdeveloped. This paper presents a comparative study addressing the\nchallenge of generating Cypher queries a powerful language for interacting with\ngraph databases using open-access LLMs. We rigorously evaluate several LLM\nagents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a\nlocally deployed Llama 3.1 8B) using a designed few-shot learning prompt and\nRetrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT)\nreasoning. Our empirical analysis of query generation accuracy reveals that\nClaude Sonnet 3.5 outperforms its counterparts in this specific domain.\nFurther, we highlight promising future research directions to address the\nidentified limitations and advance LLM-driven query generation for graph\ndatabases.\n","authors":["Siraj Munir","Alessandro Aldini"],"pdf_url":"https://arxiv.org/pdf/2411.08449v1.pdf","comment":"Paper accepted and will be presented at CSCI2024 in December 2024,\n Later will be published at Springer LNCS"},{"id":"http://arxiv.org/abs/2404.17808v3","updated":"2024-11-13T08:51:04Z","published":"2024-04-27T07:12:07Z","title":"Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models\n with Simple and Effective Scaffold Token Removal","summary":" Byte Pair Encoding (BPE) serves as a foundation method for text tokenization\nin the Natural Language Processing (NLP) field. Despite its wide adoption, the\noriginal BPE algorithm harbors an inherent flaw: it inadvertently introduces a\nfrequency imbalance for tokens in the text corpus. Since BPE iteratively merges\nthe most frequent token pair in the text corpus to generate a new token and\nkeeps all generated tokens in the vocabulary, it unavoidably holds tokens that\nprimarily act as components of a longer token and appear infrequently on their\nown. We term such tokens as Scaffold Tokens. Due to their infrequent\noccurrences in the text corpus, Scaffold Tokens pose a learning imbalance\nissue. To address that issue, we propose Scaffold-BPE, which incorporates a\ndynamic scaffold token removal mechanism by parameter-free, computation-light,\nand easy-to-implement modifications to the original BPE method. This novel\napproach ensures the exclusion of low-frequency Scaffold Tokens from the token\nrepresentations for given texts, thereby mitigating the issue of frequency\nimbalance and facilitating model training. On extensive experiments across\nlanguage modeling and even machine translation, Scaffold-BPE consistently\noutperforms the original BPE, well demonstrating its effectiveness.\n","authors":["Haoran Lian","Yizhe Xiong","Jianwei Niu","Shasha Mo","Zhenpeng Su","Zijia Lin","Hui Chen","Peng Liu","Jungong Han","Guiguang Ding"],"pdf_url":"https://arxiv.org/pdf/2404.17808v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08432v1","updated":"2024-11-13T08:32:42Z","published":"2024-11-13T08:32:42Z","title":"One STEP at a time: Language Agents are Stepwise Planners","summary":" Language agents have shown promising adaptability in dynamic environments to\nperform complex tasks. However, despite the versatile knowledge embedded in\nlarge language models, these agents still fall short when it comes to tasks\nthat require planning. We introduce STEP, a novel framework designed to\nefficiently learn from previous experiences to enhance the planning\ncapabilities of language agents in future steps. Concretely, STEP functions\nthrough four interconnected components. First, the Planner takes on the task,\nbreaks it down into subtasks and provides relevant insights. Then the Executor\ngenerates action candidates, while the Evaluator ensures the actions align with\nlearned rules from previous experiences. Lastly, Memory stores experiences to\ninform future decisions. In the ScienceWorld benchmark, our results show that\nSTEP consistently outperforms state-of-the-art models, achieving an overall\nscore of 67.4 and successfully completing 12 out of 18 tasks. These findings\nhighlight STEP's potential as a framework for enhancing planning capabilities\nin language agents, paving the way for more sophisticated task-solving in\ndynamic environments.\n","authors":["Minh Nguyen","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2411.08432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08397v1","updated":"2024-11-13T07:32:58Z","published":"2024-11-13T07:32:58Z","title":"CLaSP: Learning Concepts for Time-Series Signals from Natural Language\n Supervision","summary":" This paper proposes a foundation model called \"CLaSP\" that can search time\nseries signals using natural language that describes the characteristics of the\nsignals as queries. Previous efforts to represent time series signal data in\nnatural language have had challenges in designing a conventional class of time\nseries signal characteristics, formulating their quantification, and creating a\ndictionary of synonyms. To overcome these limitations, the proposed method\nintroduces a neural network based on contrastive learning. This network is\nfirst trained using the datasets TRUCE and SUSHI, which consist of time series\nsignals and their corresponding natural language descriptions. Previous studies\nhave proposed vocabularies that data analysts use to describe signal\ncharacteristics, and SUSHI was designed to cover these terms. We believe that a\nneural network trained on these datasets will enable data analysts to search\nusing natural language vocabulary. Furthermore, our method does not require a\ndictionary of predefined synonyms, and it leverages common sense knowledge\nembedded in a large-scale language model (LLM). Experimental results\ndemonstrate that CLaSP enables natural language search of time series signal\ndata and can accurately learn the points at which signal data changes.\n","authors":["Aoi Ito","Kota Dohi","Yohei Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2411.08397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02288v2","updated":"2024-11-13T07:13:36Z","published":"2024-08-05T07:54:01Z","title":"Spin glass model of in-context learning","summary":" Large language models show a surprising in-context learning ability -- being\nable to use a prompt to form a prediction for a query, yet without additional\ntraining, in stark contrast to old-fashioned supervised learning. Providing a\nmechanistic interpretation and linking the empirical phenomenon to physics are\nthus challenging and remain unsolved. We study a simple yet expressive\ntransformer with linear attention and map this structure to a spin glass model\nwith real-valued spins, where the couplings and fields explain the intrinsic\ndisorder in data. The spin glass model explains how the weight parameters\ninteract with each other during pre-training, and further clarifies why an\nunseen function can be predicted by providing only a prompt yet without further\ntraining. Our theory reveals that for single-instance learning, increasing the\ntask diversity leads to the emergence of in-context learning, by allowing the\nBoltzmann distribution to converge to a unique correct solution of weight\nparameters. Therefore the pre-trained transformer displays a prediction power\nin a novel prompt setting. The proposed analytically tractable model thus\noffers a promising avenue for thinking about how to interpret many intriguing\nbut puzzling properties of large language models.\n","authors":["Yuhao Li","Ruoran Bai","Haiping Huang"],"pdf_url":"https://arxiv.org/pdf/2408.02288v2.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08384v1","updated":"2024-11-13T07:10:18Z","published":"2024-11-13T07:10:18Z","title":"Interpretable Syntactic Representations Enable Hierarchical Word Vectors","summary":" The distributed representations currently used are dense and uninterpretable,\nleading to interpretations that themselves are relative, overcomplete, and hard\nto interpret. We propose a method that transforms these word vectors into\nreduced syntactic representations. The resulting representations are compact\nand interpretable allowing better visualization and comparison of the word\nvectors and we successively demonstrate that the drawn interpretations are in\nline with human judgment. The syntactic representations are then used to create\nhierarchical word vectors using an incremental learning approach similar to the\nhierarchical aspect of human learning. As these representations are drawn from\npre-trained vectors, the generation process and learning approach are\ncomputationally efficient. Most importantly, we find out that syntactic\nrepresentations provide a plausible interpretation of the vectors and\nsubsequent hierarchical vectors outperform the original vectors in benchmark\ntests.\n","authors":["Biraj Silwal"],"pdf_url":"https://arxiv.org/pdf/2411.08384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07820v2","updated":"2024-11-13T05:43:58Z","published":"2024-11-12T14:12:45Z","title":"Query Optimization for Parametric Knowledge Refinement in\n Retrieval-Augmented Large Language Models","summary":" We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel\napproach designed to bridge the pre-retrieval information gap in\nRetrieval-Augmented Generation (RAG) systems through query optimization\ntailored to meet the specific knowledge requirements of Large Language Models\n(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR\nframework begins by extracting parametric knowledge from LLMs, followed by\nusing a specialized query optimizer for refining these queries. This process\nensures the retrieval of only the most pertinent information essential for\ngenerating accurate responses. Moreover, to enhance flexibility and reduce\ncomputational costs, we propose a trainable scheme for our pipeline that\nutilizes a smaller, tunable model as the query optimizer, which is refined\nthrough knowledge distillation from a larger teacher model. Our evaluations on\nvarious question-answering (QA) datasets and with different retrieval systems\nshow that ERRR consistently outperforms existing baselines, proving to be a\nversatile and cost-effective module for improving the utility and accuracy of\nRAG systems.\n","authors":["Youan Cong","Cheng Wang","Pritom Saha Akash","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2411.07820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08348v1","updated":"2024-11-13T05:40:24Z","published":"2024-11-13T05:40:24Z","title":"Refining Translations with LLMs: A Constraint-Aware Iterative Prompting\n Approach","summary":" Large language models (LLMs) have demonstrated remarkable proficiency in\nmachine translation (MT), even without specific training on the languages in\nquestion. However, translating rare words in low-resource or domain-specific\ncontexts remains challenging for LLMs. To address this issue, we propose a\nmulti-step prompt chain that enhances translation faithfulness by prioritizing\nkey terms crucial for semantic accuracy. Our method first identifies these\nkeywords and retrieves their translations from a bilingual dictionary,\nintegrating them into the LLM's context using Retrieval-Augmented Generation\n(RAG). We further mitigate potential output hallucinations caused by long\nprompts through an iterative self-checking mechanism, where the LLM refines its\ntranslations based on lexical and semantic constraints. Experiments using Llama\nand Qwen as base models on the FLORES-200 and WMT datasets demonstrate\nsignificant improvements over baselines, highlighting the effectiveness of our\napproach in enhancing translation faithfulness and robustness, particularly in\nlow-resource scenarios.\n","authors":["Shangfeng Chen","Xiayang Shi","Pu Li","Yinlin Li","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08347v1","updated":"2024-11-13T05:38:55Z","published":"2024-11-13T05:38:55Z","title":"A Chinese Multi-label Affective Computing Dataset Based on Social Media\n Network Users","summary":" Emotion and personality are central elements in understanding human\npsychological states. Emotions reflect an individual subjective experiences,\nwhile personality reveals relatively stable behavioral and cognitive patterns.\nExisting affective computing datasets often annotate emotion and personality\ntraits separately, lacking fine-grained labeling of micro-emotions and emotion\nintensity in both single-label and multi-label classifications. Chinese emotion\ndatasets are extremely scarce, and datasets capturing Chinese user personality\ntraits are even more limited. To address these gaps, this study collected data\nfrom the major social media platform Weibo, screening 11,338 valid users from\nover 50,000 individuals with diverse MBTI personality labels and acquiring\n566,900 posts along with the user MBTI personality tags. Using the EQN method,\nwe compiled a multi-label Chinese affective computing dataset that integrates\nthe same user's personality traits with six emotions and micro-emotions, each\nannotated with intensity levels. Validation results across multiple NLP\nclassification models demonstrate the dataset strong utility. This dataset is\ndesigned to advance machine recognition of complex human emotions and provide\ndata support for research in psychology, education, marketing, finance, and\npolitics.\n","authors":["Jingyi Zhou","Senlin Luo","Haofan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08344v1","updated":"2024-11-13T05:22:45Z","published":"2024-11-13T05:22:45Z","title":"Bangla Grammatical Error Detection Leveraging Transformer-based Token\n Classification","summary":" Bangla is the seventh most spoken language by a total number of speakers in\nthe world, and yet the development of an automated grammar checker in this\nlanguage is an understudied problem. Bangla grammatical error detection is a\ntask of detecting sub-strings of a Bangla text that contain grammatical,\npunctuation, or spelling errors, which is crucial for developing an automated\nBangla typing assistant. Our approach involves breaking down the task as a\ntoken classification problem and utilizing state-of-the-art transformer-based\nmodels. Finally, we combine the output of these models and apply rule-based\npost-processing to generate a more reliable and comprehensive result. Our\nsystem is evaluated on a dataset consisting of over 25,000 texts from various\nsources. Our best model achieves a Levenshtein distance score of 1.04. Finally,\nwe provide a detailed analysis of different components of our system.\n","authors":["Shayekh Bin Islam","Ridwanul Hasan Tanvir","Sihat Afnan"],"pdf_url":"https://arxiv.org/pdf/2411.08344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.17439v3","updated":"2024-11-13T04:57:08Z","published":"2024-10-22T21:30:58Z","title":"Evaluating AI-Generated Essays with GRE Analytical Writing Assessment","summary":" The recent revolutionary advance in generative AI enables the generation of\nrealistic and coherent texts by large language models (LLMs). Despite many\nexisting evaluation metrics on the quality of the generated texts, there is\nstill a lack of rigorous assessment of how well LLMs perform in complex and\ndemanding writing assessments. This study examines essays generated by ten\nleading LLMs for the analytical writing assessment of the Graduate Record Exam\n(GRE). We assessed these essays using both human raters and the e-rater\nautomated scoring engine as used in the GRE scoring pipeline. Notably, the\ntop-performing Gemini and GPT-4o received an average score of 4.78 and 4.67,\nrespectively, falling between \"generally thoughtful, well-developed analysis of\nthe issue and conveys meaning clearly\" and \"presents a competent analysis of\nthe issue and conveys meaning with acceptable clarity\" according to the GRE\nscoring guideline. We also evaluated the detection accuracy of these essays,\nwith detectors trained on essays generated by the same and different LLMs.\n","authors":["Yang Zhong","Jiangang Hao","Michael Fauss","Chen Li","Yuan Wang"],"pdf_url":"https://arxiv.org/pdf/2410.17439v3.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2410.15553v2","updated":"2024-11-13T04:26:13Z","published":"2024-10-21T00:59:47Z","title":"Multi-IF: Benchmarking LLMs on Multi-Turn and Multilingual Instructions\n Following","summary":" Large Language Models (LLMs) have demonstrated impressive capabilities in\nvarious tasks, including instruction following, which is crucial for aligning\nmodel outputs with user expectations. However, evaluating LLMs' ability to\nfollow instructions remains challenging due to the complexity and subjectivity\nof human language. Current benchmarks primarily focus on single-turn,\nmonolingual instructions, which do not adequately reflect the complexities of\nreal-world applications that require handling multi-turn and multilingual\ninteractions. To address this gap, we introduce Multi-IF, a new benchmark\ndesigned to assess LLMs' proficiency in following multi-turn and multilingual\ninstructions. Multi-IF, which utilizes a hybrid framework combining LLM and\nhuman annotators, expands upon the IFEval by incorporating multi-turn sequences\nand translating the English prompts into another 7 languages, resulting in a\ndataset of 4,501 multilingual conversations, where each has three turns. Our\nevaluation of 14 state-of-the-art LLMs on Multi-IF reveals that it presents a\nsignificantly more challenging task than existing benchmarks. All the models\ntested showed a higher rate of failure in executing instructions correctly with\neach additional turn. For example, o1-preview drops from 0.877 at the first\nturn to 0.707 at the third turn in terms of average accuracy over all\nlanguages. Moreover, languages with non-Latin scripts (Hindi, Russian, and\nChinese) generally exhibit higher error rates, suggesting potential limitations\nin the models' multilingual capabilities. We release Multi-IF prompts and the\nevaluation code base to encourage further research in this critical area.\n","authors":["Yun He","Di Jin","Chaoqi Wang","Chloe Bi","Karishma Mandyam","Hejia Zhang","Chen Zhu","Ning Li","Tengyu Xu","Hongjiang Lv","Shruti Bhosale","Chenguang Zhu","Karthik Abinav Sankararaman","Eryk Helenowski","Melanie Kambadur","Aditya Tayade","Hao Ma","Han Fang","Sinong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.15553v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08324v1","updated":"2024-11-13T04:20:20Z","published":"2024-11-13T04:20:20Z","title":"Are LLMs Prescient? A Continuous Evaluation using Daily News as the\n Oracle","summary":" Many existing evaluation benchmarks for Large Language Models (LLMs) quickly\nbecome outdated due to the emergence of new models and training data. These\nbenchmarks also fall short in assessing how LLM performance changes over time,\nas they consist of static questions without a temporal dimension. To address\nthese limitations, we propose using future event prediction as a continuous\nevaluation method to assess LLMs' temporal generalization and forecasting\nabilities. Our benchmark, Daily Oracle, automatically generates question-answer\n(QA) pairs from daily news, challenging LLMs to predict \"future\" event\noutcomes. Our findings reveal that as pre-training data becomes outdated, LLM\nperformance degrades over time. While Retrieval Augmented Generation (RAG) has\nthe potential to enhance prediction accuracy, the performance degradation\npattern persists, highlighting the need for continuous model updates.\n","authors":["Hui Dai","Ryan Teehan","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2411.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.11216v2","updated":"2024-11-13T04:16:21Z","published":"2024-10-15T03:02:03Z","title":"Experiences from Creating a Benchmark for Sentiment Classification for\n Varieties of English","summary":" Existing benchmarks often fail to account for linguistic diversity, like\nlanguage variants of English. In this paper, we share our experiences from our\nongoing project of building a sentiment classification benchmark for three\nvariants of English: Australian (en-AU), Indian (en-IN), and British (en-UK)\nEnglish. Using Google Places reviews, we explore the effects of various\nsampling techniques based on label semantics, review length, and sentiment\nproportion and report performances on three fine-tuned BERT-based models. Our\ninitial evaluation reveals significant performance variations influenced by\nsample characteristics, label semantics, and language variety, highlighting the\nneed for nuanced benchmark design. We offer actionable insights for researchers\nto create robust benchmarks, emphasising the importance of diverse sampling,\ncareful label definition, and comprehensive evaluation across linguistic\nvarieties.\n","authors":["Dipankar Srirag","Jordan Painter","Aditya Joshi","Diptesh Kanojia"],"pdf_url":"https://arxiv.org/pdf/2410.11216v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2411.07521v2","updated":"2024-11-13T04:03:54Z","published":"2024-11-12T03:37:53Z","title":"Fair Summarization: Bridging Quality and Diversity in Extractive\n Summaries","summary":" Fairness in multi-document summarization of user-generated content remains a\ncritical challenge in natural language processing (NLP). Existing summarization\nmethods often fail to ensure equitable representation across different social\ngroups, leading to biased outputs. In this paper, we introduce two novel\nmethods for fair extractive summarization: FairExtract, a clustering-based\napproach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints.\nWe evaluate these methods using Divsumm summarization dataset of White-aligned,\nHispanic, and African-American dialect tweets and compare them against relevant\nbaselines. The results obtained using a comprehensive set of summarization\nquality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well\nas a fairness metric F, demonstrate that FairExtract and FairGPT achieve\nsuperior fairness while maintaining competitive summarization quality.\nAdditionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that\nintegrate quality and fairness into a single evaluation framework, offering a\nmore nuanced understanding of the trade-offs between these objectives. This\nwork highlights the importance of fairness in summarization and sets a\nbenchmark for future research in fairness-aware NLP models.\n","authors":["Sina Bagheri Nezhad","Sayan Bandyapadhyay","Ameeta Agrawal"],"pdf_url":"https://arxiv.org/pdf/2411.07521v2.pdf","comment":"Accepted at Algorithmic Fairness through the Lens of Metrics and\n Evaluation Workshop @ NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08302v1","updated":"2024-11-13T02:45:21Z","published":"2024-11-13T02:45:21Z","title":"R3HF: Reward Redistribution for Enhancing Reinforcement Learning from\n Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) provides a paradigm for\naligning large language models (LLMs) with human preferences. This involves the\ninitial training of a reward model based on pairwise human feedback. The reward\nmodel is subsequently utilized in reinforcement learning to assess the scores\nof each generated sentence as a whole, further guiding the optimization of\nLLMs. However, current approaches have a significant shortcoming: \\emph{They\nallocate a single, sparse, and delayed reward to an entire sequence of output}.\nThis may overlook some significant individual contributions of each token\ntowards the desired outcome. To overcome this limitation, our paper proposes a\nnovel reward redistribution method called R3HF, which facilitates a more\nfine-grained, token-level reward allocation. Specifically, our method treats\nthe reward prediction task of the reward model as a regression problem. As a\nresult, the redistributed rewards are computed by evaluating the specific\ncontribution of each token to the reward model's output. This detailed approach\nimproves the model's understanding of language nuances, leading to more precise\nenhancements in its performance. Our method is crafted to integrate seamlessly\nwith most current techniques while incurring minimal computational costs.\nThrough comprehensive experiments across diverse datasets and tasks, we have\nverified the effectiveness and superiority of our approach.\n","authors":["Jiahui Li","Tai-wei Chang","Fengda Zhang","Kun Kuang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08302v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08278v1","updated":"2024-11-13T01:33:05Z","published":"2024-11-13T01:33:05Z","title":"Knowledge Bases in Support of Large Language Models for Processing Web\n News","summary":" Large Language Models (LLMs) have received considerable interest in wide\napplications lately. During pre-training via massive datasets, such a model\nimplicitly memorizes the factual knowledge of trained datasets in its hidden\nparameters. However, knowledge held implicitly in parameters often makes its\nuse by downstream applications ineffective due to the lack of common-sense\nreasoning. In this article, we introduce a general framework that permits to\nbuild knowledge bases with an aid of LLMs, tailored for processing Web news.\nThe framework applies a rule-based News Information Extractor (NewsIE) to news\nitems for extracting their relational tuples, referred to as knowledge bases,\nwhich are then graph-convoluted with the implicit knowledge facts of news items\nobtained by LLMs, for their classification. It involves two lightweight\ncomponents: 1) NewsIE: for extracting the structural information of every news\nitem, in the form of relational tuples; 2) BERTGraph: for graph convoluting the\nimplicit knowledge facts with relational tuples extracted by NewsIE. We have\nevaluated our framework under different news-related datasets for news category\nclassification, with promising experimental results.\n","authors":["Yihe Zhang","Nabin Pakka","Nian-feng Tzeng"],"pdf_url":"https://arxiv.org/pdf/2411.08278v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.08275v1","updated":"2024-11-13T01:12:35Z","published":"2024-11-13T01:12:35Z","title":"A Large-Scale Study of Relevance Assessments with Large Language Models:\n An Initial Look","summary":" The application of large language models to provide relevance assessments\npresents exciting opportunities to advance information retrieval, natural\nlanguage processing, and beyond, but to date many unknowns remain. This paper\nreports on the results of a large-scale evaluation (the TREC 2024 RAG Track)\nwhere four different relevance assessment approaches were deployed in situ: the\n\"standard\" fully manual process that NIST has implemented for decades and three\ndifferent alternatives that take advantage of LLMs to different extents using\nthe open-source UMBRELA tool. This setup allows us to correlate system rankings\ninduced by the different approaches to characterize tradeoffs between cost and\nquality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system\nrankings induced by automatically generated relevance assessments from UMBRELA\ncorrelate highly with those induced by fully manual assessments across a\ndiverse set of 77 runs from 19 teams. Our results suggest that automatically\ngenerated UMBRELA judgments can replace fully manual judgments to accurately\ncapture run-level effectiveness. Surprisingly, we find that LLM assistance does\nnot appear to increase correlation with fully manual assessments, suggesting\nthat costs associated with human-in-the-loop processes do not bring obvious\ntangible benefits. Overall, human assessors appear to be stricter than UMBRELA\nin applying relevance criteria. Our work validates the use of LLMs in academic\nTREC-style evaluations and provides the foundation for future studies.\n","authors":["Shivani Upadhyay","Ronak Pradeep","Nandan Thakur","Daniel Campos","Nick Craswell","Ian Soboroff","Hoa Trang Dang","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.08275v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18164v2","updated":"2024-11-13T00:15:46Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Constantin Adam","Abdulhamid Adebayo","Sungeun An","Yuan Chi Chang","Xuan-Hong Dang","Nirmit Desai","Michele Dolfi","Hajar Emami-Gohari","Revital Eres","Takuya Goto","Dhiraj Joshi","Yan Koyfman","Mohammad Nassar","Hima Patel","Paramesvaran Selvam","Yousaf Shah","Saptha Surendran","Daiki Tsuzuku","Petros Zerfos","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v2.pdf","comment":"10 pages, 7 figures"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.08034v2","updated":"2024-11-13T18:59:44Z","published":"2024-11-12T18:59:35Z","title":"Scaling Properties of Diffusion Models for Perceptual Tasks","summary":" In this paper, we argue that iterative computation with diffusion models\noffers a powerful paradigm for not only generation but also visual perception\ntasks. We unify tasks such as depth estimation, optical flow, and amodal\nsegmentation under the framework of image-to-image translation, and show how\ndiffusion models benefit from scaling training and test-time compute for these\nperceptual tasks. Through a careful analysis of these scaling properties, we\nformulate compute-optimal training and inference recipes to scale diffusion\nmodels for visual perception tasks. Our models achieve competitive performance\nto state-of-the-art methods using significantly less data and compute. To\naccess our code and models, see https://scaling-diffusion-perception.github.io .\n","authors":["Rahul Ravishankar","Zeeshan Patel","Jathushan Rajasegaran","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2411.08034v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08879v1","updated":"2024-11-13T18:56:39Z","published":"2024-11-13T18:56:39Z","title":"4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization","summary":" Novel view synthesis of dynamic scenes is becoming important in various\napplications, including augmented and virtual reality. We propose a novel 4D\nGaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded\nmonocular videos. To overcome the overfitting problem of existing work for\nthese real-world videos, we introduce an uncertainty-aware regularization that\nidentifies uncertain regions with few observations and selectively imposes\nadditional priors based on diffusion models and depth smoothness on such\nregions. This approach improves both the performance of novel view synthesis\nand the quality of training image reconstruction. We also identify the\ninitialization problem of 4DGS in fast-moving dynamic regions, where the\nStructure from Motion (SfM) algorithm fails to provide reliable 3D landmarks.\nTo initialize Gaussian primitives in such regions, we present a dynamic region\ndensification method using the estimated depth maps and scene flow. Our\nexperiments show that the proposed method improves the performance of 4DGS\nreconstruction from a video captured by a handheld monocular camera and also\nexhibits promising results in few-shot static scene reconstruction.\n","authors":["Mijeong Kim","Jongwoo Lim","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2411.08879v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08878v1","updated":"2024-11-13T18:55:10Z","published":"2024-11-13T18:55:10Z","title":"A Short Note on Evaluating RepNet for Temporal Repetition Counting in\n Videos","summary":" We discuss some consistent issues on how RepNet has been evaluated in various\npapers. As a way to mitigate these issues, we report RepNet performance results\non different datasets, and release evaluation code and the RepNet checkpoint to\nobtain these results. Code URL:\nhttps://github.com/google-research/google-research/blob/master/repnet/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Pierre Sermanet","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.08878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10172v3","updated":"2024-11-13T18:42:18Z","published":"2024-04-15T23:01:59Z","title":"Forensic Iris Image-Based Post-Mortem Interval Estimation","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup. One factor that may be useful in\nconditioning iris recognition methods is the tissue decomposition level, which\nis correlated with the post-mortem interval (PMI), \\ie the number of hours that\nhave elapsed since death. PMI, however, is not always available, and its\nprecise estimation remains one of the core challenges in forensic examination.\nThis paper presents the first known to us method of the PMI estimation directly\nfrom iris images captured after death. To assess the feasibility of the\niris-based PMI estimation, we designed models predicting the PMI from (a)\nnear-infrared (NIR), (b) visible (RGB), and (c) multispectral (RGB+NIR)\nforensic iris images. Models were evaluated following a 10-fold\ncross-validation, in (S1) sample-disjoint, (S2) subject-disjoint, and (S3)\ncross-dataset scenarios. We explore two data balancing techniques for S3:\nresampling-based balancing (S3-real), and synthetic data-supplemented balancing\n(S3-synthetic). We found that using the multispectral data offers a\nspectacularly low mean absolute error (MAE) of $\\approx 3.5$ hours in the\nscenario (S1), a bit worse MAE $\\approx 17.5$ hours in the scenario (S2), and\nMAE $\\approx 45.77$ hours in the scenario (S3). Additionally, supplementing the\ntraining set with synthetically-generated forensic iris images (S3-synthetic)\nsignificantly enhances the models' ability to generalize to new NIR, RGB and\nmultispectral data collected in a different lab. This suggests that if the\nenvironmental conditions are favorable (\\eg, bodies are kept in low\ntemperatures), forensic iris images provide features that are indicative of the\nPMI and can be automatically estimated.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2404.10172v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v4","updated":"2024-11-13T18:31:18Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v4.pdf","comment":"Accepted by 2024 5th International Conference on Computer Vision,\n Image and Deep Learning"},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.08840v1","updated":"2024-11-13T18:19:51Z","published":"2024-11-13T18:19:51Z","title":"Multimodal Instruction Tuning with Hybrid State Space Models","summary":" Handling lengthy context is crucial for enhancing the recognition and\nunderstanding capabilities of multimodal large language models (MLLMs) in\napplications such as processing high-resolution images or high frame rate\nvideos. The rise in image resolution and frame rate substantially increases\ncomputational demands due to the increased number of input tokens. This\nchallenge is further exacerbated by the quadratic complexity with respect to\nsequence length of the self-attention mechanism. Most prior works either\npre-train models with long contexts, overlooking the efficiency problem, or\nattempt to reduce the context length via downsampling (e.g., identify the key\nimage patches or frames) to decrease the context length, which may result in\ninformation loss. To circumvent this issue while keeping the remarkable\neffectiveness of MLLMs, we propose a novel approach using a hybrid\ntransformer-MAMBA model to efficiently handle long contexts in multimodal\napplications. Our multimodal model can effectively process long context input\nexceeding 100k tokens, outperforming existing models across various benchmarks.\nRemarkably, our model enhances inference efficiency for high-resolution images\nand high-frame-rate videos by about 4 times compared to current models, with\nefficiency gains increasing as image resolution or video frames rise.\nFurthermore, our model is the first to be trained on low-resolution images or\nlow-frame-rate videos while being capable of inference on high-resolution\nimages and high-frame-rate videos, offering flexibility for inference in\ndiverse scenarios.\n","authors":["Jianing Zhou","Han Li","Shuai Zhang","Ning Xie","Ruijie Wang","Xiaohan Nie","Sheng Liu","Lingyun Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10259v4","updated":"2024-11-13T17:35:00Z","published":"2024-02-15T18:42:33Z","title":"GaussianObject: High-Quality 3D Object Reconstruction from Four Views\n with Gaussian Splatting","summary":" Reconstructing and rendering 3D objects from highly sparse views is of\ncritical importance for promoting applications of 3D vision techniques and\nimproving user experience. However, images from sparse views only contain very\nlimited 3D information, leading to two significant challenges: 1) Difficulty in\nbuilding multi-view consistency as images for matching are too few; 2)\nPartially omitted or highly compressed object information as view coverage is\ninsufficient. To tackle these challenges, we propose GaussianObject, a\nframework to represent and render the 3D object with Gaussian splatting that\nachieves high rendering quality with only 4 input images. We first introduce\ntechniques of visual hull and floater elimination, which explicitly inject\nstructure priors into the initial optimization process to help build multi-view\nconsistency, yielding a coarse 3D Gaussian representation. Then we construct a\nGaussian repair model based on diffusion models to supplement the omitted\nobject information, where Gaussians are further refined. We design a\nself-generating strategy to obtain image pairs for training the repair model.\nWe further design a COLMAP-free variant, where pre-given accurate camera poses\nare not required, which achieves competitive quality and facilitates wider\napplications. GaussianObject is evaluated on several challenging datasets,\nincluding MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed\nimages, achieving superior performance from only four views and significantly\noutperforming previous SOTA methods. Our demo is available at\nhttps://gaussianobject.github.io/, and the code has been released at\nhttps://github.com/GaussianObject/GaussianObject.\n","authors":["Chen Yang","Sikuang Li","Jiemin Fang","Ruofan Liang","Lingxi Xie","Xiaopeng Zhang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2402.10259v4.pdf","comment":"ACM Transactions on Graphics (SIGGRAPH Asia 2024). Project page:\n https://gaussianobject.github.io/ Code:\n https://github.com/chensjtu/GaussianObject"},{"id":"http://arxiv.org/abs/2403.18346v4","updated":"2024-11-13T17:17:43Z","published":"2024-03-27T08:38:49Z","title":"Quantifying and Mitigating Unimodal Biases in Multimodal Large Language\n Models: A Causal Perspective","summary":" Recent advancements in Large Language Models (LLMs) have facilitated the\ndevelopment of Multimodal LLMs (MLLMs). Despite their impressive capabilities,\nMLLMs often suffer from over-reliance on unimodal biases (e.g., language bias\nand vision bias), leading to incorrect answers or hallucinations in complex\nmultimodal tasks. To investigate this issue, we propose a causal framework to\ninterpret the biases in Visual Question Answering (VQA) problems. Within this\nframework, we conduct an in-depth causal analysis to assess the causal effect\nof these biases on MLLM predictions. Based on the analysis, we introduce 1) a\nnovel MORE dataset with 12,000 challenging VQA instances requiring multi-hop\nreasoning and overcoming unimodal biases. 2) a causality-enhanced agent\nframework CAVE that guides models to comprehensively integrate information from\ndifferent modalities and mitigate biases. Our experiments show that MLLMs\nperform poorly on MORE, indicating strong unimodal biases and limited semantic\nunderstanding. However, when integrated with our CAVE, promising improvements\nin reasoning and bias mitigation can be seen. These findings provide important\ninsights for the development of more robust MLLMs and contribute to the broader\ngoal of advancing multimodal AI systems capable of deeper understanding and\nreasoning. Our project page is at https://github.com/OpenCausaLab/MORE.\n","authors":["Meiqi Chen","Yixin Cao","Yan Zhang","Chaochao Lu"],"pdf_url":"https://arxiv.org/pdf/2403.18346v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.09733v3","updated":"2024-11-13T17:07:45Z","published":"2024-07-13T00:45:37Z","title":"Textured-GS: Gaussian Splatting with Spatially Defined Color and Opacity","summary":" In this paper, we introduce Textured-GS, an innovative method for rendering\nGaussian splatting that incorporates spatially defined color and opacity\nvariations using Spherical Harmonics (SH). This approach enables each Gaussian\nto exhibit a richer representation by accommodating varying colors and\nopacities across its surface, significantly enhancing rendering quality\ncompared to traditional methods. To demonstrate the merits of our approach, we\nhave adapted the Mini-Splatting architecture to integrate textured Gaussians\nwithout increasing the number of Gaussians. Our experiments across multiple\nreal-world datasets show that Textured-GS consistently outperforms both the\nbaseline Mini-Splatting and standard 3DGS in terms of visual fidelity. The\nresults highlight the potential of Textured-GS to advance Gaussian-based\nrendering technologies, promising more efficient and high-quality scene\nreconstructions. Our implementation is available at\nhttps://github.com/ZhentaoHuang/Textured-GS.\n","authors":["Zhentao Huang","Minglun Gong"],"pdf_url":"https://arxiv.org/pdf/2407.09733v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2411.08777v1","updated":"2024-11-13T17:02:46Z","published":"2024-11-13T17:02:46Z","title":"LUDO: Low-Latency Understanding of Highly Deformable Objects using Point\n Cloud Occupancy Functions","summary":" Accurately determining the shape and location of internal structures within\ndeformable objects is crucial for medical tasks that require precise targeting,\nsuch as robotic biopsies. We introduce LUDO, a method for accurate low-latency\nunderstanding of deformable objects. LUDO reconstructs objects in their\ndeformed state, including their internal structures, from a single-view point\ncloud observation in under 30 ms using occupancy networks. We demonstrate\nLUDO's abilities for autonomous targeting of internal regions of interest\n(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty\nestimates and explainability for its predictions, both of which are important\nin safety-critical applications such as surgical interventions. We evaluate\nLUDO in real-world robotic experiments, achieving a success rate of 98.9% for\npuncturing various ROIs inside highly deformable objects. LUDO demonstrates the\npotential to interact with deformable objects without the need for deformable\nregistration methods.\n","authors":["Pit Henrich","Franziska Mathis-Ullrich","Paul Maria Scheikl"],"pdf_url":"https://arxiv.org/pdf/2411.08777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08768v1","updated":"2024-11-13T16:53:29Z","published":"2024-11-13T16:53:29Z","title":"Sharingan: Extract User Action Sequence from Desktop Recordings","summary":" Video recordings of user activities, particularly desktop recordings, offer a\nrich source of data for understanding user behaviors and automating processes.\nHowever, despite advancements in Vision-Language Models (VLMs) and their\nincreasing use in video analysis, extracting user actions from desktop\nrecordings remains an underexplored area. This paper addresses this gap by\nproposing two novel VLM-based methods for user action extraction: the Direct\nFrame-Based Approach (DF), which inputs sampled frames directly into VLMs, and\nthe Differential Frame-Based Approach (DiffF), which incorporates explicit\nframe differences detected via computer vision techniques. We evaluate these\nmethods using a basic self-curated dataset and an advanced benchmark adapted\nfrom prior work. Our results show that the DF approach achieves an accuracy of\n70% to 80% in identifying user actions, with the extracted action sequences\nbeing re-playable though Robotic Process Automation. We find that while VLMs\nshow potential, incorporating explicit UI changes can degrade performance,\nmaking the DF approach more reliable. This work represents the first\napplication of VLMs for extracting user action sequences from desktop\nrecordings, contributing new methods, benchmarks, and insights for future\nresearch.\n","authors":["Yanting Chen","Yi Ren","Xiaoting Qin","Jue Zhang","Kehong Yuan","Lu Han","Qingwei Lin","Dongmei Zhang","Saravan Rajmohan","Qi Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12461v2","updated":"2024-11-13T16:49:14Z","published":"2023-11-21T09:15:24Z","title":"HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity\n Synthesis of MR Images with Structure Preservation","summary":" Synthesizing medical images while preserving their structural information is\ncrucial in medical research. In such scenarios, the preservation of anatomical\ncontent becomes especially important. Although recent advances have been made\nby incorporating instance-level information to guide translation, these methods\noverlook the spatial coherence of structural-level representation and the\nanatomical invariance of content during translation. To address these issues,\nwe introduce hierarchical granularity discrimination, which exploits various\nlevels of semantic information present in medical images. Our strategy utilizes\nthree levels of discrimination granularity: pixel-level discrimination using a\nBrain Memory Bank, structure-level discrimination on each brain structure with\na re-weighting strategy to focus on hard samples, and global-level\ndiscrimination to ensure anatomical consistency during translation. The image\ntranslation performance of our strategy has been evaluated on three independent\ndatasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed\nstate-of-the-art algorithms. Particularly, our model excels not only in\nsynthesizing normal structures but also in handling abnormal (pathological)\nstructures, such as brain tumors, despite the variations in contrast observed\nacross different imaging modalities due to their pathological characteristics.\nThe diagnostic value of synthesized MR images containing brain tumors has been\nevaluated by radiologists. This indicates that our model may offer an\nalternative solution in scenarios where specific MR modalities of patients are\nunavailable. Extensive experiments further demonstrate the versatility of our\nmethod, providing unique insights into medical image translation.\n","authors":["Ziqi Yu","Botao Zhao","Shengjie Zhang","Xiang Chen","Jianfeng Feng","Tingying Peng","Xiao-Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08756v1","updated":"2024-11-13T16:42:07Z","published":"2024-11-13T16:42:07Z","title":"Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation","summary":" In view of the fact that semi- and self-supervised learning share a\nfundamental principle, effectively modeling knowledge from unlabeled data,\nvarious semi-supervised semantic segmentation methods have integrated\nrepresentative self-supervised learning paradigms for further regularization.\nHowever, the potential of the state-of-the-art generative self-supervised\nparadigm, masked image modeling, has been scarcely studied. This paradigm\nlearns the knowledge through establishing connections between the masked and\nvisible parts of masked image, during the pixel reconstruction process. By\ninheriting and extending this insight, we successfully leverage masked image\nmodeling to boost semi-supervised semantic segmentation. Specifically, we\nintroduce a novel class-wise masked image modeling that independently\nreconstructs different image regions according to their respective classes. In\nthis way, the mask-induced connections are established within each class,\nmitigating the semantic confusion that arises from plainly reconstructing\nimages in basic masked image modeling. To strengthen these intra-class\nconnections, we further develop a feature aggregation strategy that minimizes\nthe distances between features corresponding to the masked and visible parts\nwithin the same class. Additionally, in semantic space, we explore the\napplication of masked image modeling to enhance regularization. Extensive\nexperiments conducted on well-known benchmarks demonstrate that our approach\nachieves state-of-the-art performance. The code will be available at\nhttps://github.com/haoxt/S4MIM.\n","authors":["Yangyang Li","Xuanting Hao","Ronghua Shang","Licheng Jiao"],"pdf_url":"https://arxiv.org/pdf/2411.08756v1.pdf","comment":"13 pages. This work has been submitted to the IEEE for possible\n publication"},{"id":"http://arxiv.org/abs/2411.08755v1","updated":"2024-11-13T16:33:27Z","published":"2024-11-13T16:33:27Z","title":"Weakly-Supervised Anomaly Detection in Surveillance Videos Based on\n Two-Stream I3D Convolution Network","summary":" The widespread implementation of urban surveillance systems has necessitated\nmore sophisticated techniques for anomaly detection to ensure enhanced public\nsafety. This paper presents a significant advancement in the field of anomaly\ndetection through the application of Two-Stream Inflated 3D (I3D) Convolutional\nNetworks. These networks substantially outperform traditional 3D Convolutional\nNetworks (C3D) by more effectively extracting spatial and temporal features\nfrom surveillance videos, thus improving the precision of anomaly detection.\nOur research advances the field by implementing a weakly supervised learning\nframework based on Multiple Instance Learning (MIL), which uniquely\nconceptualizes surveillance videos as collections of 'bags' that contain\ninstances (video clips). Each instance is innovatively processed through a\nranking mechanism that prioritizes clips based on their potential to display\nanomalies. This novel strategy not only enhances the accuracy and precision of\nanomaly detection but also significantly diminishes the dependency on extensive\nmanual annotations. Moreover, through meticulous optimization of model\nsettings, including the choice of optimizer, our approach not only establishes\nnew benchmarks in the performance of anomaly detection systems but also offers\na scalable and efficient solution for real-world surveillance applications.\nThis paper contributes significantly to the field of computer vision by\ndelivering a more adaptable, efficient, and context-aware anomaly detection\nsystem, which is poised to redefine practices in urban surveillance.\n","authors":["Sareh Soltani Nejad","Anwar Haque"],"pdf_url":"https://arxiv.org/pdf/2411.08755v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.08753v1","updated":"2024-11-13T16:31:08Z","published":"2024-11-13T16:31:08Z","title":"Which Viewpoint Shows it Best? Language for Weakly Supervising View\n Selection in Multi-view Videos","summary":" Given a multi-view video, which viewpoint is most informative for a human\nobserver? Existing methods rely on heuristics or expensive ``best-view\"\nsupervision to answer this question, limiting their applicability. We propose a\nweakly supervised approach that leverages language accompanying an\ninstructional multi-view video as a means to recover its most informative\nviewpoint(s). Our key hypothesis is that the more accurately an individual view\ncan predict a view-agnostic text summary, the more informative it is. To put\nthis into action, we propose a framework that uses the relative accuracy of\nview-dependent caption predictions as a proxy for best view pseudo-labels.\nThen, those pseudo-labels are used to train a view selector, together with an\nauxiliary camera pose predictor that enhances view-sensitivity. During\ninference, our model takes as input only a multi-view video -- no language or\ncamera poses -- and returns the best viewpoint to watch at each timestep. On\ntwo challenging datasets comprised of diverse multi-camera setups and how-to\nactivities, our model consistently outperforms state-of-the-art baselines, both\nwith quantitative metrics and human evaluation.\n","authors":["Sagnik Majumder","Tushar Nagarajan","Ziad Al-Halah","Reina Pradhan","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2411.08753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08715v1","updated":"2024-11-13T15:58:50Z","published":"2024-11-13T15:58:50Z","title":"Retrieval Augmented Recipe Generation","summary":" Given the potential applications of generating recipes from food images, this\narea has garnered significant attention from researchers in recent years.\nExisting works for recipe generation primarily utilize a two-stage training\nmethod, first generating ingredients and then obtaining instructions from both\nthe image and ingredients. Large Multi-modal Models (LMMs), which have achieved\nnotable success across a variety of vision and language tasks, shed light to\ngenerating both ingredients and instructions directly from images.\nNevertheless, LMMs still face the common issue of hallucinations during recipe\ngeneration, leading to suboptimal performance. To tackle this, we propose a\nretrieval augmented large multimodal model for recipe generation. We first\nintroduce Stochastic Diversified Retrieval Augmentation (SDRA) to retrieve\nrecipes semantically related to the image from an existing datastore as a\nsupplement, integrating them into the prompt to add diverse and rich context to\nthe input image. Additionally, Self-Consistency Ensemble Voting mechanism is\nproposed to determine the most confident prediction recipes as the final\noutput. It calculates the consistency among generated recipe candidates, which\nuse different retrieval recipes as context for generation. Extensive\nexperiments validate the effectiveness of our proposed method, which\ndemonstrates state-of-the-art (SOTA) performance in recipe generation tasks on\nthe Recipe1M dataset.\n","authors":["Guoshan Liu","Hailong Yin","Bin Zhu","Jingjing Chen","Chong-Wah Ngo","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.08715v1.pdf","comment":"ACCEPT on IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2025"},{"id":"http://arxiv.org/abs/2411.08712v1","updated":"2024-11-13T15:55:05Z","published":"2024-11-13T15:55:05Z","title":"High-resolution optical and acoustic remote sensing datasets of the Puck\n Lagoon, Southern Baltic","summary":" The very shallow marine basin of Puck Lagoon in the southern Baltic Sea, on\nthe Northern coast of Poland, hosts valuable benthic habitats and cultural\nheritage sites. These include, among others, protected Zostera marina meadows,\none of the Baltic's major medieval harbours, a ship graveyard, and likely other\nsubmerged features that are yet to be discovered. Prior to this project, no\ncomprehensive high-resolution remote sensing data were available for this area.\nThis article describes the first Digital Elevation Models (DEMs) derived from a\ncombination of airborne bathymetric LiDAR, multibeam echosounder, airborne\nphotogrammetry and satellite imagery. These datasets also include multibeam\nechosounder backscatter and LiDAR intensity, allowing determination of the\ncharacter and properties of the seafloor. Combined, these datasets are a vital\nresource for assessing and understanding seafloor morphology, benthic habitats,\ncultural heritage, and submerged landscapes. Given the significance of Puck\nLagoon's hydrographical, ecological, geological, and archaeological environs,\nthe high-resolution bathymetry, acquired by our project, can provide the\nfoundation for sustainable management and informed decision-making for this\narea of interest.\n","authors":["Łukasz Janowski","Dimitrios Skarlatos","Panagiotis Agrafiotis","Paweł Tysiąc","Andrzej Pydyn","Mateusz Popek","Anna M. Kotarba-Morley","Gottfried Mandlburger","Łukasz Gajewski","Mateusz Kołakowski","Alexandra Papadaki","Juliusz Gajewski"],"pdf_url":"https://arxiv.org/pdf/2411.08712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07364v3","updated":"2024-11-13T15:48:08Z","published":"2024-05-12T19:36:11Z","title":"BoQ: A Place is Worth a Bag of Learnable Queries","summary":" In visual place recognition, accurately identifying and matching images of\nlocations under varying environmental conditions and viewpoints remains a\nsignificant challenge. In this paper, we introduce a new technique, called\nBag-of-Queries (BoQ), which learns a set of global queries designed to capture\nuniversal place-specific attributes. Unlike existing methods that employ\nself-attention and generate the queries directly from the input features, BoQ\nemploys distinct learnable global queries, which probe the input features via\ncross-attention, ensuring consistent information aggregation. In addition, our\ntechnique provides an interpretable attention mechanism and integrates with\nboth CNN and Vision Transformer backbones. The performance of BoQ is\ndemonstrated through extensive experiments on 14 large-scale benchmarks. It\nconsistently outperforms current state-of-the-art techniques including NetVLAD,\nMixVPR and EigenPlaces. Moreover, as a global retrieval technique (one-stage),\nBoQ surpasses two-stage retrieval methods, such as Patch-NetVLAD, TransVPR and\nR2Former, all while being orders of magnitude faster and more efficient. The\ncode and model weights are publicly available at\nhttps://github.com/amaralibey/Bag-of-Queries.\n","authors":["Amar Ali-Bey","Brahim Chaib-draa","Philippe Giguère"],"pdf_url":"https://arxiv.org/pdf/2405.07364v3.pdf","comment":"Accepted at CVPR 2024"},{"id":"http://arxiv.org/abs/2411.08701v1","updated":"2024-11-13T15:42:28Z","published":"2024-11-13T15:42:28Z","title":"TRACE: Transformer-based Risk Assessment for Clinical Evaluation","summary":" We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation),\na novel method for clinical risk assessment based on clinical data, leveraging\nthe self-attention mechanism for enhanced feature interaction and result\ninterpretation. Our approach is able to handle different data modalities,\nincluding continuous, categorical and multiple-choice (checkbox) attributes.\nThe proposed architecture features a shared representation of the clinical data\nobtained by integrating specialized embeddings of each data modality, enabling\nthe detection of high-risk individuals using Transformer encoder layers. To\nassess the effectiveness of the proposed method, a strong baseline based on\nnon-negative multi-layer perceptrons (MLPs) is introduced. The proposed method\noutperforms various baselines widely used in the domain of clinical risk\nassessment, while effectively handling missing values. In terms of\nexplainability, our Transformer-based method offers easily interpretable\nresults via attention weights, further enhancing the clinicians'\ndecision-making process.\n","authors":["Dionysis Christopoulos","Sotiris Spanos","Valsamis Ntouskos","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2411.08701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.15339v3","updated":"2024-11-13T15:25:32Z","published":"2024-07-22T02:53:18Z","title":"Deep Learning for Economists","summary":" Deep learning provides powerful methods to impute structured information from\nlarge-scale, unstructured text and image datasets. For example, economists\nmight wish to detect the presence of economic activity in satellite images, or\nto measure the topics or entities mentioned in social media, the congressional\nrecord, or firm filings. This review introduces deep neural networks, covering\nmethods such as classifiers, regression models, generative AI, and embedding\nmodels. Applications include classification, document digitization, record\nlinkage, and methods for data exploration in massive scale text and image\ncorpora. When suitable methods are used, deep learning models can be cheap to\ntune and can scale affordably to problems involving millions or billions of\ndata points.. The review is accompanied by a companion website, EconDL, with\nuser-friendly demo notebooks, software resources, and a knowledge base that\nprovides technical details and additional applications.\n","authors":["Melissa Dell"],"pdf_url":"https://arxiv.org/pdf/2407.15339v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08666v1","updated":"2024-11-13T14:59:41Z","published":"2024-11-13T14:59:41Z","title":"A Survey on Vision Autoregressive Model","summary":" Autoregressive models have demonstrated great performance in natural language\nprocessing (NLP) with impressive scalability, adaptability and\ngeneralizability. Inspired by their notable success in NLP field,\nautoregressive models have been intensively investigated recently for computer\nvision, which perform next-token predictions by representing visual data as\nvisual tokens and enables autoregressive modelling for a wide range of vision\ntasks, ranging from visual generation and visual understanding to the very\nrecent multimodal generation that unifies visual generation and understanding\nwith a single autoregressive model. This paper provides a systematic review of\nvision autoregressive models, including the development of a taxonomy of\nexisting methods and highlighting their major contributions, strengths, and\nlimitations, covering various vision tasks such as image generation, video\ngeneration, image editing, motion generation, medical image analysis, 3D\ngeneration, robotic manipulation, unified multimodal generation, etc. Besides,\nwe investigate and analyze the latest advancements in autoregressive models,\nincluding thorough benchmarking and discussion of existing methods across\nvarious evaluation datasets. Finally, we outline key challenges and promising\ndirections for future research, offering a roadmap to guide further\nadvancements in vision autoregressive models.\n","authors":["Kai Jiang","Jiaxing Huang"],"pdf_url":"https://arxiv.org/pdf/2411.08666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08665v1","updated":"2024-11-13T14:59:00Z","published":"2024-11-13T14:59:00Z","title":"OSMLoc: Single Image-Based Visual Localization in OpenStreetMap with\n Geometric and Semantic Guidances","summary":" OpenStreetMap (OSM), an online and versatile source of volunteered geographic\ninformation (VGI), is widely used for human self-localization by matching\nnearby visual observations with vectorized map data. However, due to the\ndivergence in modalities and views, image-to-OSM (I2O) matching and\nlocalization remain challenging for robots, preventing the full utilization of\nVGI data in the unmanned ground vehicles and logistic industry. Inspired by the\nfact that the human brain relies on geometric and semantic understanding of\nsensory information for spatial localization tasks, we propose the OSMLoc in\nthis paper. OSMLoc is a brain-inspired single-image visual localization method\nwith semantic and geometric guidance to improve accuracy, robustness, and\ngeneralization ability. First, we equip the OSMLoc with the visual foundational\nmodel to extract powerful image features. Second, a geometry-guided depth\ndistribution adapter is proposed to bridge the monocular depth estimation and\ncamera-to-BEV transform. Thirdly, the semantic embeddings from the OSM data are\nutilized as auxiliary guidance for image-to-OSM feature matching. To validate\nthe proposed OSMLoc, we collect a worldwide cross-area and cross-condition (CC)\nbenchmark for extensive evaluation. Experiments on the MGL dataset, CC\nvalidation benchmark, and KITTI dataset have demonstrated the superiority of\nour method. Code, pre-trained models, CC validation benchmark, and additional\nresults are available on: https://github.com/WHU-USI3DV/OSMLoc\n","authors":["Youqi Liao","Xieyuanli Chen","Shuhao Kang","Jianping Li","Zhen Dong","Hongchao Fan","Bisheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08665v1.pdf","comment":"15 pages, technical report"},{"id":"http://arxiv.org/abs/2411.08663v1","updated":"2024-11-13T14:54:47Z","published":"2024-11-13T14:54:47Z","title":"Toward Human Understanding with Controllable Synthesis","summary":" Training methods to perform robust 3D human pose and shape (HPS) estimation\nrequires diverse training images with accurate ground truth. While BEDLAM\ndemonstrates the potential of traditional procedural graphics to generate such\ndata, the training images are clearly synthetic. In contrast, generative image\nmodels produce highly realistic images but without ground truth. Putting these\nmethods together seems straightforward: use a generative model with the body\nground truth as controlling signal. However, we find that, the more realistic\nthe generated images, the more they deviate from the ground truth, making them\ninappropriate for training and evaluation. Enhancements of realistic details,\nsuch as clothing and facial expressions, can lead to subtle yet significant\ndeviations from the ground truth, potentially misleading training models. We\nempirically verify that this misalignment causes the accuracy of HPS networks\nto decline when trained with generated images. To address this, we design a\ncontrollable synthesis method that effectively balances image realism with\nprecise ground truth. We use this to create the Generative BEDLAM (Gen-B)\ndataset, which improves the realism of the existing synthetic BEDLAM dataset\nwhile preserving ground truth accuracy. We perform extensive experiments, with\nvarious noise-conditioning strategies, to evaluate the tradeoff between visual\nrealism and HPS accuracy. We show, for the first time, that generative image\nmodels can be controlled by traditional graphics methods to produce training\ndata that increases the accuracy of HPS methods.\n","authors":["Hanz Cuevas-Velasquez","Priyanka Patel","Haiwen Feng","Michael Black"],"pdf_url":"https://arxiv.org/pdf/2411.08663v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08656v1","updated":"2024-11-13T14:46:41Z","published":"2024-11-13T14:46:41Z","title":"MikuDance: Animating Character Art with Mixed Motion Dynamics","summary":" We propose MikuDance, a diffusion-based pipeline incorporating mixed motion\ndynamics to animate stylized character art. MikuDance consists of two key\ntechniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the\nchallenges of high-dynamic motion and reference-guidance misalignment in\ncharacter art animation. Specifically, a Scene Motion Tracking strategy is\npresented to explicitly model the dynamic camera in pixel-wise space, enabling\nunified character-scene motion modeling. Building on this, the Mixed-Control\nDiffusion implicitly aligns the scale and body shape of diverse characters with\nmotion guidance, allowing flexible control of local character motion.\nSubsequently, a Motion-Adaptive Normalization module is incorporated to\neffectively inject global scene motion, paving the way for comprehensive\ncharacter art animation. Through extensive experiments, we demonstrate the\neffectiveness and generalizability of MikuDance across various character art\nand motion guidance, consistently producing high-quality animations with\nremarkable motion dynamics.\n","authors":["Jiaxu Zhang","Xianfang Zeng","Xin Chen","Wei Zuo","Gang Yu","Zhigang Tu"],"pdf_url":"https://arxiv.org/pdf/2411.08656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10534v2","updated":"2024-11-13T14:36:47Z","published":"2024-04-12T21:41:50Z","title":"Into the Fog: Evaluating Robustness of Multiple Object Tracking","summary":" State-of-the-art Multiple Object Tracking (MOT) approaches have shown\nremarkable performance when trained and evaluated on current benchmarks.\nHowever, these benchmarks primarily consist of clear weather scenarios,\noverlooking adverse atmospheric conditions such as fog, haze, smoke and dust.\nAs a result, the robustness of trackers against these challenging conditions\nremains underexplored. To address this gap, we introduce physics-based\nvolumetric fog simulation method for arbitrary MOT datasets, utilizing\nframe-by-frame monocular depth estimation and a fog formation optical model. We\nenhance our simulation by rendering both homogeneous and heterogeneous fog and\npropose to use the dark channel prior method to estimate atmospheric light,\nshowing promising results even in night and indoor scenes. We present the\nleading benchmark MOTChallenge (third release) augmented with fog (smoke for\nindoor scenes) of various intensities and conduct a comprehensive evaluation of\nMOT methods, revealing their limitations under fog and fog-like challenges.\n","authors":["Nadezda Kirillova","M. Jehanzeb Mirza","Horst Bischof","Horst Possegger"],"pdf_url":"https://arxiv.org/pdf/2404.10534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08642v1","updated":"2024-11-13T14:32:28Z","published":"2024-11-13T14:32:28Z","title":"Towards More Accurate Fake Detection on Images Generated from Advanced\n Generative and Neural Rendering Models","summary":" The remarkable progress in neural-network-driven visual data generation,\nespecially with neural rendering techniques like Neural Radiance Fields and 3D\nGaussian splatting, offers a powerful alternative to GANs and diffusion models.\nThese methods can produce high-fidelity images and lifelike avatars,\nhighlighting the need for robust detection methods. In response, an\nunsupervised training technique is proposed that enables the model to extract\ncomprehensive features from the Fourier spectrum magnitude, thereby overcoming\nthe challenges of reconstructing the spectrum due to its centrosymmetric\nproperties. By leveraging the spectral domain and dynamically combining it with\nspatial domain information, we create a robust multimodal detector that\ndemonstrates superior generalization capabilities in identifying challenging\nsynthetic images generated by the latest image synthesis techniques. To address\nthe absence of a 3D neural rendering-based fake image database, we develop a\ncomprehensive database that includes images generated by diverse neural\nrendering techniques, providing a robust foundation for evaluating and\nadvancing detection methods.\n","authors":["Chengdong Dong","Vijayakumar Bhagavatula","Zhenyu Zhou","Ajay Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08642v1.pdf","comment":"13 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2402.15322v3","updated":"2024-11-13T14:17:35Z","published":"2024-02-23T13:40:34Z","title":"Optimal Transport on the Lie Group of Roto-translations","summary":" The roto-translation group SE2 has been of active interest in image analysis\ndue to methods that lift the image data to multi-orientation representations\ndefined on this Lie group. This has led to impactful applications of\ncrossing-preserving flows for image de-noising, geodesic tracking, and\nroto-translation equivariant deep learning. In this paper, we develop a\ncomputational framework for optimal transportation over Lie groups, with a\nspecial focus on SE2. We make several theoretical contributions (generalizable\nto matrix Lie groups) such as the non-optimality of group actions as transport\nmaps, invariance and equivariance of optimal transport, and the quality of the\nentropic-regularized optimal transport plan using geodesic distance\napproximations. We develop a Sinkhorn like algorithm that can be efficiently\nimplemented using fast and accurate distance approximations of the Lie group\nand GPU-friendly group convolutions. We report valuable advancements in the\nexperiments on 1) image barycentric interpolation, 2) interpolation of planar\norientation fields, and 3) Wasserstein gradient flows on SE2. We observe that\nour framework of lifting images to SE2 and optimal transport with\nleft-invariant anisotropic metrics leads to equivariant transport along\ndominant contours and salient line structures in the image. This yields sharper\nand more meaningful interpolations compared to their counterparts on R^2\n","authors":["Daan Bon","Gautam Pai","Gijs Bellaard","Olga Mula","Remco Duits"],"pdf_url":"https://arxiv.org/pdf/2402.15322v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08629v1","updated":"2024-11-13T14:16:22Z","published":"2024-11-13T14:16:22Z","title":"Zero-shot capability of SAM-family models for bone segmentation in CT\n scans","summary":" The Segment Anything Model (SAM) and similar models build a family of\npromptable foundation models (FMs) for image and video segmentation. The object\nof interest is identified using prompts, such as bounding boxes or points. With\nthese FMs becoming part of medical image segmentation, extensive evaluation\nstudies are required to assess their strengths and weaknesses in clinical\nsetting. Since the performance is highly dependent on the chosen prompting\nstrategy, it is important to investigate different prompting techniques to\ndefine optimal guidelines that ensure effective use in medical image\nsegmentation. Currently, no dedicated evaluation studies exist specifically for\nbone segmentation in CT scans, leaving a gap in understanding the performance\nfor this task. Thus, we use non-iterative, ``optimal'' prompting strategies\ncomposed of bounding box, points and combinations to test the zero-shot\ncapability of SAM-family models for bone CT segmentation on three different\nskeletal regions. Our results show that the best settings depend on the model\ntype and size, dataset characteristics and objective to optimize. Overall, SAM\nand SAM2 prompted with a bounding box in combination with the center point for\nall the components of an object yield the best results across all tested\nsettings. As the results depend on multiple factors, we provide a guideline for\ninformed decision-making in 2D prompting with non-interactive, ''optimal''\nprompts.\n","authors":["Caroline Magg","Hoel Kervadec","Clara I. Sánchez"],"pdf_url":"https://arxiv.org/pdf/2411.08629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08645v3","updated":"2024-11-13T13:58:39Z","published":"2024-08-16T10:21:13Z","title":"Extracting polygonal footprints in off-nadir images with Segment\n Anything Model","summary":" Building Footprint Extraction (BFE) from off-nadir aerial images often\ninvolves roof segmentation and offset prediction to adjust roof boundaries to\nthe building footprint. However, this multi-stage approach typically produces\nlow-quality results, limiting its applicability in real-world data production.\nTo address this issue, we present OBMv2, an end-to-end and promptable model for\npolygonal footprint prediction. Unlike its predecessor OBM, OBMv2 introduces a\nnovel Self Offset Attention (SOFA) mechanism that improves performance across\ndiverse building types, from bungalows to skyscrapers, enabling end-to-end\nfootprint prediction without post-processing. Additionally, we propose a\nMulti-level Information System (MISS) to effectively leverage roof masks,\nbuilding masks, and offsets for accurate footprint prediction. We evaluate\nOBMv2 on the BONAI and OmniCity-view3 datasets and demonstrate its\ngeneralization on the Huizhou test set. The code will be available at\nhttps://github.com/likaiucas/OBMv2.\n","authors":["Kai Li","Yupeng Deng","Jingbo Chen","Yu Meng","Zhihao Xi","Junxian Ma","Chenhao Wang","Xiangyu Zhao"],"pdf_url":"https://arxiv.org/pdf/2408.08645v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08606v1","updated":"2024-11-13T13:46:15Z","published":"2024-11-13T13:46:15Z","title":"LG-Gaze: Learning Geometry-aware Continuous Prompts for Language-Guided\n Gaze Estimation","summary":" The ability of gaze estimation models to generalize is often significantly\nhindered by various factors unrelated to gaze, especially when the training\ndataset is limited. Current strategies aim to address this challenge through\ndifferent domain generalization techniques, yet they have had limited success\ndue to the risk of overfitting when solely relying on value labels for\nregression. Recent progress in pre-trained vision-language models has motivated\nus to capitalize on the abundant semantic information available. We propose a\nnovel approach in this paper, reframing the gaze estimation task as a\nvision-language alignment issue. Our proposed framework, named Language-Guided\nGaze Estimation (LG-Gaze), learns continuous and geometry-sensitive features\nfor gaze estimation benefit from the rich prior knowledges of vision-language\nmodels. Specifically, LG-Gaze aligns gaze features with continuous linguistic\nfeatures through our proposed multimodal contrastive regression loss, which\ncustomizes adaptive weights for different negative samples. Furthermore, to\nbetter adapt to the labels for gaze estimation task, we propose a\ngeometry-aware interpolation method to obtain more precise gaze embeddings.\nThrough extensive experiments, we validate the efficacy of our framework in\nfour different cross-domain evaluation tasks.\n","authors":["Pengwei Yin","Jingjing Wang","Guanzhong Zeng","Di Xie","Jiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.08606v1.pdf","comment":"Accepted to ECCV 2024"},{"id":"http://arxiv.org/abs/2411.08603v1","updated":"2024-11-13T13:40:27Z","published":"2024-11-13T13:40:27Z","title":"Generalized Pose Space Embeddings for Training In-the-Wild using\n Anaylis-by-Synthesis","summary":" Modern pose estimation models are trained on large, manually-labelled\ndatasets which are costly and may not cover the full extent of human poses and\nappearances in the real world. With advances in neural rendering,\nanalysis-by-synthesis and the ability to not only predict, but also render the\npose, is becoming an appealing framework, which could alleviate the need for\nlarge scale manual labelling efforts. While recent work have shown the\nfeasibility of this approach, the predictions admit many flips due to a\nsimplistic intermediate skeleton representation, resulting in low precision and\ninhibiting the acquisition of any downstream knowledge such as\nthree-dimensional positioning. We solve this problem with a more expressive\nintermediate skeleton representation capable of capturing the semantics of the\npose (left and right), which significantly reduces flips. To successfully train\nthis new representation, we extend the analysis-by-synthesis framework with a\ntraining protocol based on synthetic data. We show that our representation\nresults in less flips and more accurate predictions. Our approach outperforms\nprevious models trained with analysis-by-synthesis on standard benchmarks.\n","authors":["Dominik Borer","Jakob Buhmann","Martin Guay"],"pdf_url":"https://arxiv.org/pdf/2411.08603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08592v1","updated":"2024-11-13T13:19:51Z","published":"2024-11-13T13:19:51Z","title":"Slender Object Scene Segmentation in Remote Sensing Image Based on\n Learnable Morphological Skeleton with Segment Anything Model","summary":" Morphological methods play a crucial role in remote sensing image processing,\ndue to their ability to capture and preserve small structural details. However,\nmost of the existing deep learning models for semantic segmentation are based\non the encoder-decoder architecture including U-net and Segment Anything Model\n(SAM), where the downsampling process tends to discard fine details. In this\npaper, we propose a new approach that integrates learnable morphological\nskeleton prior into deep neural networks using the variational method. To\naddress the difficulty in backpropagation in neural networks caused by the\nnon-differentiability presented in classical morphological operations, we\nprovide a smooth representation of the morphological skeleton and design a\nvariational segmentation model integrating morphological skeleton prior by\nemploying operator splitting and dual methods. Then, we integrate this model\ninto the network architecture of SAM, which is achieved by adding a token to\nmask decoder and modifying the final sigmoid layer, ensuring the final\nsegmentation results preserve the skeleton structure as much as possible.\nExperimental results on remote sensing datasets, including buildings and roads,\ndemonstrate that our method outperforms the original SAM on slender object\nsegmentation and exhibits better generalization capability.\n","authors":["Jun Xie","Wenxiao Li","Faqiang Wang","Liqiang Zhang","Zhengyang Hou","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08579v1","updated":"2024-11-13T12:51:49Z","published":"2024-11-13T12:51:49Z","title":"NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied\n Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN), as a widely discussed research\ndirection in embodied intelligence, aims to enable embodied agents to navigate\nin complicated visual environments through natural language commands. Most\nexisting VLN methods focus on indoor ground robot scenarios. However, when\napplied to UAV VLN in outdoor urban scenes, it faces two significant\nchallenges. First, urban scenes contain numerous objects, which makes it\nchallenging to match fine-grained landmarks in images with complex textual\ndescriptions of these landmarks. Second, overall environmental information\nencompasses multiple modal dimensions, and the diversity of representations\nsignificantly increases the complexity of the encoding process. To address\nthese challenges, we propose NavAgent, the first urban UAV embodied navigation\nmodel driven by a large Vision-Language Model. NavAgent undertakes navigation\ntasks by synthesizing multi-scale environmental information, including\ntopological maps (global), panoramas (medium), and fine-grained landmarks\n(local). Specifically, we utilize GLIP to build a visual recognizer for\nlandmark capable of identifying and linguisticizing fine-grained landmarks.\nSubsequently, we develop dynamically growing scene topology map that integrate\nenvironmental information and employ Graph Convolutional Networks to encode\nglobal environmental data. In addition, to train the visual recognizer for\nlandmark, we develop NavAgent-Landmark2K, the first fine-grained landmark\ndataset for real urban street scenes. In experiments conducted on the Touchdown\nand Map2seq datasets, NavAgent outperforms strong baseline models. The code and\ndataset will be released to the community to facilitate the exploration and\ndevelopment of outdoor VLN.\n","authors":["Youzhi Liu","Fanglong Yao","Yuanchang Yue","Guangluan Xu","Xian Sun","Kun Fu"],"pdf_url":"https://arxiv.org/pdf/2411.08579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07743v3","updated":"2024-11-13T12:43:33Z","published":"2023-06-13T13:00:10Z","title":"V-LoL: A Diagnostic Dataset for Visual Logical Learning","summary":" Despite the successes of recent developments in visual AI, different\nshortcomings still exist; from missing exact logical reasoning, to abstract\ngeneralization abilities, to understanding complex and noisy scenes.\nUnfortunately, existing benchmarks, were not designed to capture more than a\nfew of these aspects. Whereas deep learning datasets focus on visually complex\ndata but simple visual reasoning tasks, inductive logic datasets involve\ncomplex logical learning tasks, however, lack the visual component. To address\nthis, we propose the diagnostic visual logical learning dataset, V-LoL, that\nseamlessly combines visual and logical challenges. Notably, we introduce the\nfirst instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic\nbenchmark in symbolic AI, the Michalski train problem. By incorporating\nintricate visual scenes and flexible logical reasoning tasks within a versatile\nframework, V-LoL-Train provides a platform for investigating a wide range of\nvisual logical learning challenges. We evaluate a variety of AI systems\nincluding traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our\nevaluations demonstrate that even SOTA AI faces difficulties in dealing with\nvisual logical learning challenges, highlighting unique advantages and\nlimitations of each methodology. Overall, V-LoL opens up new avenues for\nunderstanding and enhancing current abilities in visual logical learning for AI\nsystems.\n","authors":["Lukas Helff","Wolfgang Stammer","Hikaru Shindo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2306.07743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08569v1","updated":"2024-11-13T12:29:44Z","published":"2024-11-13T12:29:44Z","title":"UIFormer: A Unified Transformer-based Framework for Incremental Few-Shot\n Object Detection and Instance Segmentation","summary":" This paper introduces a novel framework for unified incremental few-shot\nobject detection (iFSOD) and instance segmentation (iFSIS) using the\nTransformer architecture. Our goal is to create an optimal solution for\nsituations where only a few examples of novel object classes are available,\nwith no access to training data for base or old classes, while maintaining high\nperformance across both base and novel classes. To achieve this, We extend\nMask-DINO into a two-stage incremental learning framework. Stage 1 focuses on\noptimizing the model using the base dataset, while Stage 2 involves fine-tuning\nthe model on novel classes. Besides, we incorporate a classifier selection\nstrategy that assigns appropriate classifiers to the encoder and decoder\naccording to their distinct functions. Empirical evidence indicates that this\napproach effectively mitigates the over-fitting on novel classes learning.\nFurthermore, we implement knowledge distillation to prevent catastrophic\nforgetting of base classes. Comprehensive evaluations on the COCO and LVIS\ndatasets for both iFSIS and iFSOD tasks demonstrate that our method\nsignificantly outperforms state-of-the-art approaches.\n","authors":["Chengyuan Zhang","Yilin Zhang","Lei Zhu","Deyin Liu","Lin Wu","Bo Li","Shichao Zhang","Mohammed Bennamoun","Farid Boussaid"],"pdf_url":"https://arxiv.org/pdf/2411.08569v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2410.10929v4","updated":"2024-11-13T12:27:38Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v4.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.08567v1","updated":"2024-11-13T12:27:21Z","published":"2024-11-13T12:27:21Z","title":"Saliency Map-based Image Retrieval using Invariant Krawtchouk Moments","summary":" With the widespread adoption of digital devices equipped with cameras and the\nrapid development of Internet technology, numerous content-based image\nretrieval systems and novel image feature extraction techniques have emerged in\nrecent years. This paper introduces a saliency map-based image retrieval\napproach using invariant Krawtchouk moments (SM-IKM) to enhance retrieval speed\nand accuracy. The proposed method applies a global contrast-based salient\nregion detection algorithm to create a saliency map that effectively isolates\nthe foreground from the background. It then combines multiple orders of\ninvariant Krawtchouk moments (IKM) with local binary patterns (LBPs) and color\nhistograms to comprehensively represent the foreground and background.\nAdditionally, it incorporates LBPs derived from the saliency map to improve\ndiscriminative power, facilitating more precise image differentiation. A\nbag-of-visual-words (BoVW) model is employed to generate a codebook for\nclassification and discrimination. By using compact IKMs in the BoVW framework\nand integrating a range of region-based feature-including color histograms,\nLBPs, and saliency map-enhanced LBPs, our proposed SM-IKM achieves efficient\nand accurate image retrieval. xtensive experiments on publicly available\ndatasets, such as Caltech 101 and Wang, demonstrate that SM-IKM outperforms\nrecent state-of-the-art retrieval methods. The source code for SM-IKM is\navailable at github.com/arnejad/SMIKM.\n","authors":["Ashkan Nejad","Mohammad Reza Faraji","Xiaojun Qi"],"pdf_url":"https://arxiv.org/pdf/2411.08567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.18723v2","updated":"2024-11-13T12:02:29Z","published":"2024-10-24T13:28:40Z","title":"VoxelKeypointFusion: Generalizable Multi-View Multi-Person Pose\n Estimation","summary":" In the rapidly evolving field of computer vision, the task of accurately\nestimating the poses of multiple individuals from various viewpoints presents a\nformidable challenge, especially if the estimations should be reliable as well.\nThis work presents an extensive evaluation of the generalization capabilities\nof multi-view multi-person pose estimators to unseen datasets and presents a\nnew algorithm with strong performance in this task. It also studies the\nimprovements by additionally using depth information. Since the new approach\ncan not only generalize well to unseen datasets, but also to different\nkeypoints, the first multi-view multi-person whole-body estimator is presented.\nTo support further research on those topics, all of the work is publicly\naccessible.\n","authors":["Daniel Bermuth","Alexander Poeppel","Wolfgang Reif"],"pdf_url":"https://arxiv.org/pdf/2410.18723v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08545v1","updated":"2024-11-13T11:46:42Z","published":"2024-11-13T11:46:42Z","title":"APDDv2: Aesthetics of Paintings and Drawings Dataset with Artist Labeled\n Scores and Comments","summary":" Datasets play a pivotal role in training visual models, facilitating the\ndevelopment of abstract understandings of visual features through diverse image\nsamples and multidimensional attributes. However, in the realm of aesthetic\nevaluation of artistic images, datasets remain relatively scarce. Existing\npainting datasets are often characterized by limited scoring dimensions and\ninsufficient annotations, thereby constraining the advancement and application\nof automatic aesthetic evaluation methods in the domain of painting. To bridge\nthis gap, we introduce the Aesthetics Paintings and Drawings Dataset (APDD),\nthe first comprehensive collection of paintings encompassing 24 distinct\nartistic categories and 10 aesthetic attributes. Building upon the initial\nrelease of APDDv1, our ongoing research has identified opportunities for\nenhancement in data scale and annotation precision. Consequently, APDDv2 boasts\nan expanded image corpus and improved annotation quality, featuring detailed\nlanguage comments to better cater to the needs of both researchers and\npractitioners seeking high-quality painting datasets. Furthermore, we present\nan updated version of the Art Assessment Network for Specific Painting Styles,\ndenoted as ArtCLIP. Experimental validation demonstrates the superior\nperformance of this revised model in the realm of aesthetic evaluation,\nsurpassing its predecessor in accuracy and efficacy. The dataset and model are\navailable at https://github.com/BestiVictory/APDDv2.git.\n","authors":["Xin Jin","Qianqian Qiao","Yi Lu","Huaye Wang","Heng Huang","Shan Gao","Jianfei Liu","Rui Li"],"pdf_url":"https://arxiv.org/pdf/2411.08545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.01555v2","updated":"2024-11-13T11:44:10Z","published":"2024-02-02T16:47:18Z","title":"SLYKLatent: A Learning Framework for Gaze Estimation Using Deep Facial\n Feature Learning","summary":" In this research, we present SLYKLatent, a novel approach for enhancing gaze\nestimation by addressing appearance instability challenges in datasets due to\naleatoric uncertainties, covariant shifts, and test domain generalization.\nSLYKLatent utilizes Self-Supervised Learning for initial training with facial\nexpression datasets, followed by refinement with a patch-based tri-branch\nnetwork and an inverse explained variance-weighted training loss function. Our\nevaluation on benchmark datasets achieves a 10.9% improvement on Gaze360,\nsupersedes top MPIIFaceGaze results with 3.8%, and leads on a subset of\nETH-XGaze by 11.6%, surpassing existing methods by significant margins.\nAdaptability tests on RAF-DB and Affectnet show 86.4% and 60.9% accuracies,\nrespectively. Ablation studies confirm the effectiveness of SLYKLatent's novel\ncomponents.\n","authors":["Samuel Adebayo","Joost C. Dessing","Seán McLoone"],"pdf_url":"https://arxiv.org/pdf/2402.01555v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08537v1","updated":"2024-11-13T11:35:39Z","published":"2024-11-13T11:35:39Z","title":"MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal\n Lymphatic Vessel Segmentation","summary":" Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste\nproducts from the human brain. An impairment in their functionality has been\nassociated with aging as well as brain disorders like multiple sclerosis and\nAlzheimer's disease. However, MLVs have only recently been described for the\nfirst time in magnetic resonance imaging (MRI), and their ramified structure\nrenders manual segmentation particularly difficult. Further, as there is no\nconsistent notion of their appearance, human-annotated MLV structures contain a\nhigh inter-rater variability that most automatic segmentation methods cannot\ntake into account. In this work, we propose a new rater-aware training scheme\nfor the popular nnU-Net model, and we explore rater-based ensembling strategies\nfor accurate and consistent segmentation of MLVs. This enables us to boost\nnnU-Net's performance while obtaining explicit predictions in different\nannotation styles and a rater-based uncertainty estimation. Our final model,\nMLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to\nthe human reference standard. The model further matches the human inter-rater\nreliability and replicates age-related associations with MLV volume.\n","authors":["Fabian Bongratz","Markus Karmann","Adrian Holz","Moritz Bonhoeffer","Viktor Neumaier","Sarah Deli","Benita Schmitz-Koep","Claus Zimmer","Christian Sorg","Melissa Thalhammer","Dennis M Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2411.08537v1.pdf","comment":"ML4H 2024"},{"id":"http://arxiv.org/abs/2411.08531v1","updated":"2024-11-13T11:25:26Z","published":"2024-11-13T11:25:26Z","title":"Classification and Morphological Analysis of DLBCL Subtypes in\n H\\&E-Stained Slides","summary":" We address the challenge of automated classification of diffuse large B-cell\nlymphoma (DLBCL) into its two primary subtypes: activated B-cell-like (ABC) and\ngerminal center B-cell-like (GCB). Accurate classification between these\nsubtypes is essential for determining the appropriate therapeutic strategy,\ngiven their distinct molecular profiles and treatment responses. Our proposed\ndeep learning model demonstrates robust performance, achieving an average area\nunder the curve (AUC) of (87.4 pm 5.7)\\% during cross-validation. It shows a\nhigh positive predictive value (PPV), highlighting its potential for clinical\napplication, such as triaging for molecular testing. To gain biological\ninsights, we performed an analysis of morphological features of ABC and GCB\nsubtypes. We segmented cell nuclei using a pre-trained deep neural network and\ncompared the statistics of geometric and color features for ABC and GCB. We\nfound that the distributions of these features were not very different for the\ntwo subtypes, which suggests that the visual differences between them are more\nsubtle. These results underscore the potential of our method to assist in more\nprecise subtype classification and can contribute to improved treatment\nmanagement and outcomes for patients of DLBCL.\n","authors":["Ravi Kant Gupta","Mohit Jindal","Garima Jain","Epari Sridhar","Subhash Yadav","Hasmukh Jain","Tanuja Shet","Uma Sakhdeo","Manju Sengar","Lingaraj Nayak","Bhausaheb Bagal","Umesh Apkare","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08531v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08530v1","updated":"2024-11-13T11:24:12Z","published":"2024-11-13T11:24:12Z","title":"Efficient Whole Slide Image Classification through Fisher Vector\n Representation","summary":" The advancement of digital pathology, particularly through computational\nanalysis of whole slide images (WSI), is poised to significantly enhance\ndiagnostic precision and efficiency. However, the large size and complexity of\nWSIs make it difficult to analyze and classify them using computers. This study\nintroduces a novel method for WSI classification by automating the\nidentification and examination of the most informative patches, thus\neliminating the need to process the entire slide. Our method involves\ntwo-stages: firstly, it extracts only a few patches from the WSIs based on\ntheir pathological significance; and secondly, it employs Fisher vectors (FVs)\nfor representing features extracted from these patches, which is known for its\nrobustness in capturing fine-grained details. This approach not only\naccentuates key pathological features within the WSI representation but also\nsignificantly reduces computational overhead, thus making the process more\nefficient and scalable. We have rigorously evaluated the proposed method across\nmultiple datasets to benchmark its performance against comprehensive WSI\nanalysis and contemporary weakly-supervised learning methodologies. The\nempirical results indicate that our focused analysis of select patches,\ncombined with Fisher vector representation, not only aligns with, but at times\nsurpasses, the classification accuracy of standard practices. Moreover, this\nstrategy notably diminishes computational load and resource expenditure,\nthereby establishing an efficient and precise framework for WSI analysis in the\nrealm of digital pathology.\n","authors":["Ravi Kant Gupta","Dadi Dharani","Shambhavi Shanker","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19006v4","updated":"2024-11-13T10:56:14Z","published":"2024-06-27T08:45:31Z","title":"Snakes and Ladders: Two Steps Up for VideoMamba","summary":" Video understanding requires the extraction of rich spatio-temporal\nrepresentations, which transformer models achieve through self-attention.\nUnfortunately, self-attention poses a computational burden. In NLP, Mamba has\nsurfaced as an efficient alternative for transformers. However, Mamba's\nsuccesses do not trivially extend to vision tasks, including those in video\nanalysis. In this paper, we theoretically analyze the differences between\nself-attention and Mamba. We identify two limitations in Mamba's token\nprocessing: historical decay and element contradiction. We propose\nVideoMambaPro (VMP) that solves the identified limitations by adding masked\nbackward computation and elemental residual connections to a VideoMamba\nbackbone. Differently sized VideoMambaPro models surpass VideoMamba by 1.6-2.8%\nand 1.1-1.9% top-1 on Kinetics-400 and Something-Something V2, respectively.\nEven without extensive pre-training, our models present an increasingly\nattractive and efficient alternative to current transformer models. Moreover,\nour two solutions are orthogonal to recent advances in Vision Mamba models, and\nare likely to provide further improvements in future models.\n","authors":["Hui Lu","Albert Ali Salah","Ronald Poppe"],"pdf_url":"https://arxiv.org/pdf/2406.19006v4.pdf","comment":"New updated experiment results"},{"id":"http://arxiv.org/abs/2411.08508v1","updated":"2024-11-13T10:43:39Z","published":"2024-11-13T10:43:39Z","title":"BillBoard Splatting (BBSplat): Learnable Textured Primitives for Novel\n View Synthesis","summary":" We present billboard Splatting (BBSplat) - a novel approach for 3D scene\nrepresentation based on textured geometric primitives. BBSplat represents the\nscene as a set of optimizable textured planar primitives with learnable RGB\ntextures and alpha-maps to control their shape. BBSplat primitives can be used\nin any Gaussian Splatting pipeline as drop-in replacements for Gaussians. Our\nmethod's qualitative and quantitative improvements over 3D and 2D Gaussians are\nmost noticeable when fewer primitives are used, when BBSplat achieves over 1200\nFPS. Our novel regularization term encourages textures to have a sparser\nstructure, unlocking an efficient compression that leads to a reduction in\nstorage space of the model. Our experiments show the efficiency of BBSplat on\nstandard datasets of real indoor and outdoor scenes such as Tanks&Temples, DTU,\nand Mip-NeRF-360. We demonstrate improvements on PSNR, SSIM, and LPIPS metrics\ncompared to the state-of-the-art, especially for the case when fewer primitives\nare used, which, on the other hand, leads to up to 2 times inference speed\nimprovement for the same rendering quality.\n","authors":["David Svitov","Pietro Morerio","Lourdes Agapito","Alessio Del Bue"],"pdf_url":"https://arxiv.org/pdf/2411.08508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07940v2","updated":"2024-11-13T10:29:51Z","published":"2024-11-12T17:09:20Z","title":"Automatic dataset shift identification to support root cause analysis of\n AI performance drift","summary":" Shifts in data distribution can substantially harm the performance of\nclinical AI models. Hence, various methods have been developed to detect the\npresence of such shifts at deployment time. However, root causes of dataset\nshifts are varied, and the choice of shift mitigation strategies is highly\ndependent on the precise type of shift encountered at test time. As such,\ndetecting test-time dataset shift is not sufficient: precisely identifying\nwhich type of shift has occurred is critical. In this work, we propose the\nfirst unsupervised dataset shift identification framework, effectively\ndistinguishing between prevalence shift (caused by a change in the label\ndistribution), covariate shift (caused by a change in input characteristics)\nand mixed shifts (simultaneous prevalence and covariate shifts). We discuss the\nimportance of self-supervised encoders for detecting subtle covariate shifts\nand propose a novel shift detector leveraging both self-supervised encoders and\ntask model outputs for improved shift detection. We report promising results\nfor the proposed shift identification framework across three different imaging\nmodalities (chest radiography, digital mammography, and retinal fundus images)\non five types of real-world dataset shifts, using four large publicly available\ndatasets.\n","authors":["Mélanie Roschewitz","Raghav Mehta","Charles Jones","Ben Glocker"],"pdf_url":"https://arxiv.org/pdf/2411.07940v2.pdf","comment":"Code available at\n https://github.com/biomedia-mira/shift_identification"},{"id":"http://arxiv.org/abs/2312.06978v4","updated":"2024-11-13T10:29:01Z","published":"2023-12-12T04:38:30Z","title":"CLASS-M: Adaptive stain separation-based contrastive learning with\n pseudo-labeling for histopathological image classification","summary":" Histopathological image classification is an important task in medical image\nanalysis. Recent approaches generally rely on weakly supervised learning due to\nthe ease of acquiring case-level labels from pathology reports. However,\npatch-level classification is preferable in applications where only a limited\nnumber of cases are available or when local prediction accuracy is critical. On\nthe other hand, acquiring extensive datasets with localized labels for training\nis not feasible. In this paper, we propose a semi-supervised patch-level\nhistopathological image classification model, named CLASS-M, that does not\nrequire extensively labeled datasets. CLASS-M is formed by two main parts: a\ncontrastive learning module that uses separated Hematoxylin and Eosin images\ngenerated through an adaptive stain separation process, and a module with\npseudo-labels using MixUp. We compare our model with other state-of-the-art\nmodels on two clear cell renal cell carcinoma datasets. We demonstrate that our\nCLASS-M model has the best performance on both datasets. Our code is available\nat github.com/BzhangURU/Paper_CLASS-M/tree/main\n","authors":["Bodong Zhang","Hamid Manoochehri","Man Minh Ho","Fahimeh Fooladgar","Yosep Chong","Beatrice S. Knudsen","Deepika Sirohi","Tolga Tasdizen"],"pdf_url":"https://arxiv.org/pdf/2312.06978v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08490v1","updated":"2024-11-13T10:15:27Z","published":"2024-11-13T10:15:27Z","title":"Impact of Iris Pigmentation on Performance Bias in Visible Iris\n Verification Systems: A Comparative Study","summary":" Iris recognition technology plays a critical role in biometric identification\nsystems, but their performance can be affected by variations in iris\npigmentation. In this work, we investigate the impact of iris pigmentation on\nthe efficacy of biometric recognition systems, focusing on a comparative\nanalysis of blue and dark irises. Data sets were collected using multiple\ndevices, including P1, P2, and P3 smartphones [4], to assess the robustness of\nthe systems in different capture environments [19]. Both traditional machine\nlearning techniques and deep learning models were used, namely Open-Iris,\nViT-b, and ResNet50, to evaluate performance metrics such as Equal Error Rate\n(EER) and True Match Rate (TMR). Our results indicate that iris recognition\nsystems generally exhibit higher accuracy for blue irises compared to dark\nirises. Furthermore, we examined the generalization capabilities of these\nsystems across different iris colors and devices, finding that while training\non diverse datasets enhances recognition performance, the degree of improvement\nis contingent on the specific model and device used. Our analysis also\nidentifies inherent biases in recognition performance related to iris color and\ncross-device variability. These findings underscore the need for more inclusive\ndataset collection and model refinement to reduce bias and promote equitable\nbiometric recognition across varying iris pigmentation and device\nconfigurations.\n","authors":["Geetanjali Sharma","Abhishek Tandon","Gaurav Jaswal","Aditya Nigam","Raghavendra Ramachandra"],"pdf_url":"https://arxiv.org/pdf/2411.08490v1.pdf","comment":"14 pages, 5 figures, 5 Tables"},{"id":"http://arxiv.org/abs/2411.08488v1","updated":"2024-11-13T10:13:23Z","published":"2024-11-13T10:13:23Z","title":"UNSCT-HRNet: Modeling Anatomical Uncertainty for Landmark Detection in\n Total Hip Arthroplasty","summary":" Total hip arthroplasty (THA) relies on accurate landmark detection from\nradiographic images, but unstructured data caused by irregular patient postures\nor occluded anatomical markers pose significant challenges for existing\nmethods. To address this, we propose UNSCT-HRNet (Unstructured CT -\nHigh-Resolution Net), a deep learning-based framework that integrates a Spatial\nRelationship Fusion (SRF) module and an Uncertainty Estimation (UE) module. The\nSRF module, utilizing coordinate convolution and polarized attention, enhances\nthe model's ability to capture complex spatial relationships. Meanwhile, the UE\nmodule which based on entropy ensures predictions are anatomically relevant.\nFor unstructured data, the proposed method can predict landmarks without\nrelying on the fixed number of points, which shows higher accuracy and better\nrobustness comparing with the existing methods. Our UNSCT-HRNet demonstrates\nover a 60% improvement across multiple metrics in unstructured data. The\nexperimental results also reveal that our approach maintains good performance\non the structured dataset. Overall, the proposed UNSCT-HRNet has the potential\nto be used as a new reliable, automated solution for THA surgical planning and\npostoperative monitoring.\n","authors":["Jiaxin Wan","Lin Liu","Haoran Wang","Liangwei Li","Wei Li","Shuheng Kou","Runtian Li","Jiayi Tang","Juanxiu Liu","Jing Zhang","Xiaohui Du","Ruqian Hao"],"pdf_url":"https://arxiv.org/pdf/2411.08488v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08482v1","updated":"2024-11-13T10:01:33Z","published":"2024-11-13T10:01:33Z","title":"Methodology for a Statistical Analysis of Influencing Factors on 3D\n Object Detection Performance","summary":" In autonomous driving, object detection is an essential task to perceive the\nenvironment by localizing and classifying objects. Most object detection\nalgorithms rely on deep learning for their superior performance. However, their\nblack box nature makes it challenging to ensure safety. In this paper, we\npropose a first-of-its-kind methodology for statistical analysis of the\ninfluence of various factors related to the objects to detect or the\nenvironment on the detection performance of both LiDAR- and camera-based 3D\nobject detectors. We perform a univariate analysis between each of the factors\nand the detection error in order to compare the strength of influence. To\nbetter identify potential sources of detection errors, we also analyze the\nperformance in dependency of the influencing factors and examine the\ninterdependencies between the different influencing factors. Recognizing the\nfactors that influence detection performance helps identify robustness issues\nin the trained object detector and supports the safety approval of object\ndetection systems.\n","authors":["Anton Kuznietsov","Dirk Schweickard","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2411.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.17804v3","updated":"2024-11-13T09:50:48Z","published":"2024-06-22T15:24:33Z","title":"A Review of Electromagnetic Elimination Methods for low-field portable\n MRI scanner","summary":" This paper analyzes conventional and deep learning methods for eliminating\nelectromagnetic interference (EMI) in MRI systems. We compare traditional\nanalytical and adaptive techniques with advanced deep learning approaches. Key\nstrengths and limitations of each method are highlighted. Recent advancements\nin active EMI elimination, such as external EMI receiver coils, are discussed\nalongside deep learning methods, which show superior EMI suppression by\nleveraging neural networks trained on MRI data. While deep learning improves\nEMI elimination and diagnostic capabilities, it introduces security and safety\nconcerns, particularly in commercial applications. A balanced approach,\nintegrating conventional reliability with deep learning's advanced\ncapabilities, is proposed for more effective EMI suppression in MRI systems.\n","authors":["Wanyu Bian","Panfeng Li","Mengyao Zheng","Chihang Wang","Anying Li","Ying Li","Haowei Ni","Zixuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.17804v3.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2411.08472v1","updated":"2024-11-13T09:46:08Z","published":"2024-11-13T09:46:08Z","title":"A survey on Graph Deep Representation Learning for Facial Expression\n Recognition","summary":" This comprehensive review delves deeply into the various methodologies\napplied to facial expression recognition (FER) through the lens of graph\nrepresentation learning (GRL). Initially, we introduce the task of FER and the\nconcepts of graph representation and GRL. Afterward, we discuss some of the\nmost prevalent and valuable databases for this task. We explore promising\napproaches for graph representation in FER, including graph diffusion,\nspatio-temporal graphs, and multi-stream architectures. Finally, we identify\nfuture research opportunities and provide concluding remarks.\n","authors":["Théo Gueuret","Akrem Sellami","Chaabane Djeraba"],"pdf_url":"https://arxiv.org/pdf/2411.08472v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08470v1","updated":"2024-11-13T09:42:12Z","published":"2024-11-13T09:42:12Z","title":"HyperFace: Generating Synthetic Face Recognition Datasets by Exploring\n Face Embedding Hypersphere","summary":" Face recognition datasets are often collected by crawling Internet and\nwithout individuals' consents, raising ethical and privacy concerns. Generating\nsynthetic datasets for training face recognition models has emerged as a\npromising alternative. However, the generation of synthetic datasets remains\nchallenging as it entails adequate inter-class and intra-class variations.\nWhile advances in generative models have made it easier to increase intra-class\nvariations in face datasets (such as pose, illumination, etc.), generating\nsufficient inter-class variation is still a difficult task. In this paper, we\nformulate the dataset generation as a packing problem on the embedding space\n(represented on a hypersphere) of a face recognition model and propose a new\nsynthetic dataset generation approach, called HyperFace. We formalize our\npacking problem as an optimization problem and solve it with a gradient\ndescent-based approach. Then, we use a conditional face generator model to\nsynthesize face images from the optimized embeddings. We use our generated\ndatasets to train face recognition models and evaluate the trained models on\nseveral benchmarking real datasets. Our experimental results show that models\ntrained with HyperFace achieve state-of-the-art performance in training face\nrecognition using synthetic datasets.\n","authors":["Hatef Otroshi Shahreza","Sébastien Marcel"],"pdf_url":"https://arxiv.org/pdf/2411.08470v1.pdf","comment":"Accepted in NeurIPS 2024 Safe Generative AI Workshop"},{"id":"http://arxiv.org/abs/2406.16439v4","updated":"2024-11-13T09:41:00Z","published":"2024-06-24T08:30:03Z","title":"Exploring Test-Time Adaptation for Object Detection in Continually\n Changing Environments","summary":" Real-world application models are commonly deployed in dynamic environments,\nwhere the target domain distribution undergoes temporal changes. Continual\nTest-Time Adaptation (CTTA) has recently emerged as a promising technique to\ngradually adapt a source-trained model to continually changing target domains.\nDespite recent advancements in addressing CTTA, two critical issues remain: 1)\nFixed thresholds for pseudo-labeling in existing methodologies lead to\nlow-quality pseudo-labels, as model confidence varies across categories and\ndomains; 2) Stochastic parameter restoration methods for mitigating\ncatastrophic forgetting fail to preserve critical information effectively, due\nto their intrinsic randomness. To tackle these challenges for detection models\nin CTTA scenarios, we present AMROD, featuring three core components. Firstly,\nthe object-level contrastive learning module extracts object-level features for\ncontrastive learning to refine the feature representation in the target domain.\nSecondly, the adaptive monitoring module dynamically skips unnecessary\nadaptation and updates the category-specific threshold based on predicted\nconfidence scores to enable efficiency and improve the quality of\npseudo-labels. Lastly, the adaptive randomized restoration mechanism\nselectively reset inactive parameters with higher possibilities, ensuring the\nretention of essential knowledge. We demonstrate the effectiveness of AMROD on\nfour CTTA object detection tasks, where AMROD outperforms existing methods,\nespecially achieving a 3.2 mAP improvement and a 20% increase in efficiency on\nthe Cityscapes-to-Cityscapes-C CTTA task. The code will be released.\n","authors":["Shilei Cao","Yan Liu","Juepeng Zheng","Weijia Li","Runmin Dong","Haohuan Fu"],"pdf_url":"https://arxiv.org/pdf/2406.16439v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08466v1","updated":"2024-11-13T09:37:24Z","published":"2024-11-13T09:37:24Z","title":"Can MLLMs Guide Weakly-Supervised Temporal Action Localization Tasks?","summary":" Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained\nsignificant recognition within the deep learning community, where the fusion of\nthe Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven\ninstrumental in constructing robust video understanding systems, effectively\nsurmounting constraints associated with predefined visual tasks. These\nsophisticated MLLMs exhibit remarkable proficiency in comprehending videos,\nswiftly attaining unprecedented performance levels across diverse benchmarks.\nHowever, their operation demands substantial memory and computational\nresources, underscoring the continued importance of traditional models in video\ncomprehension tasks. In this paper, we introduce a novel learning paradigm\ntermed MLLM4WTAL. This paradigm harnesses the potential of MLLM to offer\ntemporal action key semantics and complete semantic priors for conventional\nWeakly-supervised Temporal Action Localization (WTAL) methods. MLLM4WTAL\nfacilitates the enhancement of WTAL by leveraging MLLM guidance. It achieves\nthis by integrating two distinct modules: Key Semantic Matching (KSM) and\nComplete Semantic Reconstruction (CSR). These modules work in tandem to\neffectively address prevalent issues like incomplete and over-complete outcomes\ncommon in WTAL methods. Rigorous experiments are conducted to validate the\nefficacy of our proposed approach in augmenting the performance of various\nheterogeneous WTAL models.\n","authors":["Quan Zhang","Yuxin Qi"],"pdf_url":"https://arxiv.org/pdf/2411.08466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06098v2","updated":"2024-11-13T09:31:14Z","published":"2024-11-09T07:19:56Z","title":"LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning","summary":" Deep long-tailed recognition has been widely studied to address the issue of\nimbalanced data distributions in real-world scenarios. However, there has been\ninsufficient focus on the design of neural architectures, despite empirical\nevidence suggesting that architecture can significantly impact performance. In\nthis paper, we attempt to mitigate long-tailed issues through architectural\nimprovements. To simplify the design process, we utilize Differential\nArchitecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS\nmethods struggle to perform well in long-tailed scenarios. To tackle this\nchallenge, we introduce Long-Tailed Differential Architecture Search\n(LT-DARTS). Specifically, we conduct extensive experiments to explore\narchitectural components that demonstrate better performance on long-tailed\ndata and propose a new search space based on our observations. This ensures\nthat the architecture obtained through our search process incorporates superior\ncomponents. Additionally, we propose replacing the learnable linear classifier\nwith an Equiangular Tight Frame (ETF) classifier to further enhance our method.\nThis classifier effectively alleviates the biased search process and prevents\nperformance collapse. Extensive experimental evaluations demonstrate that our\napproach consistently improves upon existing methods from an orthogonal\nperspective and achieves state-of-the-art results with simple enhancements.\n","authors":["Yuhan Pan","Yanan Sun","Wei Gong"],"pdf_url":"https://arxiv.org/pdf/2411.06098v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08460v1","updated":"2024-11-13T09:31:06Z","published":"2024-11-13T09:31:06Z","title":"Trap-MID: Trapdoor-based Defense against Model Inversion Attacks","summary":" Model Inversion (MI) attacks pose a significant threat to the privacy of Deep\nNeural Networks by recovering training data distribution from well-trained\nmodels. While existing defenses often rely on regularization techniques to\nreduce information leakage, they remain vulnerable to recent attacks. In this\npaper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to\nmislead MI attacks. A trapdoor is integrated into the model to predict a\nspecific label when the input is injected with the corresponding trigger.\nConsequently, this trapdoor information serves as the \"shortcut\" for MI\nattacks, leading them to extract trapdoor triggers rather than private data. We\nprovide theoretical insights into the impacts of trapdoor's effectiveness and\nnaturalness on deceiving MI attacks. In addition, empirical experiments\ndemonstrate the state-of-the-art defense performance of Trap-MID against\nvarious MI attacks without the requirements for extra data or large\ncomputational overhead. Our source code is publicly available at\nhttps://github.com/ntuaislab/Trap-MID.\n","authors":["Zhen-Ting Liu","Shang-Tse Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08460v1.pdf","comment":"Accepted by Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2411.00393v4","updated":"2024-11-13T09:27:41Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08453v1","updated":"2024-11-13T09:16:21Z","published":"2024-11-13T09:16:21Z","title":"Biomass phenotyping of oilseed rape through UAV multi-view oblique\n imaging with 3DGS and SAM model","summary":" Biomass estimation of oilseed rape is crucial for optimizing crop\nproductivity and breeding strategies. While UAV-based imaging has advanced\nhigh-throughput phenotyping, current methods often rely on orthophoto images,\nwhich struggle with overlapping leaves and incomplete structural information in\ncomplex field environments. This study integrates 3D Gaussian Splatting (3DGS)\nwith the Segment Anything Model (SAM) for precise 3D reconstruction and biomass\nestimation of oilseed rape. UAV multi-view oblique images from 36 angles were\nused to perform 3D reconstruction, with the SAM module enhancing point cloud\nsegmentation. The segmented point clouds were then converted into point cloud\nvolumes, which were fitted to ground-measured biomass using linear regression.\nThe results showed that 3DGS (7k and 30k iterations) provided high accuracy,\nwith peak signal-to-noise ratios (PSNR) of 27.43 and 29.53 and training times\nof 7 and 49 minutes, respectively. This performance exceeded that of structure\nfrom motion (SfM) and mipmap Neural Radiance Fields (Mip-NeRF), demonstrating\nsuperior efficiency. The SAM module achieved high segmentation accuracy, with a\nmean intersection over union (mIoU) of 0.961 and an F1-score of 0.980.\nAdditionally, a comparison of biomass extraction models found the point cloud\nvolume model to be the most accurate, with an determination coefficient (R2) of\n0.976, root mean square error (RMSE) of 2.92 g/plant, and mean absolute\npercentage error (MAPE) of 6.81%, outperforming both the plot crop volume and\nindividual crop volume models. This study highlights the potential of combining\n3DGS with multi-view UAV imaging for improved biomass phenotyping.\n","authors":["Yutao Shen","Hongyu Zhou","Xin Yang","Xuqi Lu","Ziyue Guo","Lixi Jiang","Yong He","Haiyan Cen"],"pdf_url":"https://arxiv.org/pdf/2411.08453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08451v1","updated":"2024-11-13T09:14:35Z","published":"2024-11-13T09:14:35Z","title":"AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference\n Understanding","summary":" Embodied reference understanding is crucial for intelligent agents to predict\nreferents based on human intention through gesture signals and language\ndescriptions. This paper introduces the Attention-Dynamic DINO, a novel\nframework designed to mitigate misinterpretations of pointing gestures across\nvarious interaction contexts. Our approach integrates visual and textual\nfeatures to simultaneously predict the target object's bounding box and the\nattention source in pointing gestures. Leveraging the distance-aware nature of\nnonverbal communication in visual perspective taking, we extend the virtual\ntouch line mechanism and propose an attention-dynamic touch line to represent\nreferring gesture based on interactive distances. The combination of this\ndistance-aware approach and independent prediction of the attention source,\nenhances the alignment between objects and the gesture represented line.\nExtensive experiments on the YouRefIt dataset demonstrate the efficacy of our\ngesture information understanding method in significantly improving task\nperformance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and,\nnotably, surpasses human performance at the 0.75 IoU threshold, marking a first\nin this domain. Comparative experiments with distance-unaware understanding\nmethods from previous research further validate the superiority of the\nAttention-Dynamic Touch Line across diverse contexts.\n","authors":["Hao Guo","Wei Fan","Baichun Wei","Jianfei Zhu","Jin Tian","Chunzhi Yi","Feng Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.08451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.06645v4","updated":"2024-11-13T09:14:12Z","published":"2024-10-09T07:57:47Z","title":"Continual Learning in the Frequency Domain","summary":" Continual learning (CL) is designed to learn new tasks while preserving\nexisting knowledge. Replaying samples from earlier tasks has proven to be an\neffective method to mitigate the forgetting of previously acquired knowledge.\nHowever, the current research on the training efficiency of rehearsal-based\nmethods is insufficient, which limits the practical application of CL systems\nin resource-limited scenarios. The human visual system (HVS) exhibits varying\nsensitivities to different frequency components, enabling the efficient\nelimination of visually redundant information. Inspired by HVS, we propose a\nnovel framework called Continual Learning in the Frequency Domain (CLFD). To\nour knowledge, this is the first study to utilize frequency domain features to\nenhance the performance and efficiency of CL training on edge devices. For the\ninput features of the feature extractor, CLFD employs wavelet transform to map\nthe original input image into the frequency domain, thereby effectively\nreducing the size of input feature maps. Regarding the output features of the\nfeature extractor, CLFD selectively utilizes output features for distinct\nclasses for classification, thereby balancing the reusability and interference\nof output features based on the frequency domain similarity of the classes\nacross various tasks. Optimizing only the input and output features of the\nfeature extractor allows for seamless integration of CLFD with various\nrehearsal-based methods. Extensive experiments conducted in both cloud and edge\nenvironments demonstrate that CLFD consistently improves the performance of\nstate-of-the-art (SOTA) methods in both precision and training efficiency.\nSpecifically, CLFD can increase the accuracy of the SOTA CL method by up to\n6.83% and reduce the training time by 2.6$\\times$.\n","authors":["Ruiqi Liu","Boyu Diao","Libo Huang","Zijia An","Zhulin An","Yongjun Xu"],"pdf_url":"https://arxiv.org/pdf/2410.06645v4.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2410.23828v2","updated":"2024-11-13T09:06:18Z","published":"2024-10-31T11:20:13Z","title":"Show Me What and Where has Changed? Question Answering and Grounding for\n Remote Sensing Change Detection","summary":" Remote sensing change detection aims to perceive changes occurring on the\nEarth's surface from remote sensing data in different periods, and feed these\nchanges back to humans. However, most existing methods only focus on detecting\nchange regions, lacking the capability to interact with users to identify\nchanges that the users expect. In this paper, we introduce a new task named\nChange Detection Question Answering and Grounding (CDQAG), which extends the\ntraditional change detection task by providing interpretable textual answers\nand intuitive visual evidence. To this end, we construct the first CDQAG\nbenchmark dataset, termed QAG-360K, comprising over 360K triplets of questions,\ntextual answers, and corresponding high-quality visual masks. It encompasses 10\nessential land-cover categories and 8 comprehensive question types, which\nprovides a valuable and diverse dataset for remote sensing applications.\nFurthermore, we present VisTA, a simple yet effective baseline method that\nunifies the tasks of question answering and grounding by delivering both visual\nand textual answers. Our method achieves state-of-the-art results on both the\nclassic change detection-based visual question answering (CDVQA) and the\nproposed CDQAG datasets. Extensive qualitative and quantitative experimental\nresults provide useful insights for developing better CDQAG models, and we hope\nthat our work can inspire further research in this important yet underexplored\nresearch field. The proposed benchmark dataset and method are available at\nhttps://github.com/like413/VisTA.\n","authors":["Ke Li","Fuyu Dong","Di Wang","Shaofeng Li","Quan Wang","Xinbo Gao","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2410.23828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.21991v4","updated":"2024-11-13T08:59:31Z","published":"2024-10-29T12:22:07Z","title":"From Explicit Rules to Implicit Reasoning in an Interpretable Violence\n Monitoring System","summary":" Recently, research based on pre-trained models has demonstrated outstanding\nperformance in violence surveillance tasks. However, most of them were\nblack-box systems which faced challenges regarding explainability during\ntraining and inference processes. An important question is how to incorporate\nexplicit knowledge into these implicit models, thereby designing expert-driven\nand interpretable violence surveillance systems. This paper proposes a new\nparadigm for weakly supervised violence monitoring (WSVM) called Rule base\nViolence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure\nwith different designs for images and text. One of the branches is called the\nimplicit branch, which uses only visual features for coarse-grained binary\nclassification. In this branch, image feature extraction is divided into two\nchannels: one responsible for extracting scene frames and the other focusing on\nextracting actions. The other branch is called the explicit branch, which\nutilizes language-image alignment to perform fine-grained classification. For\nthe language channel design in the explicit branch, the proposed RuleCLIP uses\nthe state-of-the-art YOLO-World model to detect objects in video frames, and\nassociation rules are identified through data mining methods as descriptions of\nthe video. Leveraging the dual-branch architecture, RuleVM achieves\ninterpretable coarse-grained and fine-grained violence surveillance. Extensive\nexperiments were conducted on two commonly used benchmarks, and the results\nshow that RuleCLIP achieved the best performance in both coarse-grained and\nfine-grained monitoring, significantly outperforming existing state-of-the-art\nmethods. Moreover, interpretability experiments uncovered some interesting\nrules, such as the observation that as the number of people increases, the risk\nlevel of violent behavior also rises.\n","authors":["Wen-Dong Jiang","Chih-Yung Chang","Ssu-Chi Kuai","Diptendu Sinha Roy"],"pdf_url":"https://arxiv.org/pdf/2410.21991v4.pdf","comment":"12 pages,7 figures IEEE TSMCA (Under review)"},{"id":"http://arxiv.org/abs/2411.08443v1","updated":"2024-11-13T08:56:35Z","published":"2024-11-13T08:56:35Z","title":"Machine Unlearning on Pre-trained Models by Residual Feature Alignment\n Using LoRA","summary":" Machine unlearning is new emerged technology that removes a subset of the\ntraining data from a trained model without affecting the model performance on\nthe remaining data. This topic is becoming increasingly important in protecting\nuser privacy and eliminating harmful or outdated data. The key challenge lies\nin effectively and efficiently unlearning specific information without\ncompromising the model's utility on the retained data. For the pre-trained\nmodels, fine-tuning is an important way to achieve the unlearning target.\nPrevious work typically fine-tuned the entire model's parameters, which incurs\nsignificant computation costs. In addition, the fine-tuning process may cause\nshifts in the intermediate layer features, affecting the model's overall\nutility. In this work, we propose a novel and efficient machine unlearning\nmethod on pre-trained models. We term the method as Residual Feature Alignment\nUnlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose\nthe model's intermediate features into pre-trained features and residual\nfeatures. By adjusting the residual features, we align the unlearned model with\nthe pre-trained model at the intermediate feature level to achieve both\nunlearning and remaining targets. The method aims to learn the zero residuals\non the retained set and shifted residuals on the unlearning set. Extensive\nexperiments on numerous datasets validate the effectiveness of our approach.\n","authors":["Laiqiao Qin","Tianqing Zhu","Linlin Wang","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.09828v4","updated":"2024-11-13T08:34:48Z","published":"2024-05-16T06:05:08Z","title":"*: Improving the 3D detector by introducing Voxel2Pillar feature\n encoding and extracting multi-scale features","summary":" The multi-line LiDAR is widely used in autonomous vehicles, so point\ncloud-based 3D detectors are essential for autonomous driving. Extracting rich\nmulti-scale features is crucial for point cloud-based 3D detectors in\nautonomous driving due to significant differences in the size of different\ntypes of objects. However, because of the real-time requirements, large-size\nconvolution kernels are rarely used to extract large-scale features in the\nbackbone. Current 3D detectors commonly use feature pyramid networks to obtain\nlarge-scale features; however, some objects containing fewer point clouds are\nfurther lost during down-sampling, resulting in degraded performance. Since\npillar-based schemes require much less computation than voxel-based schemes,\nthey are more suitable for constructing real-time 3D detectors. Hence, we\npropose the *, a pillar-based scheme. We redesigned the feature encoding, the\nbackbone, and the neck of the 3D detector. We propose the Voxel2Pillar feature\nencoding, which uses a sparse convolution constructor to construct pillars with\nricher point cloud features, especially height features. The Voxel2Pillar adds\nmore learnable parameters to the feature encoding, enabling the initial pillars\nto have higher performance ability. We extract multi-scale and large-scale\nfeatures in the proposed fully sparse backbone, which does not utilize\nlarge-size convolutional kernels; the backbone consists of the proposed\nmulti-scale feature extraction module. The neck consists of the proposed sparse\nConvNeXt, whose simple structure significantly improves the performance. We\nvalidate the effectiveness of the proposed * on the Waymo Open Dataset, and the\nobject detection accuracy for vehicles, pedestrians, and cyclists is improved.\nWe also verify the effectiveness of each proposed module in detail through\nablation studies.\n","authors":["Xusheng Li","Chengliang Wang","Shumao Wang","Zhuo Zeng","Ji Liu"],"pdf_url":"https://arxiv.org/pdf/2405.09828v4.pdf","comment":"Due to experimental data errors, it needs to be withdrawn"},{"id":"http://arxiv.org/abs/2411.04919v2","updated":"2024-11-13T08:32:27Z","published":"2024-11-07T17:56:16Z","title":"Stem-OB: Generalizable Visual Imitation Learning with Stem-Like\n Convergent Observation through Diffusion Inversion","summary":" Visual imitation learning methods demonstrate strong performance, yet they\nlack generalization when faced with visual input perturbations, including\nvariations in lighting and textures, impeding their real-world application. We\npropose Stem-OB that utilizes pretrained image diffusion models to suppress\nlow-level visual differences while maintaining high-level scene structures.\nThis image inversion process is akin to transforming the observation into a\nshared representation, from which other observations stem, with extraneous\ndetails removed. Stem-OB contrasts with data-augmentation approaches as it is\nrobust to various unspecified appearance changes without the need for\nadditional training. Our method is a simple yet highly effective plug-and-play\nsolution. Empirical results confirm the effectiveness of our approach in\nsimulated tasks and show an exceptionally significant improvement in real-world\napplications, with an average increase of 22.2% in success rates compared to\nthe best baseline. See https://hukz18.github.io/Stem-Ob/ for more info.\n","authors":["Kaizhe Hu","Zihang Rui","Yao He","Yuyao Liu","Pu Hua","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2411.04919v2.pdf","comment":"Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/"},{"id":"http://arxiv.org/abs/2411.07501v2","updated":"2024-11-13T08:30:52Z","published":"2024-11-12T02:57:15Z","title":"LAuReL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v2.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.08424v1","updated":"2024-11-13T08:17:52Z","published":"2024-11-13T08:17:52Z","title":"A Heterogeneous Graph Neural Network Fusing Functional and Structural\n Connectivity for MCI Diagnosis","summary":" Brain connectivity alternations associated with brain disorders have been\nwidely reported in resting-state functional imaging (rs-fMRI) and diffusion\ntensor imaging (DTI). While many dual-modal fusion methods based on graph\nneural networks (GNNs) have been proposed, they generally follow homogenous\nfusion ways ignoring rich heterogeneity of dual-modal information. To address\nthis issue, we propose a novel method that integrates functional and structural\nconnectivity based on heterogeneous graph neural networks (HGNNs) to better\nleverage the rich heterogeneity in dual-modal images. We firstly use blood\noxygen level dependency and whiter matter structure information provided by\nrs-fMRI and DTI to establish homo-meta-path, capturing node relationships\nwithin the same modality. At the same time, we propose to establish\nhetero-meta-path based on structure-function coupling and brain community\nsearching to capture relations among cross-modal nodes. Secondly, we further\nintroduce a heterogeneous graph pooling strategy that automatically balances\nhomo- and hetero-meta-path, effectively leveraging heterogeneous information\nand preventing feature confusion after pooling. Thirdly, based on the\nflexibility of heterogeneous graphs, we propose a heterogeneous graph data\naugmentation approach that can conveniently address the sample imbalance issue\ncommonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset\nfor mild cognitive impairment (MCI) diagnosis. Experimental results indicate\nthe proposed method is effective and superior to other algorithms, with a mean\nclassification accuracy of 93.3%.\n","authors":["Feiyu Yin","Yu Lei","Siyuan Dai","Wenwen Zeng","Guoqing Wu","Liang Zhan","Jinhua Yu"],"pdf_url":"https://arxiv.org/pdf/2411.08424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07579v2","updated":"2024-11-13T08:00:57Z","published":"2024-11-12T06:29:48Z","title":"Projecting Gaussian Ellipsoids While Avoiding Affine Projection\n Approximation","summary":" Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its\nreal-time rendering speed and state-of-the-art rendering quality. However,\nduring the rendering process, the use of the Jacobian of the affine\napproximation of the projection transformation leads to inevitable errors,\nresulting in blurriness, artifacts and a lack of scene consistency in the final\nrendered images. To address this issue, we introduce an ellipsoid-based\nprojection method to calculate the projection of Gaussian ellipsoid on the\nimage plane, witch is the primitive of 3D Gaussian Splatting. As our proposed\nellipsoid-based projection method cannot handle Gaussian ellipsoids with camera\norigins inside them or parts lying below $z=0$ plane in the camera space, we\ndesigned a pre-filtering strategy. Experiments over multiple widely adopted\nbenchmark datasets show that using our ellipsoid-based projection method can\nenhance the rendering quality of 3D Gaussian Splatting and its extensions.\n","authors":["Han Qi","Tao Cai","Xiyue Han"],"pdf_url":"https://arxiv.org/pdf/2411.07579v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08410v1","updated":"2024-11-13T07:57:19Z","published":"2024-11-13T07:57:19Z","title":"The VLLM Safety Paradox: Dual Ease in Jailbreak Attack and Defense","summary":" The vulnerability of Vision Large Language Models (VLLMs) to jailbreak\nattacks appears as no surprise. However, recent defense mechanisms against\nthese attacks have reached near-saturation performance on benchmarks, often\nwith minimal effort. This simultaneous high performance in both attack and\ndefense presents a perplexing paradox. Resolving it is critical for advancing\nthe development of trustworthy models. To address this research gap, we first\ninvestigate why VLLMs are prone to these attacks. We then make a key\nobservation: existing defense mechanisms suffer from an \\textbf{over-prudence}\nproblem, resulting in unexpected abstention even in the presence of benign\ninputs. Additionally, we find that the two representative evaluation methods\nfor jailbreak often exhibit chance agreement. This limitation makes it\npotentially misleading when evaluating attack strategies or defense mechanisms.\nBeyond these empirical observations, our another contribution in this work is\nto repurpose the guardrails of LLMs on the shelf, as an effective alternative\ndetector prior to VLLM response. We believe these findings offer useful\ninsights to rethink the foundational development of VLLM safety with respect to\nbenchmark datasets, evaluation methods, and defense strategies.\n","authors":["Yangyang Guo","Fangkai Jiao","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2411.08410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08402v1","updated":"2024-11-13T07:41:47Z","published":"2024-11-13T07:41:47Z","title":"V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with\n Denoising Diffusion","summary":" Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D\nobject detection using LiDAR and camera data. However, these methods suffer\nfrom performance degradation in adverse weather conditions. The weatherrobust\n4D radar provides Doppler and additional geometric information, raising the\npossibility of addressing this challenge. To this end, we present V2X-R, the\nfirst simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R\ncontains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point\nclouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes.\nSubsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for\n3D object detection and implement it with various fusion strategies. To achieve\nweather-robust detection, we additionally propose a Multi-modal Denoising\nDiffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D\nradar feature as a condition to prompt the diffusion model to denoise noisy\nLiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline\ndemonstrates superior performance in the V2X-R dataset. Over and above this,\nour MDD module further improved the performance of basic fusion model by up to\n5.73%/6.70% in foggy/snowy conditions with barely disrupting normal\nperformance. The dataset and code will be publicly available at:\nhttps://github.com/ylwhxht/V2X-R.\n","authors":["Xun Huang","Jinlong Wang","Qiming Xia","Siheng Chen","Bisheng Yang","Cheng Wang","Chenglu Wen"],"pdf_url":"https://arxiv.org/pdf/2411.08402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08395v1","updated":"2024-11-13T07:27:56Z","published":"2024-11-13T07:27:56Z","title":"MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion\n Prompt for Ultrasound Needle Tracking","summary":" Ultrasound (US)-guided needle insertion is widely employed in percutaneous\ninterventions. However, providing feedback on the needle tip position via US\nimage presents challenges due to noise, artifacts, and the thin imaging plane\nof US, which degrades needle features and leads to intermittent tip visibility.\nIn this paper, a Mamba-based US needle tracker MambaXCTrack utilizing\nstructured state space models cross-correlation (SSMX-Corr) and implicit motion\nprompt is proposed, which is the first application of Mamba in US needle\ntracking. The SSMX-Corr enhances cross-correlation by long-range modeling and\nglobal searching of distant semantic features between template and search maps,\nbenefiting the tracking under noise and artifacts by implicitly learning\npotential distant semantic cues. By combining with cross-map interleaved scan\n(CIS), local pixel-wise interaction with positional inductive bias can also be\nintroduced to SSMX-Corr. The implicit low-level motion descriptor is proposed\nas a non-visual prompt to enhance tracking robustness, addressing the\nintermittent tip visibility problem. Extensive experiments on a dataset with\nmotorized needle insertion in both phantom and tissue samples demonstrate that\nthe proposed tracker outperforms other state-of-the-art trackers while ablation\nstudies further highlight the effectiveness of each proposed tracking module.\n","authors":["Yuelin Zhang","Qingpeng Ding","Long Lei","Jiwei Shan","Wenxuan Xie","Tianyi Zhang","Wanquan Yan","Raymond Shing-Yan Tang","Shing Shin Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.08395v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2411.07265v2","updated":"2024-11-13T07:26:33Z","published":"2024-11-09T13:13:49Z","title":"ViTOC: Vision Transformer and Object-aware Captioner","summary":" This paper presents ViTOC (Vision Transformer and Object-aware Captioner), a\nnovel vision-language model for image captioning that addresses the challenges\nof accuracy and diversity in generated descriptions. Unlike conventional\napproaches, ViTOC employs a dual-path architecture based on Vision Transformer\nand object detector, effectively fusing global visual features and local object\ninformation through learnable vectors. The model introduces an innovative\nobject-aware prompting strategy that significantly enhances its capability in\nhandling long-tail data. Experiments on the standard COCO dataset demonstrate\nthat ViTOC outperforms baseline models across all evaluation metrics.\nAdditionally, we propose a reference-free evaluation method based on CLIP to\nfurther validate the model's effectiveness. By utilizing pretrained visual\nmodel parameters, ViTOC achieves efficient end-to-end training.\n","authors":["Feiyang Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07265v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08380v1","updated":"2024-11-13T07:05:40Z","published":"2024-11-13T07:05:40Z","title":"EgoVid-5M: A Large-Scale Video-Action Dataset for Egocentric Video\n Generation","summary":" Video generation has emerged as a promising tool for world simulation,\nleveraging visual data to replicate real-world environments. Within this\ncontext, egocentric video generation, which centers on the human perspective,\nholds significant potential for enhancing applications in virtual reality,\naugmented reality, and gaming. However, the generation of egocentric videos\npresents substantial challenges due to the dynamic nature of egocentric\nviewpoints, the intricate diversity of actions, and the complex variety of\nscenes encountered. Existing datasets are inadequate for addressing these\nchallenges effectively. To bridge this gap, we present EgoVid-5M, the first\nhigh-quality dataset specifically curated for egocentric video generation.\nEgoVid-5M encompasses 5 million egocentric video clips and is enriched with\ndetailed action annotations, including fine-grained kinematic control and\nhigh-level textual descriptions. To ensure the integrity and usability of the\ndataset, we implement a sophisticated data cleaning pipeline designed to\nmaintain frame consistency, action coherence, and motion smoothness under\negocentric conditions. Furthermore, we introduce EgoDreamer, which is capable\nof generating egocentric videos driven simultaneously by action descriptions\nand kinematic control signals. The EgoVid-5M dataset, associated action\nannotations, and all data cleansing metadata will be released for the\nadvancement of research in egocentric video generation.\n","authors":["Xiaofeng Wang","Kang Zhao","Feng Liu","Jiayu Wang","Guosheng Zhao","Xiaoyi Bao","Zheng Zhu","Yingya Zhang","Xingang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08380v1.pdf","comment":"Project Page: https://egovid.github.io/"},{"id":"http://arxiv.org/abs/2411.08371v1","updated":"2024-11-13T06:42:03Z","published":"2024-11-13T06:42:03Z","title":"Multiscale Graph Construction Using Non-local Cluster Features","summary":" This paper presents a multiscale graph construction method using both graph\nand signal features. Multiscale graph is a hierarchical representation of the\ngraph, where a node at each level indicates a cluster in a finer resolution. To\nobtain the hierarchical clusters, existing methods often use graph clustering;\nhowever, they may ignore signal variations. As a result, these methods could\nfail to detect the clusters having similar features on nodes. In this paper, we\nconsider graph and node-wise features simultaneously for multiscale clustering\nof a graph. With given clusters of the graph, the clusters are merged\nhierarchically in three steps: 1) Feature vectors in the clusters are\nextracted. 2) Similarities among cluster features are calculated using optimal\ntransport. 3) A variable $k$-nearest neighbor graph (V$k$NNG) is constructed\nand graph spectral clustering is applied to the V$k$NNG to obtain clusters at a\ncoarser scale. Additionally, the multiscale graph in this paper has\n\\textit{non-local} characteristics: Nodes with similar features are merged even\nif they are spatially separated. In experiments on multiscale image and point\ncloud segmentation, we demonstrate the effectiveness of the proposed method.\n","authors":["Reina Kaneko","Hayate Kojima","Kenta Yanagiya","Junya Hara","Hiroshi Higashi","Yuichi Tanaka"],"pdf_url":"https://arxiv.org/pdf/2411.08371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.18054v3","updated":"2024-11-13T06:25:23Z","published":"2024-06-26T04:12:34Z","title":"Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image\n Translation","summary":" The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology\nare Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides\noffer high quality histopathological images but require a labor-intensive\nacquisition process. In contrast, FF slides can be prepared quickly, but the\nimage quality is relatively poor. Our task is to translate FF images into FFPE\nstyle, thereby improving the image quality for diagnostic purposes. In this\npaper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological\nimage translation using a pre-trained diffusion model. Specifically, we utilize\na one-step diffusion model as the generator, which we fine-tune using LoRA\nadapters within an adversarial learning framework. To enable the model to\neffectively capture both global structural patterns and local details, we\nintroduce a multi-scale feature fusion module that leverages two VAE encoders\nto extract features at different image resolutions, performing feature fusion\nbefore inputting them into the UNet. Additionally, a pre-trained\nvision-language model for histopathology serves as the backbone for the\ndiscriminator, enhancing model performance. Our FF-to-FFPE translation\nexperiments on the TCGA-NSCLC dataset demonstrate that the proposed approach\noutperforms existing methods. The code and models are released at\nhttps://github.com/QilaiZhang/Diffusion-FFPE.\n","authors":["Qilai Zhang","Jiawen Li","Peiran Liao","Jiali Hu","Tian Guan","Anjia Han","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2406.18054v3.pdf","comment":"Accepted at IEEE BIBM 2024"},{"id":"http://arxiv.org/abs/2411.04493v2","updated":"2024-11-13T05:52:23Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08347v1","updated":"2024-11-13T05:38:55Z","published":"2024-11-13T05:38:55Z","title":"A Chinese Multi-label Affective Computing Dataset Based on Social Media\n Network Users","summary":" Emotion and personality are central elements in understanding human\npsychological states. Emotions reflect an individual subjective experiences,\nwhile personality reveals relatively stable behavioral and cognitive patterns.\nExisting affective computing datasets often annotate emotion and personality\ntraits separately, lacking fine-grained labeling of micro-emotions and emotion\nintensity in both single-label and multi-label classifications. Chinese emotion\ndatasets are extremely scarce, and datasets capturing Chinese user personality\ntraits are even more limited. To address these gaps, this study collected data\nfrom the major social media platform Weibo, screening 11,338 valid users from\nover 50,000 individuals with diverse MBTI personality labels and acquiring\n566,900 posts along with the user MBTI personality tags. Using the EQN method,\nwe compiled a multi-label Chinese affective computing dataset that integrates\nthe same user's personality traits with six emotions and micro-emotions, each\nannotated with intensity levels. Validation results across multiple NLP\nclassification models demonstrate the dataset strong utility. This dataset is\ndesigned to advance machine recognition of complex human emotions and provide\ndata support for research in psychology, education, marketing, finance, and\npolitics.\n","authors":["Jingyi Zhou","Senlin Luo","Haofan Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08347v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.18066v2","updated":"2024-11-13T05:15:53Z","published":"2024-02-28T05:52:25Z","title":"Six-Point Method for Multi-Camera Systems with Reduced Solution Space","summary":" Relative pose estimation using point correspondences (PC) is a widely used\ntechnique. A minimal configuration of six PCs is required for two views of\ngeneralized cameras. In this paper, we present several minimal solvers that use\nsix PCs to compute the 6DOF relative pose of multi-camera systems, including a\nminimal solver for the generalized camera and two minimal solvers for the\npractical configuration of two-camera rigs. The equation construction is based\non the decoupling of rotation and translation. Rotation is represented by\nCayley or quaternion parametrization, and translation can be eliminated by\nusing the hidden variable technique. Ray bundle constraints are found and\nproven when a subset of PCs relate the same cameras across two views. This is\nthe key to reducing the number of solutions and generating numerically stable\nsolvers. Moreover, all configurations of six-point problems for multi-camera\nsystems are enumerated. Extensive experiments demonstrate the superior accuracy\nand efficiency of our solvers compared to state-of-the-art six-point methods.\nThe code is available at https://github.com/jizhaox/relpose-6pt\n","authors":["Banglei Guan","Ji Zhao","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2402.18066v2.pdf","comment":"Accepted to the European Conference on Computer Vision (ECCV), 2024,\n for an oral presentation"},{"id":"http://arxiv.org/abs/2411.08340v1","updated":"2024-11-13T05:09:28Z","published":"2024-11-13T05:09:28Z","title":"DyConfidMatch: Dynamic Thresholding and Re-sampling for 3D\n Semi-supervised Learning","summary":" Semi-supervised learning (SSL) leverages limited labeled and abundant\nunlabeled data but often faces challenges with data imbalance, especially in 3D\ncontexts. This study investigates class-level confidence as an indicator of\nlearning status in 3D SSL, proposing a novel method that utilizes dynamic\nthresholding to better use unlabeled data, particularly from underrepresented\nclasses. A re-sampling strategy is also introduced to mitigate bias towards\nwell-represented classes, ensuring equitable class representation. Through\nextensive experiments in 3D SSL, our method surpasses state-of-the-art\ncounterparts in classification and detection tasks, highlighting its\neffectiveness in tackling data imbalance. This approach presents a significant\nadvancement in SSL for 3D datasets, providing a robust solution for data\nimbalance issues.\n","authors":["Zhimin Chen","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2411.08340v1.pdf","comment":"Accepted by Pattern Recognition Journal"},{"id":"http://arxiv.org/abs/2411.08335v1","updated":"2024-11-13T04:49:32Z","published":"2024-11-13T04:49:32Z","title":"DEEGITS: Deep Learning based Framework for Measuring Heterogenous\n Traffic State in Challenging Traffic Scenarios","summary":" This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State\nMeasurement), a comprehensive framework that leverages state-of-the-art\nconvolutional neural network (CNN) techniques to accurately and rapidly detect\nvehicles and pedestrians, as well as to measure traffic states in challenging\nscenarios (i.e., congestion, occlusion). In this study, we enhance the training\ndataset through data fusion, enabling simultaneous detection of vehicles and\npedestrians. Image preprocessing and augmentation are subsequently performed to\nimprove the quality and quantity of the dataset. Transfer learning is applied\non the YOLOv8 pretrained model to increase the model's capability to identify a\ndiverse array of vehicles. Optimal hyperparameters are obtained using the Grid\nSearch algorithm, with the Stochastic Gradient Descent (SGD) optimizer\noutperforming other optimizers under these settings. Extensive experimentation\nand evaluation demonstrate substantial accuracy within the detection framework,\nwith the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5\non the test set, surpassing previous benchmarks on similar datasets. The\nDeepSORT multi-object tracking algorithm is incorporated to track detected\nvehicles and pedestrians in this study. Finally, the framework is tested to\nmeasure heterogeneous traffic states in mixed traffic conditions. Two locations\nwith differing traffic compositions and congestion levels are selected: one\nmotorized-dominant location with moderate density and one\nnon-motorized-dominant location with higher density. Errors are statistically\ninsignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91\nto 0.97 for heterogeneous traffic flow and speed measurements, respectively.\n","authors":["Muttahirul Islam","Nazmul Haque","Md. Hadiuzzaman"],"pdf_url":"https://arxiv.org/pdf/2411.08335v1.pdf","comment":"Submitted for presentation at the 103 rd Annual Meeting of\n Transportation Research Board and publication in Transportation Research\n Record: Journal of Transportation Research Board"},{"id":"http://arxiv.org/abs/2411.08334v1","updated":"2024-11-13T04:32:58Z","published":"2024-11-13T04:32:58Z","title":"Enhancing Multimodal Query Representation via Visual Dialogues for\n End-to-End Knowledge Retrieval","summary":" Existing multimodal retrieval systems often rely on disjointed models for\nimage comprehension, such as object detectors and caption generators, leading\nto cumbersome implementations and training processes. To overcome this\nlimitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a\ntext retriever with the ability to understand multimodal queries via dynamic\nmodality interaction. Ret-XKnow leverages a partial convolution mechanism to\nfocus on visual information relevant to the given textual query, thereby\nenhancing multimodal query representations. To effectively learn multimodal\ninteraction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset\nautomatically constructed from visual dialogue datasets. Our dataset\nconstruction process ensures that the dialogues are transformed into suitable\ninformation retrieval tasks using a text retriever. We demonstrate that our\napproach not only significantly improves retrieval performance in zero-shot\nsettings but also achieves substantial improvements in fine-tuning scenarios.\nOur code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow.\n","authors":["Yeong-Joon Ju","Ho-Joong Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08333v1","updated":"2024-11-13T04:29:34Z","published":"2024-11-13T04:29:34Z","title":"SASE: A Searching Architecture for Squeeze and Excitation Operations","summary":" In the past few years, channel-wise and spatial-wise attention blocks have\nbeen widely adopted as supplementary modules in deep neural networks, enhancing\nnetwork representational abilities while introducing low complexity. Most\nattention modules follow a squeeze-and-excitation paradigm. However, to design\nsuch attention modules, requires a substantial amount of experiments and\ncomputational resources. Neural Architecture Search (NAS), meanwhile, is able\nto automate the design of neural networks and spares the numerous experiments\nrequired for an optimal architecture. This motivates us to design a search\narchitecture that can automatically find near-optimal attention modules through\nNAS. We propose SASE, a Searching Architecture for Squeeze and Excitation\noperations, to form a plug-and-play attention block by searching within certain\nsearch space. The search space is separated into 4 different sets, each\ncorresponds to the squeeze or excitation operation along the channel or spatial\ndimension. Additionally, the search sets include not only existing attention\nblocks but also other operations that have not been utilized in attention\nmechanisms before. To the best of our knowledge, SASE is the first attempt to\nsubdivide the attention search space and search for architectures beyond\ncurrently known attention modules. The searched attention module is tested with\nextensive experiments across a range of visual tasks. Experimental results\nindicate that visual backbone networks (ResNet-50/101) using the SASE attention\nmodule achieved the best performance compared to those using the current\nstate-of-the-art attention modules. Codes are included in the supplementary\nmaterial, and they will be made public later.\n","authors":["Hanming Wang","Yunlong Li","Zijun Wu","Huifen Wang","Yuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08333v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08328v1","updated":"2024-11-13T04:20:45Z","published":"2024-11-13T04:20:45Z","title":"Motion Control for Enhanced Complex Action Video Generation","summary":" Existing text-to-video (T2V) models often struggle with generating videos\nwith sufficiently pronounced or complex actions. A key limitation lies in the\ntext prompt's inability to precisely convey intricate motion details. To\naddress this, we propose a novel framework, MVideo, designed to produce\nlong-duration videos with precise, fluid actions. MVideo overcomes the\nlimitations of text prompts by incorporating mask sequences as an additional\nmotion condition input, providing a clearer, more accurate representation of\nintended actions. Leveraging foundational vision models such as GroundingDINO\nand SAM2, MVideo automatically generates mask sequences, enhancing both\nefficiency and robustness. Our results demonstrate that, after training, MVideo\neffectively aligns text prompts with motion conditions to produce videos that\nsimultaneously meet both criteria. This dual control mechanism allows for more\ndynamic video generation by enabling alterations to either the text prompt or\nmotion condition independently, or both in tandem. Furthermore, MVideo supports\nmotion condition editing and composition, facilitating the generation of videos\nwith more complex actions. MVideo thus advances T2V motion generation, setting\na strong benchmark for improved action depiction in current video diffusion\nmodels. Our project page is available at https://mvideo-v1.github.io/.\n","authors":["Qiang Zhou","Shaofeng Zhang","Nianzu Yang","Ye Qian","Hao Li"],"pdf_url":"https://arxiv.org/pdf/2411.08328v1.pdf","comment":"Project page: https://mvideo-v1.github.io/"},{"id":"http://arxiv.org/abs/2411.07976v2","updated":"2024-11-13T03:56:10Z","published":"2024-11-12T17:55:39Z","title":"DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring","summary":" Coronary artery disease (CAD), one of the most common cause of mortality in\nthe world. Coronary artery calcium (CAC) scoring using computed tomography (CT)\nis key for risk assessment to prevent coronary disease. Previous studies on\nrisk assessment and calcification detection in CT scans primarily use\napproaches based on UNET architecture, frequently implemented on pre-built\nmodels. However, these models are limited by the availability of annotated CT\nscans containing CAC and suffering from imbalanced dataset, decreasing\nperformance of CAC segmentation and scoring. In this study, we extend this\napproach by incorporating the self-supervised learning (SSL) technique of DINO\n(self-distillation with no labels) to eliminate limitations of scarce annotated\ndata in CT scans. The DINO model's ability to train without requiring CAC area\nannotations enhances its robustness in generating distinct features. The DINO\nmodel is trained on to focus specifically on calcified areas by using labels,\naiming to generate features that effectively capture and highlight key\ncharacteristics. The label-guided DINO (DINO-LG) enhances classification by\ndistinguishing CT slices that contain calcification from those that do not,\nperforming 57% better than the standard DINO model in this task. CAC scoring\nand segmentation tasks are performed by a basic U-NET architecture, fed\nspecifically with CT slices containing calcified areas as identified by the\nDINO-LG model. This targeted identification performed by DINO-LG model improves\nCAC segmentation performance by approximately 10% and significant increase in\nCAC scoring accuracy.\n","authors":["Mahmut S. Gokmen","Cody Bumgardner","Caner Ozcan"],"pdf_url":"https://arxiv.org/pdf/2411.07976v2.pdf","comment":"Developed by Center for Applied Artificial Intelligence (CAAI),\n University of Kentucky"},{"id":"http://arxiv.org/abs/2411.06106v2","updated":"2024-11-13T03:19:47Z","published":"2024-11-09T08:00:50Z","title":"Personalize to generalize: Towards a universal medical multi-modality\n generalization through personalization","summary":" The differences among medical imaging modalities, driven by distinct\nunderlying principles, pose significant challenges for generalization in\nmulti-modal medical tasks. Beyond modality gaps, individual variations, such as\ndifferences in organ size and metabolic rate, further impede a model's ability\nto generalize effectively across both modalities and diverse populations.\nDespite the importance of personalization, existing approaches to multi-modal\ngeneralization often neglect individual differences, focusing solely on common\nanatomical features. This limitation may result in weakened generalization in\nvarious medical tasks. In this paper, we unveil that personalization is\ncritical for multi-modal generalization. Specifically, we propose an approach\nto achieve personalized generalization through approximating the underlying\npersonalized invariant representation ${X}_h$ across various modalities by\nleveraging individual-level constraints and a learnable biological prior. We\nvalidate the feasibility and benefits of learning a personalized ${X}_h$,\nshowing that this representation is highly generalizable and transferable\nacross various multi-modal medical tasks. Extensive experimental results\nconsistently show that the additionally incorporated personalization\nsignificantly improves performance and generalization across diverse scenarios,\nconfirming its effectiveness.\n","authors":["Zhaorui Tan","Xi Yang","Tan Pan","Tianyi Liu","Chen Jiang","Xin Guo","Qiufeng Wang","Anh Nguyen","Yuan Qi","Kaizhu Huang","Yuan Cheng"],"pdf_url":"https://arxiv.org/pdf/2411.06106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08305v1","updated":"2024-11-13T03:03:30Z","published":"2024-11-13T03:03:30Z","title":"Robust Divergence Learning for Missing-Modality Segmentation","summary":" Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary\ninformation for analyzing brain tumor subregions. While methods using four\ncommon MRI modalities for automatic segmentation have shown success, they often\nface challenges with missing modalities due to image quality issues,\ninconsistent protocols, allergic reactions, or cost factors. Thus, developing a\nsegmentation paradigm that handles missing modalities is clinically valuable. A\nnovel single-modality parallel processing network framework based on H\\\"older\ndivergence and mutual information is introduced. Each modality is independently\ninput into a shared network backbone for parallel processing, preserving unique\ninformation. Additionally, a dynamic sharing framework is introduced that\nadjusts network parameters based on modality availability. A H\\\"older\ndivergence and mutual information-based loss functions are used for evaluating\ndiscrepancies between predictions and labels. Extensive testing on the BraTS\n2018 and BraTS 2020 datasets demonstrates that our method outperforms existing\ntechniques in handling missing modalities and validates each component's\neffectiveness.\n","authors":["Runze Cheng","Zhongao Sun","Ye Zhang","Chun Li"],"pdf_url":"https://arxiv.org/pdf/2411.08305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v2","updated":"2024-11-13T02:39:12Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at:\n\\url{https://github.com/chikap421/mseg_vcuq}\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v2.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.08293v1","updated":"2024-11-13T02:18:03Z","published":"2024-11-13T02:18:03Z","title":"Choix d'un espace de représentation image adapté à la détection\n de réseaux routiers","summary":" These last years, algorithms allowing to decompose an image into its\nstructures and textures components have emerged. In this paper, we present an\napplication of this type of decomposition to the problem road network detection\nin aerial or satelite imagery. The algorithmic procedure involves the image\ndecomposition (using a unique property), an alignment detection step based on\nthe Gestalt theory, and a refinement step using statistical active contours.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08293v1.pdf","comment":"in French language"},{"id":"http://arxiv.org/abs/2411.08292v1","updated":"2024-11-13T02:17:57Z","published":"2024-11-13T02:17:57Z","title":"Noisy image decomposition: a new structure, texture and noise model\n based on local adaptivity","summary":" These last few years, image decomposition algorithms have been proposed to\nsplit an image into two parts: the structures and the textures. These\nalgorithms are not adapted to the case of noisy images because the textures are\ncorrupted by noise. In this paper, we propose a new model which decomposes an\nimage into three parts (structures, textures and noise) based on a local\nregularization scheme. We compare our results with the recent work of Aujol and\nChambolle. We finish by giving another model which combines the advantages of\nthe two previous ones.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08292v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2411.05265"},{"id":"http://arxiv.org/abs/2411.08291v1","updated":"2024-11-13T02:17:52Z","published":"2024-11-13T02:17:52Z","title":"Restoration algorithms and system performance evaluation for active\n imagers","summary":" This paper deals with two fields related to active imaging system. First, we\nbegin to explore image processing algorithms to restore the artefacts like\nspeckle, scintillation and image dancing caused by atmospheric turbulence.\nNext, we examine how to evaluate the performance of this kind of systems. To do\nthis task, we propose a modified version of the german TRM3 metric which\npermits to get MTF-like measures. We use the database acquired during NATO-TG40\nfield trials to make our tests.\n","authors":["Jerome Gilles"],"pdf_url":"https://arxiv.org/pdf/2411.08291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03644v2","updated":"2024-11-13T01:45:31Z","published":"2024-09-05T16:02:11Z","title":"RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in\n Generated Images","summary":" In recent years, diffusion models have revolutionized visual generation,\noutperforming traditional frameworks like Generative Adversarial Networks\n(GANs). However, generating images of humans with realistic semantic parts,\nsuch as hands and faces, remains a significant challenge due to their intricate\nstructural complexity. To address this issue, we propose a novel\npost-processing solution named RealisHuman. The RealisHuman framework operates\nin two stages. First, it generates realistic human parts, such as hands or\nfaces, using the original malformed parts as references, ensuring consistent\ndetails with the original image. Second, it seamlessly integrates the rectified\nhuman parts back into their corresponding positions by repainting the\nsurrounding areas to ensure smooth and realistic blending. The RealisHuman\nframework significantly enhances the realism of human generation, as\ndemonstrated by notable improvements in both qualitative and quantitative\nmetrics. Code is available at https://github.com/Wangbenzhi/RealisHuman.\n","authors":["Benzhi Wang","Jingkai Zhou","Jingqi Bai","Yang Yang","Weihua Chen","Fan Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2409.03644v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08279v1","updated":"2024-11-13T01:38:06Z","published":"2024-11-13T01:38:06Z","title":"MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields\n Representation","summary":" Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and\n3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in\nSimultaneous Localization and Mapping (SLAM) for photo-realistic rendering,\nparticularly when using high-quality video sequences as input. However,\nexisting methods struggle with motion-blurred frames, which are common in\nreal-world scenarios like low-light or long-exposure conditions. This often\nresults in a significant reduction in both camera localization accuracy and map\nreconstruction quality. To address this challenge, we propose a dense visual\nSLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our\napproach integrates an efficient motion blur-aware tracker with either neural\nradiance fields or Gaussian Splatting based mapper. By accurately modeling the\nphysical image formation process of motion-blurred images, our method\nsimultaneously learns 3D scene representation and estimates the cameras' local\ntrajectory during exposure time, enabling proactive compensation for motion\nblur caused by camera movement. In our experiments, we demonstrate that\nMBA-SLAM surpasses previous state-of-the-art methods in both camera\nlocalization and map reconstruction, showcasing superior performance across a\nrange of datasets, including synthetic and real datasets featuring sharp images\nas well as those affected by motion blur, highlighting the versatility and\nrobustness of our approach. Code is available at\nhttps://github.com/WU-CVGL/MBA-SLAM.\n","authors":["Peng Wang","Lingzhe Zhao","Yin Zhang","Shiyu Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08272v1","updated":"2024-11-13T00:49:05Z","published":"2024-11-13T00:49:05Z","title":"LBONet: Supervised Spectral Descriptors for Shape Analysis","summary":" The Laplace-Beltrami operator has established itself in the field of\nnon-rigid shape analysis due to its many useful properties such as being\ninvariant under isometric transformation, having a countable eigensystem\nforming an orthonormal basis, and fully characterizing geodesic distances of\nthe manifold. However, this invariancy only applies under isometric\ndeformations, which leads to a performance breakdown in many real-world\napplications. In recent years emphasis has been placed upon extracting optimal\nfeatures using deep learning methods, however spectral signatures play a\ncrucial role and still add value. In this paper we take a step back, revisiting\nthe LBO and proposing a supervised way to learn several operators on a\nmanifold. Depending on the task, by applying these functions, we can train the\nLBO eigenbasis to be more task-specific. The optimization of the LBO leads to\nenormous improvements to established descriptors such as the heat kernel\nsignature in various tasks such as retrieval, classification, segmentation, and\ncorrespondence, proving the adaption of the LBO eigenbasis to both global and\nhighly local learning settings.\n","authors":["Oguzhan Yigit","Richard C. Wilson"],"pdf_url":"https://arxiv.org/pdf/2411.08272v1.pdf","comment":"14 pages, 13 figure"},{"id":"http://arxiv.org/abs/2404.09995v2","updated":"2024-11-13T00:41:01Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2406.08164v3","updated":"2024-11-13T00:15:20Z","published":"2024-06-12T12:54:27Z","title":"ConMe: Rethinking Evaluation of Compositional Reasoning for Modern VLMs","summary":" Compositional Reasoning (CR) entails grasping the significance of attributes,\nrelations, and word order. Recent Vision-Language Models (VLMs), comprising a\nvisual encoder and a Large Language Model (LLM) decoder, have demonstrated\nremarkable proficiency in such reasoning tasks. This prompts a crucial\nquestion: have VLMs effectively tackled the CR challenge? We conjecture that\nexisting CR benchmarks may not adequately push the boundaries of modern VLMs\ndue to the reliance on an LLM-only negative text generation pipeline.\nConsequently, the negatives produced either appear as outliers from the natural\nlanguage distribution learned by VLMs' LLM decoders or as improbable within the\ncorresponding image context. To address these limitations, we introduce ConMe\n-- a compositional reasoning benchmark and a novel data generation pipeline\nleveraging VLMs to produce `hard CR Q&A'. Through a new concept of VLMs\nconversing with each other to collaboratively expose their weaknesses, our\npipeline autonomously generates, evaluates, and selects challenging\ncompositional reasoning questions, establishing a robust CR benchmark, also\nsubsequently validated manually. Our benchmark provokes a noteworthy, up to\n33%, decrease in CR performance compared to preceding benchmarks, reinstating\nthe CR challenge even for state-of-the-art VLMs.\n","authors":["Irene Huang","Wei Lin","M. Jehanzeb Mirza","Jacob A. Hansen","Sivan Doveh","Victor Ion Butoi","Roei Herzig","Assaf Arbelle","Hilde Kuehne","Trevor Darrell","Chuang Gan","Aude Oliva","Rogerio Feris","Leonid Karlinsky"],"pdf_url":"https://arxiv.org/pdf/2406.08164v3.pdf","comment":"NeurIPS 2024 Camera Ready"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.18659v2","updated":"2024-11-13T15:59:02Z","published":"2024-09-27T11:43:19Z","title":"Explainable Enrichment-Driven GrAph Reasoner (EDGAR) for Large Knowledge\n Graphs with Applications in Drug Repurposing","summary":" Knowledge graphs (KGs) represent connections and relationships between\nreal-world entities. We propose a link prediction framework for KGs named\nEnrichment-Driven GrAph Reasoner (EDGAR), which infers new edges by mining\nentity-local rules. This approach leverages enrichment analysis, a\nwell-established statistical method used to identify mechanisms common to sets\nof differentially expressed genes. EDGAR's inference results are inherently\nexplainable and rankable, with p-values indicating the statistical significance\nof each enrichment-based rule.\n We demonstrate the framework's effectiveness on a large-scale biomedical KG,\nROBOKOP, focusing on drug repurposing for Alzheimer disease (AD) as a case\nstudy. Initially, we extracted 14 known drugs from the KG and identified 20\ncontextual biomarkers through enrichment analysis, revealing functional\npathways relevant to shared drug efficacy for AD. Subsequently, using the top\n1000 enrichment results, our system identified 1246 additional drug candidates\nfor AD treatment. The top 10 candidates were validated using evidence from\nmedical literature.\n EDGAR is deployed within ROBOKOP, complete with a web user interface. This is\nthe first study to apply enrichment analysis to large graph completion and drug\nrepurposing.\n","authors":["Olawumi Olasunkanmi","Evan Morris","Yaphet Kebede","Harlin Lee","Stanley Ahalt","Alexander Tropsha","Chris Bizon"],"pdf_url":"https://arxiv.org/pdf/2409.18659v2.pdf","comment":"10 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2411.08700v1","updated":"2024-11-13T15:42:13Z","published":"2024-11-13T15:42:13Z","title":"Rethinking negative sampling in content-based news recommendation","summary":" News recommender systems are hindered by the brief lifespan of articles, as\nthey undergo rapid relevance decay. Recent studies have demonstrated the\npotential of content-based neural techniques in tackling this problem. However,\nthese models often involve complex neural architectures and often lack\nconsideration for negative examples. In this study, we posit that the careful\nsampling of negative examples has a big impact on the model's outcome. We\ndevise a negative sampling technique that not only improves the accuracy of the\nmodel but also facilitates the decentralization of the recommendation system.\nThe experimental results obtained using the MIND dataset demonstrate that the\naccuracy of the method under consideration can compete with that of\nState-of-the-Art models. The utilization of the sampling technique is essential\nin reducing model complexity and accelerating the training process, while\nmaintaining a high level of accuracy. Finally, we discuss how decentralized\nmodels can help improve privacy and scalability.\n","authors":["Miguel Ângelo Rebelo","João Vinagre","Ivo Pereira","Álvaro Figueira"],"pdf_url":"https://arxiv.org/pdf/2411.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08696v1","updated":"2024-11-13T15:34:52Z","published":"2024-11-13T15:34:52Z","title":"Scholarly Wikidata: Population and Exploration of Conference Data in\n Wikidata using LLMs","summary":" Several initiatives have been undertaken to conceptually model the domain of\nscholarly data using ontologies and to create respective Knowledge Graphs. Yet,\nthe full potential seems unleashed, as automated means for automatic population\nof said ontologies are lacking, and respective initiatives from the Semantic\nWeb community are not necessarily connected: we propose to make scholarly data\nmore sustainably accessible by leveraging Wikidata's infrastructure and\nautomating its population in a sustainable manner through LLMs by tapping into\nunstructured sources like conference Web sites and proceedings texts as well as\nalready existing structured conference datasets. While an initial analysis\nshows that Semantic Web conferences are only minimally represented in Wikidata,\nwe argue that our methodology can help to populate, evolve and maintain\nscholarly data as a community within Wikidata. Our main contributions include\n(a) an analysis of ontologies for representing scholarly data to identify gaps\nand relevant entities/properties in Wikidata, (b) semi-automated extraction --\nrequiring (minimal) manual validation -- of conference metadata (e.g.,\nacceptance rates, organizer roles, programme committee members, best paper\nawards, keynotes, and sponsors) from websites and proceedings texts using LLMs.\nFinally, we discuss (c) extensions to visualization tools in the Wikidata\ncontext for data exploration of the generated scholarly data. Our study focuses\non data from 105 Semantic Web-related conferences and extends/adds more than\n6000 entities in Wikidata. It is important to note that the method can be more\ngenerally applicable beyond Semantic Web-related conferences for enhancing\nWikidata's utility as a comprehensive scholarly resource.\n Source Repository: https://github.com/scholarly-wikidata/\n DOI: https://doi.org/10.5281/zenodo.10989709\n License: Creative Commons CC0 (Data), MIT (Code)\n","authors":["Nandana Mihindukulasooriya","Sanju Tiwari","Daniil Dobriy","Finn Årup Nielsen","Tek Raj Chhetri","Axel Polleres"],"pdf_url":"https://arxiv.org/pdf/2411.08696v1.pdf","comment":"17 pages, accepted at EKAW-24"},{"id":"http://arxiv.org/abs/2411.08562v1","updated":"2024-11-13T12:19:46Z","published":"2024-11-13T12:19:46Z","title":"Neural Corrective Machine Unranking","summary":" Machine unlearning in neural information retrieval (IR) systems requires\nremoving specific data whilst maintaining model performance. Applying existing\nmachine unlearning methods to IR may compromise retrieval effectiveness or\ninadvertently expose unlearning actions due to the removal of particular items\nfrom the retrieved results presented to users. We formalise corrective\nunranking, which extends machine unlearning in (neural) IR context by\nintegrating substitute documents to preserve ranking integrity, and propose a\nnovel teacher-student framework, Corrective unRanking Distillation (CuRD), for\nthis task. CuRD (1) facilitates forgetting by adjusting the (trained) neural IR\nmodel such that its output relevance scores of to-be-forgotten samples mimic\nthose of low-ranking, non-retrievable samples; (2) enables correction by\nfine-tuning the relevance scores for the substitute samples to match those of\ncorresponding to-be-forgotten samples closely; (3) seeks to preserve\nperformance on samples that are not targeted for forgetting. We evaluate CuRD\non four neural IR models (BERTcat, BERTdot, ColBERT, PARADE) using MS MARCO and\nTREC CAR datasets. Experiments with forget set sizes from 1 % and 20 % of the\ntraining dataset demonstrate that CuRD outperforms seven state-of-the-art\nbaselines in terms of forgetting and correction while maintaining model\nretention and generalisation capabilities.\n","authors":["Jingrui Hou","Axel Finke","Georgina Cosma"],"pdf_url":"https://arxiv.org/pdf/2411.08562v1.pdf","comment":"submitted to Information Sciences"},{"id":"http://arxiv.org/abs/2411.03364v2","updated":"2024-11-13T08:30:59Z","published":"2024-11-05T06:54:38Z","title":"DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural\n Networks","summary":" Graph has become increasingly integral to the advancement of recommendation\nsystems, particularly with the fast development of graph neural network(GNN).\nBy exploring the virtue of rich node features and link information, GNN is\ndesigned to provide personalized and accurate suggestions. Meanwhile, the\nprivacy leakage of GNN in such contexts has also captured special attention.\nPrior work has revealed that a malicious user can utilize auxiliary knowledge\nto extract sensitive link data of the target graph, integral to recommendation\nsystems, via the decision made by the target GNN model. This poses a\nsignificant risk to the integrity and confidentiality of data used in\nrecommendation system. Though important, previous works on GNN's privacy\nleakage are still challenged in three aspects, i.e., limited stealing attack\nscenarios, sub-optimal attack performance, and adaptation against defense. To\naddress these issues, we propose a diffusion model based link stealing attack,\nnamed DM4Steal. It differs previous work from three critical aspects. (i)\nGenerality: aiming at six attack scenarios with limited auxiliary knowledge, we\npropose a novel training strategy for diffusion models so that DM4Steal is\ntransferable to diverse attack scenarios. (ii) Effectiveness: benefiting from\nthe retention of semantic structure in the diffusion model during the training\nprocess, DM4Steal is capable to learn the precise topology of the target graph\nthrough the GNN decision process. (iii) Adaptation: when GNN is defensive\n(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling\nthe score model multiple times to keep performance degradation to a minimum,\nthus DM4Steal implements successful adaptive attack on defensive GNN.\n","authors":["Jinyin Chen","Haonan Ma","Haibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.03364v2.pdf","comment":"We found that there were critical problems in our paper, and we\n needed to redo the experiment, which was incomplete"},{"id":"http://arxiv.org/abs/2411.07820v2","updated":"2024-11-13T05:43:58Z","published":"2024-11-12T14:12:45Z","title":"Query Optimization for Parametric Knowledge Refinement in\n Retrieval-Augmented Large Language Models","summary":" We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel\napproach designed to bridge the pre-retrieval information gap in\nRetrieval-Augmented Generation (RAG) systems through query optimization\ntailored to meet the specific knowledge requirements of Large Language Models\n(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR\nframework begins by extracting parametric knowledge from LLMs, followed by\nusing a specialized query optimizer for refining these queries. This process\nensures the retrieval of only the most pertinent information essential for\ngenerating accurate responses. Moreover, to enhance flexibility and reduce\ncomputational costs, we propose a trainable scheme for our pipeline that\nutilizes a smaller, tunable model as the query optimizer, which is refined\nthrough knowledge distillation from a larger teacher model. Our evaluations on\nvarious question-answering (QA) datasets and with different retrieval systems\nshow that ERRR consistently outperforms existing baselines, proving to be a\nversatile and cost-effective module for improving the utility and accuracy of\nRAG systems.\n","authors":["Youan Cong","Cheng Wang","Pritom Saha Akash","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2411.07820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07508v2","updated":"2024-11-13T05:05:56Z","published":"2024-11-12T03:05:03Z","title":"Feature Interaction Fusion Self-Distillation Network For CTR Prediction","summary":" Click-Through Rate (CTR) prediction plays a vital role in recommender\nsystems, online advertising, and search engines. Most of the current approaches\nmodel feature interactions through stacked or parallel structures, with some\nemploying knowledge distillation for model compression. However, we observe\nsome limitations with these approaches: (1) In parallel structure models, the\nexplicit and implicit components are executed independently and simultaneously,\nwhich leads to insufficient information sharing within the feature set. (2) The\nintroduction of knowledge distillation technology brings about the problems of\ncomplex teacher-student framework design and low knowledge transfer efficiency.\n(3) The dataset and the process of constructing high-order feature interactions\ncontain significant noise, which limits the model's effectiveness. To address\nthese limitations, we propose FSDNet, a CTR prediction framework incorporating\na plug-and-play fusion self-distillation module. Specifically, FSDNet forms\nconnections between explicit and implicit feature interactions at each layer,\nenhancing the sharing of information between different features. The deepest\nfusion layer is then used as the teacher model, utilizing self-distillation to\nguide the training of shallow layers. Empirical evaluation across four\nbenchmark datasets validates the framework's efficacy and generalization\ncapabilities. The code is available on\nhttps://anonymous.4open.science/r/FSDNet.\n","authors":["Lei Sang","Qiuze Ru","Honghao Li","Yiwen Zhang","Qian Cao","Xindong Wu"],"pdf_url":"https://arxiv.org/pdf/2411.07508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08334v1","updated":"2024-11-13T04:32:58Z","published":"2024-11-13T04:32:58Z","title":"Enhancing Multimodal Query Representation via Visual Dialogues for\n End-to-End Knowledge Retrieval","summary":" Existing multimodal retrieval systems often rely on disjointed models for\nimage comprehension, such as object detectors and caption generators, leading\nto cumbersome implementations and training processes. To overcome this\nlimitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a\ntext retriever with the ability to understand multimodal queries via dynamic\nmodality interaction. Ret-XKnow leverages a partial convolution mechanism to\nfocus on visual information relevant to the given textual query, thereby\nenhancing multimodal query representations. To effectively learn multimodal\ninteraction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset\nautomatically constructed from visual dialogue datasets. Our dataset\nconstruction process ensures that the dialogues are transformed into suitable\ninformation retrieval tasks using a text retriever. We demonstrate that our\napproach not only significantly improves retrieval performance in zero-shot\nsettings but also achieves substantial improvements in fine-tuning scenarios.\nOur code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow.\n","authors":["Yeong-Joon Ju","Ho-Joong Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08275v1","updated":"2024-11-13T01:12:35Z","published":"2024-11-13T01:12:35Z","title":"A Large-Scale Study of Relevance Assessments with Large Language Models:\n An Initial Look","summary":" The application of large language models to provide relevance assessments\npresents exciting opportunities to advance information retrieval, natural\nlanguage processing, and beyond, but to date many unknowns remain. This paper\nreports on the results of a large-scale evaluation (the TREC 2024 RAG Track)\nwhere four different relevance assessment approaches were deployed in situ: the\n\"standard\" fully manual process that NIST has implemented for decades and three\ndifferent alternatives that take advantage of LLMs to different extents using\nthe open-source UMBRELA tool. This setup allows us to correlate system rankings\ninduced by the different approaches to characterize tradeoffs between cost and\nquality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system\nrankings induced by automatically generated relevance assessments from UMBRELA\ncorrelate highly with those induced by fully manual assessments across a\ndiverse set of 77 runs from 19 teams. Our results suggest that automatically\ngenerated UMBRELA judgments can replace fully manual judgments to accurately\ncapture run-level effectiveness. Surprisingly, we find that LLM assistance does\nnot appear to increase correlation with fully manual assessments, suggesting\nthat costs associated with human-in-the-loop processes do not bring obvious\ntangible benefits. Overall, human assessors appear to be stricter than UMBRELA\nin applying relevance criteria. Our work validates the use of LLMs in academic\nTREC-style evaluations and provides the foundation for future studies.\n","authors":["Shivani Upadhyay","Ronak Pradeep","Nandan Thakur","Daniel Campos","Nick Craswell","Ian Soboroff","Hoa Trang Dang","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.08275v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.08878v1","updated":"2024-11-13T18:55:10Z","published":"2024-11-13T18:55:10Z","title":"A Short Note on Evaluating RepNet for Temporal Repetition Counting in\n Videos","summary":" We discuss some consistent issues on how RepNet has been evaluated in various\npapers. As a way to mitigate these issues, we report RepNet performance results\non different datasets, and release evaluation code and the RepNet checkpoint to\nobtain these results. Code URL:\nhttps://github.com/google-research/google-research/blob/master/repnet/\n","authors":["Debidatta Dwibedi","Yusuf Aytar","Jonathan Tompson","Pierre Sermanet","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2411.08878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08870v1","updated":"2024-11-13T18:50:13Z","published":"2024-11-13T18:50:13Z","title":"The Limited Impact of Medical Adaptation of Large Language and\n Vision-Language Models","summary":" Several recent works seek to develop foundation models specifically for\nmedical applications, adapting general-purpose large language models (LLMs) and\nvision-language models (VLMs) via continued pretraining on publicly available\nbiomedical corpora. These works typically claim that such domain-adaptive\npretraining (DAPT) improves performance on downstream medical tasks, such as\nanswering medical licensing exam questions. In this paper, we compare ten\npublic \"medical\" LLMs and two VLMs against their corresponding base models,\narriving at a different conclusion: all medical VLMs and nearly all medical\nLLMs fail to consistently improve over their base models in the zero-/few-shot\nprompting and supervised fine-tuning regimes for medical question-answering\n(QA). For instance, across all tasks and model pairs we consider in the 3-shot\nsetting, medical LLMs only outperform their base models in 22.7% of cases,\nreach a (statistical) tie in 36.8% of cases, and are significantly worse than\ntheir base models in the remaining 40.5% of cases. Our conclusions are based on\n(i) comparing each medical model head-to-head, directly against the\ncorresponding base model; (ii) optimizing the prompts for each model separately\nin zero-/few-shot prompting; and (iii) accounting for statistical uncertainty\nin comparisons. While these basic practices are not consistently adopted in the\nliterature, our ablations show that they substantially impact conclusions.\nMeanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs\ncan show performance improvements, but the benefits do not carry over to tasks\nbased on clinical notes. Our findings suggest that state-of-the-art\ngeneral-domain models may already exhibit strong medical knowledge and\nreasoning capabilities, and offer recommendations to strengthen the conclusions\nof future studies.\n","authors":["Daniel P. Jeong","Pranav Mani","Saurabh Garg","Zachary C. Lipton","Michael Oberst"],"pdf_url":"https://arxiv.org/pdf/2411.08870v1.pdf","comment":"Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes\n additional results on clinical note QA tasks and supervised fine-tuning\n evaluations"},{"id":"http://arxiv.org/abs/2411.08867v1","updated":"2024-11-13T18:48:51Z","published":"2024-11-13T18:48:51Z","title":"Unsupervised Parameter-free Outlier Detection using HDBSCAN* Outlier\n Profiles","summary":" In machine learning and data mining, outliers are data points that\nsignificantly differ from the dataset and often introduce irrelevant\ninformation that can induce bias in its statistics and models. Therefore,\nunsupervised methods are crucial to detect outliers if there is limited or no\ninformation about them. Global-Local Outlier Scores based on Hierarchies\n(GLOSH) is an unsupervised outlier detection method within HDBSCAN*, a\nstate-of-the-art hierarchical clustering method. GLOSH estimates outlier scores\nfor each data point by comparing its density to the highest density of the\nregion they reside in the HDBSCAN* hierarchy. GLOSH may be sensitive to\nHDBSCAN*'s minpts parameter that influences density estimation. With limited\nknowledge about the data, choosing an appropriate minpts value beforehand is\nchallenging as one or some minpts values may better represent the underlying\ncluster structure than others. Additionally, in the process of searching for\n``potential outliers'', one has to define the number of outliers n a dataset\nhas, which may be impractical and is often unknown. In this paper, we propose\nan unsupervised strategy to find the ``best'' minpts value, leveraging the\nrange of GLOSH scores across minpts values to identify the value for which\nGLOSH scores can best identify outliers from the rest of the dataset. Moreover,\nwe propose an unsupervised strategy to estimate a threshold for classifying\npoints into inliers and (potential) outliers without the need to pre-define any\nvalue. Our experiments show that our strategies can automatically find the\nminpts value and threshold that yield the best or near best outlier detection\nresults using GLOSH.\n","authors":["Kushankur Ghosh","Murilo Coelho Naldi","Jörg Sander","Euijin Choo"],"pdf_url":"https://arxiv.org/pdf/2411.08867v1.pdf","comment":"Accepted at IEEE International Conference on Big Data, IEEE BigData\n 2024"},{"id":"http://arxiv.org/abs/2411.08862v1","updated":"2024-11-13T18:44:30Z","published":"2024-11-13T18:44:30Z","title":"LLMStinger: Jailbreaking LLMs using RL fine-tuned LLMs","summary":" We introduce LLMStinger, a novel approach that leverages Large Language\nModels (LLMs) to automatically generate adversarial suffixes for jailbreak\nattacks. Unlike traditional methods, which require complex prompt engineering\nor white-box access, LLMStinger uses a reinforcement learning (RL) loop to\nfine-tune an attacker LLM, generating new suffixes based on existing attacks\nfor harmful questions from the HarmBench benchmark. Our method significantly\noutperforms existing red-teaming approaches (we compared against 15 of the\nlatest methods), achieving a +57.2% improvement in Attack Success Rate (ASR) on\nLLaMA2-7B-chat and a +50.3% ASR increase on Claude 2, both models known for\ntheir extensive safety measures. Additionally, we achieved a 94.97% ASR on\nGPT-3.5 and 99.4% on Gemma-2B-it, demonstrating the robustness and adaptability\nof LLMStinger across open and closed-source models.\n","authors":["Piyush Jha","Arnav Arora","Vijay Ganesh"],"pdf_url":"https://arxiv.org/pdf/2411.08862v1.pdf","comment":"Accepted at AAAI 2025"},{"id":"http://arxiv.org/abs/2411.08861v1","updated":"2024-11-13T18:42:34Z","published":"2024-11-13T18:42:34Z","title":"Interaction Testing in Variation Analysis","summary":" Relationships of cause and effect are of prime importance for explaining\nscientific phenomena. Often, rather than just understanding the effects of\ncauses, researchers also wish to understand how a cause $X$ affects an outcome\n$Y$ mechanistically -- i.e., what are the causal pathways that are activated\nbetween $X$ and $Y$. For analyzing such questions, a range of methods has been\ndeveloped over decades under the rubric of causal mediation analysis.\nTraditional mediation analysis focuses on decomposing the average treatment\neffect (ATE) into direct and indirect effects, and therefore focuses on the ATE\nas the central quantity. This corresponds to providing explanations for\nassociations in the interventional regime, such as when the treatment $X$ is\nrandomized. Commonly, however, it is of interest to explain associations in the\nobservational regime, and not just in the interventional regime. In this paper,\nwe introduce \\text{variation analysis}, an extension of mediation analysis that\nfocuses on the total variation (TV) measure between $X$ and $Y$, written as\n$\\mathrm{E}[Y \\mid X=x_1] - \\mathrm{E}[Y \\mid X=x_0]$. The TV measure\nencompasses both causal and confounded effects, as opposed to the ATE which\nonly encompasses causal (direct and mediated) variations. In this way, the TV\nmeasure is suitable for providing explanations in the natural regime and\nanswering questions such as ``why is $X$ associated with $Y$?''. Our focus is\non decomposing the TV measure, in a way that explicitly includes direct,\nindirect, and confounded variations. Furthermore, we also decompose the TV\nmeasure to include interaction terms between these different pathways.\nSubsequently, interaction testing is introduced, involving hypothesis tests to\ndetermine if interaction terms are significantly different from zero. If\ninteractions are not significant, more parsimonious decompositions of the TV\nmeasure can be used.\n","authors":["Drago Plecko"],"pdf_url":"https://arxiv.org/pdf/2411.08861v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13880v4","updated":"2024-11-13T18:31:18Z","published":"2024-04-22T05:07:02Z","title":"Regional Style and Color Transfer","summary":" This paper presents a novel contribution to the field of regional style\ntransfer. Existing methods often suffer from the drawback of applying style\nhomogeneously across the entire image, leading to stylistic inconsistencies or\nforeground object twisted when applied to image with foreground elements such\nas person figures. To address this limitation, we propose a new approach that\nleverages a segmentation network to precisely isolate foreground objects within\nthe input image. Subsequently, style transfer is applied exclusively to the\nbackground region. The isolated foreground objects are then carefully\nreintegrated into the style-transferred background. To enhance the visual\ncoherence between foreground and background, a color transfer step is employed\non the foreground elements prior to their rein-corporation. Finally, we utilize\nfeathering techniques to achieve a seamless amalgamation of foreground and\nbackground, resulting in a visually unified and aesthetically pleasing final\ncomposition. Extensive evaluations demonstrate that our proposed approach\nyields significantly more natural stylistic transformations compared to\nconventional methods.\n","authors":["Zhicheng Ding","Panfeng Li","Qikai Yang","Siyang Li","Qingtian Gong"],"pdf_url":"https://arxiv.org/pdf/2404.13880v4.pdf","comment":"Accepted by 2024 5th International Conference on Computer Vision,\n Image and Deep Learning"},{"id":"http://arxiv.org/abs/2411.08849v1","updated":"2024-11-13T18:29:58Z","published":"2024-11-13T18:29:58Z","title":"Oblique Bayesian additive regression trees","summary":" Current implementations of Bayesian Additive Regression Trees (BART) are\nbased on axis-aligned decision rules that recursively partition the feature\nspace using a single feature at a time. Several authors have demonstrated that\noblique trees, whose decision rules are based on linear combinations of\nfeatures, can sometimes yield better predictions than axis-aligned trees and\nexhibit excellent theoretical properties. We develop an oblique version of BART\nthat leverages a data-adaptive decision rule prior that recursively partitions\nthe feature space along random hyperplanes. Using several synthetic and\nreal-world benchmark datasets, we systematically compared our oblique BART\nimplementation to axis-aligned BART and other tree ensemble methods, finding\nthat oblique BART was competitive with -- and sometimes much better than --\nthose methods.\n","authors":["Paul-Hieu V. Nguyen","Ryan Yee","Sameer K. Deshpande"],"pdf_url":"https://arxiv.org/pdf/2411.08849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06438v2","updated":"2024-11-13T18:21:22Z","published":"2024-07-08T22:40:15Z","title":"A Single Transformer for Scalable Vision-Language Modeling","summary":" We present SOLO, a single transformer for Scalable visiOn-Language mOdeling.\nCurrent large vision-language models (LVLMs) such as LLaVA mostly employ\nheterogeneous architectures that connect pre-trained visual encoders with large\nlanguage models (LLMs) to facilitate visual recognition and complex reasoning.\nAlthough achieving remarkable performance with relatively lightweight training,\nwe identify four primary scalability limitations: (1) The visual capacity is\nconstrained by pre-trained visual encoders, which are typically an order of\nmagnitude smaller than LLMs. (2) The heterogeneous architecture complicates the\nuse of established hardware and software infrastructure. (3) Study of scaling\nlaws on such architecture must consider three separate components - visual\nencoder, connector, and LLMs, which complicates the analysis. (4) The use of\nexisting visual encoders typically requires following a pre-defined\nspecification of image inputs pre-processing, for example, by reshaping inputs\nto fixed-resolution square images, which presents difficulties in processing\nand training on high-resolution images or those with unusual aspect ratio. A\nunified single Transformer architecture, like SOLO, effectively addresses these\nscalability concerns in LVLMs; however, its limited adoption in the modern\ncontext likely stems from the absence of reliable training recipes that balance\nboth modalities and ensure stable training for billion-scale models. In this\npaper, we introduce the first open-source training recipe for developing SOLO,\nan open-source 7B LVLM using moderate academic resources. The training recipe\ninvolves initializing from LLMs, sequential pre-training on ImageNet and\nweb-scale data, and instruction fine-tuning on our curated high-quality\ndatasets. On extensive evaluation, SOLO demonstrates performance comparable to\nLLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning.\n","authors":["Yangyi Chen","Xingyao Wang","Hao Peng","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2407.06438v2.pdf","comment":"Accepted to TMLR"},{"id":"http://arxiv.org/abs/2411.08832v1","updated":"2024-11-13T18:12:15Z","published":"2024-11-13T18:12:15Z","title":"Offline Adaptation of Quadruped Locomotion using Diffusion Models","summary":" We present a diffusion-based approach to quadrupedal locomotion that\nsimultaneously addresses the limitations of learning and interpolating between\nmultiple skills and of (modes) offline adapting to new locomotion behaviours\nafter training. This is the first framework to apply classifier-free guided\ndiffusion to quadruped locomotion and demonstrate its efficacy by extracting\ngoal-conditioned behaviour from an originally unlabelled dataset. We show that\nthese capabilities are compatible with a multi-skill policy and can be applied\nwith little modification and minimal compute overhead, i.e., running entirely\non the robots onboard CPU. We verify the validity of our approach with hardware\nexperiments on the ANYmal quadruped platform.\n","authors":["Reece O'Mahoney","Alexander L. Mitchell","Wanming Yu","Ingmar Posner","Ioannis Havoutis"],"pdf_url":"https://arxiv.org/pdf/2411.08832v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08821v1","updated":"2024-11-13T17:59:44Z","published":"2024-11-13T17:59:44Z","title":"Model agnostic local variable importance for locally dependent\n relationships","summary":" Global variable importance measures are commonly used to interpret machine\nlearning model results. Local variable importance techniques assess how\nvariables contribute to individual observations rather than the entire dataset.\nCurrent methods typically fail to accurately reflect locally dependent\nrelationships between variables and instead focus on marginal importance\nvalues. Additionally, they are not natively adapted for multi-class\nclassification problems. We propose a new model-agnostic method for calculating\nlocal variable importance, CLIQUE, that captures locally dependent\nrelationships, contains improvements over permutation-based methods, and can be\ndirectly applied to multi-class classification problems. Simulated and\nreal-world examples show that CLIQUE emphasizes locally dependent information\nand properly reduces bias in regions where variables do not affect the\nresponse.\n","authors":["Kelvyn K. Bladen","Adele Cutler","D. Richard Cutler","Kevin R. Moon"],"pdf_url":"https://arxiv.org/pdf/2411.08821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08814v1","updated":"2024-11-13T17:53:23Z","published":"2024-11-13T17:53:23Z","title":"Process-aware Human Activity Recognition","summary":" Humans naturally follow distinct patterns when conducting their daily\nactivities, which are driven by established practices and processes, such as\nproduction workflows, social norms and daily routines. Human activity\nrecognition (HAR) algorithms usually use neural networks or machine learning\ntechniques to analyse inherent relationships within the data. However, these\napproaches often overlook the contextual information in which the data are\ngenerated, potentially limiting their effectiveness. We propose a novel\napproach that incorporates process information from context to enhance the HAR\nperformance. Specifically, we align probabilistic events generated by machine\nlearning models with process models derived from contextual information. This\nalignment adaptively weighs these two sources of information to optimise HAR\naccuracy. Our experiments demonstrate that our approach achieves better\naccuracy and Macro F1-score compared to baseline models.\n","authors":["Jiawei Zheng","Petros Papapanagiotou","Jacques D. Fleuriot","Jane Hillston"],"pdf_url":"https://arxiv.org/pdf/2411.08814v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.01600v3","updated":"2024-11-13T17:41:43Z","published":"2024-08-02T23:11:42Z","title":"Physics-Informed Geometry-Aware Neural Operator","summary":" Engineering design problems often involve solving parametric Partial\nDifferential Equations (PDEs) under variable PDE parameters and domain\ngeometry. Recently, neural operators have shown promise in learning PDE\noperators and quickly predicting the PDE solutions. However, training these\nneural operators typically requires large datasets, the acquisition of which\ncan be prohibitively expensive. To overcome this, physics-informed training\noffers an alternative way of building neural operators, eliminating the high\ncomputational costs associated with Finite Element generation of training data.\nNevertheless, current physics-informed neural operators struggle with\nlimitations, either in handling varying domain geometries or varying PDE\nparameters. In this research, we introduce a novel method, the Physics-Informed\nGeometry-Aware Neural Operator (PI-GANO), designed to simultaneously generalize\nacross both PDE parameters and domain geometries. We adopt a geometry encoder\nto capture the domain geometry features, and design a novel pipeline to\nintegrate this component within the existing DCON architecture. Numerical\nresults demonstrate the accuracy and efficiency of the proposed method. All the\ncodes and data related to this work are available on GitHub:\nhttps://github.com/WeihengZ/Physics-informed-Neural-Foundation-Operator.\n","authors":["Weiheng Zhong","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2408.01600v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2404.13646"},{"id":"http://arxiv.org/abs/2411.08804v1","updated":"2024-11-13T17:38:07Z","published":"2024-11-13T17:38:07Z","title":"FinRobot: AI Agent for Equity Research and Valuation with Large Language\n Models","summary":" As financial markets grow increasingly complex, there is a rising need for\nautomated tools that can effectively assist human analysts in equity research,\nparticularly within sell-side research. While Generative AI (GenAI) has\nattracted significant attention in this field, existing AI solutions often fall\nshort due to their narrow focus on technical factors and limited capacity for\ndiscretionary judgment. These limitations hinder their ability to adapt to new\ndata in real-time and accurately assess risks, which diminishes their practical\nvalue for investors.\n This paper presents FinRobot, the first AI agent framework specifically\ndesigned for equity research. FinRobot employs a multi-agent Chain of Thought\n(CoT) system, integrating both quantitative and qualitative analyses to emulate\nthe comprehensive reasoning of a human analyst. The system is structured around\nthree specialized agents: the Data-CoT Agent, which aggregates diverse data\nsources for robust financial integration; the Concept-CoT Agent, which mimics\nan analysts reasoning to generate actionable insights; and the Thesis-CoT\nAgent, which synthesizes these insights into a coherent investment thesis and\nreport. FinRobot provides thorough company analysis supported by precise\nnumerical data, industry-appropriate valuation metrics, and realistic risk\nassessments. Its dynamically updatable data pipeline ensures that research\nremains timely and relevant, adapting seamlessly to new financial information.\nUnlike existing automated research tools, such as CapitalCube and Wright\nReports, FinRobot delivers insights comparable to those produced by major\nbrokerage firms and fundamental research vendors. We open-source FinRobot at\n\\url{https://github. com/AI4Finance-Foundation/FinRobot}.\n","authors":["Tianyu Zhou","Pinqiao Wang","Yilin Wu","Hongyang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.08804v1.pdf","comment":"The 1st Workshop on LLMs and Generative AI for Finance, ICAIF 2024"},{"id":"http://arxiv.org/abs/2410.16527v2","updated":"2024-11-13T17:30:33Z","published":"2024-10-21T21:36:03Z","title":"Insights and Current Gaps in Open-Source LLM Vulnerability Scanners: A\n Comparative Analysis","summary":" This report presents a comparative analysis of open-source vulnerability\nscanners for conversational large language models (LLMs). As LLMs become\nintegral to various applications, they also present potential attack surfaces,\nexposed to security risks such as information leakage and jailbreak attacks.\nOur study evaluates prominent scanners - Garak, Giskard, PyRIT, and\nCyberSecEval - that adapt red-teaming practices to expose these\nvulnerabilities. We detail the distinctive features and practical use of these\nscanners, outline unifying principles of their design and perform quantitative\nevaluations to compare them. These evaluations uncover significant reliability\nissues in detecting successful attacks, highlighting a fundamental gap for\nfuture development. Additionally, we contribute a preliminary labelled dataset,\nwhich serves as an initial step to bridge this gap. Based on the above, we\nprovide strategic recommendations to assist organizations choose the most\nsuitable scanner for their red-teaming needs, accounting for customizability,\ntest suite comprehensiveness, and industry-specific use cases.\n","authors":["Jonathan Brokman","Omer Hofman","Oren Rachmil","Inderjeet Singh","Rathina Sabapathy Aishvariya Priya","Vikas Pahuja","Amit Giloni","Roman Vainshtein","Hisashi Kojima"],"pdf_url":"https://arxiv.org/pdf/2410.16527v2.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2410.24164v3","updated":"2024-11-13T17:30:10Z","published":"2024-10-31T17:22:30Z","title":"$π_0$: A Vision-Language-Action Flow Model for General Robot Control","summary":" Robot learning holds tremendous promise to unlock the full potential of\nflexible, general, and dexterous robot systems, as well as to address some of\nthe deepest questions in artificial intelligence. However, bringing robot\nlearning to the level of generality required for effective real-world systems\nfaces major obstacles in terms of data, generalization, and robustness. In this\npaper, we discuss how generalist robot policies (i.e., robot foundation models)\ncan address these challenges, and how we can design effective generalist robot\npolicies for complex and highly dexterous tasks. We propose a novel flow\nmatching architecture built on top of a pre-trained vision-language model (VLM)\nto inherit Internet-scale semantic knowledge. We then discuss how this model\ncan be trained on a large and diverse dataset from multiple dexterous robot\nplatforms, including single-arm robots, dual-arm robots, and mobile\nmanipulators. We evaluate our model in terms of its ability to perform tasks in\nzero shot after pre-training, follow language instructions from people and from\na high-level VLM policy, and its ability to acquire new skills via fine-tuning.\nOur results cover a wide variety of tasks, such as laundry folding, table\ncleaning, and assembling boxes.\n","authors":["Kevin Black","Noah Brown","Danny Driess","Adnan Esmail","Michael Equi","Chelsea Finn","Niccolo Fusai","Lachy Groom","Karol Hausman","Brian Ichter","Szymon Jakubczak","Tim Jones","Liyiming Ke","Sergey Levine","Adrian Li-Bell","Mohith Mothukuri","Suraj Nair","Karl Pertsch","Lucy Xiaoyang Shi","James Tanner","Quan Vuong","Anna Walling","Haohuan Wang","Ury Zhilinsky"],"pdf_url":"https://arxiv.org/pdf/2410.24164v3.pdf","comment":"See project website for videos:\n https://physicalintelligence.company/blog/pi0"},{"id":"http://arxiv.org/abs/2411.08800v1","updated":"2024-11-13T17:27:32Z","published":"2024-11-13T17:27:32Z","title":"Deep Learning Accelerated Quantum Transport Simulations in\n Nanoelectronics: From Break Junctions to Field-Effect Transistors","summary":" Quantum transport calculations are essential for understanding and designing\nnanoelectronic devices, yet the trade-off between accuracy and computational\nefficiency has long limited their practical applications. We present a general\nframework that combines the deep learning tight-binding Hamiltonian (DeePTB)\napproach with the non-equilibrium Green's Function (NEGF) method, enabling\nefficient quantum transport calculations while maintaining first-principles\naccuracy. We demonstrate the capabilities of the DeePTB-NEGF framework through\ntwo representative applications: comprehensive simulation of break junction\nsystems, where conductance histograms show good agreement with experimental\nmeasurements in both metallic contact and single-molecule junction cases; and\nsimulation of carbon nanotube field effect transistors through self-consistent\nNEGF-Poisson calculations, capturing essential physics including the\nelectrostatic potential and transfer characteristic curves under finite bias\nconditions. This framework bridges the gap between first-principles accuracy\nand computational efficiency, providing a powerful tool for high-throughput\nquantum transport simulations across different scales in nanoelectronics.\n","authors":["Jijie Zou","Zhanghao Zhouyin","Dongying Lin","Linfeng Zhang","Shimin Hou","Qiangqiang Gu"],"pdf_url":"https://arxiv.org/pdf/2411.08800v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2404.13646v4","updated":"2024-11-13T17:26:36Z","published":"2024-04-21T12:41:30Z","title":"Physics-informed Discretization-independent Deep Compositional Operator\n Network","summary":" Solving parametric Partial Differential Equations (PDEs) for a broad range of\nparameters is a critical challenge in scientific computing. To this end, neural\noperators, which \\textcolor{black}{predicts the PDE solution with variable PDE\nparameter inputs}, have been successfully used. However, the training of neural\noperators typically demands large training datasets, the acquisition of which\ncan be prohibitively expensive. To address this challenge, physics-informed\ntraining can offer a cost-effective strategy. However, current physics-informed\nneural operators face limitations, either in handling irregular domain shapes\nor in in generalizing to various discrete representations of PDE parameters. In\nthis research, we introduce a novel physics-informed model architecture which\ncan generalize to various discrete representations of PDE parameters and\nirregular domain shapes. Particularly, inspired by deep operator neural\nnetworks, our model involves a discretization-independent learning of parameter\nembedding repeatedly, and this parameter embedding is integrated with the\nresponse embeddings through multiple compositional layers, for more\nexpressivity. Numerical results demonstrate the accuracy and efficiency of the\nproposed method. All the codes and data related to this work are available on\nGitHub: https://github.com/WeihengZ/PI-DCON.\n","authors":["Weiheng Zhong","Hadi Meidani"],"pdf_url":"https://arxiv.org/pdf/2404.13646v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08798v1","updated":"2024-11-13T17:25:25Z","published":"2024-11-13T17:25:25Z","title":"Learning Gaussian Multi-Index Models with Gradient Flow: Time Complexity\n and Directional Convergence","summary":" This work focuses on the gradient flow dynamics of a neural network model\nthat uses correlation loss to approximate a multi-index function on\nhigh-dimensional standard Gaussian data. Specifically, the multi-index function\nwe consider is a sum of neurons $f^*(x) \\!=\\! \\sum_{j=1}^k \\! \\sigma^*(v_j^T\nx)$ where $v_1, \\dots, v_k$ are unit vectors, and $\\sigma^*$ lacks the first\nand second Hermite polynomials in its Hermite expansion. It is known that, for\nthe single-index case ($k\\!=\\!1$), overcoming the search phase requires\npolynomial time complexity. We first generalize this result to multi-index\nfunctions characterized by vectors in arbitrary directions. After the search\nphase, it is not clear whether the network neurons converge to the index\nvectors, or get stuck at a sub-optimal solution. When the index vectors are\northogonal, we give a complete characterization of the fixed points and prove\nthat neurons converge to the nearest index vectors. Therefore, using $n \\!\n\\asymp \\! k \\log k$ neurons ensures finding the full set of index vectors with\ngradient flow with high probability over random initialization. When $ v_i^T\nv_j \\!=\\! \\beta \\! \\geq \\! 0$ for all $i \\neq j$, we prove the existence of a\nsharp threshold $\\beta_c \\!=\\! c/(c+k)$ at which the fixed point that computes\nthe average of the index vectors transitions from a saddle point to a minimum.\nNumerical simulations show that using a correlation loss and a mild\noverparameterization suffices to learn all of the index vectors when they are\nnearly orthogonal, however, the correlation loss fails when the dot product\nbetween the index vectors exceeds a certain threshold.\n","authors":["Berfin Simsek","Amire Bendjeddou","Daniel Hsu"],"pdf_url":"https://arxiv.org/pdf/2411.08798v1.pdf","comment":"21 pages, 6 figures, under review by AISTATS 2025"},{"id":"http://arxiv.org/abs/2411.08791v1","updated":"2024-11-13T17:17:16Z","published":"2024-11-13T17:17:16Z","title":"Locally Private Sampling with Public Data","summary":" Local differential privacy (LDP) is increasingly employed in\nprivacy-preserving machine learning to protect user data before sharing it with\nan untrusted aggregator. Most LDP methods assume that users possess only a\nsingle data record, which is a significant limitation since users often gather\nextensive datasets (e.g., images, text, time-series data) and frequently have\naccess to public datasets. To address this limitation, we propose a locally\nprivate sampling framework that leverages both the private and public datasets\nof each user. Specifically, we assume each user has two distributions: $p$ and\n$q$ that represent their private dataset and the public dataset, respectively.\nThe objective is to design a mechanism that generates a private sample\napproximating $p$ while simultaneously preserving $q$. We frame this objective\nas a minimax optimization problem using $f$-divergence as the utility measure.\nWe fully characterize the minimax optimal mechanisms for general\n$f$-divergences provided that $p$ and $q$ are discrete distributions.\nRemarkably, we demonstrate that this optimal mechanism is universal across all\n$f$-divergences. Experiments validate the effectiveness of our minimax optimal\nsampler compared to the state-of-the-art locally private sampler.\n","authors":["Behnoosh Zamanlooy","Mario Diaz","Shahab Asoodeh"],"pdf_url":"https://arxiv.org/pdf/2411.08791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08790v1","updated":"2024-11-13T17:16:48Z","published":"2024-11-13T17:16:48Z","title":"Can sparse autoencoders be used to decompose and interpret steering\n vectors?","summary":" Steering vectors are a promising approach to control the behaviour of large\nlanguage models. However, their underlying mechanisms remain poorly understood.\nWhile sparse autoencoders (SAEs) may offer a potential method to interpret\nsteering vectors, recent findings show that SAE-reconstructed vectors often\nlack the steering properties of the original vectors. This paper investigates\nwhy directly applying SAEs to steering vectors yields misleading\ndecompositions, identifying two reasons: (1) steering vectors fall outside the\ninput distribution for which SAEs are designed, and (2) steering vectors can\nhave meaningful negative projections in feature directions, which SAEs are not\ndesigned to accommodate. These limitations hinder the direct use of SAEs for\ninterpreting steering vectors.\n","authors":["Harry Mayne","Yushi Yang","Adam Mahdi"],"pdf_url":"https://arxiv.org/pdf/2411.08790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.19552v2","updated":"2024-11-13T17:12:34Z","published":"2024-09-29T04:41:10Z","title":"A Universal Deep Learning Framework for Materials X-ray Absorption\n Spectra","summary":" X-ray absorption spectroscopy (XAS) is a powerful characterization technique\nfor probing the local chemical environment of absorbing atoms. However,\nanalyzing XAS data presents significant challenges, often requiring extensive,\ncomputationally intensive simulations, as well as significant domain expertise.\nThese limitations hinder the development of fast, robust XAS analysis pipelines\nthat are essential in high-throughput studies and for autonomous\nexperimentation. We address these challenges with OmniXAS, a framework that\ncontains a suite of transfer learning approaches for XAS prediction, each\ncontributing to improved accuracy and efficiency, as demonstrated on K-edge\nspectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS\nframework is built upon three distinct strategies. First, we use M3GNet to\nderive latent representations of the local chemical environment of absorption\nsites as input for XAS prediction, achieving up to order-of-magnitude\nimprovements over conventional featurization techniques. Second, we employ a\nhierarchical transfer learning strategy, training a universal multi-task model\nacross elements before fine-tuning for element-specific predictions. Models\nbased on this cascaded approach after element-wise fine-tuning outperform\nelement-specific models by up to 69%. Third, we implement cross-fidelity\ntransfer learning, adapting a universal model to predict spectra generated by\nsimulation of a different fidelity with a higher computational cost. This\napproach improves prediction accuracy by up to 11% over models trained on the\ntarget fidelity alone. Our approach boosts the throughput of XAS modeling by\norders of magnitude versus first-principles simulations and is extendable to\nXAS prediction for a broader range of elements. This transfer learning\nframework is generalizable to enhance deep-learning models that target other\nproperties in materials research.\n","authors":["Shubha R. Kharel","Fanchen Meng","Xiaohui Qu","Matthew R. Carbone","Deyu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.19552v2.pdf","comment":"Main manuscript: 22 pages, 11 figures. Supplemental material (12\n pages, 6 figures) available as a separate file in arXiv ancillary files\n (additional downloadable files)"},{"id":"http://arxiv.org/abs/2402.03271v3","updated":"2024-11-13T17:10:20Z","published":"2024-02-05T18:28:44Z","title":"Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information\n Seeking in Large Language Models","summary":" In the face of uncertainty, the ability to *seek information* is of\nfundamental importance. In many practical applications, such as medical\ndiagnosis and troubleshooting, the information needed to solve the task is not\ninitially given and has to be actively sought by asking follow-up questions\n(for example, a doctor asking a patient for more details about their symptoms).\nIn this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to\naugment large language models with the ability to actively seek information by\nasking effective questions. UoT combines 1) an *uncertainty-aware simulation\napproach* which enables the model to simulate possible future scenarios and how\nlikely they are to occur, 2) *uncertainty-based rewards* motivated by\ninformation gain which incentivizes the model to seek information, and 3) a\n*reward propagation scheme* to select the optimal question to ask in a way that\nmaximizes the expected reward. In experiments on medical diagnosis,\ntroubleshooting, and the `20 Questions` game, UoT achieves an average\nperformance improvement of 38.1% in the rate of successful task completion\nacross multiple LLMs compared with direct prompting and also improves\nefficiency (i.e., the number of questions needed to complete the task). Our\ncode has been released [here](https://github.com/zhiyuanhubj/UoT)\n","authors":["Zhiyuan Hu","Chumin Liu","Xidong Feng","Yilun Zhao","See-Kiong Ng","Anh Tuan Luu","Junxian He","Pang Wei Koh","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2402.03271v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.09322v2","updated":"2024-11-13T17:08:34Z","published":"2024-06-13T17:00:30Z","title":"Active Inference Meeting Energy-Efficient Control of Parallel and\n Identical Machines","summary":" We investigate the application of active inference in developing\nenergy-efficient control agents for manufacturing systems. Active inference,\nrooted in neuroscience, provides a unified probabilistic framework integrating\nperception, learning, and action, with inherent uncertainty quantification\nelements. Our study explores deep active inference, an emerging field that\ncombines deep learning with the active inference decision-making framework.\nLeveraging a deep active inference agent, we focus on controlling parallel and\nidentical machine workstations to enhance energy efficiency. We address\nchallenges posed by the problem's stochastic nature and delayed policy response\nby introducing tailored enhancements to existing agent architectures.\nSpecifically, we introduce multi-step transition and hybrid horizon methods to\nmitigate the need for complex planning. Our experimental results demonstrate\nthe effectiveness of these enhancements and highlight the potential of the\nactive inference-based approach.\n","authors":["Yavar Taheri Yeganeh","Mohsen Jafari","Andrea Matta"],"pdf_url":"https://arxiv.org/pdf/2406.09322v2.pdf","comment":"Accepted at the 10th International Conference on Machine Learning,\n Optimization, and Data Science"},{"id":"http://arxiv.org/abs/2411.08773v1","updated":"2024-11-13T16:58:51Z","published":"2024-11-13T16:58:51Z","title":"Optimal Oblivious Subspace Embeddings with Near-optimal Sparsity","summary":" An oblivious subspace embedding is a random $m\\times n$ matrix $\\Pi$ such\nthat, for any $d$-dimensional subspace, with high probability $\\Pi$ preserves\nthe norms of all vectors in that subspace within a $1\\pm\\epsilon$ factor. In\nthis work, we give an oblivious subspace embedding with the optimal dimension\n$m=\\Theta(d/\\epsilon^2)$ that has a near-optimal sparsity of $\\tilde\nO(1/\\epsilon)$ non-zero entries per column of $\\Pi$. This is the first result\nto nearly match the conjecture of Nelson and Nguyen [FOCS 2013] in terms of the\nbest sparsity attainable by an optimal oblivious subspace embedding, improving\non a prior bound of $\\tilde O(1/\\epsilon^6)$ non-zeros per column [Chenakkod et\nal., STOC 2024]. We further extend our approach to the non-oblivious setting,\nproposing a new family of Leverage Score Sparsified embeddings with Independent\nColumns, which yield faster runtimes for matrix approximation and regression\ntasks.\n In our analysis, we develop a new method which uses a decoupling argument\ntogether with the cumulant method for bounding the edge universality error of\nisotropic random matrices. To achieve near-optimal sparsity, we combine this\ngeneral-purpose approach with new traces inequalities that leverage the\nspecific structure of our subspace embedding construction.\n","authors":["Shabarish Chenakkod","Michał Dereziński","Xiaoyu Dong"],"pdf_url":"https://arxiv.org/pdf/2411.08773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08766v1","updated":"2024-11-13T16:52:30Z","published":"2024-11-13T16:52:30Z","title":"Mapping Methane -- The Impact of Dairy Farm Practices on Emissions\n Through Satellite Data and Machine Learning","summary":" This study investigates the correlation between dairy farm characteristics\nand methane concentrations as derived from satellite observations in Eastern\nCanada. Utilizing data from 11 dairy farms collected between January 2020 and\nDecember 2022, we integrated Sentinel-5P satellite methane data with critical\nfarm-level attributes, including herd genetics, feeding practices, and\nmanagement strategies. Initial analyses revealed significant correlations with\nmethane concentrations, leading to the application of Variance Inflation Factor\n(VIF) and Principal Component Analysis (PCA) to address multicollinearity and\nenhance model stability. Subsequently, machine learning models - specifically\nRandom Forest and Neural Networks - were employed to evaluate feature\nimportance and predict methane emissions. Our findings indicate a strong\nnegative correlation between the Estimated Breeding Value (EBV) for protein\npercentage and methane concentrations, suggesting that genetic selection for\nhigher milk protein content could be an effective strategy for emissions\nreduction. The integration of atmospheric transport models with satellite data\nfurther refined our emission estimates, significantly enhancing accuracy and\nspatial resolution. This research underscores the potential of advanced\nsatellite monitoring, machine learning techniques, and atmospheric modeling in\nimproving methane emission assessments within the dairy sector. It emphasizes\nthe critical role of farm-specific characteristics in developing effective\nmitigation strategies. Future investigations should focus on expanding the\ndataset and incorporating inversion modeling for more precise emission\nquantification. Balancing ecological impacts with economic viability will be\nessential for fostering sustainable dairy farming practices.\n","authors":["Hanqing Bi","Suresh Neethirajan"],"pdf_url":"https://arxiv.org/pdf/2411.08766v1.pdf","comment":"16 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.08764v1","updated":"2024-11-13T16:49:56Z","published":"2024-11-13T16:49:56Z","title":"Flow reconstruction in time-varying geometries using graph neural\n networks","summary":" The paper presents a Graph Attention Convolutional Network (GACN) for flow\nreconstruction from very sparse data in time-varying geometries. The model\nincorporates a feature propagation algorithm as a preprocessing step to handle\nextremely sparse inputs, leveraging information from neighboring nodes to\ninitialize missing features. In addition, a binary indicator is introduced as a\nvalidity mask to distinguish between the original and propagated data points,\nenabling more effective learning from sparse inputs. Trained on a unique data\nset of Direct Numerical Simulations (DNS) of a motored engine at a technically\nrelevant operating condition, the GACN shows robust performance across\ndifferent resolutions and domain sizes and can effectively handle unstructured\ndata and variable input sizes. The model is tested on previously unseen DNS\ndata as well as on an experimental data set from Particle Image Velocimetry\n(PIV) measurements that were not considered during training. A comparative\nanalysis shows that the GACN consistently outperforms both a conventional\nConvolutional Neural Network (CNN) and cubic interpolation methods on the DNS\nand PIV test sets by achieving lower reconstruction errors and better capturing\nfine-scale turbulent structures. In particular, the GACN effectively\nreconstructs flow fields from domains up to 14 times larger than those observed\nduring training, with the performance advantage increasing for larger domains.\n","authors":["Bogdan A. Danciu","Vito A. Pagone","Benjamin Böhm","Marius Schmidt","Christos E. Frouzakis"],"pdf_url":"https://arxiv.org/pdf/2411.08764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08760v1","updated":"2024-11-13T16:47:34Z","published":"2024-11-13T16:47:34Z","title":"Energy Dissipation Preserving Physics Informed Neural Network for\n Allen-Cahn Equations","summary":" This paper investigates a numerical solution of Allen-Cahn equation with\nconstant and degenerate mobility, with polynomial and logarithmic energy\nfunctionals, with deterministic and random initial functions, and with\nadvective term in one, two, and three spatial dimensions, based on the\nphysics-informed neural network (PINN). To improve the learning capacity of the\nPINN, we incorporate the energy dissipation property of the Allen-Cahn equation\nas a penalty term into the loss function of the network. To facilitate the\nlearning process of random initials, we employ a continuous analogue of the\ninitial random condition by utilizing the Fourier series expansion. Adaptive\nmethods from traditional numerical analysis are also integrated to enhance the\neffectiveness of the proposed PINN. Numerical results indicate a consistent\ndecrease in the discrete energy, while also revealing phenomena such as phase\nseparation and metastability.\n","authors":["Mustafa Kütük","Hamdullah Yücel"],"pdf_url":"https://arxiv.org/pdf/2411.08760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.13150v2","updated":"2024-11-13T16:46:23Z","published":"2024-03-19T20:58:38Z","title":"On Training Survival Models with Scoring Rules","summary":" Scoring rules are an established way of comparing predictive performances\nacross model classes. In the context of survival analysis, they require\nadaptation in order to accommodate censoring. This work investigates using\nscoring rules for model training rather than evaluation. Doing so, we establish\na general framework for training survival models that is model agnostic and can\nlearn event time distributions parametrically or non-parametrically. In\naddition, our framework is not restricted to any specific scoring rule. While\nwe focus on neural network-based implementations, we also provide\nproof-of-concept implementations using gradient boosting, generalized additive\nmodels, and trees. Empirical comparisons on synthetic and real-world data\nindicate that scoring rules can be successfully incorporated into model\ntraining and yield competitive predictive performance with established\ntime-to-event models.\n","authors":["Philipp Kopper","David Rügamer","Raphael Sonabend","Bernd Bischl","Andreas Bender"],"pdf_url":"https://arxiv.org/pdf/2403.13150v2.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2407.02538v2","updated":"2024-11-13T16:44:07Z","published":"2024-07-01T23:24:05Z","title":"CGRclust: Chaos Game Representation for Twin Contrastive Clustering of\n Unlabelled DNA Sequences","summary":" This study proposes CGRclust, a novel combination of unsupervised twin\ncontrastive clustering of Chaos Game Representations (CGR) of DNA sequences,\nwith convolutional neural networks (CNNs). To the best of our knowledge,\nCGRclust is the first method to use unsupervised learning for image\nclassification (herein applied to two-dimensional CGR images) for clustering\ndatasets of DNA sequences. CGRclust overcomes the limitations of traditional\nsequence classification methods by leveraging unsupervised twin contrastive\nlearning to detect distinctive sequence patterns, without requiring DNA\nsequence alignment or biological/taxonomic labels. CGRclust accurately\nclustered twenty-five diverse datasets, with sequence lengths ranging from 664\nbp to 100 kbp, including mitochondrial genomes of fish, fungi, and protists, as\nwell as viral whole genome assemblies and synthetic DNA sequences. Compared\nwith three recent clustering methods for DNA sequences (DeLUCS, iDeLUCS, and\nMeShClust v3.0.), CGRclust is the only method that surpasses 81.70% accuracy\nacross all four taxonomic levels tested for mitochondrial DNA genomes of fish.\nMoreover, CGRclust also consistently demonstrates superior performance across\nall the viral genomic datasets. The high clustering accuracy of CGRclust on\nthese twenty-five datasets, which vary significantly in terms of sequence\nlength, number of genomes, number of clusters, and level of taxonomy,\ndemonstrates its robustness, scalability, and versatility.\n","authors":["Fatemeh Alipour","Kathleen A. Hill","Lila Kari"],"pdf_url":"https://arxiv.org/pdf/2407.02538v2.pdf","comment":"28 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.08758v1","updated":"2024-11-13T16:42:59Z","published":"2024-11-13T16:42:59Z","title":"ScaleNet: Scale Invariance Learning in Directed Graphs","summary":" Graph Neural Networks (GNNs) have advanced relational data analysis but lack\ninvariance learning techniques common in image classification. In node\nclassification with GNNs, it is actually the ego-graph of the center node that\nis classified. This research extends the scale invariance concept to node\nclassification by drawing an analogy to image processing: just as scale\ninvariance being used in image classification to capture multi-scale features,\nwe propose the concept of ``scaled ego-graphs''. Scaled ego-graphs generalize\ntraditional ego-graphs by replacing undirected single-edges with\n``scaled-edges'', which are ordered sequences of multiple directed edges. We\nempirically assess the performance of the proposed scale invariance in graphs\non seven benchmark datasets, across both homophilic and heterophilic\nstructures. Our scale-invariance-based graph learning outperforms inception\nmodels derived from random walks by being simpler, faster, and more accurate.\nThe scale invariance explains inception models' success on homophilic graphs\nand limitations on heterophilic graphs. To ensure applicability of inception\nmodel to heterophilic graphs as well, we further present ScaleNet, an\narchitecture that leverages multi-scaled features. ScaleNet achieves\nstate-of-the-art results on five out of seven datasets (four homophilic and one\nheterophilic) and matches top performance on the remaining two, demonstrating\nits excellent applicability. This represents a significant advance in graph\nlearning, offering a unified framework that enhances node classification across\nvarious graph types. Our code is available at\nhttps://github.com/Qin87/ScaleNet/tree/July25.\n","authors":["Qin Jiang","Chengjia Wang","Michael Lones","Wei Pang"],"pdf_url":"https://arxiv.org/pdf/2411.08758v1.pdf","comment":"Scale invariance in node classification is demonstrated and applied\n in graph transformation to develop ScaleNet, which achieves state-of-the-art\n performance on both homophilic and heterophilic directed graphs"},{"id":"http://arxiv.org/abs/2310.10545v3","updated":"2024-11-13T16:42:52Z","published":"2023-10-16T16:14:43Z","title":"Optimal vintage factor analysis with deflation varimax","summary":" Vintage factor analysis is one important type of factor analysis that aims to\nfirst find a low-dimensional representation of the original data, and then to\nseek a rotation such that the rotated low-dimensional representation is\nscientifically meaningful. The most widely used vintage factor analysis is the\nPrincipal Component Analysis (PCA) followed by the varimax rotation. Despite\nits popularity, little theoretical guarantee can be provided to date mainly\nbecause varimax rotation requires to solve a non-convex optimization over the\nset of orthogonal matrices.\n In this paper, we propose a deflation varimax procedure that solves each row\nof an orthogonal matrix sequentially. In addition to its net computational gain\nand flexibility, we are able to fully establish theoretical guarantees for the\nproposed procedure in a broader context. Adopting this new deflation varimax as\nthe second step after PCA, we further analyze this two step procedure under a\ngeneral class of factor models. Our results show that it estimates the factor\nloading matrix in the minimax optimal rate when the signal-to-noise-ratio (SNR)\nis moderate or large. In the low SNR regime, we offer possible improvement over\nusing PCA and the deflation varimax when the additive noise under the factor\nmodel is structured. The modified procedure is shown to be minimax optimal in\nall SNR regimes. Our theory is valid for finite sample and allows the number of\nthe latent factors to grow with the sample size as well as the ambient\ndimension to grow with, or even exceed, the sample size. Extensive simulation\nand real data analysis further corroborate our theoretical findings.\n","authors":["Xin Bing","Dian Jin","Yuqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10545v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.03679v6","updated":"2024-11-13T16:42:22Z","published":"2024-06-06T01:49:29Z","title":"On the Effects of Data Scale on UI Control Agents","summary":" Autonomous agents that control computer interfaces to accomplish human tasks\nare emerging. Leveraging LLMs to power such agents has been of special\ninterest, but unless fine-tuned on human-collected task demonstrations,\nperformance is still relatively low. In this work we study whether fine-tuning\nalone is a viable approach for building real-world computer control agents. In\nparticularly, we investigate how performance measured on both high and\nlow-level tasks in domain and out of domain scales as more training data is\ncollected. To this end we collect and release a new dataset, AndroidControl,\nconsisting of 15,283 demonstrations of everyday tasks with Android apps.\nCompared to existing datasets, each AndroidControl task instance includes both\nhigh and low-level human-generated instructions, allowing us to explore the\nlevel of task complexity an agent can handle. Moreover, AndroidControl is the\nmost diverse computer control dataset to date, including 14,548 unique tasks\nover 833 Android apps, thus allowing us to conduct in-depth analysis of the\nmodel performance in and out of the domain of the training data. Using the\ndataset, we find that when tested in domain fine-tuned models outperform zero\nand few-shot baselines and scale in such a way that robust performance might\nfeasibly be obtained simply by collecting more data. Out of domain, performance\nscales significantly more slowly and suggests that in particular for high-level\ntasks, fine-tuning on more data alone may be insufficient for achieving robust\nout-of-domain performance.\n","authors":["Wei Li","William Bishop","Alice Li","Chris Rawles","Folawiyo Campbell-Ajala","Divya Tyamagundlu","Oriana Riva"],"pdf_url":"https://arxiv.org/pdf/2406.03679v6.pdf","comment":"NeurIPS 2024 (Datasets and Benchmarks)"},{"id":"http://arxiv.org/abs/2404.10420v3","updated":"2024-11-13T16:42:16Z","published":"2024-04-16T09:37:41Z","title":"AudioProtoPNet: An interpretable deep learning model for bird sound\n classification","summary":" Deep learning models have significantly advanced acoustic bird monitoring by\nbeing able to recognize numerous bird species based on their vocalizations.\nHowever, traditional deep learning models are black boxes that provide no\ninsight into their underlying computations, limiting their usefulness to\nornithologists and machine learning engineers. Explainable models could\nfacilitate debugging, knowledge discovery, trust, and interdisciplinary\ncollaboration. This study introduces AudioProtoPNet, an adaptation of the\nPrototypical Part Network (ProtoPNet) for multi-label bird sound\nclassification. It is an inherently interpretable model that uses a ConvNeXt\nbackbone to extract embeddings, with the classification layer replaced by a\nprototype learning classifier trained on these embeddings. The classifier\nlearns prototypical patterns of each bird species' vocalizations from\nspectrograms of training instances. During inference, audio recordings are\nclassified by comparing them to the learned prototypes in the embedding space,\nproviding explanations for the model's decisions and insights into the most\ninformative embeddings of each bird species. The model was trained on the\nBirdSet training dataset, which consists of 9,734 bird species and over 6,800\nhours of recordings. Its performance was evaluated on the seven test datasets\nof BirdSet, covering different geographical regions. AudioProtoPNet\noutperformed the state-of-the-art model Perch, achieving an average AUROC of\n0.90 and a cmAP of 0.42, with relative improvements of 7.1% and 16.7% over\nPerch, respectively. These results demonstrate that even for the challenging\ntask of multi-label bird sound classification, it is possible to develop\npowerful yet inherently interpretable deep learning models that provide\nvaluable insights for ornithologists and machine learning engineers.\n","authors":["René Heinrich","Lukas Rauch","Bernhard Sick","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2404.10420v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2411.08755v1","updated":"2024-11-13T16:33:27Z","published":"2024-11-13T16:33:27Z","title":"Weakly-Supervised Anomaly Detection in Surveillance Videos Based on\n Two-Stream I3D Convolution Network","summary":" The widespread implementation of urban surveillance systems has necessitated\nmore sophisticated techniques for anomaly detection to ensure enhanced public\nsafety. This paper presents a significant advancement in the field of anomaly\ndetection through the application of Two-Stream Inflated 3D (I3D) Convolutional\nNetworks. These networks substantially outperform traditional 3D Convolutional\nNetworks (C3D) by more effectively extracting spatial and temporal features\nfrom surveillance videos, thus improving the precision of anomaly detection.\nOur research advances the field by implementing a weakly supervised learning\nframework based on Multiple Instance Learning (MIL), which uniquely\nconceptualizes surveillance videos as collections of 'bags' that contain\ninstances (video clips). Each instance is innovatively processed through a\nranking mechanism that prioritizes clips based on their potential to display\nanomalies. This novel strategy not only enhances the accuracy and precision of\nanomaly detection but also significantly diminishes the dependency on extensive\nmanual annotations. Moreover, through meticulous optimization of model\nsettings, including the choice of optimizer, our approach not only establishes\nnew benchmarks in the performance of anomaly detection systems but also offers\na scalable and efficient solution for real-world surveillance applications.\nThis paper contributes significantly to the field of computer vision by\ndelivering a more adaptable, efficient, and context-aware anomaly detection\nsystem, which is poised to redefine practices in urban surveillance.\n","authors":["Sareh Soltani Nejad","Anwar Haque"],"pdf_url":"https://arxiv.org/pdf/2411.08755v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2411.08750v1","updated":"2024-11-13T16:29:33Z","published":"2024-11-13T16:29:33Z","title":"Optimal Transport-Based Displacement Interpolation with Data\n Augmentation for Reduced Order Modeling of Nonlinear Dynamical Systems","summary":" We present a novel reduced-order Model (ROM) that leverages optimal transport\n(OT) theory and displacement interpolation to enhance the representation of\nnonlinear dynamics in complex systems. While traditional ROM techniques face\nchallenges in this scenario, especially when data (i.e., observational\nsnapshots) is limited, our method addresses these issues by introducing a data\naugmentation strategy based on OT principles. The proposed framework generates\ninterpolated solutions tracing geodesic paths in the space of probability\ndistributions, enriching the training dataset for the ROM. A key feature of our\napproach is its ability to provide a continuous representation of the\nsolution's dynamics by exploiting a virtual-to-real time mapping. This enables\nthe reconstruction of solutions at finer temporal scales than those provided by\nthe original data. To further improve prediction accuracy, we employ Gaussian\nProcess Regression to learn the residual and correct the representation between\nthe interpolated snapshots and the physical solution. We demonstrate the\neffectiveness of our methodology with atmospheric mesoscale benchmarks\ncharacterized by highly nonlinear, advection-dominated dynamics. Our results\nshow improved accuracy and efficiency in predicting complex system behaviors,\nindicating the potential of this approach for a wide range of applications in\ncomputational physics and engineering.\n","authors":["Moaad Khamlich","Federico Pichi","Michele Girfoglio","Annalisa Quaini","Gianluigi Rozza"],"pdf_url":"https://arxiv.org/pdf/2411.08750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08739v1","updated":"2024-11-13T16:18:57Z","published":"2024-11-13T16:18:57Z","title":"Bayesian Comparisons Between Representations","summary":" Which neural networks are similar is a fundamental question for both machine\nlearning and neuroscience. Our novel method compares representations based on\nBayesian statistics about linear readouts from the representations. Concretely,\nwe suggest to use the total variation distance or Jensen-Shannon distance\nbetween prior predictive distributions to compare representations. The prior\npredictive distribution is a full description of the inductive bias and\ngeneralization of a model in Bayesian statistics, making it a great basis for\ncomparisons. As Jensen-Shannon distance and total variation distance are\nmetrics our dissimilarity measures are pseudo-metrics for representations. For\na linear readout, our metrics just depend on the linear kernel matrix of the\nrepresentations. Thus, our metrics connects linear read-out based comparisons\nto kernel based metrics like centered kernel alignment and representational\nsimilarity analysis. We apply our new metrics to deep neural networks trained\non ImageNet-1k. Our new metrics can be computed efficiently including a\nstochastic gradient without dimensionality reductions of the representations.\nIt broadly agrees with existing metrics, but is more stringent. It varies less\nacross different random image samples, and it measures how well two\nrepresentations could be distinguished based on a linear read out. Thus our\nmetric nicely extends our toolkit for comparing representations.\n","authors":["Heiko H. Schütt"],"pdf_url":"https://arxiv.org/pdf/2411.08739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08734v1","updated":"2024-11-13T16:16:22Z","published":"2024-11-13T16:16:22Z","title":"Recommender systems and reinforcement learning for building control and\n occupant interaction: A text-mining driven review of scientific literature","summary":" The indoor environment greatly affects health and well-being; enhancing\nhealth and reducing energy use in these settings is a key research focus. With\nadvancing Information and Communication Technology (ICT), recommendation\nsystems and reinforcement learning have emerged as promising methods to induce\nbehavioral changes that improve indoor environments and building energy\nefficiency. This study employs text-mining and Natural Language Processing\n(NLP) to examine these approaches in building control and occupant interaction.\nAnalyzing approximately 27,000 articles from the ScienceDirect database, we\nfound extensive use of recommendation systems and reinforcement learning for\nspace optimization, location recommendations, and personalized control\nsuggestions. Despite broad applications, their use in optimizing indoor\nenvironments and energy efficiency is limited. Traditional recommendation\nalgorithms are commonly used, but optimizing indoor conditions and energy\nefficiency often requires advanced machine learning techniques like\nreinforcement and deep learning. This review highlights the potential for\nexpanding recommender systems and reinforcement learning applications in\nbuildings and indoor environments. Areas for innovation include predictive\nmaintenance, building-related product recommendations, and optimizing\nenvironments for specific needs like sleep and productivity enhancements based\non user feedback.\n","authors":["Wenhao Zhang","Matias Quintana","Clayton Miller"],"pdf_url":"https://arxiv.org/pdf/2411.08734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12763v3","updated":"2024-11-13T16:07:47Z","published":"2024-06-18T16:30:51Z","title":"Implicit Bias of Mirror Flow on Separable Data","summary":" We examine the continuous-time counterpart of mirror descent, namely mirror\nflow, on classification problems which are linearly separable. Such problems\nare minimised `at infinity' and have many possible solutions; we study which\nsolution is preferred by the algorithm depending on the mirror potential. For\nexponential tailed losses and under mild assumptions on the potential, we show\nthat the iterates converge in direction towards a $\\phi_\\infty$-maximum margin\nclassifier. The function $\\phi_\\infty$ is the \\textit{horizon function} of the\nmirror potential and characterises its shape `at infinity'. When the potential\nis separable, a simple formula allows to compute this function. We analyse\nseveral examples of potentials and provide numerical experiments highlighting\nour results.\n","authors":["Scott Pesme","Radu-Alexandru Dragomir","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2406.12763v3.pdf","comment":"Neurips camera ready. Minor changes from the previous versions.\n Mainly added full iterate trajectories (Figure 4)"},{"id":"http://arxiv.org/abs/2307.05284v4","updated":"2024-11-13T15:53:37Z","published":"2023-07-11T14:25:10Z","title":"Rethinking Distribution Shifts: Empirical Analysis and Inductive\n Modeling for Tabular Data","summary":" Different distribution shifts require different interventions, and algorithms\nmust be grounded in the specific shifts they address. However, methodological\ndevelopment for robust algorithms typically relies on structural assumptions\nthat lack empirical validation. Advocating for an empirically grounded\ndata-driven approach to research, we build an empirical testbed comprising\nnatural shifts across 5 tabular datasets and 60,000 method configurations\nencompassing imbalanced learning and distributionally robust optimization (DRO)\nmethods. We find $Y|X$-shifts are most prevalent on our testbed, in stark\ncontrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The\nperformance of robust algorithms varies significantly over shift types, and is\nno better than that of vanilla methods. To understand why, we conduct an\nin-depth empirical analysis of DRO methods and find that although often\nneglected by researchers, implementation details -- such as the choice of\nunderlying model class (e.g., XGBoost) and hyperparameter selection -- have a\nbigger impact on performance than the ambiguity set or its radius. To further\nbridge that gap between methodological research and practice, we design case\nstudies that illustrate how such a data-driven, inductive understanding of\ndistribution shifts can enhance both data-centric and algorithmic\ninterventions.\n","authors":["Jiashuo Liu","Tianyu Wang","Peng Cui","Hongseok Namkoong"],"pdf_url":"https://arxiv.org/pdf/2307.05284v4.pdf","comment":"Conference version appeared in NeurIPS 2023, previously titled \"On\n the Need for a Language Describing Distribution Shifts: Illustrations on\n Tabular Datasets\""},{"id":"http://arxiv.org/abs/2411.08706v1","updated":"2024-11-13T15:50:32Z","published":"2024-11-13T15:50:32Z","title":"Searching Latent Program Spaces","summary":" Program synthesis methods aim to automatically generate programs restricted\nto a language that can explain a given specification of input-output pairs.\nWhile purely symbolic approaches suffer from a combinatorial search space,\nrecent methods leverage neural networks to learn distributions over program\nstructures to narrow this search space significantly, enabling more efficient\nsearch. However, for challenging problems, it remains difficult to train models\nto perform program synthesis in one shot, making test-time search essential.\nMost neural methods lack structured search mechanisms during inference, relying\ninstead on stochastic sampling or gradient updates, which can be inefficient.\nIn this work, we propose the Latent Program Network (LPN), a general algorithm\nfor program induction that learns a distribution over latent programs in a\ncontinuous space, enabling efficient search and test-time adaptation. We\nexplore how to train these networks to optimize for test-time computation and\ndemonstrate the use of gradient-based search both during training and at test\ntime. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates\nperformance by generalizing programs to new inputs rather than explaining the\nunderlying specification. We show that LPN can generalize beyond its training\ndistribution and adapt to unseen tasks by utilizing test-time computation,\noutperforming algorithms without test-time adaptation mechanisms.\n","authors":["Clément Bonnet","Matthew V Macfarlane"],"pdf_url":"https://arxiv.org/pdf/2411.08706v1.pdf","comment":"Code available at https://github.com/clement-bonnet/lpn"},{"id":"http://arxiv.org/abs/2408.00838v2","updated":"2024-11-13T15:48:34Z","published":"2024-08-01T18:00:05Z","title":"Calibrating Bayesian Generative Machine Learning for Bayesiamplification","summary":" Recently, combinations of generative and Bayesian machine learning have been\nintroduced in particle physics for both fast detector simulation and inference\ntasks. These neural networks aim to quantify the uncertainty on the generated\ndistribution originating from limited training statistics. The interpretation\nof a distribution-wide uncertainty however remains ill-defined. We show a clear\nscheme for quantifying the calibration of Bayesian generative machine learning\nmodels. For a Continuous Normalizing Flow applied to a low-dimensional toy\nexample, we evaluate the calibration of Bayesian uncertainties from either a\nmean-field Gaussian weight posterior, or Monte Carlo sampling network weights,\nto gauge their behaviour on unsteady distribution edges. Well calibrated\nuncertainties can then be used to roughly estimate the number of uncorrelated\ntruth samples that are equivalent to the generated sample and clearly indicate\ndata amplification for smooth features of the distribution.\n","authors":["Sebastian Bieringer","Sascha Diefenbacher","Gregor Kasieczka","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2408.00838v2.pdf","comment":"15 pages, 6 figures, updated references, fixed typo"},{"id":"http://arxiv.org/abs/2411.08703v1","updated":"2024-11-13T15:45:46Z","published":"2024-11-13T15:45:46Z","title":"MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics\n Classification","summary":" The distinct characteristics of multiomics data, including complex\ninteractions within and across biological layers and disease heterogeneity\n(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop\nnovel designs to address unique challenges in multiomics prediction. In this\npaper, we propose the multi-view knowledge transfer learning (MVKTrans)\nframework, which transfers intra- and inter-omics knowledge in an adaptive\nmanner by reviewing data heterogeneity and suppressing bias transfer, thereby\nenhancing classification performance. Specifically, we design a graph\ncontrastive module that is trained on unlabeled data to effectively learn and\ntransfer the underlying intra-omics patterns to the supervised task. This\nunsupervised pretraining promotes learning general and unbiased representations\nfor each modality, regardless of the downstream tasks. In light of the varying\ndiscriminative capacities of modalities across different diseases and/or\nsamples, we introduce an adaptive and bi-directional cross-omics distillation\nmodule. This module automatically identifies richer modalities and facilitates\ndynamic knowledge transfer from more informative to less informative omics,\nthereby enabling a more robust and generalized integration. Extensive\nexperiments on four real biomedical datasets demonstrate the superior\nperformance and robustness of MVKTrans compared to the state-of-the-art. Code\nand data are available at https://github.com/Yaolab-fantastic/MVKTrans.\n","authors":["Shan Cong","Zhiling Sang","Hongwei Liu","Haoran Luo","Xin Wang","Hong Liang","Jie Hao","Xiaohui Yao"],"pdf_url":"https://arxiv.org/pdf/2411.08703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08701v1","updated":"2024-11-13T15:42:28Z","published":"2024-11-13T15:42:28Z","title":"TRACE: Transformer-based Risk Assessment for Clinical Evaluation","summary":" We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation),\na novel method for clinical risk assessment based on clinical data, leveraging\nthe self-attention mechanism for enhanced feature interaction and result\ninterpretation. Our approach is able to handle different data modalities,\nincluding continuous, categorical and multiple-choice (checkbox) attributes.\nThe proposed architecture features a shared representation of the clinical data\nobtained by integrating specialized embeddings of each data modality, enabling\nthe detection of high-risk individuals using Transformer encoder layers. To\nassess the effectiveness of the proposed method, a strong baseline based on\nnon-negative multi-layer perceptrons (MLPs) is introduced. The proposed method\noutperforms various baselines widely used in the domain of clinical risk\nassessment, while effectively handling missing values. In terms of\nexplainability, our Transformer-based method offers easily interpretable\nresults via attention weights, further enhancing the clinicians'\ndecision-making process.\n","authors":["Dionysis Christopoulos","Sotiris Spanos","Valsamis Ntouskos","Konstantinos Karantzalos"],"pdf_url":"https://arxiv.org/pdf/2411.08701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08700v1","updated":"2024-11-13T15:42:13Z","published":"2024-11-13T15:42:13Z","title":"Rethinking negative sampling in content-based news recommendation","summary":" News recommender systems are hindered by the brief lifespan of articles, as\nthey undergo rapid relevance decay. Recent studies have demonstrated the\npotential of content-based neural techniques in tackling this problem. However,\nthese models often involve complex neural architectures and often lack\nconsideration for negative examples. In this study, we posit that the careful\nsampling of negative examples has a big impact on the model's outcome. We\ndevise a negative sampling technique that not only improves the accuracy of the\nmodel but also facilitates the decentralization of the recommendation system.\nThe experimental results obtained using the MIND dataset demonstrate that the\naccuracy of the method under consideration can compete with that of\nState-of-the-Art models. The utilization of the sampling technique is essential\nin reducing model complexity and accelerating the training process, while\nmaintaining a high level of accuracy. Finally, we discuss how decentralized\nmodels can help improve privacy and scalability.\n","authors":["Miguel Ângelo Rebelo","João Vinagre","Ivo Pereira","Álvaro Figueira"],"pdf_url":"https://arxiv.org/pdf/2411.08700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08699v1","updated":"2024-11-13T15:42:09Z","published":"2024-11-13T15:42:09Z","title":"FedSub: Introducing class-aware Subnetworks Fusion to Enhance\n Personalized Federated Learning in Ubiquitous Systems","summary":" Personalized Federated Learning is essential in AI-driven ubiquitous systems,\nsupporting the distributed development of models able to adapt to diverse and\nevolving user behaviors while safeguarding privacy. Despite addressing\nheterogeneous user data distributions in collaborative model training, existing\nmethods often face limitations balancing personalization and generalization,\noversimplifying user similarities, or relying heavily on global models. In this\npaper, we propose FedSub, a novel federated approach designed to enhance\npersonalization through the use of class-aware prototypes and model\nsubnetworks. Prototypes serve as compact representations of user data,\nclustered on the server to identify similarities based on specific label\npatterns. Concurrently, subnetworks -- model components necessary to process\neach class -- are extracted locally and fused by the server according to these\nclusters, producing highly tailored model updates for each user. This\nfine-grained, class-specific aggregation of clients' models allows FedSub to\ncapture the unique characteristics of individual user data patterns. The\neffectiveness of FedSub is validated in three real-world scenarios\ncharacterized by high data heterogeneity, derived from human activity\nrecognition and mobile health applications. Experimental evaluations\ndemonstrate FedSub's performance improvements with respect to the\nstate-of-the-art and significant advancements in personalization for ubiquitous\nsystems based on personal mobile and wearable devices.\n","authors":["Mattia Giovanni Campana","Franca Delmastro"],"pdf_url":"https://arxiv.org/pdf/2411.08699v1.pdf","comment":"Submitted to Proceedings of the ACM on Interactive, Mobile, Wearable\n and Ubiquitous Technologies (IMWUT)"},{"id":"http://arxiv.org/abs/2405.15732v2","updated":"2024-11-13T15:30:50Z","published":"2024-05-24T17:20:18Z","title":"Neural Persistence Dynamics","summary":" We consider the problem of learning the dynamics in the topology of\ntime-evolving point clouds, the prevalent spatiotemporal model for systems\nexhibiting collective behavior, such as swarms of insects and birds or\nparticles in physics. In such systems, patterns emerge from (local)\ninteractions among self-propelled entities. While several well-understood\ngoverning equations for motion and interaction exist, they are notoriously\ndifficult to fit to data, as most prior work requires knowledge about\nindividual motion trajectories, i.e., a requirement that is challenging to\nsatisfy with an increasing number of entities. To evade such confounding\nfactors, we investigate collective behavior from a $\\textit{topological\nperspective}$, but instead of summarizing entire observation sequences (as done\npreviously), we propose learning a latent dynamical model from topological\nfeatures $\\textit{per time point}$. The latter is then used to formulate a\ndownstream regression task to predict the parametrization of some a priori\nspecified governing equation. We implement this idea based on a latent ODE\nlearned from vectorized (static) persistence diagrams and show that a\ncombination of recent stability results for persistent homology justifies this\nmodeling choice. Various (ablation) experiments not only demonstrate the\nrelevance of each model component but provide compelling empirical evidence\nthat our proposed model - $\\textit{Neural Persistence Dynamics}$ -\nsubstantially outperforms the state-of-the-art across a diverse set of\nparameter regression tasks.\n","authors":["Sebastian Zeng","Florian Graf","Martin Uray","Stefan Huber","Roland Kwitt"],"pdf_url":"https://arxiv.org/pdf/2405.15732v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08687v1","updated":"2024-11-13T15:22:33Z","published":"2024-11-13T15:22:33Z","title":"Measuring similarity between embedding spaces using induced neighborhood\n graphs","summary":" Deep Learning techniques have excelled at generating embedding spaces that\ncapture semantic similarities between items. Often these representations are\npaired, enabling experiments with analogies (pairs within the same domain) and\ncross-modality (pairs across domains). These experiments are based on specific\nassumptions about the geometry of embedding spaces, which allow finding paired\nitems by extrapolating the positional relationships between embedding pairs in\nthe training dataset, allowing for tasks such as finding new analogies, and\nmultimodal zero-shot classification. In this work, we propose a metric to\nevaluate the similarity between paired item representations. Our proposal is\nbuilt from the structural similarity between the nearest-neighbors induced\ngraphs of each representation, and can be configured to compare spaces based on\ndifferent distance metrics and on different neighborhood sizes. We demonstrate\nthat our proposal can be used to identify similar structures at different\nscales, which is hard to achieve with kernel methods such as Centered Kernel\nAlignment (CKA). We further illustrate our method with two case studies: an\nanalogy task using GloVe embeddings, and zero-shot classification in the\nCIFAR-100 dataset using CLIP embeddings. Our results show that accuracy in both\nanalogy and zero-shot classification tasks correlates with the embedding\nsimilarity. These findings can help explain performance differences in these\ntasks, and may lead to improved design of paired-embedding models in the\nfuture.\n","authors":["Tiago F. Tavares","Fabio Ayres","Paris Smaragdis"],"pdf_url":"https://arxiv.org/pdf/2411.08687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.09604v2","updated":"2024-11-13T15:17:20Z","published":"2024-08-18T22:11:24Z","title":"Circuit design in biology and machine learning. I. Random networks and\n dimensional reduction","summary":" A biological circuit is a neural or biochemical cascade, taking inputs and\nproducing outputs. How have biological circuits learned to solve environmental\nchallenges over the history of life? The answer certainly follows Dobzhansky's\nfamous quote that ``nothing in biology makes sense except in the light of\nevolution.'' But that quote leaves out the mechanistic basis by which natural\nselection's trial-and-error learning happens, which is exactly what we have to\nunderstand. How does the learning process that designs biological circuits\nactually work? How much insight can we gain about the form and function of\nbiological circuits by studying the processes that have made those circuits?\nBecause life's circuits must often solve the same problems as those faced by\nmachine learning, such as environmental tracking, homeostatic control,\ndimensional reduction, or classification, we can begin by considering how\nmachine learning designs computational circuits to solve problems. We can then\nask: How much insight do those computational circuits provide about the design\nof biological circuits? How much does biology differ from computers in the\nparticular circuit designs that it uses to solve problems? This article steps\nthrough two classic machine learning models to set the foundation for analyzing\nbroad questions about the design of biological circuits. One insight is the\nsurprising power of randomly connected networks. Another is the central role of\ninternal models of the environment embedded within biological circuits,\nillustrated by a model of dimensional reduction and trend prediction. Overall,\nmany challenges in biology have machine learning analogs, suggesting hypotheses\nabout how biology's circuits are designed.\n","authors":["Steven A. Frank"],"pdf_url":"https://arxiv.org/pdf/2408.09604v2.pdf","comment":"Added background info in two text boxes and new figure, edited\n throughout"},{"id":"http://arxiv.org/abs/2402.16187v3","updated":"2024-11-13T15:14:38Z","published":"2024-02-25T20:24:07Z","title":"No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design\n Choices","summary":" Advances in generative models have made it possible for AI-generated text,\ncode, and images to mirror human-generated content in many applications.\nWatermarking, a technique that aims to embed information in the output of a\nmodel to verify its source, is useful for mitigating the misuse of such\nAI-generated content. However, we show that common design choices in LLM\nwatermarking schemes make the resulting systems surprisingly susceptible to\nattack -- leading to fundamental trade-offs in robustness, utility, and\nusability. To navigate these trade-offs, we rigorously study a set of simple\nyet effective attacks on common watermarking systems, and propose guidelines\nand defenses for LLM watermarking in practice.\n","authors":["Qi Pang","Shengyuan Hu","Wenting Zheng","Virginia Smith"],"pdf_url":"https://arxiv.org/pdf/2402.16187v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08664v1","updated":"2024-11-13T14:55:08Z","published":"2024-11-13T14:55:08Z","title":"UniMat: Unifying Materials Embeddings through Multi-modal Learning","summary":" Materials science datasets are inherently heterogeneous and are available in\ndifferent modalities such as characterization spectra, atomic structures,\nmicroscopic images, and text-based synthesis conditions. The advancements in\nmulti-modal learning, particularly in vision and language models, have opened\nnew avenues for integrating data in different forms. In this work, we evaluate\ncommon techniques in multi-modal learning (alignment and fusion) in unifying\nsome of the most important modalities in materials science: atomic structure,\nX-ray diffraction patterns (XRD), and composition. We show that structure graph\nmodality can be enhanced by aligning with XRD patterns. Additionally, we show\nthat aligning and fusing more experimentally accessible data formats, such as\nXRD patterns and compositions, can create more robust joint embeddings than\nindividual modalities across various tasks. This lays the groundwork for future\nstudies aiming to exploit the full potential of multi-modal data in materials\nscience, facilitating more informed decision-making in materials design and\ndiscovery.\n","authors":["Janghoon Ock","Joseph Montoya","Daniel Schweigert","Linda Hung","Santosh K. Suram","Weike Ye"],"pdf_url":"https://arxiv.org/pdf/2411.08664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.13224v4","updated":"2024-11-13T14:54:18Z","published":"2024-02-20T18:37:11Z","title":"Controlling Large Electric Vehicle Charging Stations via User Behavior\n Modeling and Stochastic Programming","summary":" This paper introduces an Electric Vehicle Charging Station (EVCS) model that\nincorporates real-world constraints, such as slot power limitations, contract\nthreshold overruns penalties, or early disconnections of electric vehicles\n(EVs). We propose a formulation of the problem of EVCS control under\nuncertainty, and implement two Multi-Stage Stochastic Programming approaches\nthat leverage user-provided information, namely, Model Predictive Control and\nTwo-Stage Stochastic Programming. The model addresses uncertainties in charging\nsession start and end times, as well as in energy demand. A user's behavior\nmodel based on a sojourn-time-dependent stochastic process enhances cost\nreduction while maintaining customer satisfaction. The benefits of the two\nproposed methods are showcased against two baselines over a 22-day simulation\nusing a real-world dataset. The two-stage approach demonstrates robustness\nagainst early disconnections by considering a wider range of uncertainty\nscenarios for optimization. The algorithm prioritizing user satisfaction over\nelectricity cost achieves a 20% and 36% improvement in two user satisfaction\nmetrics compared to an industry-standard baseline. Additionally, the algorithm\nstriking the best balance between cost and user satisfaction exhibits a mere 3%\nrelative cost increase compared to the theoretically optimal baseline - for\nwhich the nonanticipativity constraint is relaxed - while attaining 94% and 84%\nof the user satisfaction performance in the two used satisfaction metrics.\n","authors":["Alban Puech","Tristan Rigaut","William Templier","Maud Tournoud"],"pdf_url":"https://arxiv.org/pdf/2402.13224v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08652v1","updated":"2024-11-13T14:42:32Z","published":"2024-11-13T14:42:32Z","title":"Accelerating Quasi-Static Time Series Simulations with Foundation Models","summary":" Quasi-static time series (QSTS) simulations have great potential for\nevaluating the grid's ability to accommodate the large-scale integration of\ndistributed energy resources. However, as grids expand and operate closer to\ntheir limits, iterative power flow solvers, central to QSTS simulations, become\ncomputationally prohibitive and face increasing convergence issues. Neural\npower flow solvers provide a promising alternative, speeding up power flow\ncomputations by 3 to 4 orders of magnitude, though they are costly to train. In\nthis paper, we envision how recently introduced grid foundation models could\nimprove the economic viability of neural power flow solvers. Conceptually,\nthese models amortize training costs by serving as a foundation for a range of\ngrid operation and planning tasks beyond power flow solving, with only minimal\nfine-tuning required. We call for collaboration between the AI and power grid\ncommunities to develop and open-source these models, enabling all operators,\neven those with limited resources, to benefit from AI without building\nsolutions from scratch.\n","authors":["Alban Puech","François Mirallès","Jonas Weiss","Vincent Mai","Alexandre Blondin Massé","Martin de Montigny","Thomas Brunschwiler","Hendrik F. Hamann"],"pdf_url":"https://arxiv.org/pdf/2411.08652v1.pdf","comment":"Equal contributors: A.P. and F.M.; Lead contact: A.P"},{"id":"http://arxiv.org/abs/2306.16028v2","updated":"2024-11-13T14:41:20Z","published":"2023-06-28T08:55:56Z","title":"Exponential separations between classical and quantum learners","summary":" Despite significant effort, the quantum machine learning community has only\ndemonstrated quantum learning advantages for artificial cryptography-inspired\ndatasets when dealing with classical data. In this paper we address the\nchallenge of finding learning problems where quantum learning algorithms can\nachieve a provable exponential speedup over classical learning algorithms. We\nreflect on computational learning theory concepts related to this question and\ndiscuss how subtle differences in definitions can result in significantly\ndifferent requirements and tasks for the learner to meet and solve. We examine\nexisting learning problems with provable quantum speedups and find that they\nlargely rely on the classical hardness of evaluating the function that\ngenerates the data, rather than identifying it. To address this, we present two\nnew learning separations where the classical difficulty primarily lies in\nidentifying the function generating the data. Furthermore, we explore\ncomputational hardness assumptions that can be leveraged to prove quantum\nspeedups in scenarios where data is quantum-generated, which implies likely\nquantum advantages in a plethora of more natural settings (e.g., in condensed\nmatter and high energy physics). We also discuss the limitations of the\nclassical shadow paradigm in the context of learning separations, and how\nphysically-motivated settings such as characterizing phases of matter and\nHamiltonian learning fit in the computational learning framework.\n","authors":["Casper Gyurik","Vedran Dunjko"],"pdf_url":"https://arxiv.org/pdf/2306.16028v2.pdf","comment":"this article supersedes arXiv:2208.06339"},{"id":"http://arxiv.org/abs/2411.08651v1","updated":"2024-11-13T14:40:51Z","published":"2024-11-13T14:40:51Z","title":"Estimating unknown parameters in differential equations with a\n reinforcement learning based PSO method","summary":" Differential equations offer a foundational yet powerful framework for\nmodeling interactions within complex dynamic systems and are widely applied\nacross numerous scientific fields. One common challenge in this area is\nestimating the unknown parameters of these dynamic relationships. However,\ntraditional numerical optimization methods rely on the selection of initial\nparameter values, making them prone to local optima. Meanwhile, deep learning\nand Bayesian methods require training models on specific differential\nequations, resulting in poor versatility. This paper reformulates the parameter\nestimation problem of differential equations as an optimization problem by\nintroducing the concept of particles from the particle swarm optimization\nalgorithm. Building on reinforcement learning-based particle swarm optimization\n(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown\nparameters of differential equations. We compared its performance on three\ntypical ordinary differential equations with the state-of-the-art methods,\nincluding the RLLPSO algorithm, traditional numerical methods, deep learning\napproaches, and Bayesian methods. The experimental results demonstrate that our\nDERLPSO consistently outperforms other methods in terms of performance,\nachieving an average Mean Square Error of 1.13e-05, which reduces the error by\napproximately 4 orders of magnitude compared to other methods. Apart from\nordinary differential equations, our DERLPSO also show great promise for\nestimating unknown parameters of partial differential equations. The DERLPSO\nmethod proposed in this paper has high accuracy, is independent of initial\nparameter values, and possesses strong versatility and stability. This work\nprovides new insights into unknown parameter estimation for differential\nequations.\n","authors":["Wenkui Sun","Xiaoya Fan","Lijuan Jia","Tinyi Chu","Shing-Tung Yau","Rongling Wu","Zhong Wang"],"pdf_url":"https://arxiv.org/pdf/2411.08651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07444v2","updated":"2024-11-13T14:39:42Z","published":"2023-11-13T16:18:58Z","title":"On the Robustness of Neural Collapse and the Neural Collapse of\n Robustness","summary":" Neural Collapse refers to the curious phenomenon in the end of training of a\nneural network, where feature vectors and classification weights converge to a\nvery simple geometrical arrangement (a simplex). While it has been observed\nempirically in various cases and has been theoretically motivated, its\nconnection with crucial properties of neural networks, like their\ngeneralization and robustness, remains unclear. In this work, we study the\nstability properties of these simplices. We find that the simplex structure\ndisappears under small adversarial attacks, and that perturbed examples \"leap\"\nbetween simplex vertices. We further analyze the geometry of networks that are\noptimized to be robust against adversarial perturbations of the input, and find\nthat Neural Collapse is a pervasive phenomenon in these cases as well, with\nclean and perturbed representations forming aligned simplices, and giving rise\nto a robust simple nearest-neighbor classifier. By studying the propagation of\nthe amount of collapse inside the network, we identify novel properties of both\nrobust and non-robust machine learning models, and show that earlier, unlike\nlater layers maintain reliable simplices on perturbed data. Our code is\navailable at https://github.com/JingtongSu/robust_neural_collapse .\n","authors":["Jingtong Su","Ya Shi Zhang","Nikolaos Tsilivis","Julia Kempe"],"pdf_url":"https://arxiv.org/pdf/2311.07444v2.pdf","comment":"Transactions on Machine Learning Research, 2024"},{"id":"http://arxiv.org/abs/2411.08640v1","updated":"2024-11-13T14:31:52Z","published":"2024-11-13T14:31:52Z","title":"Towards Secure Intelligent O-RAN Architecture: Vulnerabilities, Threats\n and Promising Technical Solutions using LLMs","summary":" The evolution of wireless communication systems will be fundamentally\nimpacted by an open radio access network (O-RAN), a new concept defining an\nintelligent architecture with enhanced flexibility, openness, and the ability\nto slice services more efficiently. For all its promises, and like any\ntechnological advancement, O-RAN is not without risks that need to be carefully\nassessed and properly addressed to accelerate its wide adoption in future\nmobile networks. In this paper, we present an in-depth security analysis of the\nO-RAN architecture, discussing the potential threats that may arise in the\ndifferent O-RAN architecture layers and their impact on the Confidentiality,\nIntegrity, and Availability (CIA) triad. We also promote the potential of zero\ntrust, Moving Target Defense (MTD), blockchain, and large language models(LLM)\ntechnologies in fortifying O-RAN's security posture. Furthermore, we\nnumerically demonstrate the effectiveness of MTD in empowering robust deep\nreinforcement learning methods for dynamic network slice admission control in\nthe O-RAN architecture. Moreover, we examine the effect of explainable AI (XAI)\nbased on LLMs in securing the system.\n","authors":["Mojdeh Karbalaee Motalleb","Chafika Benzaid","Tarik Taleb","Marcos Katz","Vahid Shah-Mansouri","JaeSeung Song"],"pdf_url":"https://arxiv.org/pdf/2411.08640v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.08638v1","updated":"2024-11-13T14:26:04Z","published":"2024-11-13T14:26:04Z","title":"Gaussian Mixture Models Based Augmentation Enhances GNN Generalization","summary":" Graph Neural Networks (GNNs) have shown great promise in tasks like node and\ngraph classification, but they often struggle to generalize, particularly to\nunseen or out-of-distribution (OOD) data. These challenges are exacerbated when\ntraining data is limited in size or diversity. To address these issues, we\nintroduce a theoretical framework using Rademacher complexity to compute a\nregret bound on the generalization error and then characterize the effect of\ndata augmentation. This framework informs the design of GMM-GDA, an efficient\ngraph data augmentation (GDA) algorithm leveraging the capability of Gaussian\nMixture Models (GMMs) to approximate any distribution. Our approach not only\noutperforms existing augmentation techniques in terms of generalization but\nalso offers improved time complexity, making it highly suitable for real-world\napplications.\n","authors":["Yassine Abbahaddou","Fragkiskos D. Malliaros","Johannes F. Lutzeyer","Amine Mohamed Aboussalah","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2411.08638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08637v1","updated":"2024-11-13T14:24:47Z","published":"2024-11-13T14:24:47Z","title":"Robot See, Robot Do: Imitation Reward for Noisy Financial Environments","summary":" The sequential nature of decision-making in financial asset trading aligns\nnaturally with the reinforcement learning (RL) framework, making RL a common\napproach in this domain. However, the low signal-to-noise ratio in financial\nmarkets results in noisy estimates of environment components, including the\nreward function, which hinders effective policy learning by RL agents. Given\nthe critical importance of reward function design in RL problems, this paper\nintroduces a novel and more robust reward function by leveraging imitation\nlearning, where a trend labeling algorithm acts as an expert. We integrate\nimitation (expert's) feedback with reinforcement (agent's) feedback in a\nmodel-free RL algorithm, effectively embedding the imitation learning problem\nwithin the RL paradigm to handle the stochasticity of reward signals. Empirical\nresults demonstrate that this novel approach improves financial performance\nmetrics compared to traditional benchmarks and RL agents trained solely using\nreinforcement feedback.\n","authors":["Sven Goluža","Tomislav Kovačević","Stjepan Begušić","Zvonko Kostanjčar"],"pdf_url":"https://arxiv.org/pdf/2411.08637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08631v1","updated":"2024-11-13T14:17:26Z","published":"2024-11-13T14:17:26Z","title":"Deep Generative Demand Learning for Newsvendor and Pricing","summary":" We consider data-driven inventory and pricing decisions in the feature-based\nnewsvendor problem, where demand is influenced by both price and contextual\nfeatures and is modeled without any structural assumptions. The unknown demand\ndistribution results in a challenging conditional stochastic optimization\nproblem, further complicated by decision-dependent uncertainty and the\nintegration of features. Inspired by recent advances in deep generative\nlearning, we propose a novel approach leveraging conditional deep generative\nmodels (cDGMs) to address these challenges. cDGMs learn the demand distribution\nand generate probabilistic demand forecasts conditioned on price and features.\nThis generative approach enables accurate profit estimation and supports the\ndesign of algorithms for two key objectives: (1) optimizing inventory for\narbitrary prices, and (2) jointly determining optimal pricing and inventory\nlevels. We provide theoretical guarantees for our approach, including the\nconsistency of profit estimation and convergence of our decisions to the\noptimal solution. Extensive simulations-ranging from simple to complex\nscenarios, including one involving textual features-and a real-world case study\ndemonstrate the effectiveness of our approach. Our method opens a new paradigm\nin management science and operations research, is adaptable to extensions of\nthe newsvendor and pricing problems, and holds potential for solving other\nconditional stochastic optimization problems.\n","authors":["Shijin Gong","Huihang Liu","Xinyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.08631v1.pdf","comment":"30 pages, 6 figures"},{"id":"http://arxiv.org/abs/2403.16336v2","updated":"2024-11-13T14:13:58Z","published":"2024-03-25T00:21:34Z","title":"Predictive Inference in Multi-environment Scenarios","summary":" We address the challenge of constructing valid confidence intervals and sets\nin problems of prediction across multiple environments. We investigate two\ntypes of coverage suitable for these problems, extending the jackknife and\nsplit-conformal methods to show how to obtain distribution-free coverage in\nsuch non-traditional, potentially hierarchical data-generating scenarios. We\ndemonstrate a novel resizing method to adapt to problem difficulty, which\napplies both to existing approaches for predictive inference and the methods we\ndevelop; this reduces prediction set sizes using limited information from the\ntest environment, a key to the methods' practical performance, which we\nevaluate through neurochemical sensing and species classification datasets. Our\ncontributions also include extensions for settings with non-real-valued\nresponses, a theory of consistency for predictive inference in these general\nproblems, and insights on the limits of conditional coverage.\n","authors":["John C. Duchi","Suyash Gupta","Kuanhao Jiang","Pragya Sur"],"pdf_url":"https://arxiv.org/pdf/2403.16336v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08610v1","updated":"2024-11-13T13:53:10Z","published":"2024-11-13T13:53:10Z","title":"Dynamic Subset Tuning: Expanding the Operational Range of\n Parameter-Efficient Training for Large Language Models","summary":" We propose a novel parameter-efficient training (PET) method for large\nlanguage models that adapts models to downstream tasks by optimizing a small\nsubset of the existing model parameters. Unlike prior methods, this subset is\nnot fixed in location but rather which parameters are modified evolves over the\ncourse of training. This dynamic parameter selection can yield good performance\nwith many fewer parameters than extant methods. Our method enables a seamless\nscaling of the subset size across an arbitrary proportion of the total model\nsize, while popular PET approaches like prompt tuning and LoRA cover only a\nsmall part of this spectrum. We match or outperform prompt tuning and LoRA in\nmost cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given\nparameter budget across different model families and sizes.\n","authors":["Felix Stahlberg","Jared Lichtarge","Shankar Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.08610v1.pdf","comment":"NeurIPS 2024 Workshop on Adaptive Foundation Models"},{"id":"http://arxiv.org/abs/2411.08013v2","updated":"2024-11-13T13:36:05Z","published":"2024-11-12T18:43:27Z","title":"Investigating the Effectiveness of Explainability Methods in Parkinson's\n Detection from Speech","summary":" Speech impairments in Parkinson's disease (PD) provide significant early\nindicators for diagnosis. While models for speech-based PD detection have shown\nstrong performance, their interpretability remains underexplored. This study\nsystematically evaluates several explainability methods to identify PD-specific\nspeech features, aiming to support the development of accurate, interpretable\nmodels for clinical decision-making in PD diagnosis and monitoring. Our\nmethodology involves (i) obtaining attributions and saliency maps using\nmainstream interpretability techniques, (ii) quantitatively evaluating the\nfaithfulness of these maps and their combinations obtained via union and\nintersection through a range of established metrics, and (iii) assessing the\ninformation conveyed by the saliency maps for PD detection from an auxiliary\nclassifier. Our results reveal that, while explanations are aligned with the\nclassifier, they often fail to provide valuable information for domain experts.\n","authors":["Eleonora Mancini","Francesco Paissan","Paolo Torroni","Mirco Ravanelli","Cem Subakan"],"pdf_url":"https://arxiv.org/pdf/2411.08013v2.pdf","comment":"The first two authors contributed equally to this research: author\n order is alphabetical"},{"id":"http://arxiv.org/abs/2411.08599v1","updated":"2024-11-13T13:30:21Z","published":"2024-11-13T13:30:21Z","title":"XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL","summary":" To tackle the challenges of large language model performance in natural\nlanguage to SQL tasks, we introduce XiYan-SQL, an innovative framework that\nemploys a multi-generator ensemble strategy to improve candidate generation. We\nintroduce M-Schema, a semi-structured schema representation method designed to\nenhance the understanding of database structures. To enhance the quality and\ndiversity of generated candidate SQL queries, XiYan-SQL integrates the\nsignificant potential of in-context learning (ICL) with the precise control of\nsupervised fine-tuning. On one hand, we propose a series of training strategies\nto fine-tune models to generate high-quality candidates with diverse\npreferences. On the other hand, we implement the ICL approach with an example\nselection method based on named entity recognition to prevent overemphasis on\nentities. The refiner optimizes each candidate by correcting logical or\nsyntactical errors. To address the challenge of identifying the best candidate,\nwe fine-tune a selection model to distinguish nuances of candidate SQL queries.\nThe experimental results on multiple dialect datasets demonstrate the\nrobustness of XiYan-SQL in addressing challenges across different scenarios.\nOverall, our proposed XiYan-SQL achieves the state-of-the-art execution\naccuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on\nNL2GQL, and a competitive score of 72.23% on the Bird development benchmark.\nThe proposed framework not only enhances the quality and diversity of SQL\nqueries but also outperforms previous methods.\n","authors":["Yingqi Gao","Yifu Liu","Xiaoxia Li","Xiaorong Shi","Yin Zhu","Yiming Wang","Shiqi Li","Wei Li","Yuntao Hong","Zhiling Luo","Jinyang Gao","Liyu Mou","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2411.08599v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.13178v2","updated":"2024-11-13T13:14:19Z","published":"2024-10-17T02:58:57Z","title":"GeSubNet: Gene Interaction Inference for Disease Subtype Network\n Generation","summary":" Retrieving gene functional networks from knowledge databases presents a\nchallenge due to the mismatch between disease networks and subtype-specific\nvariations. Current solutions, including statistical and deep learning methods,\noften fail to effectively integrate gene interaction knowledge from databases\nor explicitly learn subtype-specific interactions. To address this mismatch, we\npropose GeSubNet, which learns a unified representation capable of predicting\ngene interactions while distinguishing between different disease subtypes.\nGraphs generated by such representations can be considered subtype-specific\nnetworks. GeSubNet is a multi-step representation learning framework with three\nmodules: First, a deep generative model learns distinct disease subtypes from\npatient gene expression profiles. Second, a graph neural network captures\nrepresentations of prior gene networks from knowledge databases, ensuring\naccurate physical gene interactions. Finally, we integrate these two\nrepresentations using an inference loss that leverages graph generation\ncapabilities, conditioned on the patient separation loss, to refine\nsubtype-specific information in the learned representation. GeSubNet\nconsistently outperforms traditional methods, with average improvements of\n30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged\nover four cancer datasets. Particularly, we conduct a biological simulation\nexperiment to assess how the behavior of selected genes from over 11,000\ncandidates affects subtypes or patient distributions. The results show that the\ngenerated network has the potential to identify subtype-specific genes with an\n83% likelihood of impacting patient distribution shifts. The GeSubNet resource\nis available: https://anonymous.4open.science/r/GeSubNet/\n","authors":["Ziwei Yang","Zheng Chen","Xin Liu","Rikuto Kotoge","Peng Chen","Yasuko Matsubara","Yasushi Sakurai","Jimeng Sun"],"pdf_url":"https://arxiv.org/pdf/2410.13178v2.pdf","comment":"Under review as a conference paper at ICLR 2025"},{"id":"http://arxiv.org/abs/2411.08590v1","updated":"2024-11-13T13:13:07Z","published":"2024-11-13T13:13:07Z","title":"Hopfield-Fenchel-Young Networks: A Unified Framework for Associative\n Memory Retrieval","summary":" Associative memory models, such as Hopfield networks and their modern\nvariants, have garnered renewed interest due to advancements in memory capacity\nand connections with self-attention in transformers. In this work, we introduce\na unified framework-Hopfield-Fenchel-Young networks-which generalizes these\nmodels to a broader family of energy functions. Our energies are formulated as\nthe difference between two Fenchel-Young losses: one, parameterized by a\ngeneralized entropy, defines the Hopfield scoring mechanism, while the other\napplies a post-transformation to the Hopfield output. By utilizing Tsallis and\nnorm entropies, we derive end-to-end differentiable update rules that enable\nsparse transformations, uncovering new connections between loss margins,\nsparsity, and exact retrieval of single memory patterns. We further extend this\nframework to structured Hopfield networks using the SparseMAP transformation,\nallowing the retrieval of pattern associations rather than a single pattern.\nOur framework unifies and extends traditional and modern Hopfield networks and\nprovides an energy minimization perspective for widely used\npost-transformations like $\\ell_2$-normalization and layer normalization-all\nthrough suitable choices of Fenchel-Young losses and by using convex analysis\nas a building block. Finally, we validate our Hopfield-Fenchel-Young networks\non diverse memory recall tasks, including free and sequential recall.\nExperiments on simulated data, image retrieval, multiple instance learning, and\ntext rationalization demonstrate the effectiveness of our approach.\n","authors":["Saul Santos","Vlad Niculae","Daniel McNamee","André F. T. Martins"],"pdf_url":"https://arxiv.org/pdf/2411.08590v1.pdf","comment":"49 pages, 14 figures. arXiv admin note: text overlap with\n arXiv:2402.13725"},{"id":"http://arxiv.org/abs/2411.08587v1","updated":"2024-11-13T13:11:49Z","published":"2024-11-13T13:11:49Z","title":"DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning\n Methods","summary":" Assessing the quality of aleatoric uncertainty estimates from uncertainty\nquantification (UQ) deep learning methods is important in scientific contexts,\nwhere uncertainty is physically meaningful and important to characterize and\ninterpret exactly. We systematically compare aleatoric uncertainty measured by\ntwo UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER).\nOur method focuses on both zero-dimensional (0D) and two-dimensional (2D) data,\nto explore how the UQ methods function for different data dimensionalities. We\ninvestigate uncertainty injected on the input and output variables and include\na method to propagate uncertainty in the case of input uncertainty so that we\ncan compare the predicted aleatoric uncertainty to the known values. We\nexperiment with three levels of noise. The aleatoric uncertainty predicted\nacross all models and experiments scales with the injected noise level.\nHowever, the predicted uncertainty is miscalibrated to $\\rm{std}(\\sigma_{\\rm\nal})$ with the true uncertainty for half of the DE experiments and almost all\nof the DER experiments. The predicted uncertainty is the least accurate for\nboth UQ methods for the 2D input uncertainty experiment and the high-noise\nlevel. While these results do not apply to more complex data, they highlight\nthat further research on post-facto calibration for these methods would be\nbeneficial, particularly for high-noise and high-dimensional settings.\n","authors":["Rebecca Nevin","Aleksandra Ćiprijanović","Brian D. Nord"],"pdf_url":"https://arxiv.org/pdf/2411.08587v1.pdf","comment":"Accepted to the Machine Learning for Physical Sciences workshop at\n NeurIPS 2024; 11 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2411.08582v1","updated":"2024-11-13T13:01:44Z","published":"2024-11-13T13:01:44Z","title":"Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors","summary":" The application of machine learning (ML) algorithms in the intelligent\ndiagnosis of three-phase engines has the potential to significantly enhance\ndiagnostic performance and accuracy. Traditional methods largely rely on\nsignature analysis, which, despite being a standard practice, can benefit from\nthe integration of advanced ML techniques. In our study, we innovate by\ncombining state of the art algorithms with a novel unsupervised anomaly\ngeneration methodology that takes into account physics model of the engine.\nThis hybrid approach leverages the strengths of both supervised ML and\nunsupervised signature analysis, achieving superior diagnostic accuracy and\nreliability along with a wide industrial application. Our experimental results\ndemonstrate that this method significantly outperforms existing ML and non-ML\nstate-of-the-art approaches while retaining the practical advantages of an\nunsupervised methodology. The findings highlight the potential of our approach\nto significantly contribute to the field of engine diagnostics, offering a\nrobust and efficient solution for real-world applications.\n","authors":["Stepan Svirin","Artem Ryzhikov","Saraa Ali","Denis Derkach"],"pdf_url":"https://arxiv.org/pdf/2411.08582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16561v2","updated":"2024-11-13T13:01:19Z","published":"2024-10-21T22:40:42Z","title":"Gradient Normalization Provably Benefits Nonconvex SGD under\n Heavy-Tailed Noise","summary":" This paper investigates the roles of gradient normalization and clipping in\nensuring the convergence of Stochastic Gradient Descent (SGD) under\nheavy-tailed noise. While existing approaches consider gradient clipping\nindispensable for SGD convergence, we theoretically demonstrate that gradient\nnormalization alone without clipping is sufficient to ensure convergence.\nFurthermore, we establish that combining gradient normalization with clipping\noffers significantly improved convergence rates compared to using either\ntechnique in isolation, particularly as gradient noise diminishes. With these\nresults, our work provides the first theoretical evidence demonstrating the\nbenefits of gradient normalization in SGD under heavy-tailed noise. Finally, we\nintroduce an accelerated SGD variant that incorporates both gradient\nnormalization and clipping, further enhancing convergence rates under\nheavy-tailed noise.\n","authors":["Tao Sun","Xinwang Liu","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2410.16561v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07743v3","updated":"2024-11-13T12:43:33Z","published":"2023-06-13T13:00:10Z","title":"V-LoL: A Diagnostic Dataset for Visual Logical Learning","summary":" Despite the successes of recent developments in visual AI, different\nshortcomings still exist; from missing exact logical reasoning, to abstract\ngeneralization abilities, to understanding complex and noisy scenes.\nUnfortunately, existing benchmarks, were not designed to capture more than a\nfew of these aspects. Whereas deep learning datasets focus on visually complex\ndata but simple visual reasoning tasks, inductive logic datasets involve\ncomplex logical learning tasks, however, lack the visual component. To address\nthis, we propose the diagnostic visual logical learning dataset, V-LoL, that\nseamlessly combines visual and logical challenges. Notably, we introduce the\nfirst instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic\nbenchmark in symbolic AI, the Michalski train problem. By incorporating\nintricate visual scenes and flexible logical reasoning tasks within a versatile\nframework, V-LoL-Train provides a platform for investigating a wide range of\nvisual logical learning challenges. We evaluate a variety of AI systems\nincluding traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our\nevaluations demonstrate that even SOTA AI faces difficulties in dealing with\nvisual logical learning challenges, highlighting unique advantages and\nlimitations of each methodology. Overall, V-LoL opens up new avenues for\nunderstanding and enhancing current abilities in visual logical learning for AI\nsystems.\n","authors":["Lukas Helff","Wolfgang Stammer","Hikaru Shindo","Devendra Singh Dhami","Kristian Kersting"],"pdf_url":"https://arxiv.org/pdf/2306.07743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.02549v2","updated":"2024-11-13T12:37:09Z","published":"2024-02-04T15:52:59Z","title":"Are Large Language Models Table-based Fact-Checkers?","summary":" Table-based Fact Verification (TFV) aims to extract the entailment relation\nbetween statements and structured tables. Existing TFV methods based on\nsmall-scaled models suffer from insufficient labeled data and weak zero-shot\nability. Recently, the appearance of Large Language Models (LLMs) has gained\nlots of attraction in research fields. They have shown powerful zero-shot and\nin-context learning abilities on several NLP tasks, but their potential on TFV\nis still unknown. In this work, we implement a preliminary study about whether\nLLMs are table-based fact-checkers. In detail, we design diverse prompts to\nexplore how the in-context learning can help LLMs in TFV, i.e., zero-shot and\nfew-shot TFV capability. Besides, we carefully design and construct TFV\ninstructions to study the performance gain brought by the instruction tuning of\nLLMs. Experimental results demonstrate that LLMs can achieve acceptable results\non zero-shot and few-shot TFV with prompt engineering, while instruction-tuning\ncan stimulate the TFV capability significantly. We also make some valuable\nfindings about the format of zero-shot prompts and the number of in-context\nexamples. Finally, we analyze some possible directions to promote the accuracy\nof TFV via LLMs, which is beneficial to further research of table reasoning.\n","authors":["Hanwen Zhang","Qingyi Si","Peng Fu","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2402.02549v2.pdf","comment":"CSCWD 2024"},{"id":"http://arxiv.org/abs/2410.10929v4","updated":"2024-11-13T12:27:38Z","published":"2024-10-14T16:35:27Z","title":"ASTM :Autonomous Smart Traffic Management System Using Artificial\n Intelligence CNN and LSTM","summary":" In the modern world, the development of Artificial Intelligence (AI) has\ncontributed to improvements in various areas, including automation, computer\nvision, fraud detection, and more. AI can be leveraged to enhance the\nefficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce\ntraffic congestion rates. This paper presents an Autonomous Smart Traffic\nManagement (STM) system that uses AI to improve traffic flow rates. The system\nemploys the YOLO V5 Convolutional Neural Network to detect vehicles in traffic\nmanagement images. Additionally, it predicts the number of vehicles for the\nnext 12 hours using a Recurrent Neural Network with Long Short-Term Memory\n(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the\ntraffic cycle length based on these vehicle predictions, aided by AI. From the\nresults of the RNN-LSTM model for predicting vehicle numbers over the next 12\nhours, we observe that the model predicts traffic with a Mean Squared Error\n(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles.\nAfter simulating the STM system in the CARLA simulation environment, we found\nthat the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per\nminute) is 50\\% higher than the rate without STM (around 15 vehicles per\nminute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5\nseconds per vehicle) is 70\\% lower than without STM (around 12 seconds per\nvehicle). These results demonstrate that the STM system using AI can increase\ntraffic flow by 50\\% and reduce vehicle pass delays by 70\\%.\n","authors":["Christofel Rio Goenawan"],"pdf_url":"https://arxiv.org/pdf/2410.10929v4.pdf","comment":"In process to IEEE Intelligent Vehicle Symposium 2025"},{"id":"http://arxiv.org/abs/2411.08566v1","updated":"2024-11-13T12:26:08Z","published":"2024-11-13T12:26:08Z","title":"Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space\n Exploration by Reinforcement Learning Agent","summary":" Grasping by a robot in unstructured environments is deemed a critical\nchallenge because of the requirement for effective adaptation to a wide\nvariation in object geometries, material properties, and other environmental\nfactors. In this paper, we propose a novel framework for robotic grasping based\non the idea of compressing high-dimensional target and gripper features in a\ncommon latent space using a set of autoencoders. Our approach simplifies\ngrasping by using three autoencoders dedicated to the target, the gripper, and\na third one that fuses their latent representations. This allows the RL agent\nto achieve higher learning rates at the initial stages of exploration of a new\nenvironment, as well as at non-zero shot grasp attempts. The agent explores the\nlatent space of the third autoencoder for better quality grasp without explicit\nreconstruction of objects. By implementing the PoWER algorithm into the RL\ntraining process, updates on the agent's policy will be made through the\nperturbation in the reward-weighted latent space. The successful exploration\nefficiently constrains both position and pose integrity for feasible executions\nof grasps. We evaluate our system on a diverse set of objects, demonstrating\nthe high success rate in grasping with minimum computational overhead. We found\nthat approach enhances the adaptation of the RL agent by more than 35 \\% in\nsimulation experiments.\n","authors":["Leonidas Askianakis"],"pdf_url":"https://arxiv.org/pdf/2411.08566v1.pdf","comment":"Submitted for review at IEEE ICRA 2025"},{"id":"http://arxiv.org/abs/2411.08557v1","updated":"2024-11-13T12:13:15Z","published":"2024-11-13T12:13:15Z","title":"Learning Locally Adaptive Metrics that Enhance Structural Representation\n with $\\texttt{LAMINAR}$","summary":" We present $\\texttt{LAMINAR}$, a novel unsupervised machine learning pipeline\ndesigned to enhance the representation of structure within data via producing a\nmore-informative distance metric. Analysis methods in the physical sciences\noften rely on standard metrics to define geometric relationships in data, which\nmay fail to capture the underlying structure of complex data sets.\n$\\texttt{LAMINAR}$ addresses this by using a continuous-normalising-flow and\ninverse-transform-sampling to define a Riemannian manifold in the data space\nwithout the need for the user to specify a metric over the data a-priori. The\nresult is a locally-adaptive-metric that produces structurally-informative\ndensity-based distances. We demonstrate the utility of $\\texttt{LAMINAR}$ by\ncomparing its output to the Euclidean metric for structured data sets.\n","authors":["Christian Kleiber","William H. Oliver","Tobias Buck"],"pdf_url":"https://arxiv.org/pdf/2411.08557v1.pdf","comment":"Accepted to the NeurIPS 2024 Machine Learning and the Physical\n Sciences workshop. 6 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.08552v1","updated":"2024-11-13T12:03:39Z","published":"2024-11-13T12:03:39Z","title":"Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with\n Variational Quantum Circuits","summary":" Quantum Machine Learning (QML) offers tremendous potential but is currently\nlimited by the availability of qubits. We introduce an innovative approach that\nutilizes pre-trained neural networks to enhance Variational Quantum Circuits\n(VQC). This technique effectively separates approximation error from qubit\ncount and removes the need for restrictive conditions, making QML more viable\nfor real-world applications. Our method significantly improves parameter\noptimization for VQC while delivering notable gains in representation and\ngeneralization capabilities, as evidenced by rigorous theoretical analysis and\nextensive empirical testing on quantum dot classification tasks. Moreover, our\nresults extend to applications such as human genome analysis, demonstrating the\nbroad applicability of our approach. By addressing the constraints of current\nquantum hardware, our work paves the way for a new era of advanced QML\napplications, unlocking the full potential of quantum computing in fields such\nas machine learning, materials science, medicine, mimetics, and various\ninterdisciplinary areas.\n","authors":["Jun Qi","Chao-Han Yang","Samuel Yen-Chi Chen","Pin-Yu Chen","Hector Zenil","Jesper Tegner"],"pdf_url":"https://arxiv.org/pdf/2411.08552v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2411.08550v1","updated":"2024-11-13T11:59:40Z","published":"2024-11-13T11:59:40Z","title":"Graph Neural Networks in Supply Chain Analytics and Optimization:\n Concepts, Perspectives, Dataset and Benchmarks","summary":" Graph Neural Networks (GNNs) have recently gained traction in transportation,\nbioinformatics, language and image processing, but research on their\napplication to supply chain management remains limited. Supply chains are\ninherently graph-like, making them ideal for GNN methodologies, which can\noptimize and solve complex problems. The barriers include a lack of proper\nconceptual foundations, familiarity with graph applications in SCM, and\nreal-world benchmark datasets for GNN-based supply chain research. To address\nthis, we discuss and connect supply chains with graph structures for effective\nGNN application, providing detailed formulations, examples, mathematical\ndefinitions, and task guidelines. Additionally, we present a multi-perspective\nreal-world benchmark dataset from a leading FMCG company in Bangladesh,\nfocusing on supply chain planning. We discuss various supply chain tasks using\nGNNs and benchmark several state-of-the-art models on homogeneous and\nheterogeneous graphs across six supply chain analytics tasks. Our analysis\nshows that GNN-based models consistently outperform statistical Machine\nLearning and other Deep Learning models by around 10-30% in regression, 10-30%\nin classification and detection tasks, and 15-40% in anomaly detection tasks on\ndesignated metrics. With this work, we lay the groundwork for solving supply\nchain problems using GNNs, supported by conceptual discussions, methodological\ninsights, and a comprehensive dataset.\n","authors":["Azmine Toushik Wasi","MD Shafikul Islam","Adipto Raihan Akib","Mahathir Mohammad Bappy"],"pdf_url":"https://arxiv.org/pdf/2411.08550v1.pdf","comment":"27 Pages. Extended journal version of SupplyGraph (arXiv:2401.15299).\n In Review"},{"id":"http://arxiv.org/abs/2411.08537v1","updated":"2024-11-13T11:35:39Z","published":"2024-11-13T11:35:39Z","title":"MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal\n Lymphatic Vessel Segmentation","summary":" Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste\nproducts from the human brain. An impairment in their functionality has been\nassociated with aging as well as brain disorders like multiple sclerosis and\nAlzheimer's disease. However, MLVs have only recently been described for the\nfirst time in magnetic resonance imaging (MRI), and their ramified structure\nrenders manual segmentation particularly difficult. Further, as there is no\nconsistent notion of their appearance, human-annotated MLV structures contain a\nhigh inter-rater variability that most automatic segmentation methods cannot\ntake into account. In this work, we propose a new rater-aware training scheme\nfor the popular nnU-Net model, and we explore rater-based ensembling strategies\nfor accurate and consistent segmentation of MLVs. This enables us to boost\nnnU-Net's performance while obtaining explicit predictions in different\nannotation styles and a rater-based uncertainty estimation. Our final model,\nMLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to\nthe human reference standard. The model further matches the human inter-rater\nreliability and replicates age-related associations with MLV volume.\n","authors":["Fabian Bongratz","Markus Karmann","Adrian Holz","Moritz Bonhoeffer","Viktor Neumaier","Sarah Deli","Benita Schmitz-Koep","Claus Zimmer","Christian Sorg","Melissa Thalhammer","Dennis M Hedderich","Christian Wachinger"],"pdf_url":"https://arxiv.org/pdf/2411.08537v1.pdf","comment":"ML4H 2024"},{"id":"http://arxiv.org/abs/2411.08530v1","updated":"2024-11-13T11:24:12Z","published":"2024-11-13T11:24:12Z","title":"Efficient Whole Slide Image Classification through Fisher Vector\n Representation","summary":" The advancement of digital pathology, particularly through computational\nanalysis of whole slide images (WSI), is poised to significantly enhance\ndiagnostic precision and efficiency. However, the large size and complexity of\nWSIs make it difficult to analyze and classify them using computers. This study\nintroduces a novel method for WSI classification by automating the\nidentification and examination of the most informative patches, thus\neliminating the need to process the entire slide. Our method involves\ntwo-stages: firstly, it extracts only a few patches from the WSIs based on\ntheir pathological significance; and secondly, it employs Fisher vectors (FVs)\nfor representing features extracted from these patches, which is known for its\nrobustness in capturing fine-grained details. This approach not only\naccentuates key pathological features within the WSI representation but also\nsignificantly reduces computational overhead, thus making the process more\nefficient and scalable. We have rigorously evaluated the proposed method across\nmultiple datasets to benchmark its performance against comprehensive WSI\nanalysis and contemporary weakly-supervised learning methodologies. The\nempirical results indicate that our focused analysis of select patches,\ncombined with Fisher vector representation, not only aligns with, but at times\nsurpasses, the classification accuracy of standard practices. Moreover, this\nstrategy notably diminishes computational load and resource expenditure,\nthereby establishing an efficient and precise framework for WSI analysis in the\nrealm of digital pathology.\n","authors":["Ravi Kant Gupta","Dadi Dharani","Shambhavi Shanker","Amit Sethi"],"pdf_url":"https://arxiv.org/pdf/2411.08530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.10040v3","updated":"2024-11-13T11:13:56Z","published":"2024-05-16T12:22:41Z","title":"SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation","summary":" It is often desirable to distill the capabilities of large language models\n(LLMs) into smaller student models due to compute and memory constraints. One\nway to do this for classification tasks is via dataset synthesis, which can be\naccomplished by generating examples of each label from the LLM. Prior\napproaches to synthesis use few-shot prompting, which relies on the LLM's\nparametric knowledge to generate usable examples. However, this leads to issues\nof repetition, bias towards popular entities, and stylistic differences from\nhuman text. In this work, we propose Synthesize by Retrieval and Refinement\n(SynthesizRR), which uses retrieval augmentation to introduce variety into the\ndataset synthesis process: as retrieved passages vary, the LLM is seeded with\ndifferent content to generate its examples. We empirically study the synthesis\nof six datasets, covering topic classification, sentiment analysis, tone\ndetection, and humor, requiring complex synthesis strategies. We find that\nSynthesizRR greatly improves lexical and semantic diversity, similarity to\nhuman-written text, and distillation performance, when compared to 32-shot\nprompting and four prior approaches. We release our code to perform all steps\nat https://github.com/amazon-science/synthesizrr\n","authors":["Abhishek Divekar","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2405.10040v3.pdf","comment":"Published as a main conference paper at EMNLP 2024. Code available at\n https://github.com/amazon-science/synthesizrr"},{"id":"http://arxiv.org/abs/2411.08521v1","updated":"2024-11-13T11:08:28Z","published":"2024-11-13T11:08:28Z","title":"SAD-TIME: a Spatiotemporal-fused network for depression detection with\n Automated multi-scale Depth-wise and TIME-interval-related common feature\n extractor","summary":" Background and Objective: Depression is a severe mental disorder, and\naccurate diagnosis is pivotal to the cure and rehabilitation of people with\ndepression. However, the current questionnaire-based diagnostic methods could\nbring subjective biases and may be denied by subjects. In search of a more\nobjective means of diagnosis, researchers have begun to experiment with deep\nlearning-based methods for identifying depressive disorders in recent years.\nMethods: In this study, a novel Spatiotemporal-fused network with Automated\nmulti-scale Depth-wise and TIME-interval-related common feature extractor\n(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common\nfeatures extractor (CFE), a spatial sector (SpS), a modified temporal sector\n(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale\ndepth-wise 1D-convolutional neural network and a time-interval embedding\ngenerator, where the unique information of each channel is preserved. The SpS\nfuses the functional connectivity with the distance-based connectivity\ncontaining spatial position of EEG electrodes. A multi-head-attention graph\nconvolutional network is also applied in the SpS to fuse the features from\ndifferent EEG channels. The TeS is based on long short-term memory and graph\ntransformer networks, where the temporal information of different time-windows\nis fused. Moreover, the DAL is used after the SpS to obtain the\ndomain-invariant feature. Results: Experimental results under tenfold\ncross-validation show that the proposed SAD-TIME method achieves 92.00% and\n94.00% depression classification accuracies on two datasets, respectively, in\ncross-subject mode. Conclusion: SAD-TIME is a robust depression detection\nmodel, where the automatedly-generated features, the SpS and the TeS assist the\nclassification performance with the fusion of the innate spatiotemporal\ninformation in the EEG signals.\n","authors":["Han-Guang Wang","Hui-Rang Hou","Li-Cheng Jin","Chen-Yang Xu","Zhong-Yi Zhang","Qing-Hao Meng"],"pdf_url":"https://arxiv.org/pdf/2411.08521v1.pdf","comment":"21pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.15166v2","updated":"2024-11-13T11:05:04Z","published":"2024-09-23T16:20:21Z","title":"Harmonic Path Integral Diffusion","summary":" In this manuscript, we present a novel approach for sampling from a\ncontinuous multivariate probability distribution, which may either be\nexplicitly known (up to a normalization factor) or represented via empirical\nsamples. Our method constructs a time-dependent bridge from a delta function\ncentered at the origin of the state space at $t=0$, optimally transforming it\ninto the target distribution at $t=1$. We formulate this as a Stochastic\nOptimal Control problem of the Path Integral Control type, with a cost function\ncomprising (in its basic form) a quadratic control term, a quadratic state\nterm, and a terminal constraint. This framework, which we refer to as Harmonic\nPath Integral Diffusion (H-PID), leverages an analytical solution through a\nmapping to an auxiliary quantum harmonic oscillator in imaginary time.\n The H-PID framework results in a set of efficient sampling algorithms,\nwithout the incorporation of Neural Networks. The algorithms are validated on\ntwo standard use cases: a mixture of Gaussians over a grid and images from\nCIFAR-10. The transparency of the method allows us to analyze the algorithms in\ndetail, particularly revealing that the current weighted state is an order\nparameter for the dynamic phase transition, signaling earlier, at $t<1$, that\nthe sample generation process is almost complete. We contrast these algorithms\nwith other sampling methods, particularly simulated annealing and path integral\nsampling, highlighting their advantages in terms of analytical control,\naccuracy, and computational efficiency on benchmark problems.\n Additionally, we extend the methodology to more general cases where the\nunderlying stochastic differential equation includes an external deterministic,\npossibly non-conservative force, and where the cost function incorporates a\ngauge potential term.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2409.15166v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08506v1","updated":"2024-11-13T10:43:31Z","published":"2024-11-13T10:43:31Z","title":"An Information Theoretic Approach to Operationalize Right to Data\n Protection","summary":" The widespread practice of indiscriminate data scraping to fine-tune language\nmodels (LMs) raises significant legal and ethical concerns, particularly\nregarding compliance with data protection laws such as the General Data\nProtection Regulation (GDPR). This practice often results in the unauthorized\nuse of personal information, prompting growing debate within the academic and\nregulatory communities. Recent works have introduced the concept of generating\nunlearnable datasets (by adding imperceptible noise to the clean data), such\nthat the underlying model achieves lower loss during training but fails to\ngeneralize to the unseen test setting. Though somewhat effective, these\napproaches are predominantly designed for images and are limited by several\npractical constraints like requiring knowledge of the target model. To this\nend, we introduce RegText, a framework that injects imperceptible spurious\ncorrelations into natural language datasets, effectively rendering them\nunlearnable without affecting semantic content. We demonstrate RegText's\nutility through rigorous empirical analysis of small and large LMs. Notably,\nRegText can restrict newer models like GPT-4o and Llama from learning on our\ngenerated data, resulting in a drop in their test accuracy compared to their\nzero-shot performance and paving the way for generating unlearnable text to\nprotect public data.\n","authors":["Abhinav Java","Simra Shahid","Chirag Agarwal"],"pdf_url":"https://arxiv.org/pdf/2411.08506v1.pdf","comment":"First two authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2301.10369v4","updated":"2024-11-13T10:35:25Z","published":"2023-01-25T00:50:28Z","title":"Exact Fractional Inference via Re-Parametrization & Interpolation\n between Tree-Re-Weighted- and Belief Propagation- Algorithms","summary":" Computing the partition function, $Z$, of an Ising model over a graph of $N$\n\\enquote{spins} is most likely exponential in $N$. Efficient variational\nmethods, such as Belief Propagation (BP) and Tree Re-Weighted (TRW) algorithms,\ncompute $Z$ approximately by minimizing the respective (BP- or TRW-) free\nenergy. We generalize the variational scheme by building a $\\lambda$-fractional\ninterpolation, $Z^{(\\lambda)}$, where $\\lambda=0$ and $\\lambda=1$ correspond to\nTRW- and BP-approximations, respectively. This fractional scheme -- coined\nFractional Belief Propagation (FBP) -- guarantees that in the attractive\n(ferromagnetic) case $Z^{(TRW)} \\geq Z^{(\\lambda)} \\geq Z^{(BP)}$, and there\nexists a unique (\\enquote{exact}) $\\lambda_*$ such that $Z=Z^{(\\lambda_*)}$.\nGeneralizing the re-parametrization approach of\n\\citep{wainwright_tree-based_2002} and the loop series approach of\n\\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\\forall\n\\lambda:\\ Z=Z^{(\\lambda)}{\\tilde Z}^{(\\lambda)}$, where the multiplicative\ncorrection, ${\\tilde Z}^{(\\lambda)}$, is an expectation over a node-independent\nprobability distribution built from node-wise fractional marginals. Our\ntheoretical analysis is complemented by extensive experiments with models from\nIsing ensembles over planar and random graphs of medium and large sizes. Our\nempirical study yields a number of interesting observations, such as the\nability to estimate ${\\tilde Z}^{(\\lambda)}$ with $O(N^{2::4})$ fractional\nsamples and suppression of variation in $\\lambda_*$ estimates with an increase\nin $N$ for instances from a particular random Ising ensemble, where $[2::4]$\nindicates a range from $2$ to $4$. We also discuss the applicability of this\napproach to the problem of image de-noising.\n","authors":["Hamidreza Behjoo","Michael Chertkov"],"pdf_url":"https://arxiv.org/pdf/2301.10369v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06058v3","updated":"2024-11-13T10:09:25Z","published":"2023-03-10T16:43:48Z","title":"A General Recipe for the Analysis of Randomized Multi-Armed Bandit\n Algorithms","summary":" In this paper we propose a general methodology to derive regret bounds for\nrandomized multi-armed bandit algorithms. It consists in checking a set of\nsufficient conditions on the sampling probability of each arm and on the family\nof distributions to prove a logarithmic regret. As a direct application we\nrevisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and\nThompson Sampling (TS), under various models for the distributions including\nsingle parameter exponential families, Gaussian distributions, bounded\ndistributions, or distributions satisfying some conditions on their moments. In\nparticular, we prove that MED is asymptotically optimal for all these models,\nbut also provide a simple regret analysis of some TS algorithms for which the\noptimality is already known. We then further illustrate the interest of our\napproach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to\nsome families of unbounded reward distributions with a bounded h-moment. This\nmodel can for instance capture some non-parametric families of distributions\nwhose variance is upper bounded by a known constant.\n","authors":["Dorian Baudry","Kazuya Suzuki","Junya Honda"],"pdf_url":"https://arxiv.org/pdf/2303.06058v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14487v3","updated":"2024-11-13T10:09:23Z","published":"2024-08-19T18:47:07Z","title":"Active learning of digenic functions with boolean matrix logic\n programming","summary":" We apply logic-based machine learning techniques to facilitate cellular\nengineering and drive biological discovery, based on comprehensive databases of\nmetabolic processes called genome-scale metabolic network models (GEMs).\nPredicted host behaviours are not always correctly described by GEMs. Learning\nthe intricate genetic interactions within GEMs presents computational and\nempirical challenges. To address these, we describe a novel approach called\nBoolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to\nevaluate large logic programs. We introduce a new system, $BMLP_{active}$,\nwhich efficiently explores the genomic hypothesis space by guiding informative\nexperimentation through active learning. In contrast to sub-symbolic methods,\n$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial\nhost in an interpretable and logical representation using datalog logic\nprograms. Notably, $BMLP_{active}$ can successfully learn the interaction\nbetween a gene pair with fewer training examples than random experimentation,\novercoming the increase in experimental design space. $BMLP_{active}$ enables\nrapid optimisation of metabolic models and offers a realistic approach to a\nself-driving lab for microbial engineering.\n","authors":["Lun Ai","Stephen H. Muggleton","Shi-shun Liang","Geoff S. Baldwin"],"pdf_url":"https://arxiv.org/pdf/2408.14487v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2405.06724"},{"id":"http://arxiv.org/abs/2410.17851v2","updated":"2024-11-13T10:01:38Z","published":"2024-10-23T13:20:42Z","title":"The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty\n Quantification","summary":" Tsetlin Machines (TMs) have emerged as a compelling alternative to\nconventional deep learning methods, offering notable advantages such as smaller\nmemory footprint, faster inference, fault-tolerant properties, and\ninterpretability. Although various adaptations of TMs have expanded their\napplicability across diverse domains, a fundamental gap remains in\nunderstanding how TMs quantify uncertainty in their predictions. In response,\nthis paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed\nat providing a robust, reliable, and interpretable approach for uncertainty\nquantification. Unlike the original TM, the PTM learns the probability of\nstaying on each state of each Tsetlin Automaton (TA) across all clauses. These\nprobabilities are updated using the feedback tables that are part of the TM\nframework: Type I and Type II feedback. During inference, TAs decide their\nactions by sampling states based on learned probability distributions, akin to\nBayesian neural networks when generating weight values. In our experimental\nanalysis, we first illustrate the spread of the probabilities across TA states\nfor the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models\nusing both simulated and real-world datasets. The experiments on the simulated\ndataset reveal the PTM's effectiveness in uncertainty quantification,\nparticularly in delineating decision boundaries and identifying regions of high\nuncertainty. Moreover, when applied to multiclass classification tasks using\nthe Iris dataset, the PTM demonstrates competitive performance in terms of\npredictive entropy and expected calibration error, showcasing its potential as\na reliable tool for uncertainty estimation. Our findings underscore the\nimportance of selecting appropriate models for accurate uncertainty\nquantification in predictive tasks, with the PTM offering a particularly\ninterpretable and effective solution.\n","authors":["K. Darshana Abeyrathna","Sara El Mekkaoui","Andreas Hafver","Christian Agrell"],"pdf_url":"https://arxiv.org/pdf/2410.17851v2.pdf","comment":"12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024,\n London"},{"id":"http://arxiv.org/abs/2411.08482v1","updated":"2024-11-13T10:01:33Z","published":"2024-11-13T10:01:33Z","title":"Methodology for a Statistical Analysis of Influencing Factors on 3D\n Object Detection Performance","summary":" In autonomous driving, object detection is an essential task to perceive the\nenvironment by localizing and classifying objects. Most object detection\nalgorithms rely on deep learning for their superior performance. However, their\nblack box nature makes it challenging to ensure safety. In this paper, we\npropose a first-of-its-kind methodology for statistical analysis of the\ninfluence of various factors related to the objects to detect or the\nenvironment on the detection performance of both LiDAR- and camera-based 3D\nobject detectors. We perform a univariate analysis between each of the factors\nand the detection error in order to compare the strength of influence. To\nbetter identify potential sources of detection errors, we also analyze the\nperformance in dependency of the influencing factors and examine the\ninterdependencies between the different influencing factors. Recognizing the\nfactors that influence detection performance helps identify robustness issues\nin the trained object detector and supports the safety approval of object\ndetection systems.\n","authors":["Anton Kuznietsov","Dirk Schweickard","Steven Peters"],"pdf_url":"https://arxiv.org/pdf/2411.08482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08478v1","updated":"2024-11-13T09:55:59Z","published":"2024-11-13T09:55:59Z","title":"Learning Model Agnostic Explanations via Constraint Programming","summary":" Interpretable Machine Learning faces a recurring challenge of explaining the\npredictions made by opaque classifiers such as ensemble models, kernel methods,\nor neural networks in terms that are understandable to humans. When the model\nis viewed as a black box, the objective is to identify a small set of features\nthat jointly determine the black box response with minimal error. However,\nfinding such model-agnostic explanations is computationally demanding, as the\nproblem is intractable even for binary classifiers. In this paper, the task is\nframed as a Constraint Optimization Problem, where the constraint solver seeks\nan explanation of minimum error and bounded size for an input data instance and\na set of samples generated by the black box. From a theoretical perspective,\nthis constraint programming approach offers PAC-style guarantees for the output\nexplanation. We evaluate the approach empirically on various datasets and show\nthat it statistically outperforms the state-of-the-art heuristic Anchors\nmethod.\n","authors":["Frederic Koriche","Jean-Marie Lagniez","Stefan Mengel","Chi Tran"],"pdf_url":"https://arxiv.org/pdf/2411.08478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07979v2","updated":"2024-11-13T09:52:45Z","published":"2024-11-12T17:58:40Z","title":"Exact, Tractable Gauss-Newton Optimization in Deep Reversible\n Architectures Reveal Poor Generalization","summary":" Second-order optimization has been shown to accelerate the training of deep\nneural networks in many applications, often yielding faster progress per\niteration on the training loss compared to first-order optimizers. However, the\ngeneralization properties of second-order methods are still being debated.\nTheoretical investigations have proved difficult to carry out outside the\ntractable settings of heavily simplified model classes -- thus, the relevance\nof existing theories to practical deep learning applications remains unclear.\nSimilarly, empirical studies in large-scale models and real datasets are\nsignificantly confounded by the necessity to approximate second-order updates\nin practice. It is often unclear whether the observed generalization behaviour\narises specifically from the second-order nature of the parameter updates, or\ninstead reflects the specific structured (e.g.\\ Kronecker) approximations used\nor any damping-based interpolation towards first-order updates. Here, we show\nfor the first time that exact Gauss-Newton (GN) updates take on a tractable\nform in a class of deep reversible architectures that are sufficiently\nexpressive to be meaningfully applied to common benchmark datasets. We exploit\nthis novel setting to study the training and generalization properties of the\nGN optimizer. We find that exact GN generalizes poorly. In the mini-batch\ntraining setting, this manifests as rapidly saturating progress even on the\n\\emph{training} loss, with parameter updates found to overfit each\nmini-batchatch without producing the features that would support generalization\nto other mini-batches. We show that our experiments run in the ``lazy'' regime,\nin which the neural tangent kernel (NTK) changes very little during the course\nof training. This behaviour is associated with having no significant changes in\nneural representations, explaining the lack of generalization.\n","authors":["Davide Buffelli","Jamie McGowan","Wangkun Xu","Alexandru Cioba","Da-shan Shiu","Guillaume Hennequin","Alberto Bernacchia"],"pdf_url":"https://arxiv.org/pdf/2411.07979v2.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.17804v3","updated":"2024-11-13T09:50:48Z","published":"2024-06-22T15:24:33Z","title":"A Review of Electromagnetic Elimination Methods for low-field portable\n MRI scanner","summary":" This paper analyzes conventional and deep learning methods for eliminating\nelectromagnetic interference (EMI) in MRI systems. We compare traditional\nanalytical and adaptive techniques with advanced deep learning approaches. Key\nstrengths and limitations of each method are highlighted. Recent advancements\nin active EMI elimination, such as external EMI receiver coils, are discussed\nalongside deep learning methods, which show superior EMI suppression by\nleveraging neural networks trained on MRI data. While deep learning improves\nEMI elimination and diagnostic capabilities, it introduces security and safety\nconcerns, particularly in commercial applications. A balanced approach,\nintegrating conventional reliability with deep learning's advanced\ncapabilities, is proposed for more effective EMI suppression in MRI systems.\n","authors":["Wanyu Bian","Panfeng Li","Mengyao Zheng","Chihang Wang","Anying Li","Ying Li","Haowei Ni","Zixuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2406.17804v3.pdf","comment":"Accepted by 2024 5th International Conference on Machine Learning and\n Computer Application"},{"id":"http://arxiv.org/abs/2304.08310v2","updated":"2024-11-13T09:47:41Z","published":"2023-04-17T14:27:19Z","title":"TreeC: a method to generate interpretable energy management systems\n using a metaheuristic algorithm","summary":" Energy management systems (EMS) have traditionally been implemented using\nrule-based control (RBC) and model predictive control (MPC) methods. However,\nrecent research has explored the use of reinforcement learning (RL) as a\npromising alternative. This paper introduces TreeC, a machine learning method\nthat utilizes the covariance matrix adaptation evolution strategy metaheuristic\nalgorithm to generate an interpretable EMS modeled as a decision tree. Unlike\nRBC and MPC approaches, TreeC learns the decision strategy of the EMS based on\nhistorical data, adapting the control model to the controlled energy grid. The\ndecision strategy is represented as a decision tree, providing interpretability\ncompared to RL methods that often rely on black-box models like neural\nnetworks. TreeC is evaluated against MPC with perfect forecast and RL EMSs in\ntwo case studies taken from literature: an electric grid case and a household\nheating case. In the electric grid case, TreeC achieves an average energy loss\nand constraint violation score of 19.2, which is close to MPC and RL EMSs that\nachieve scores of 14.4 and 16.2 respectively. All three methods control the\nelectric grid well especially when compared to the random EMS, which obtains an\naverage score of 12 875. In the household heating case, TreeC performs\nsimilarly to MPC on the adjusted and averaged electricity cost and total\ndiscomfort (0.033 EUR/m$^2$ and 0.42 Kh for TreeC compared to 0.037 EUR/m$^2$\nand 2.91 kH for MPC), while outperforming RL (0.266 EUR/m$^2$ and 24.41 Kh).\n","authors":["Julian Ruddick","Luis Ramirez Camargo","Muhammad Andy Putratama","Maarten Messagie","Thierry Coosemans"],"pdf_url":"https://arxiv.org/pdf/2304.08310v2.pdf","comment":"Accepted version Knowledge based system"},{"id":"http://arxiv.org/abs/2411.08460v1","updated":"2024-11-13T09:31:06Z","published":"2024-11-13T09:31:06Z","title":"Trap-MID: Trapdoor-based Defense against Model Inversion Attacks","summary":" Model Inversion (MI) attacks pose a significant threat to the privacy of Deep\nNeural Networks by recovering training data distribution from well-trained\nmodels. While existing defenses often rely on regularization techniques to\nreduce information leakage, they remain vulnerable to recent attacks. In this\npaper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to\nmislead MI attacks. A trapdoor is integrated into the model to predict a\nspecific label when the input is injected with the corresponding trigger.\nConsequently, this trapdoor information serves as the \"shortcut\" for MI\nattacks, leading them to extract trapdoor triggers rather than private data. We\nprovide theoretical insights into the impacts of trapdoor's effectiveness and\nnaturalness on deceiving MI attacks. In addition, empirical experiments\ndemonstrate the state-of-the-art defense performance of Trap-MID against\nvarious MI attacks without the requirements for extra data or large\ncomputational overhead. Our source code is publicly available at\nhttps://github.com/ntuaislab/Trap-MID.\n","authors":["Zhen-Ting Liu","Shang-Tse Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08460v1.pdf","comment":"Accepted by Neural Information Processing Systems (NeurIPS) 2024"},{"id":"http://arxiv.org/abs/2307.12594v2","updated":"2024-11-13T09:29:36Z","published":"2023-07-24T08:11:59Z","title":"The effect of dataset size and the process of big data mining for\n investigating solar-thermal desalination by using machine learning","summary":" Machine learning's application in solar-thermal desalination is limited by\ndata shortage and inconsistent analysis. This study develops an optimized\ndataset collection and analysis process for the representative solar still. By\nultra-hydrophilic treatment on the condensation cover, the dataset collection\nprocess reduces the collection time by 83.3%. Over 1,000 datasets are\ncollected, which is nearly one order of magnitude larger than up-to-date works.\nThen, a new interdisciplinary process flow is proposed. Some meaningful results\nare obtained that were not addressed by previous studies. It is found that\nRadom Forest might be a better choice for datasets larger than 1,000 due to\nboth high accuracy and fast speed. Besides, the dataset range affects the\nquantified importance (weighted value) of factors significantly, with up to a\n115% increment. Moreover, the results show that machine learning has a high\naccuracy on the extrapolation prediction of productivity, where the minimum\nmean relative prediction error is just around 4%. The results of this work not\nonly show the necessity of the dataset characteristics' effect but also provide\na standard process for studying solar-thermal desalination by machine learning,\nwhich would pave the way for interdisciplinary study.\n","authors":["Guilong Peng","Senshan Sun","Zhenwei Xu","Juxin Du","Yangjun Qin","Swellam W. Sharshir","A. W. Kandel","A. E. Kabeel","Nuo Yang"],"pdf_url":"https://arxiv.org/pdf/2307.12594v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00393v4","updated":"2024-11-13T09:27:41Z","published":"2024-11-01T06:40:47Z","title":"Advantages of Neural Population Coding for Deep Learning","summary":" Scalar variables, e.g., the orientation of a shape in an image, are commonly\npredicted using a single output neuron in a neural network. In contrast, the\nmammalian cortex represents variables with a population of neurons. In this\npopulation code, each neuron is most active at its preferred value and shows\npartial activity for other values. Here, we investigate the benefit of using a\npopulation code for the output layer of a neural network. We compare population\ncodes against single-neuron outputs and one-hot vectors. First, we show\ntheoretically and in experiments with synthetic data that population codes\nimprove robustness to input noise in networks of stacked linear layers. Second,\nwe demonstrate the benefit of using population codes to encode ambiguous\noutputs, such as the pose of symmetric objects. Using the T-LESS dataset of\nfeature-less real-world objects, we show that population codes improve the\naccuracy of predicting 3D object orientation from image input.\n","authors":["Heiko Hoffmann"],"pdf_url":"https://arxiv.org/pdf/2411.00393v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08443v1","updated":"2024-11-13T08:56:35Z","published":"2024-11-13T08:56:35Z","title":"Machine Unlearning on Pre-trained Models by Residual Feature Alignment\n Using LoRA","summary":" Machine unlearning is new emerged technology that removes a subset of the\ntraining data from a trained model without affecting the model performance on\nthe remaining data. This topic is becoming increasingly important in protecting\nuser privacy and eliminating harmful or outdated data. The key challenge lies\nin effectively and efficiently unlearning specific information without\ncompromising the model's utility on the retained data. For the pre-trained\nmodels, fine-tuning is an important way to achieve the unlearning target.\nPrevious work typically fine-tuned the entire model's parameters, which incurs\nsignificant computation costs. In addition, the fine-tuning process may cause\nshifts in the intermediate layer features, affecting the model's overall\nutility. In this work, we propose a novel and efficient machine unlearning\nmethod on pre-trained models. We term the method as Residual Feature Alignment\nUnlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose\nthe model's intermediate features into pre-trained features and residual\nfeatures. By adjusting the residual features, we align the unlearned model with\nthe pre-trained model at the intermediate feature level to achieve both\nunlearning and remaining targets. The method aims to learn the zero residuals\non the retained set and shifted residuals on the unlearning set. Extensive\nexperiments on numerous datasets validate the effectiveness of our approach.\n","authors":["Laiqiao Qin","Tianqing Zhu","Linlin Wang","Wanlei Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.06979v2","updated":"2024-11-13T08:41:50Z","published":"2024-06-11T06:18:29Z","title":"AudioMarkBench: Benchmarking Robustness of Audio Watermarking","summary":" The increasing realism of synthetic speech, driven by advancements in\ntext-to-speech models, raises ethical concerns regarding impersonation and\ndisinformation. Audio watermarking offers a promising solution via embedding\nhuman-imperceptible watermarks into AI-generated audios. However, the\nrobustness of audio watermarking against common/adversarial perturbations\nremains understudied. We present AudioMarkBench, the first systematic benchmark\nfor evaluating the robustness of audio watermarking against watermark removal\nand watermark forgery. AudioMarkBench includes a new dataset created from\nCommon-Voice across languages, biological sexes, and ages, 3 state-of-the-art\nwatermarking methods, and 15 types of perturbations. We benchmark the\nrobustness of these methods against the perturbations in no-box, black-box, and\nwhite-box settings. Our findings highlight the vulnerabilities of current\nwatermarking techniques and emphasize the need for more robust and fair audio\nwatermarking solutions. Our dataset and code are publicly available at\nhttps://github.com/moyangkuo/AudioMarkBench.\n","authors":["Hongbin Liu","Moyang Guo","Zhengyuan Jiang","Lun Wang","Neil Zhenqiang Gong"],"pdf_url":"https://arxiv.org/pdf/2406.06979v2.pdf","comment":"To appear in NeurIPS Datasets and Benchmarks, 2024"},{"id":"http://arxiv.org/abs/2410.21283v2","updated":"2024-11-13T08:33:17Z","published":"2024-10-11T03:19:44Z","title":"pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2","summary":" Recent advancements in protein structure prediction, particularly AlphaFold2,\nhave revolutionized structural biology by achieving near-experimental accuracy\n($\\text{average RMSD} < 1.5\\text{\\AA}$). However, the computational demands of\nthese models (approximately 30 minutes per protein on an RTX 4090)\nsignificantly limit their application in high-throughput protein screening.\nWhile large language models like ESM (Evolutionary Scale Modeling) have shown\npromise in extracting structural information directly from protein sequences,\nrapid assessment of protein structure quality for large-scale analyses remains\na major challenge.\n We introduce pLDDT-Predictor, a high-speed protein screening tool that\nachieves a $250,000\\times$ speedup compared to AlphaFold2 by leveraging\npre-trained ESM2 protein embeddings and a Transformer architecture. Our model\npredicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores\nwith a Pearson correlation of 0.7891 and processes proteins in just 0.007\nseconds on average. Using a comprehensive dataset of 1.5 million diverse\nprotein sequences (ranging from 50 to 2048 amino acids), we demonstrate that\npLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70)\nwith 91.2\\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's\npredictions.\n The source code and pre-trained models are freely available at\n\\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research\ncommunity to perform rapid, large-scale protein structure quality assessments.\n","authors":["Joongwon Chae","Zhenyu Wang","Ijaz Gul","Jiansong Ji","Zhenglin Chen","Peiwu Qin"],"pdf_url":"https://arxiv.org/pdf/2410.21283v2.pdf","comment":"6 pages main topic, 8 pages including citiation, 4 figures"},{"id":"http://arxiv.org/abs/2411.08432v1","updated":"2024-11-13T08:32:42Z","published":"2024-11-13T08:32:42Z","title":"One STEP at a time: Language Agents are Stepwise Planners","summary":" Language agents have shown promising adaptability in dynamic environments to\nperform complex tasks. However, despite the versatile knowledge embedded in\nlarge language models, these agents still fall short when it comes to tasks\nthat require planning. We introduce STEP, a novel framework designed to\nefficiently learn from previous experiences to enhance the planning\ncapabilities of language agents in future steps. Concretely, STEP functions\nthrough four interconnected components. First, the Planner takes on the task,\nbreaks it down into subtasks and provides relevant insights. Then the Executor\ngenerates action candidates, while the Evaluator ensures the actions align with\nlearned rules from previous experiences. Lastly, Memory stores experiences to\ninform future decisions. In the ScienceWorld benchmark, our results show that\nSTEP consistently outperforms state-of-the-art models, achieving an overall\nscore of 67.4 and successfully completing 12 out of 18 tasks. These findings\nhighlight STEP's potential as a framework for enhancing planning capabilities\nin language agents, paving the way for more sophisticated task-solving in\ndynamic environments.\n","authors":["Minh Nguyen","Ehsan Shareghi"],"pdf_url":"https://arxiv.org/pdf/2411.08432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03364v2","updated":"2024-11-13T08:30:59Z","published":"2024-11-05T06:54:38Z","title":"DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural\n Networks","summary":" Graph has become increasingly integral to the advancement of recommendation\nsystems, particularly with the fast development of graph neural network(GNN).\nBy exploring the virtue of rich node features and link information, GNN is\ndesigned to provide personalized and accurate suggestions. Meanwhile, the\nprivacy leakage of GNN in such contexts has also captured special attention.\nPrior work has revealed that a malicious user can utilize auxiliary knowledge\nto extract sensitive link data of the target graph, integral to recommendation\nsystems, via the decision made by the target GNN model. This poses a\nsignificant risk to the integrity and confidentiality of data used in\nrecommendation system. Though important, previous works on GNN's privacy\nleakage are still challenged in three aspects, i.e., limited stealing attack\nscenarios, sub-optimal attack performance, and adaptation against defense. To\naddress these issues, we propose a diffusion model based link stealing attack,\nnamed DM4Steal. It differs previous work from three critical aspects. (i)\nGenerality: aiming at six attack scenarios with limited auxiliary knowledge, we\npropose a novel training strategy for diffusion models so that DM4Steal is\ntransferable to diverse attack scenarios. (ii) Effectiveness: benefiting from\nthe retention of semantic structure in the diffusion model during the training\nprocess, DM4Steal is capable to learn the precise topology of the target graph\nthrough the GNN decision process. (iii) Adaptation: when GNN is defensive\n(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling\nthe score model multiple times to keep performance degradation to a minimum,\nthus DM4Steal implements successful adaptive attack on defensive GNN.\n","authors":["Jinyin Chen","Haonan Ma","Haibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2411.03364v2.pdf","comment":"We found that there were critical problems in our paper, and we\n needed to redo the experiment, which was incomplete"},{"id":"http://arxiv.org/abs/2411.07501v2","updated":"2024-11-13T08:30:52Z","published":"2024-11-12T02:57:15Z","title":"LAuReL: Learned Augmented Residual Layer","summary":" One of the core pillars of efficient deep learning methods is architectural\nimprovements such as the residual/skip connection, which has led to\nsignificantly better model convergence and quality. Since then the residual\nconnection has become ubiquitous in not just convolutional neural networks but\nalso transformer-based architectures, the backbone of LLMs.\n In this paper we introduce \\emph{Learned Augmented Residual Layer} (LAuReL)\n-- a novel generalization of the canonical residual connection -- with the goal\nto be an in-situ replacement of the latter while outperforming on both model\nquality and footprint metrics. Our experiments show that using \\laurel can help\nboost performance for both vision and language models. For example, on the\nResNet-50, ImageNet 1K task, it achieves $60\\%$ of the gains from adding an\nextra layer, while only adding $0.003\\%$ more parameters, and matches it while\nadding $2.6\\times$ fewer parameters.\n","authors":["Gaurav Menghani","Ravi Kumar","Sanjiv Kumar"],"pdf_url":"https://arxiv.org/pdf/2411.07501v2.pdf","comment":"Accepted at the 2nd Efficient Systems for Foundation Models Workshop\n at the International Conference on Machine Learning (ICML) 2024"},{"id":"http://arxiv.org/abs/2411.01137v2","updated":"2024-11-13T08:24:09Z","published":"2024-11-02T04:48:41Z","title":"Data movement limits to frontier model training","summary":" We present a theoretical model of distributed training, and use it to analyze\nhow far dense and sparse training runs can be scaled. Under our baseline\nassumptions, given a three month training duration, data movement bottlenecks\nbegin to significantly lower hardware utilization for training runs exceeding\nabout $10^{28}$ FLOP, two orders of magnitude above the largest training run to\ndate, suggesting the arrival of fundamental barriers to scaling in three years\ngiven recent rates of growth. A training run exceeding about $10^{31}$ FLOP is\ninfeasible even at low utilization. However, more aggressive batch size scaling\nand/or shorter and fatter model shapes, if achievable, have the potential to\npermit much larger training runs.\n","authors":["Ege Erdil","David Schneider-Joseph"],"pdf_url":"https://arxiv.org/pdf/2411.01137v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08425v1","updated":"2024-11-13T08:18:03Z","published":"2024-11-13T08:18:03Z","title":"Properties of fairness measures in the context of varying class\n imbalance and protected group ratios","summary":" Society is increasingly relying on predictive models in fields like criminal\njustice, credit risk management, or hiring. To prevent such automated systems\nfrom discriminating against people belonging to certain groups, fairness\nmeasures have become a crucial component in socially relevant applications of\nmachine learning. However, existing fairness measures have been designed to\nassess the bias between predictions for protected groups without considering\nthe imbalance in the classes of the target variable. Current research on the\npotential effect of class imbalance on fairness focuses on practical\napplications rather than dataset-independent measure properties. In this paper,\nwe study the general properties of fairness measures for changing class and\nprotected group proportions. For this purpose, we analyze the probability mass\nfunctions of six of the most popular group fairness measures. We also measure\nhow the probability of achieving perfect fairness changes for varying class\nimbalance ratios. Moreover, we relate the dataset-independent properties of\nfairness measures described in this paper to classifier fairness in real-life\ntasks. Our results show that measures such as Equal Opportunity and Positive\nPredictive Parity are more sensitive to changes in class imbalance than\nAccuracy Equality. These findings can help guide researchers and practitioners\nin choosing the most appropriate fairness measures for their classification\nproblems.\n","authors":["Dariusz Brzezinski","Julia Stachowiak","Jerzy Stefanowski","Izabela Szczech","Robert Susmaga","Sofya Aksenyuk","Uladzimir Ivashka","Oleksandr Yasinskyi"],"pdf_url":"https://arxiv.org/pdf/2411.08425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.15586v2","updated":"2024-11-13T08:17:38Z","published":"2024-05-24T14:14:24Z","title":"DAGER: Exact Gradient Inversion for Large Language Models","summary":" Federated learning works by aggregating locally computed gradients from\nmultiple clients, thus enabling collaborative training without sharing private\nclient data. However, prior work has shown that the data can actually be\nrecovered by the server using so-called gradient inversion attacks. While these\nattacks perform well when applied on images, they are limited in the text\ndomain and only permit approximate reconstruction of small batches and short\ninput sequences. In this work, we propose DAGER, the first algorithm to recover\nwhole batches of input text exactly. DAGER leverages the low-rank structure of\nself-attention layer gradients and the discrete nature of token embeddings to\nefficiently check if a given token sequence is part of the client data. We use\nthis check to exactly recover full batches in the honest-but-curious setting\nwithout any prior on the data for both encoder- and decoder-based architectures\nusing exhaustive heuristic search and a greedy approach, respectively. We\nprovide an efficient GPU implementation of DAGER and show experimentally that\nit recovers full batches of size up to 128 on large language models (LLMs),\nbeating prior attacks in speed (20x at same batch size), scalability (10x\nlarger batches), and reconstruction quality (ROUGE-1/2 > 0.99).\n","authors":["Ivo Petrov","Dimitar I. Dimitrov","Maximilian Baader","Mark Niklas Müller","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2405.15586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.21063v2","updated":"2024-11-13T08:13:36Z","published":"2024-05-31T17:51:07Z","title":"Neural Network Verification with Branch-and-Bound for General\n Nonlinearities","summary":" Branch-and-bound (BaB) is among the most effective techniques for neural\nnetwork (NN) verification. However, existing works on BaB for NN verification\nhave mostly focused on NNs with piecewise linear activations, especially ReLU\nnetworks. In this paper, we develop a general framework, named GenBaB, to\nconduct BaB on general nonlinearities to verify NNs with general architectures,\nbased on linear bound propagation for NN verification. To decide which neuron\nto branch, we design a new branching heuristic which leverages linear bounds as\nshortcuts to efficiently estimate the potential improvement after branching. To\ndecide nontrivial branching points for general nonlinear functions, we propose\nto pre-optimize branching points, which can be efficiently leveraged during\nverification with a lookup table. We demonstrate the effectiveness of our\nGenBaB on verifying a wide range of NNs, including NNs with activation\nfunctions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving\nmulti-dimensional nonlinear operations such as multiplications in LSTMs and\nVision Transformers. Our framework also allows the verification of general\nnonlinear computation graphs and enables verification applications beyond\nsimple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of\nthe latest $\\alpha,\\!\\beta$-CROWN, the winner of the 4th and the 5th\nInternational Verification of Neural Networks Competition (VNN-COMP 2023 and\n2024).\n","authors":["Zhouxing Shi","Qirui Jin","Zico Kolter","Suman Jana","Cho-Jui Hsieh","Huan Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.21063v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2411.08414v1","updated":"2024-11-13T08:07:21Z","published":"2024-11-13T08:07:21Z","title":"Material Property Prediction with Element Attribute Knowledge Graphs and\n Multimodal Representation Learning","summary":" Machine learning has become a crucial tool for predicting the properties of\ncrystalline materials. However, existing methods primarily represent material\ninformation by constructing multi-edge graphs of crystal structures, often\noverlooking the chemical and physical properties of elements (such as atomic\nradius, electronegativity, melting point, and ionization energy), which have a\nsignificant impact on material performance. To address this limitation, we\nfirst constructed an element property knowledge graph and utilized an embedding\nmodel to encode the element attributes within the knowledge graph. Furthermore,\nwe propose a multimodal fusion framework, ESNet, which integrates element\nproperty features with crystal structure features to generate joint multimodal\nrepresentations. This provides a more comprehensive perspective for predicting\nthe performance of crystalline materials, enabling the model to consider both\nmicrostructural composition and chemical characteristics of the materials. We\nconducted experiments on the Materials Project benchmark dataset, which showed\nleading performance in the bandgap prediction task and achieved results on a\npar with existing benchmarks in the formation energy prediction task.\n","authors":["Chao Huang","Chunyan Chen","Ling Shi","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2411.08414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08404v1","updated":"2024-11-13T07:45:40Z","published":"2024-11-13T07:45:40Z","title":"Quantifying Qualitative Insights: Leveraging LLMs to Market Predict","summary":" Recent advancements in Large Language Models (LLMs) have the potential to\ntransform financial analytics by integrating numerical and textual data.\nHowever, challenges such as insufficient context when fusing multimodal\ninformation and the difficulty in measuring the utility of qualitative outputs,\nwhich LLMs generate as text, have limited their effectiveness in tasks such as\nfinancial forecasting. This study addresses these challenges by leveraging\ndaily reports from securities firms to create high-quality contextual\ninformation. The reports are segmented into text-based key factors and combined\nwith numerical data, such as price information, to form context sets. By\ndynamically updating few-shot examples based on the query time, the sets\nincorporate the latest information, forming a highly relevant set closely\naligned with the query point. Additionally, a crafted prompt is designed to\nassign scores to the key factors, converting qualitative insights into\nquantitative results. The derived scores undergo a scaling process,\ntransforming them into real-world values that are used for prediction. Our\nexperiments demonstrate that LLMs outperform time-series models in market\nforecasting, though challenges such as imperfect reproducibility and limited\nexplainability remain.\n","authors":["Hoyoung Lee","Youngsoo Choi","Yuhee Kwon"],"pdf_url":"https://arxiv.org/pdf/2411.08404v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.08397v1","updated":"2024-11-13T07:32:58Z","published":"2024-11-13T07:32:58Z","title":"CLaSP: Learning Concepts for Time-Series Signals from Natural Language\n Supervision","summary":" This paper proposes a foundation model called \"CLaSP\" that can search time\nseries signals using natural language that describes the characteristics of the\nsignals as queries. Previous efforts to represent time series signal data in\nnatural language have had challenges in designing a conventional class of time\nseries signal characteristics, formulating their quantification, and creating a\ndictionary of synonyms. To overcome these limitations, the proposed method\nintroduces a neural network based on contrastive learning. This network is\nfirst trained using the datasets TRUCE and SUSHI, which consist of time series\nsignals and their corresponding natural language descriptions. Previous studies\nhave proposed vocabularies that data analysts use to describe signal\ncharacteristics, and SUSHI was designed to cover these terms. We believe that a\nneural network trained on these datasets will enable data analysts to search\nusing natural language vocabulary. Furthermore, our method does not require a\ndictionary of predefined synonyms, and it leverages common sense knowledge\nembedded in a large-scale language model (LLM). Experimental results\ndemonstrate that CLaSP enables natural language search of time series signal\ndata and can accurately learn the points at which signal data changes.\n","authors":["Aoi Ito","Kota Dohi","Yohei Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2411.08397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08384v1","updated":"2024-11-13T07:10:18Z","published":"2024-11-13T07:10:18Z","title":"Interpretable Syntactic Representations Enable Hierarchical Word Vectors","summary":" The distributed representations currently used are dense and uninterpretable,\nleading to interpretations that themselves are relative, overcomplete, and hard\nto interpret. We propose a method that transforms these word vectors into\nreduced syntactic representations. The resulting representations are compact\nand interpretable allowing better visualization and comparison of the word\nvectors and we successively demonstrate that the drawn interpretations are in\nline with human judgment. The syntactic representations are then used to create\nhierarchical word vectors using an incremental learning approach similar to the\nhierarchical aspect of human learning. As these representations are drawn from\npre-trained vectors, the generation process and learning approach are\ncomputationally efficient. Most importantly, we find out that syntactic\nrepresentations provide a plausible interpretation of the vectors and\nsubsequent hierarchical vectors outperform the original vectors in benchmark\ntests.\n","authors":["Biraj Silwal"],"pdf_url":"https://arxiv.org/pdf/2411.08384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08378v1","updated":"2024-11-13T07:03:47Z","published":"2024-11-13T07:03:47Z","title":"Physics Informed Distillation for Diffusion Models","summary":" Diffusion models have recently emerged as a potent tool in generative\nmodeling. However, their inherent iterative nature often results in sluggish\nimage generation due to the requirement for multiple model evaluations. Recent\nprogress has unveiled the intrinsic link between diffusion models and\nProbability Flow Ordinary Differential Equations (ODEs), thus enabling us to\nconceptualize diffusion models as ODE systems. Simultaneously, Physics Informed\nNeural Networks (PINNs) have substantiated their effectiveness in solving\nintricate differential equations through implicit modeling of their solutions.\nBuilding upon these foundational insights, we introduce Physics Informed\nDistillation (PID), which employs a student model to represent the solution of\nthe ODE system corresponding to the teacher diffusion model, akin to the\nprinciples employed in PINNs. Through experiments on CIFAR 10 and ImageNet\n64x64, we observe that PID achieves performance comparable to recent\ndistillation methods. Notably, it demonstrates predictable trends concerning\nmethod-specific hyperparameters and eliminates the need for synthetic dataset\ngeneration during the distillation process. Both of which contribute to its\neasy-to-use nature as a distillation approach for Diffusion Models. Our code\nand pre-trained checkpoint are publicly available at:\nhttps://github.com/pantheon5100/pid_diffusion.git.\n","authors":["Joshua Tian Jin Tee","Kang Zhang","Hee Suk Yoon","Dhananjaya Nagaraja Gowda","Chanwoo Kim","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2411.08378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.02775v4","updated":"2024-11-13T06:57:35Z","published":"2022-01-08T06:18:17Z","title":"ADI: Adversarial Dominating Inputs in Vertical Federated Learning\n Systems","summary":" Vertical federated learning (VFL) system has recently become prominent as a\nconcept to process data distributed across many individual sources without the\nneed to centralize it. Multiple participants collaboratively train models based\non their local data in a privacy-aware manner. To date, VFL has become a de\nfacto solution to securely learn a model among organizations, allowing\nknowledge to be shared without compromising privacy of any individuals. Despite\nthe prosperous development of VFL systems, we find that certain inputs of a\nparticipant, named adversarial dominating inputs (ADIs), can dominate the joint\ninference towards the direction of the adversary's will and force other\n(victim) participants to make negligible contributions, losing rewards that are\nusually offered regarding the importance of their contributions in federated\nlearning scenarios. We conduct a systematic study on ADIs by first proving\ntheir existence in typical VFL systems. We then propose gradient-based methods\nto synthesize ADIs of various formats and exploit common VFL systems. We\nfurther launch greybox fuzz testing, guided by the saliency score of ``victim''\nparticipants, to perturb adversary-controlled inputs and systematically explore\nthe VFL attack surface in a privacy-preserving manner. We conduct an in-depth\nstudy on the influence of critical parameters and settings in synthesizing\nADIs. Our study reveals new VFL attack opportunities, promoting the\nidentification of unknown threats before breaches and building more secure VFL\nsystems.\n","authors":["Qi Pang","Yuanyuan Yuan","Shuai Wang","Wenting Zheng"],"pdf_url":"https://arxiv.org/pdf/2201.02775v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08374v1","updated":"2024-11-13T06:54:05Z","published":"2024-11-13T06:54:05Z","title":"Federated Graph Learning with Graphless Clients","summary":" Federated Graph Learning (FGL) is tasked with training machine learning\nmodels, such as Graph Neural Networks (GNNs), for multiple clients, each with\nits own graph data. Existing methods usually assume that each client has both\nnode features and graph structure of its graph data. In real-world scenarios,\nhowever, there exist federated systems where only a part of the clients have\nsuch data while other clients (i.e. graphless clients) may only have node\nfeatures. This naturally leads to a novel problem in FGL: how to jointly train\na model over distributed graph data with graphless clients? In this paper, we\npropose a novel framework FedGLS to tackle the problem in FGL with graphless\nclients. In FedGLS, we devise a local graph learner on each graphless client\nwhich learns the local graph structure with the structure knowledge transferred\nfrom other clients. To enable structure knowledge transfer, we design a GNN\nmodel and a feature encoder on each client. During local training, the feature\nencoder retains the local graph structure knowledge together with the GNN model\nvia knowledge distillation, and the structure knowledge is transferred among\nclients in global update. Our extensive experiments demonstrate the superiority\nof the proposed FedGLS over five baselines.\n","authors":["Xingbo Fu","Song Wang","Yushun Dong","Binchi Zhang","Chen Chen","Jundong Li"],"pdf_url":"https://arxiv.org/pdf/2411.08374v1.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2410.21564v2","updated":"2024-11-13T06:35:53Z","published":"2024-10-28T21:54:44Z","title":"Mitigating Gradient Overlap in Deep Residual Networks with Gradient\n Normalization for Improved Non-Convex Optimization","summary":" In deep learning, Residual Networks (ResNets) have proven effective in\naddressing the vanishing gradient problem, allowing for the successful training\nof very deep networks. However, skip connections in ResNets can lead to\ngradient overlap, where gradients from both the learned transformation and the\nskip connection combine, potentially resulting in overestimated gradients. This\noverestimation can cause inefficiencies in optimization, as some updates may\novershoot optimal regions, affecting weight updates. To address this, we\nexamine Z-score Normalization (ZNorm) as a technique to manage gradient\noverlap. ZNorm adjusts the gradient scale, standardizing gradients across\nlayers and reducing the negative impact of overlapping gradients. Our\nexperiments demonstrate that ZNorm improves training process, especially in\nnon-convex optimization scenarios common in deep learning, where finding\noptimal solutions is challenging. These findings suggest that ZNorm can affect\nthe gradient flow, enhancing performance in large-scale data processing where\naccuracy is critical.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2410.21564v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07934v2","updated":"2024-11-13T06:34:07Z","published":"2024-11-12T17:04:56Z","title":"Doubly Mild Generalization for Offline Reinforcement Learning","summary":" Offline Reinforcement Learning (RL) suffers from the extrapolation error and\nvalue overestimation. From a generalization perspective, this issue can be\nattributed to the over-generalization of value functions or policies towards\nout-of-distribution (OOD) actions. Significant efforts have been devoted to\nmitigating such generalization, and recent in-sample learning approaches have\nfurther succeeded in entirely eschewing it. Nevertheless, we show that mild\ngeneralization beyond the dataset can be trusted and leveraged to improve\nperformance under certain conditions. To appropriately exploit generalization\nin offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild\naction generalization and (ii) mild generalization propagation. The former\nrefers to selecting actions in a close neighborhood of the dataset to maximize\nthe Q values. Even so, the potential erroneous generalization can still be\npropagated, accumulated, and exacerbated by bootstrapping. In light of this,\nthe latter concept is introduced to mitigate the generalization propagation\nwithout impeding the propagation of RL learning signals. Theoretically, DMG\nguarantees better performance than the in-sample optimal policy in the oracle\ngeneralization scenario. Even under worst-case generalization, DMG can still\ncontrol value overestimation at a certain level and lower bound the\nperformance. Empirically, DMG achieves state-of-the-art performance across\nGym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting\nfrom its flexibility in both generalization aspects, DMG enjoys a seamless\ntransition from offline to online learning and attains strong online\nfine-tuning performance.\n","authors":["Yixiu Mao","Qi Wang","Yun Qu","Yuhang Jiang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2411.07934v2.pdf","comment":"Accepted to NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08367v1","updated":"2024-11-13T06:32:17Z","published":"2024-11-13T06:32:17Z","title":"Surprisingly Popular Voting for Concentric Rank-Order Models","summary":" An important problem on social information sites is the recovery of ground\ntruth from individual reports when the experts are in the minority. The wisdom\nof the crowd, i.e. the collective opinion of a group of individuals fails in\nsuch a scenario. However, the surprisingly popular (SP)\nalgorithm~\\cite{prelec2017solution} can recover the ground truth even when the\nexperts are in the minority, by asking the individuals to report additional\nprediction reports--their beliefs about the reports of others. Several recent\nworks have extended the surprisingly popular algorithm to an equivalent voting\nrule (SP-voting) to recover the ground truth ranking over a set of $m$\nalternatives. However, we are yet to fully understand when SP-voting can\nrecover the ground truth ranking, and if so, how many samples (votes and\npredictions) it needs. We answer this question by proposing two rank-order\nmodels and analyzing the sample complexity of SP-voting under these models. In\nparticular, we propose concentric mixtures of Mallows and Plackett-Luce models\nwith $G (\\ge 2)$ groups. Our models generalize previously proposed concentric\nmixtures of Mallows models with $2$ groups, and we highlight the importance of\n$G > 2$ groups by identifying three distinct groups (expert, intermediate, and\nnon-expert) from existing datasets. Next, we provide conditions on the\nparameters of the underlying models so that SP-voting can recover ground-truth\nrankings with high probability, and also derive sample complexities under the\nsame. We complement the theoretical results by evaluating SP-voting on\nsimulated and real datasets.\n","authors":["Hadi Hosseini","Debmalya Mandal","Amrit Puhan"],"pdf_url":"https://arxiv.org/pdf/2411.08367v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08360v1","updated":"2024-11-13T06:16:12Z","published":"2024-11-13T06:16:12Z","title":"Coverage Analysis for Digital Cousin Selection -- Improving\n Multi-Environment Q-Learning","summary":" Q-learning is widely employed for optimizing various large-dimensional\nnetworks with unknown system dynamics. Recent advancements include\nmulti-environment mixed Q-learning (MEMQ) algorithms, which utilize multiple\nindependent Q-learning algorithms across multiple, structurally related but\ndistinct environments and outperform several state-of-the-art Q-learning\nalgorithms in terms of accuracy, complexity, and robustness. We herein conduct\na comprehensive probabilistic coverage analysis to ensure optimal data coverage\nconditions for MEMQ algorithms. First, we derive upper and lower bounds on the\nexpectation and variance of different coverage coefficients (CC) for MEMQ\nalgorithms. Leveraging these bounds, we develop a simple way of comparing the\nutilities of multiple environments in MEMQ algorithms. This approach appears to\nbe near optimal versus our previously proposed partial ordering approach. We\nalso present a novel CC-based MEMQ algorithm to improve the accuracy and\ncomplexity of existing MEMQ algorithms. Numerical experiments are conducted\nusing random network graphs with four different graph properties. Our algorithm\ncan reduce the average policy error (APE) by 65% compared to partial ordering\nand is 95% faster than the exhaustive search. It also achieves 60% less APE\nthan several state-of-the-art reinforcement learning and prior MEMQ algorithms.\nAdditionally, we numerically verify the theoretical results and show their\nscalability with the action-space size.\n","authors":["Talha Bozkus","Tara Javidi","Urbashi Mitra"],"pdf_url":"https://arxiv.org/pdf/2411.08360v1.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2409.18696v2","updated":"2024-11-13T06:01:51Z","published":"2024-09-27T12:34:08Z","title":"Rethinking the Power of Timestamps for Robust Time Series Forecasting: A\n Global-Local Fusion Perspective","summary":" Time series forecasting has played a pivotal role across various industries,\nincluding finance, transportation, energy, healthcare, and climate. Due to the\nabundant seasonal information they contain, timestamps possess the potential to\noffer robust global guidance for forecasting techniques. However, existing\nworks primarily focus on local observations, with timestamps being treated\nmerely as an optional supplement that remains underutilized. When data gathered\nfrom the real world is polluted, the absence of global information will damage\nthe robust prediction capability of these algorithms. To address these\nproblems, we propose a novel framework named GLAFF. Within this framework, the\ntimestamps are modeled individually to capture the global dependencies. Working\nas a plugin, GLAFF adaptively adjusts the combined weights for global and local\ninformation, enabling seamless collaboration with any time series forecasting\nbackbone. Extensive experiments conducted on nine real-world datasets\ndemonstrate that GLAFF significantly enhances the average performance of widely\nused mainstream forecasting models by 12.5%, surpassing the previous\nstate-of-the-art method by 5.5%.\n","authors":["Chengsen Wang","Qi Qi","Jingyu Wang","Haifeng Sun","Zirui Zhuang","Jinming Wu","Jianxin Liao"],"pdf_url":"https://arxiv.org/pdf/2409.18696v2.pdf","comment":"Accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.08355v1","updated":"2024-11-13T05:59:04Z","published":"2024-11-13T05:59:04Z","title":"Communication Efficient Decentralization for Smoothed Online Convex\n Optimization","summary":" We study the multi-agent Smoothed Online Convex Optimization (SOCO) problem,\nwhere $N$ agents interact through a communication graph. In each round, each\nagent $i$ receives a strongly convex hitting cost function $f^i_t$ in an online\nfashion and selects an action $x^i_t \\in \\mathbb{R}^d$. The objective is to\nminimize the global cumulative cost, which includes the sum of individual\nhitting costs $f^i_t(x^i_t)$, a temporal \"switching cost\" for changing\ndecisions, and a spatial \"dissimilarity cost\" that penalizes deviations in\ndecisions among neighboring agents. We propose the first decentralized\nalgorithm for multi-agent SOCO and prove its asymptotic optimality. Our\napproach allows each agent to operate using only local information from its\nimmediate neighbors in the graph. For finite-time performance, we establish\nthat the optimality gap in competitive ratio decreases with the time horizon\n$T$ and can be conveniently tuned based on the per-round computation available\nto each agent. Moreover, our results hold even when the communication graph\nchanges arbitrarily and adaptively over time. Finally, we establish that the\ncomputational complexity per round depends only logarithmically on the number\nof agents and almost linearly on their degree within the graph, ensuring\nscalability for large-system implementations.\n","authors":["Neelkamal Bhuyan","Debankur Mukherjee","Adam Wierman"],"pdf_url":"https://arxiv.org/pdf/2411.08355v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2411.04493v2","updated":"2024-11-13T05:52:23Z","published":"2024-11-07T07:41:04Z","title":"Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised\n Medical Image Segmentation","summary":" Semi-supervised learning has received considerable attention for its\npotential to leverage abundant unlabeled data to enhance model robustness.\nPseudo labeling is a widely used strategy in semi supervised learning. However,\nexisting methods often suffer from noise contamination, which can undermine\nmodel performance. To tackle this challenge, we introduce a novel\nSynergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework.\nBuilt upon the mean teacher network, we employ a Mix Augmentation module to\nenhance the unlabeled data. By evaluating the synergy before and after\naugmentation, we strategically partition the pseudo labels into distinct\nregions. Additionally, we introduce a Region Loss Evaluation module to assess\nthe loss across each delineated area. Extensive experiments conducted on the LA\ndataset have demonstrated superior performance over state-of-the-art\ntechniques, underscoring the efficiency and practicality of our framework.\n","authors":["Tao Wang","Xinlin Zhang","Yuanbin Chen","Yuanbo Zhou","Longxuan Zhao","Tao Tan","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2411.04493v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08344v1","updated":"2024-11-13T05:22:45Z","published":"2024-11-13T05:22:45Z","title":"Bangla Grammatical Error Detection Leveraging Transformer-based Token\n Classification","summary":" Bangla is the seventh most spoken language by a total number of speakers in\nthe world, and yet the development of an automated grammar checker in this\nlanguage is an understudied problem. Bangla grammatical error detection is a\ntask of detecting sub-strings of a Bangla text that contain grammatical,\npunctuation, or spelling errors, which is crucial for developing an automated\nBangla typing assistant. Our approach involves breaking down the task as a\ntoken classification problem and utilizing state-of-the-art transformer-based\nmodels. Finally, we combine the output of these models and apply rule-based\npost-processing to generate a more reliable and comprehensive result. Our\nsystem is evaluated on a dataset consisting of over 25,000 texts from various\nsources. Our best model achieves a Levenshtein distance score of 1.04. Finally,\nwe provide a detailed analysis of different components of our system.\n","authors":["Shayekh Bin Islam","Ridwanul Hasan Tanvir","Sihat Afnan"],"pdf_url":"https://arxiv.org/pdf/2411.08344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03497v3","updated":"2024-11-13T04:41:31Z","published":"2024-08-07T01:37:10Z","title":"Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and\n Tabnet with SMOTEENN","summary":" Bank credit risk is a significant challenge in modern financial transactions,\nand the ability to identify qualified credit card holders among a large number\nof applicants is crucial for the profitability of a bank'sbank's credit card\nbusiness. In the past, screening applicants'applicants' conditions often\nrequired a significant amount of manual labor, which was time-consuming and\nlabor-intensive. Although the accuracy and reliability of previously used ML\nmodels have been continuously improving, the pursuit of more reliable and\npowerful AI intelligent models is undoubtedly the unremitting pursuit by major\nbanks in the financial industry. In this study, we used a dataset of over\n40,000 records provided by a commercial bank as the research object. We\ncompared various dimensionality reduction techniques such as PCA and T-SNE for\npreprocessing high-dimensional datasets and performed in-depth adaptation and\ntuning of distributed models such as LightGBM and XGBoost, as well as deep\nmodels like Tabnet. After a series of research and processing, we obtained\nexcellent research results by combining SMOTEENN with these techniques. The\nexperiments demonstrated that LightGBM combined with PCA and SMOTEENN\ntechniques can assist banks in accurately predicting potential high-quality\ncustomers, showing relatively outstanding performance compared to other models.\n","authors":["Chang Yu","Yixin Jin","Qianwen Xing","Ye Zhang","Shaobo Guo","Shuchen Meng"],"pdf_url":"https://arxiv.org/pdf/2408.03497v3.pdf","comment":"8 pagess on IEEE ICPICS"},{"id":"http://arxiv.org/abs/2411.08332v1","updated":"2024-11-13T04:27:25Z","published":"2024-11-13T04:27:25Z","title":"Learning-Augmented Algorithms for Online Concave Packing and Convex\n Covering Problems","summary":" Learning-augmented algorithms have been extensively studied across the\ncomputer science community in the recent years, driven by advances in machine\nlearning predictors, which can provide additional information to augment\nclassical algorithms. Such predictions are especially powerful in the context\nof online problems, where decisions have to be made without knowledge of the\nfuture, and which traditionally exhibits impossibility results bounding the\nperformance of any online algorithm. The study of learning-augmented algorithms\nthus aims to use external advice prudently, to overcome classical impossibility\nresults when the advice is accurate, and still perform comparably to the\nstate-of-the-art online algorithms even when the advice is inaccurate.\n In this paper, we present learning-augmented algorithmic frameworks for two\nfundamental optimizations settings, extending and generalizing prior works. For\nonline packing with concave objectives, we present a simple but overarching\nstrategy that switches between the advice and the state-of-the-art online\nalgorithm. For online covering with convex objectives, we greatly extend\nprimal-dual methods for online convex covering programs by Azar et al. (FOCS\n2016) and previous learning-augmented framework for online covering linear\nprograms from the literature, to many new applications. We show that our\nalgorithms break impossibility results when the advice is accurate, while\nmaintaining comparable performance with state-of-the-art classical online\nalgorithms even when the advice is erroneous.\n","authors":["Elena Grigorescu","Young-San Lin","Maoyuan Song"],"pdf_url":"https://arxiv.org/pdf/2411.08332v1.pdf","comment":"38 pages. In submission"},{"id":"http://arxiv.org/abs/2411.08326v1","updated":"2024-11-13T04:20:29Z","published":"2024-11-13T04:20:29Z","title":"Neural Conjugate Flows: Physics-informed architectures with flow\n structure","summary":" We introduce Neural Conjugate Flows (NCF), a class of neural network\narchitectures equipped with exact flow structure. By leveraging topological\nconjugation, we prove that these networks are not only naturally isomorphic to\na continuous group, but are also universal approximators for flows of ordinary\ndifferential equation (ODEs). Furthermore, topological properties of these\nflows can be enforced by the architecture in an interpretable manner. We\ndemonstrate in numerical experiments how this topological group structure leads\nto concrete computational gains over other physics informed neural networks in\nestimating and extrapolating latent dynamics of ODEs, while training up to five\ntimes faster than other flow-based architectures.\n","authors":["Arthur Bizzi","Lucas Nissenbaum","João M. Pereira"],"pdf_url":"https://arxiv.org/pdf/2411.08326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08324v1","updated":"2024-11-13T04:20:20Z","published":"2024-11-13T04:20:20Z","title":"Are LLMs Prescient? A Continuous Evaluation using Daily News as the\n Oracle","summary":" Many existing evaluation benchmarks for Large Language Models (LLMs) quickly\nbecome outdated due to the emergence of new models and training data. These\nbenchmarks also fall short in assessing how LLM performance changes over time,\nas they consist of static questions without a temporal dimension. To address\nthese limitations, we propose using future event prediction as a continuous\nevaluation method to assess LLMs' temporal generalization and forecasting\nabilities. Our benchmark, Daily Oracle, automatically generates question-answer\n(QA) pairs from daily news, challenging LLMs to predict \"future\" event\noutcomes. Our findings reveal that as pre-training data becomes outdated, LLM\nperformance degrades over time. While Retrieval Augmented Generation (RAG) has\nthe potential to enhance prediction accuracy, the performance degradation\npattern persists, highlighting the need for continuous model updates.\n","authors":["Hui Dai","Ryan Teehan","Mengye Ren"],"pdf_url":"https://arxiv.org/pdf/2411.08324v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07207v2","updated":"2024-11-13T04:15:38Z","published":"2024-11-11T18:32:44Z","title":"General Geospatial Inference with a Population Dynamics Foundation Model","summary":" Supporting the health and well-being of dynamic populations around the world\nrequires governmental agencies, organizations and researchers to understand and\nreason over complex relationships between human behavior and local contexts in\norder to identify high-risk groups and strategically allocate limited\nresources. Traditional approaches to these classes of problems often entail\ndeveloping manually curated, task-specific features and models to represent\nhuman behavior and the natural and built environment, which can be challenging\nto adapt to new, or even, related tasks. To address this, we introduce a\nPopulation Dynamics Foundation Model (PDFM) that aims to capture the\nrelationships between diverse data modalities and is applicable to a broad\nrange of geospatial tasks. We first construct a geo-indexed dataset for postal\ncodes and counties across the United States, capturing rich aggregated\ninformation on human behavior from maps, busyness, and aggregated search\ntrends, and environmental factors such as weather and air quality. We then\nmodel this data and the complex relationships between locations using a graph\nneural network, producing embeddings that can be adapted to a wide range of\ndownstream tasks using relatively simple models. We evaluate the effectiveness\nof our approach by benchmarking it on 27 downstream tasks spanning three\ndistinct domains: health indicators, socioeconomic factors, and environmental\nmeasurements. The approach achieves state-of-the-art performance on all 27\ngeospatial interpolation tasks, and on 25 out of the 27 extrapolation and\nsuper-resolution tasks. We combined the PDFM with a state-of-the-art\nforecasting foundation model, TimesFM, to predict unemployment and poverty,\nachieving performance that surpasses fully supervised forecasting. The full set\nof embeddings and sample code are publicly available for researchers.\n","authors":["Mohit Agarwal","Mimi Sun","Chaitanya Kamath","Arbaaz Muslim","Prithul Sarker","Joydeep Paul","Hector Yee","Marcin Sieniek","Kim Jablonski","Yael Mayer","David Fork","Sheila de Guia","Jamie McPike","Adam Boulanger","Tomer Shekel","David Schottlander","Yao Xiao","Manjit Chakravarthy Manukonda","Yun Liu","Neslihan Bulut","Sami Abu-el-haija","Arno Eigenwillig","Parth Kothari","Bryan Perozzi","Monica Bharel","Von Nguyen","Luke Barrington","Niv Efron","Yossi Matias","Greg Corrado","Krish Eswaran","Shruthi Prabhakara","Shravya Shetty","Gautam Prasad"],"pdf_url":"https://arxiv.org/pdf/2411.07207v2.pdf","comment":"28 pages, 16 figures, preprint; v2: updated github url"},{"id":"http://arxiv.org/abs/2411.08314v1","updated":"2024-11-13T03:42:55Z","published":"2024-11-13T03:42:55Z","title":"Conditional Variable Flow Matching: Transforming Conditional Densities\n with Amortized Conditional Optimal Transport","summary":" Forecasting stochastic nonlinear dynamical systems under the influence of\nconditioning variables is a fundamental challenge repeatedly encountered across\nthe biological and physical sciences. While flow-based models can impressively\npredict the temporal evolution of probability distributions representing\npossible outcomes of a specific process, existing frameworks cannot\nsatisfactorily account for the impact of conditioning variables on these\ndynamics. Amongst several limitations, existing methods require training data\nwith paired conditions and are developed for discrete conditioning variables.\nWe propose Conditional Variable Flow Matching (CVFM), a framework for learning\nflows transforming conditional distributions with amortization across\ncontinuous conditioning variables - permitting predictions across the\nconditional density manifold. This is accomplished through several novel\nadvances, in particular, simultaneous sample conditioned flows over the main\nand conditioning variables, alongside a conditional Wasserstein distance and\nkernel facilitating conditional optimal transport. Collectively, these advances\nallow for learning system dynamics provided measurement data whose states and\nconditioning variables are not in correspondence. We demonstrate CVFM on a\nsuite of increasingly challenging problems, including discrete and continuous\nconditional mapping benchmarks, image-to-image domain transfer, and modeling\nthe temporal evolution of materials internal structure during manufacturing\nprocesses. We observe that CVFM results in improved performance and convergence\ncharacteristics over alternative conditional variants.\n","authors":["Adam P. Generale","Andreas E. Robertson","Surya R. Kalidindi"],"pdf_url":"https://arxiv.org/pdf/2411.08314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08621v3","updated":"2024-11-13T03:24:24Z","published":"2024-02-13T17:42:27Z","title":"A Unified Framework for Analyzing Meta-algorithms in Online Convex\n Optimization","summary":" In this paper, we analyze the problem of online convex optimization in\ndifferent settings, including different feedback types\n(full-information/semi-bandit/bandit/etc) in either stochastic or\nnon-stochastic setting and different notions of regret (static adversarial\nregret/dynamic regret/adaptive regret). This is done through a framework which\nallows us to systematically propose and analyze meta-algorithms for the various\nsettings described above. We show that any algorithm for online linear\noptimization with fully adaptive adversaries is an algorithm for online convex\noptimization. We also show that any such algorithm that requires\nfull-information feedback may be transformed to an algorithm with semi-bandit\nfeedback with comparable regret bound. We further show that algorithms that are\ndesigned for fully adaptive adversaries using deterministic semi-bandit\nfeedback can obtain similar bounds using only stochastic semi-bandit feedback\nwhen facing oblivious adversaries. We use this to describe general\nmeta-algorithms to convert first order algorithms to zeroth order algorithms\nwith comparable regret bounds. Our framework allows us to analyze online\noptimization in various settings, recovers several results in the literature\nwith a simplified proof technique, and provides new results.\n","authors":["Mohammad Pedramfar","Vaneet Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2402.08621v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08306v1","updated":"2024-11-13T03:08:33Z","published":"2024-11-13T03:08:33Z","title":"SDDBench: A Benchmark for Synthesizable Drug Design","summary":" A significant challenge in wet lab experiments with current drug design\ngenerative models is the trade-off between pharmacological properties and\nsynthesizability. Molecules predicted to have highly desirable properties are\noften difficult to synthesize, while those that are easily synthesizable tend\nto exhibit less favorable properties. As a result, evaluating the\nsynthesizability of molecules in general drug design scenarios remains a\nsignificant challenge in the field of drug discovery. The commonly used\nsynthetic accessibility (SA) score aims to evaluate the ease of synthesizing\ngenerated molecules, but it falls short of guaranteeing that synthetic routes\ncan actually be found. Inspired by recent advances in top-down synthetic route\ngeneration, we propose a new, data-driven metric to evaluate molecule\nsynthesizability. Our approach directly assesses the feasibility of synthetic\nroutes for a given molecule through our proposed round-trip score. This novel\nmetric leverages the synergistic duality between retrosynthetic planners and\nreaction predictors, both of which are trained on extensive reaction datasets.\nTo demonstrate the efficacy of our method, we conduct a comprehensive\nevaluation of round-trip scores alongside search success rate across a range of\nrepresentative molecule generative models. Code is available at\nhttps://github.com/SongtaoLiu0823/SDDBench.\n","authors":["Songtao Liu","Zhengkai Tu","Hanjun Dai","Peng Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08306v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07468v2","updated":"2024-11-13T03:07:36Z","published":"2024-11-12T01:09:52Z","title":"Privacy-Preserving Verifiable Neural Network Inference Service","summary":" Machine learning has revolutionized data analysis and pattern recognition,\nbut its resource-intensive training has limited accessibility. Machine Learning\nas a Service (MLaaS) simplifies this by enabling users to delegate their data\nsamples to an MLaaS provider and obtain the inference result using a\npre-trained model. Despite its convenience, leveraging MLaaS poses significant\nprivacy and reliability concerns to the client. Specifically, sensitive\ninformation from the client inquiry data can be leaked to an adversarial MLaaS\nprovider. Meanwhile, the lack of a verifiability guarantee can potentially\nresult in biased inference results or even unfair payment issues. While\nexisting trustworthy machine learning techniques, such as those relying on\nverifiable computation or secure computation, offer solutions to privacy and\nreliability concerns, they fall short of simultaneously protecting the privacy\nof client data and providing provable inference verifiability.\n In this paper, we propose vPIN, a privacy-preserving and verifiable CNN\ninference scheme that preserves privacy for client data samples while ensuring\nverifiability for the inference. vPIN makes use of partial homomorphic\nencryption and commit-and-prove succinct non-interactive argument of knowledge\ntechniques to achieve desirable security properties. In vPIN, we develop\nvarious optimization techniques to minimize the proving circuit for homomorphic\ninference evaluation thereby, improving the efficiency and performance of our\ntechnique. We fully implemented and evaluated our vPIN scheme on standard\ndatasets (e.g., MNIST, CIFAR-10). Our experimental results show that vPIN\nachieves high efficiency in terms of proving time, verification time, and proof\nsize, while providing client data privacy guarantees and provable\nverifiability.\n","authors":["Arman Riasi","Jorge Guajardo","Thang Hoang"],"pdf_url":"https://arxiv.org/pdf/2411.07468v2.pdf","comment":"Accepted at the Annual Computer Security Applications Conference\n (ACSAC) 2024. Source code: github.com/vt-asaplab/vPIN"},{"id":"http://arxiv.org/abs/2411.07954v2","updated":"2024-11-13T02:56:56Z","published":"2024-11-12T17:30:31Z","title":"Learning Memory Mechanisms for Decision Making through Demonstrations","summary":" In Partially Observable Markov Decision Processes, integrating an agent's\nhistory into memory poses a significant challenge for decision-making.\nTraditional imitation learning, relying on observation-action pairs for expert\ndemonstrations, fails to capture the expert's memory mechanisms used in\ndecision-making. To capture memory processes as demonstrations, we introduce\nthe concept of memory dependency pairs $(p, q)$ indicating that events at time\n$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner\nto leverage memory dependency pairs in Transformers and find significant\nimprovements across several tasks compared to standard Transformers when\nevaluated on Memory Gym and the Long-term Memory Benchmark. Code is available\nat https://github.com/WilliamYue37/AttentionTuner.\n","authors":["William Yue","Bo Liu","Peter Stone"],"pdf_url":"https://arxiv.org/pdf/2411.07954v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07463v2","updated":"2024-11-13T02:39:12Z","published":"2024-11-12T00:54:26Z","title":"MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation\n Models, Convolutional Neural Networks, and Uncertainty Quantification for\n High-Speed Video Phase Detection Data","summary":" Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in\nnuclear reactors, chemical processing, and electronics cooling for detecting\nvapor, liquid, and microlayer phases. Traditional segmentation models face\npixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ\nintroduces VideoSAM, a hybrid framework leveraging convolutional neural\nnetworks (CNNs) and transformer-based vision models to enhance segmentation\naccuracy and generalizability across complex multimodal PD tasks. Methods:\nVideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced\nfeature extraction and segmentation across diverse HSV PD modalities, spanning\nfluids like water, FC-72, nitrogen, and argon under varied heat flux\nconditions. The framework also incorporates uncertainty quantification (UQ) to\nassess pixel-based discretization errors, delivering reliable metrics such as\ncontact line density and dry area fraction under experimental conditions.\nResults: VideoSAM outperforms SAM and modality-specific CNN models in\nsegmentation accuracy, excelling in environments with complex phase boundaries,\noverlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid\narchitecture supports cross-dataset generalization, adapting effectively to\nvarying modalities. The UQ module provides accurate error estimates, enhancing\nthe reliability of segmentation outputs for advanced HSV PD research.\nConclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD\nsegmentation, addressing previous limitations with advanced deep learning and\nUQ techniques. The open-source datasets and tools introduced enable scalable,\nprecise, and adaptable segmentation for multimodal PD datasets, supporting\nadvancements in HSV analysis and autonomous experimentation. The codes and data\nused for this paper are publicly available at:\n\\url{https://github.com/chikap421/mseg_vcuq}\n","authors":["Chika Maduabuchi","Ericmoore Jossou","Matteo Bucci"],"pdf_url":"https://arxiv.org/pdf/2411.07463v2.pdf","comment":"Under Review in EAAI"},{"id":"http://arxiv.org/abs/2411.07249v2","updated":"2024-11-13T02:38:02Z","published":"2024-10-26T21:27:53Z","title":"SPDIM: Source-Free Unsupervised Conditional and Label Shift Adaptation\n in EEG","summary":" The non-stationary nature of electroencephalography (EEG) introduces\ndistribution shifts across domains (e.g., days and subjects), posing a\nsignificant challenge to EEG-based neurotechnology generalization. Without\nlabeled calibration data for target domains, the problem is a source-free\nunsupervised domain adaptation (SFUDA) problem. For scenarios with constant\nlabel distribution, Riemannian geometry-aware statistical alignment frameworks\non the symmetric positive definite (SPD) manifold are considered\nstate-of-the-art. However, many practical scenarios, including EEG-based sleep\nstaging, exhibit label shifts. Here, we propose a geometric deep learning\nframework for SFUDA problems under specific distribution shifts, including\nlabel shifts. We introduce a novel, realistic generative model and show that\nprior Riemannian statistical alignment methods on the SPD manifold can\ncompensate for specific marginal and conditional distribution shifts but hurt\ngeneralization under label shifts. As a remedy, we propose a\nparameter-efficient manifold optimization strategy termed SPDIM. SPDIM uses the\ninformation maximization principle to learn a single SPD-manifold-constrained\nparameter per target domain. In simulations, we demonstrate that SPDIM can\ncompensate for the shifts under our generative model. Moreover, using public\nEEG-based brain-computer interface and sleep staging datasets, we show that\nSPDIM outperforms prior approaches.\n","authors":["Shanglin Li","Motoaki Kawanabe","Reinmar J. Kobler"],"pdf_url":"https://arxiv.org/pdf/2411.07249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08297v1","updated":"2024-11-13T02:32:38Z","published":"2024-11-13T02:32:38Z","title":"TowerDebias: A Novel Debiasing Method based on the Tower Property","summary":" Decision-making processes have increasingly come to rely on sophisticated\nmachine learning tools, raising concerns about the fairness of their\npredictions with respect to any sensitive groups. The widespread use of\ncommercial black-box machine learning models necessitates careful consideration\nof their legal and ethical implications on consumers. In situations where users\nhave access to these \"black-box\" models, a key question emerges: how can we\nmitigate or eliminate the influence of sensitive attributes, such as race or\ngender? We propose towerDebias (tDB), a novel approach designed to reduce the\ninfluence of sensitive variables in predictions made by black-box models. Using\nthe Tower Property from probability theory, tDB aims to improve prediction\nfairness during the post-processing stage in a manner amenable to the\nFairness-Utility Tradeoff. This method is highly flexible, requiring no prior\nknowledge of the original model's internal structure, and can be extended to a\nrange of different applications. We provide a formal improvement theorem for\ntDB and demonstrate its effectiveness in both regression and classification\ntasks, underscoring its impact on the fairness-utility tradeoff.\n","authors":["Norman Matloff","Aditya Mittal"],"pdf_url":"https://arxiv.org/pdf/2411.08297v1.pdf","comment":"To be submitted to a journal soon"},{"id":"http://arxiv.org/abs/2411.08290v1","updated":"2024-11-13T02:17:03Z","published":"2024-11-13T02:17:03Z","title":"RESOLVE: Relational Reasoning with Symbolic and Object-Level Features\n Using Vector Symbolic Processing","summary":" Modern transformer-based encoder-decoder architectures struggle with\nreasoning tasks due to their inability to effectively extract relational\ninformation between input objects (data/tokens). Recent work introduced the\nAbstractor module, embedded between transformer layers, to address this gap.\nHowever, the Abstractor layer while excelling at capturing relational\ninformation (pure relational reasoning), faces challenges in tasks that require\nboth object and relational-level reasoning (partial relational reasoning). To\naddress this, we propose RESOLVE, a neuro-vector symbolic architecture that\ncombines object-level features with relational representations in\nhigh-dimensional spaces, using fast and efficient operations such as bundling\n(summation) and binding (Hadamard product) allowing both object-level features\nand relational representations to coexist within the same structure without\ninterfering with one another. RESOLVE is driven by a novel attention mechanism\nthat operates in a bipolar high dimensional space, allowing fast attention\nscore computation compared to the state-of-the-art. By leveraging this design,\nthe model achieves both low compute latency and memory efficiency. RESOLVE also\noffers better generalizability while achieving higher accuracy in purely\nrelational reasoning tasks such as sorting as well as partial relational\nreasoning tasks such as math problem-solving compared to state-of-the-art\nmethods.\n","authors":["Mohamed Mejri","Chandramouli Amarnath","Abhijit Chatterjee"],"pdf_url":"https://arxiv.org/pdf/2411.08290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02349v2","updated":"2024-11-13T02:03:01Z","published":"2023-11-04T08:28:33Z","title":"Sample Complexity of Opinion Formation on Networks with Linear\n Regression Models","summary":" Consider public health officials aiming to spread awareness about a new\nvaccine in a community interconnected by a social network. How can they\ndistribute information with minimal resources, so as to avoid polarization and\nensure community-wide convergence of opinion? To tackle such challenges, we\ninitiate the study of sample complexity of opinion convergence in networks. Our\nframework is built on the recognized opinion formation game, where we regard\nthe opinion of each agent as a data-derived model, unlike previous works that\ntreat opinions as data-independent scalars. The opinion model for every agent\nis initially learned from its local samples and evolves game-theoretically as\nall agents communicate with neighbors and revise their models towards an\nequilibrium. Our focus is on the sample complexity needed to ensure that the\nopinions converge to an equilibrium such that the final model of every agent\nhas low generalization error.\n Our paper has two main technical results. First, we present a novel\npolynomial time optimization framework to quantify the total sample complexity\nfor arbitrary networks, when the underlying learning problem is (generalized)\nlinear regression. Second, we leverage this optimization to study the network\ngain which measures the improvement of sample complexity when learning over a\nnetwork compared to that in isolation. Towards this end, we derive network gain\nbounds for various network classes including cliques, star graphs, and random\nregular graphs. Additionally, our framework provides a method to study sample\ndistribution within the network, suggesting that it is sufficient to allocate\nsamples inversely to the degree. Empirical results on both synthetic and\nreal-world networks strongly support our theoretical findings.\n","authors":["Haolin Liu","Rajmohan Rajaraman","Ravi Sundaram","Anil Vullikanti","Omer Wasim","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.02349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08286v1","updated":"2024-11-13T02:02:52Z","published":"2024-11-13T02:02:52Z","title":"Hashing for Protein Structure Similarity Search","summary":" Protein structure similarity search (PSSS), which tries to search proteins\nwith similar structures, plays a crucial role across diverse domains from drug\ndesign to protein function prediction and molecular evolution. Traditional\nalignment-based PSSS methods, which directly calculate alignment on the protein\nstructures, are highly time-consuming with high memory cost. Recently,\nalignment-free methods, which represent protein structures as fixed-length\nreal-valued vectors, are proposed for PSSS. Although these methods have lower\ntime and memory cost than alignment-based methods, their time and memory cost\nis still too high for large-scale PSSS, and their accuracy is unsatisfactory.\nIn this paper, we propose a novel method, called\n$\\underline{\\text{p}}$r$\\underline{\\text{o}}$tein\n$\\underline{\\text{s}}$tructure $\\underline{\\text{h}}$ashing (POSH), for PSSS.\nPOSH learns a binary vector representation for each protein structure, which\ncan dramatically reduce the time and memory cost for PSSS compared with\nreal-valued vector representation based methods. Furthermore, in POSH we also\npropose expressive hand-crafted features and a structure encoder to well model\nboth node and edge interactions in proteins. Experimental results on real\ndatasets show that POSH can outperform other methods to achieve\nstate-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more\nthan six times and speed improvement of more than four times, compared with\nother methods.\n","authors":["Jin Han","Wu-Jun Li"],"pdf_url":"https://arxiv.org/pdf/2411.08286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12723v4","updated":"2024-11-13T01:45:11Z","published":"2024-06-18T15:45:21Z","title":"BIOSCAN-5M: A Multimodal Dataset for Insect Biodiversity","summary":" As part of an ongoing worldwide effort to comprehend and monitor insect\nbiodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine\nlearning community and establish several benchmark tasks. BIOSCAN-5M is a\ncomprehensive dataset containing multi-modal information for over 5 million\ninsect specimens, and it significantly expands existing image-based biological\ndatasets by including taxonomic labels, raw nucleotide barcode sequences,\nassigned barcode index numbers, geographical, and size information. We propose\nthree benchmark experiments to demonstrate the impact of the multi-modal data\ntypes on the classification and clustering accuracy. First, we pretrain a\nmasked language model on the DNA barcode sequences of the BIOSCAN-5M dataset,\nand demonstrate the impact of using this large reference library on species-\nand genus-level classification performance. Second, we propose a zero-shot\ntransfer learning task applied to images and DNA barcodes to cluster feature\nembeddings obtained from self-supervised learning, to investigate whether\nmeaningful clusters can be derived from these representation embeddings. Third,\nwe benchmark multi-modality by performing contrastive learning on DNA barcodes,\nimage data, and taxonomic information. This yields a general shared embedding\nspace enabling taxonomic classification using multiple types of information and\nmodalities. The code repository of the BIOSCAN-5M Insect dataset is available\nat https://github.com/bioscan-ml/BIOSCAN-5M.\n","authors":["Zahra Gharaee","Scott C. Lowe","ZeMing Gong","Pablo Millan Arias","Nicholas Pellegrino","Austin T. Wang","Joakim Bruslund Haurum","Iuliia Zarubiieva","Lila Kari","Dirk Steinke","Graham W. Taylor","Paul Fieguth","Angel X. Chang"],"pdf_url":"https://arxiv.org/pdf/2406.12723v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.05386v2","updated":"2024-11-13T01:40:53Z","published":"2024-05-08T19:31:06Z","title":"Interpretability Needs a New Paradigm","summary":" Interpretability is the study of explaining models in understandable terms to\nhumans. At present, interpretability is divided into two paradigms: the\nintrinsic paradigm, which believes that only models designed to be explained\ncan be explained, and the post-hoc paradigm, which believes that black-box\nmodels can be explained. At the core of this debate is how each paradigm\nensures its explanations are faithful, i.e., true to the model's behavior. This\nis important, as false but convincing explanations lead to unsupported\nconfidence in artificial intelligence (AI), which can be dangerous. This\npaper's position is that we should think about new paradigms while staying\nvigilant regarding faithfulness. First, by examining the history of paradigms\nin science, we see that paradigms are constantly evolving. Then, by examining\nthe current paradigms, we can understand their underlying beliefs, the value\nthey bring, and their limitations. Finally, this paper presents 3 emerging\nparadigms for interpretability. The first paradigm designs models such that\nfaithfulness can be easily measured. Another optimizes models such that\nexplanations become faithful. The last paradigm proposes to develop models that\nproduce both a prediction and an explanation.\n","authors":["Andreas Madsen","Himabindu Lakkaraju","Siva Reddy","Sarath Chandar"],"pdf_url":"https://arxiv.org/pdf/2405.05386v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.14798v2","updated":"2024-11-13T01:36:33Z","published":"2024-06-21T00:16:55Z","title":"Probabilistic Emulation of a Global Climate Model with Spherical\n DYffusion","summary":" Data-driven deep learning models are transforming global weather forecasting.\nIt is an open question if this success can extend to climate modeling, where\nthe complexity of the data and long inference rollouts pose significant\nchallenges. Here, we present the first conditional generative model that\nproduces accurate and physically consistent global climate ensemble simulations\nby emulating a coarse version of the United States' primary operational global\nforecast model, FV3GFS. Our model integrates the dynamics-informed diffusion\nframework (DYffusion) with the Spherical Fourier Neural Operator (SFNO)\narchitecture, enabling stable 100-year simulations at 6-hourly timesteps while\nmaintaining low computational overhead compared to single-step deterministic\nbaselines. The model achieves near gold-standard performance for climate model\nemulation, outperforming existing approaches and demonstrating promising\nensemble skill. This work represents a significant advance towards efficient,\ndata-driven climate simulations that can enhance our understanding of the\nclimate system and inform adaptation strategies.\n","authors":["Salva Rühling Cachay","Brian Henn","Oliver Watt-Meyer","Christopher S. Bretherton","Rose Yu"],"pdf_url":"https://arxiv.org/pdf/2406.14798v2.pdf","comment":"NeurIPS 2024; Code is available at\n https://github.com/Rose-STL-Lab/spherical-dyffusion"},{"id":"http://arxiv.org/abs/2411.08267v1","updated":"2024-11-13T00:42:40Z","published":"2024-11-13T00:42:40Z","title":"Least Squares Training of Quadratic Convolutional Neural Networks with\n Applications to System Theory","summary":" This paper provides a least squares formulation for the training of a 2-layer\nconvolutional neural network using quadratic activation functions, a 2-norm\nloss function, and no regularization term. Using this method, an analytic\nexpression for the globally optimal weights is obtained alongside a quadratic\ninput-output equation for the network. These properties make the network a\nviable tool in system theory by enabling further analysis, such as the\nsensitivity of the output to perturbations in the input, which is crucial for\nsafety-critical systems such as aircraft or autonomous vehicles.The least\nsquares method is compared to previously proposed strategies for training\nquadratic networks and to a back-propagation-trained ReLU network. The proposed\nmethod is applied to a system identification problem and a GPS position\nestimation problem. The least squares network is shown to have a significantly\nreduced training time with minimal compromises on prediction accuracy alongside\nthe advantages of having an analytic input-output equation. Although these\nresults only apply to 2-layer networks, this paper motivates the exploration of\ndeeper quadratic networks in the context of system theory.\n","authors":["Zachary Yetman Van Egmond","Luis Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2411.08267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09995v2","updated":"2024-11-13T00:41:01Z","published":"2024-04-15T17:59:57Z","title":"Taming Latent Diffusion Model for Neural Radiance Field Inpainting","summary":" Neural Radiance Field (NeRF) is a representation for 3D reconstruction from\nmulti-view images. Despite some recent work showing preliminary success in\nediting a reconstructed NeRF with diffusion prior, they remain struggling to\nsynthesize reasonable geometry in completely uncovered regions. One major\nreason is the high diversity of synthetic contents from the diffusion model,\nwhich hinders the radiance field from converging to a crisp and deterministic\ngeometry. Moreover, applying latent diffusion models on real data often yields\na textural shift incoherent to the image condition due to auto-encoding errors.\nThese two problems are further reinforced with the use of pixel-distance\nlosses. To address these issues, we propose tempering the diffusion model's\nstochasticity with per-scene customization and mitigating the textural shift\nwith masked adversarial training. During the analyses, we also found the\ncommonly used pixel and perceptual losses are harmful in the NeRF inpainting\ntask. Through rigorous experiments, our framework yields state-of-the-art NeRF\ninpainting results on various real-world scenes. Project page:\nhttps://hubert0527.github.io/MALD-NeRF\n","authors":["Chieh Hubert Lin","Changil Kim","Jia-Bin Huang","Qinbo Li","Chih-Yao Ma","Johannes Kopf","Ming-Hsuan Yang","Hung-Yu Tseng"],"pdf_url":"https://arxiv.org/pdf/2404.09995v2.pdf","comment":"Accepted to ECCV 2024. Project page:\n https://hubert0527.github.io/MALD-NeRF"},{"id":"http://arxiv.org/abs/2402.17457v2","updated":"2024-11-13T00:38:48Z","published":"2024-02-27T12:28:01Z","title":"Super Consistency of Neural Network Landscapes and Learning Rate\n Transfer","summary":" Recently, there has been growing evidence that if the width and depth of a\nneural network are scaled toward the so-called rich feature learning limit\n(\\mup and its depth extension), then some hyperparameters -- such as the\nlearning rate -- exhibit transfer from small to very large models. From an\noptimization perspective, this phenomenon is puzzling, as it implies that the\nloss landscape is consistently similar across very different model sizes. In\nthis work, we study the landscape through the lens of the loss Hessian, with a\nfocus on its largest eigenvalue (i.e. the sharpness), and find that certain\nspectral properties under $\\mu$P are largely independent of the size of the\nnetwork, and remain consistent as training progresses. We name this property\nSuper Consistency of the landscape. On the other hand, we show that in the\nNeural Tangent Kernel (NTK) and other scaling regimes, the sharpness exhibits\nvery different dynamics at different scales. But what causes these differences\nin the sharpness dynamics? Through a connection between the Hessian's and the\nNTK's spectrum, we argue that the cause lies in the presence (for $\\mu$P) or\nprogressive absence (for the NTK scaling) of feature learning. We corroborate\nour claims with a substantial suite of experiments, covering a wide range of\ndatasets and architectures: from ResNets and Vision Transformers trained on\nbenchmark vision datasets to Transformers-based language models trained on\nWikiText.\n","authors":["Lorenzo Noci","Alexandru Meterez","Thomas Hofmann","Antonio Orvieto"],"pdf_url":"https://arxiv.org/pdf/2402.17457v2.pdf","comment":"The paper has been accepted at Neurips 2024. This is a revised\n version of the paper previously titled \"Why do Learning Rates Transfer?\n Reconciling Optimization and Scaling Limits for Deep Learning\""},{"id":"http://arxiv.org/abs/2410.01272v2","updated":"2024-11-13T00:19:34Z","published":"2024-10-02T06:30:49Z","title":"\"No Matter What You Do\": Purifying GNN Models via Backdoor Unlearning","summary":" Recent studies have exposed that GNNs are vulnerable to several adversarial\nattacks, among which backdoor attack is one of the toughest. Similar to Deep\nNeural Networks (DNNs), backdoor attacks in GNNs lie in the fact that the\nattacker modifies a portion of graph data by embedding triggers and enforces\nthe model to learn the trigger feature during the model training process.\nDespite the massive prior backdoor defense works on DNNs, defending against\nbackdoor attacks in GNNs is largely unexplored, severely hindering the\nwidespread application of GNNs in real-world tasks. To bridge this gap, we\npresent GCleaner, the first backdoor mitigation method on GNNs. GCleaner can\nmitigate the presence of the backdoor logic within backdoored GNNs by reversing\nthe backdoor learning procedure, aiming to restore the model performance to a\nlevel similar to that is directly trained on the original clean dataset. To\nachieve this objective, we ask: How to recover universal and hard backdoor\ntriggers in GNNs? How to unlearn the backdoor trigger feature while maintaining\nthe model performance? We conduct the graph trigger recovery via the\nexplanation method to identify optimal trigger locations, facilitating the\nsearch of universal and hard backdoor triggers in the feature space of the\nbackdoored model through maximal similarity. Subsequently, we introduce the\nbackdoor unlearning mechanism, which combines knowledge distillation and\ngradient-based explainable knowledge for fine-grained backdoor erasure.\nExtensive experimental evaluations on four benchmark datasets demonstrate that\nGCleaner can reduce the backdoor attack success rate to 10% with only 1% of\nclean data, and has almost negligible degradation in model performance, which\nfar outperforms the state-of-the-art (SOTA) defense methods.\n","authors":["Jiale Zhang","Chengcheng Zhu","Bosen Rao","Hao Sui","Xiaobing Sun","Bing Chen","Chunyi Zhou","Shouling Ji"],"pdf_url":"https://arxiv.org/pdf/2410.01272v2.pdf","comment":"18 pages, 12 figures, 9 tables"},{"id":"http://arxiv.org/abs/2409.18164v2","updated":"2024-11-13T00:15:46Z","published":"2024-09-26T17:30:28Z","title":"Data-Prep-Kit: getting your data ready for LLM application development","summary":" Data preparation is the first and a very important step towards any Large\nLanguage Model (LLM) development. This paper introduces an easy-to-use,\nextensible, and scale-flexible open-source data preparation toolkit called Data\nPrep Kit (DPK). DPK is architected and designed to enable users to scale their\ndata preparation to their needs. With DPK they can prepare data on a local\nmachine or effortlessly scale to run on a cluster with thousands of CPU Cores.\nDPK comes with a highly scalable, yet extensible set of modules that transform\nnatural language and code data. If the user needs additional transforms, they\ncan be easily developed using extensive DPK support for transform creation.\nThese modules can be used independently or pipelined to perform a series of\noperations. In this paper, we describe DPK architecture and show its\nperformance from a small scale to a very large number of CPUs. The modules from\nDPK have been used for the preparation of Granite Models [1] [2]. We believe\nDPK is a valuable contribution to the AI community to easily prepare data to\nenhance the performance of their LLM models or to fine-tune models with\nRetrieval-Augmented Generation (RAG).\n","authors":["David Wood","Boris Lublinsky","Alexy Roytman","Shivdeep Singh","Constantin Adam","Abdulhamid Adebayo","Sungeun An","Yuan Chi Chang","Xuan-Hong Dang","Nirmit Desai","Michele Dolfi","Hajar Emami-Gohari","Revital Eres","Takuya Goto","Dhiraj Joshi","Yan Koyfman","Mohammad Nassar","Hima Patel","Paramesvaran Selvam","Yousaf Shah","Saptha Surendran","Daiki Tsuzuku","Petros Zerfos","Shahrokh Daijavad"],"pdf_url":"https://arxiv.org/pdf/2409.18164v2.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2411.08257v1","updated":"2024-11-13T00:14:09Z","published":"2024-11-13T00:14:09Z","title":"GPTree: Towards Explainable Decision-Making via LLM-powered Decision\n Trees","summary":" Traditional decision tree algorithms are explainable but struggle with\nnon-linear, high-dimensional data, limiting its applicability in complex\ndecision-making. Neural networks excel at capturing complex patterns but\nsacrifice explainability in the process. In this work, we present GPTree, a\nnovel framework combining explainability of decision trees with the advanced\nreasoning capabilities of LLMs. GPTree eliminates the need for feature\nengineering and prompt chaining, requiring only a task-specific prompt and\nleveraging a tree-based structure to dynamically split samples. We also\nintroduce an expert-in-the-loop feedback mechanism to further enhance\nperformance by enabling human intervention to refine and rebuild decision\npaths, emphasizing the harmony between human expertise and machine\nintelligence. Our decision tree achieved a 7.8% precision rate for identifying\n\"unicorn\" startups at the inception stage of a startup, surpassing gpt-4o with\nfew-shot learning as well as the best human decision-makers (3.1% to 5.6%).\n","authors":["Sichao Xiong","Yigit Ihlamur","Fuat Alican","Aaron Ontoyin Yin"],"pdf_url":"https://arxiv.org/pdf/2411.08257v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.08730v1","updated":"2024-11-13T16:14:13Z","published":"2024-11-13T16:14:13Z","title":"3D Modelling to Address Pandemic Challenges: A Project-Based Learning\n Methodology","summary":" The use of 3D modelling in medical education is a revolutionary tool during\nthe learning process. In fact, this type of technology enables a more\ninteractive teaching approach, making information retention more effective and\nenhancing students' understanding. 3D modelling allows for the creation of\nprecise representations of the human body, as well as interaction with\nthree-dimensional models, giving students a better spatial understanding of the\ndifferent organs and systems and enabling simulations of surgical and technical\nprocedures. This way, medical education is enriched with a more realistic and\nsafe educational experience. The goal is to understand whether, when students\nand schools are challenged, they play an important role in addressing health\nissues in their community. School-led projects are directed towards educational\nscenarios that emphasize STEM education, tackling relevant public health\nproblems through open-school initiatives. By implementing an educational\nscenario focused on 3D modelling and leveraging technology, we aim to raise\ncommunity awareness on public health issues.\n","authors":["Tânia Rocha","Ana Ribeiro","Joana Oliveira","Ricardo Nunes","Diana Carvalho","Hugo Paredes","Paulo Martins"],"pdf_url":"https://arxiv.org/pdf/2411.08730v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08409v1","updated":"2024-11-13T07:55:41Z","published":"2024-11-13T07:55:41Z","title":"DiVR: incorporating context from diverse VR scenes for human trajectory\n prediction","summary":" Virtual environments provide a rich and controlled setting for collecting\ndetailed data on human behavior, offering unique opportunities for predicting\nhuman trajectories in dynamic scenes. However, most existing approaches have\noverlooked the potential of these environments, focusing instead on static\ncontexts without considering userspecific factors. Employing the CREATTIVE3D\ndataset, our work models trajectories recorded in virtual reality (VR) scenes\nfor diverse situations including road-crossing tasks with user interactions and\nsimulated visual impairments. We propose Diverse Context VR Human Motion\nPrediction (DiVR), a cross-modal transformer based on the Perceiver\narchitecture that integrates both static and dynamic scene context using a\nheterogeneous graph convolution network. We conduct extensive experiments\ncomparing DiVR against existing architectures including MLP, LSTM, and\ntransformers with gaze and point cloud context. Additionally, we also stress\ntest our model's generalizability across different users, tasks, and scenes.\nResults show that DiVR achieves higher accuracy and adaptability compared to\nother models and to static graphs. This work highlights the advantages of using\nVR datasets for context-aware human trajectory modeling, with potential\napplications in enhancing user experiences in the metaverse. Our source code is\npublicly available at https://gitlab.inria.fr/ffrancog/creattive3d-divr-model.\n","authors":["Franz Franco Gallo","Hui-Yin Wu","Lucile Sassatelli"],"pdf_url":"https://arxiv.org/pdf/2411.08409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08334v1","updated":"2024-11-13T04:32:58Z","published":"2024-11-13T04:32:58Z","title":"Enhancing Multimodal Query Representation via Visual Dialogues for\n End-to-End Knowledge Retrieval","summary":" Existing multimodal retrieval systems often rely on disjointed models for\nimage comprehension, such as object detectors and caption generators, leading\nto cumbersome implementations and training processes. To overcome this\nlimitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a\ntext retriever with the ability to understand multimodal queries via dynamic\nmodality interaction. Ret-XKnow leverages a partial convolution mechanism to\nfocus on visual information relevant to the given textual query, thereby\nenhancing multimodal query representations. To effectively learn multimodal\ninteraction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset\nautomatically constructed from visual dialogue datasets. Our dataset\nconstruction process ensures that the dialogues are transformed into suitable\ninformation retrieval tasks using a text retriever. We demonstrate that our\napproach not only significantly improves retrieval performance in zero-shot\nsettings but also achieves substantial improvements in fine-tuning scenarios.\nOur code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow.\n","authors":["Yeong-Joon Ju","Ho-Joong Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2411.08334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08307v1","updated":"2024-11-13T03:14:10Z","published":"2024-11-13T03:14:10Z","title":"PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for\n Long-Term Expressive Symbolic Music Generation","summary":" Music generation has progressed significantly, especially in the domain of\naudio generation. However, generating symbolic music that is both\nlong-structured and expressive remains a significant challenge. In this paper,\nwe propose PerceiverS (Segmentation and Scale), a novel architecture designed\nto address this issue by leveraging both Effective Segmentation and Multi-Scale\nattention mechanisms. Our approach enhances symbolic music generation by\nsimultaneously learning long-term structural dependencies and short-term\nexpressive details. By combining cross-attention and self-attention in a\nMulti-Scale setting, PerceiverS captures long-range musical structure while\npreserving performance nuances. The proposed model, evaluated on datasets like\nMaestro, demonstrates improvements in generating coherent and diverse music\nwith both structural consistency and expressive variation. The project demos\nand the generated music samples can be accessed through the link:\nhttps://perceivers.github.io.\n","authors":["Yungang Yi","Weihua Li","Matthew Kuo","Quan Bai"],"pdf_url":"https://arxiv.org/pdf/2411.08307v1.pdf","comment":null}]},"2024-11-12T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2411.08248v1","updated":"2024-11-12T23:54:58Z","published":"2024-11-12T23:54:58Z","title":"Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial\n Approach","summary":" Deep learning underpins most of the currently advanced natural language\nprocessing (NLP) tasks such as textual classification, neural machine\ntranslation (NMT), abstractive summarization and question-answering (QA).\nHowever, the robustness of the models, particularly QA models, against\nadversarial attacks is a critical concern that remains insufficiently explored.\nThis paper introduces QA-Attack (Question Answering Attack), a novel word-level\nadversarial strategy that fools QA models. Our attention-based attack exploits\nthe customized attention mechanism and deletion ranking strategy to identify\nand target specific words within contextual passages. It creates deceptive\ninputs by carefully choosing and substituting synonyms, preserving grammatical\nintegrity while misleading the model to produce incorrect responses. Our\napproach demonstrates versatility across various question types, particularly\nwhen dealing with extensive long textual inputs. Extensive experiments on\nmultiple benchmark datasets demonstrate that QA-Attack successfully deceives\nbaseline QA models and surpasses existing adversarial techniques regarding\nsuccess rate, semantics changes, BLEU score, fluency and grammar error rate.\n","authors":["Jiyao Li","Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08243v1","updated":"2024-11-12T23:43:20Z","published":"2024-11-12T23:43:20Z","title":"Beyond the Safety Bundle: Auditing the Helpful and Harmless Dataset","summary":" In an effort to mitigate the harms of large language models (LLMs), learning\nfrom human feedback (LHF) has been used to steer LLMs towards outputs that are\nintended to be both less harmful and more helpful. Despite the widespread\nadoption of LHF in practice, the quality of this feedback and its effectiveness\nas a safety mitigation technique remain unclear. This study addresses these\nissues by auditing the widely-used Helpful and Harmless (HH) dataset by\nAnthropic. Our work includes: (1) a thorough investigation of the dataset's\ncontent through both manual and automated evaluation; (2) experiments\ndemonstrating the dataset's impact on models' safety; and (3) an analysis of\nthe 100 most influential papers citing this dataset. Through our audit, we\nshowcase how conceptualization failures and quality issues identified in the HH\ndataset can create additional harms by leading to disparate safety behaviors\nacross demographic groups. Our findings highlight the need for more nuanced,\ncontext-sensitive approaches to safety mitigation in LLMs.\n","authors":["Khaoula Chehbouni","Jonathan Colaço-Carr","Yash More","Jackie CK Cheung","Golnoosh Farnadi"],"pdf_url":"https://arxiv.org/pdf/2411.08243v1.pdf","comment":"Prepared for conference submission"},{"id":"http://arxiv.org/abs/2403.06399v3","updated":"2024-11-12T22:17:46Z","published":"2024-03-11T03:21:15Z","title":"GlossLM: A Massively Multilingual Corpus and Pretrained Model for\n Interlinear Glossed Text","summary":" Language documentation projects often involve the creation of annotated text\nin a format such as interlinear glossed text (IGT), which captures fine-grained\nmorphosyntactic analyses in a morpheme-by-morpheme format. However, there are\nfew existing resources providing large amounts of standardized, easily\naccessible IGT data, limiting their applicability to linguistic research, and\nmaking it difficult to use such data in NLP modeling.\n We compile the largest existing corpus of IGT data from a variety of sources,\ncovering over 450k examples across 1.8k languages, to enable research on\ncrosslingual transfer and IGT generation. We normalize much of our data to\nfollow a standard set of labels across languages.\n Furthermore, we explore the task of automatically generating IGT in order to\naid documentation projects. As many languages lack sufficient monolingual data,\nwe pretrain a large multilingual model on our corpus. We demonstrate the\nutility of this model by finetuning it on monolingual corpora, outperforming\nSOTA models by up to 6.6\\%. Our pretrained model and dataset are available on\nHugging Face.\n","authors":["Michael Ginn","Lindia Tjuatja","Taiqi He","Enora Rice","Graham Neubig","Alexis Palmer","Lori Levin"],"pdf_url":"https://arxiv.org/pdf/2403.06399v3.pdf","comment":"EMNLP 2024. First two authors are equal contribution"},{"id":"http://arxiv.org/abs/2408.07832v6","updated":"2024-11-12T20:51:07Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08165v1","updated":"2024-11-12T20:15:58Z","published":"2024-11-12T20:15:58Z","title":"Retrieval, Reasoning, Re-ranking: A Context-Enriched Framework for\n Knowledge Graph Completion","summary":" The Knowledge Graph Completion~(KGC) task aims to infer the missing entity\nfrom an incomplete triple. Existing embedding-based methods rely solely on\ntriples in the KG, which is vulnerable to specious relation patterns and\nlong-tail entities. On the other hand, text-based methods struggle with the\nsemantic gap between KG triples and natural language. Apart from triples,\nentity contexts (e.g., labels, descriptions, aliases) also play a significant\nrole in augmenting KGs. To address these limitations, we propose KGR3, a\ncontext-enriched framework for KGC. KGR3 is composed of three modules. Firstly,\nthe Retrieval module gathers supporting triples from the KG, collects plausible\ncandidate answers from a base embedding model, and retrieves context for each\nrelated entity. Then, the Reasoning module employs a large language model to\ngenerate potential answers for each query triple. Finally, the Re-ranking\nmodule combines candidate answers from the two modules mentioned above, and\nfine-tunes an LLM to provide the best answer. Extensive experiments on widely\nused datasets demonstrate that KGR3 consistently improves various KGC methods.\nSpecifically, the best variant of KGR3 achieves absolute Hits@1 improvements of\n12.3% and 5.6% on the FB15k237 and WN18RR datasets.\n","authors":["Muzhi Li","Cehao Yang","Chengjin Xu","Xuhui Jiang","Yiyan Qi","Jian Guo","Ho-fung Leung","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2411.08165v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04427v2","updated":"2024-11-12T20:11:58Z","published":"2024-11-07T04:38:58Z","title":"One fish, two fish, but not the whole sea: Alignment reduces language\n models' conceptual diversity","summary":" Researchers in social science and psychology have recently proposed using\nlarge language models (LLMs) as replacements for humans in behavioral research.\nIn addition to arguments about whether LLMs accurately capture population-level\npatterns, this has raised questions about whether LLMs capture human-like\nconceptual diversity. Separately, it is debated whether post-training alignment\n(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies,\nwe use a new way of measuring the conceptual diversity of\nsynthetically-generated LLM \"populations\" by relating the internal variability\nof simulated individuals to the population-level variability. We use this\napproach to evaluate non-aligned and aligned LLMs on two domains with rich\nhuman behavioral data. While no model reaches human-like diversity, aligned\nmodels generally display less diversity than their instruction fine-tuned\ncounterparts. Our findings highlight potential trade-offs between increasing\nmodels' value alignment and decreasing the diversity of their conceptual\nrepresentations.\n","authors":["Sonia K. Murthy","Tomer Ullman","Jennifer Hu"],"pdf_url":"https://arxiv.org/pdf/2411.04427v2.pdf","comment":"17 pages, 10 figures; corrected figure version"},{"id":"http://arxiv.org/abs/2411.08147v1","updated":"2024-11-12T19:53:00Z","published":"2024-11-12T19:53:00Z","title":"Large Language Models Can Self-Improve in Long-context Reasoning","summary":" Large language models (LLMs) have achieved substantial progress in processing\nlong contexts but still struggle with long-context reasoning. Existing\napproaches typically involve fine-tuning LLMs with synthetic data, which\ndepends on annotations from human experts or advanced models like GPT-4, thus\nrestricting further advancements. To address this issue, we investigate the\npotential for LLMs to self-improve in long-context reasoning and propose \\ours,\nan approach specifically designed for this purpose. This approach is\nstraightforward: we sample multiple outputs for each question, score them with\nMinimum Bayes Risk, and then apply supervised fine-tuning or preference\noptimization based on these outputs. Extensive experiments on several leading\nLLMs demonstrate the effectiveness of \\ours, with an absolute improvement of\n$4.2$ points for Llama-3.1-8B-Instruct. Furthermore, \\ours achieves superior\nperformance compared to prior approaches that depend on data produced by human\nexperts or advanced models. We anticipate that this work will open new avenues\nfor self-improvement techniques in long-context scenarios, which are essential\nfor the continual advancement of LLMs.\n","authors":["Siheng Li","Cheng Yang","Zesen Cheng","Lemao Liu","Mo Yu","Yujiu Yang","Wai Lam"],"pdf_url":"https://arxiv.org/pdf/2411.08147v1.pdf","comment":"Project Page: https://github.com/SihengLi99/SEALONG"},{"id":"http://arxiv.org/abs/2411.04329v2","updated":"2024-11-12T19:37:20Z","published":"2024-11-07T00:09:54Z","title":"CodeTree: Agent-guided Tree Search for Code Generation with Large\n Language Models","summary":" Pre-trained on massive amounts of code and text data, large language models\n(LLMs) have demonstrated remarkable achievements in performing code generation\ntasks. With additional execution-based feedback, these models can act as agents\nwith capabilities to self-refine and improve generated code autonomously.\nHowever, on challenging coding tasks with extremely large search space, current\nagentic approaches still struggle with multi-stage planning, generating, and\ndebugging. To address this problem, we propose CodeTree, a framework for LLM\nagents to efficiently explore the search space in different stages of the code\ngeneration process. Specifically, we adopted a unified tree structure to\nexplicitly explore different coding strategies, generate corresponding coding\nsolutions, and subsequently refine the solutions. In each stage, critical\ndecision-making (ranking, termination, expanding) of the exploration process is\nguided by both the environmental execution-based feedback and\nLLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code\ngeneration benchmarks and demonstrated the significant performance gains of\nCodeTree against strong baselines. Using GPT-4o as the base model, we\nconsistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0\non CodeContests. On the challenging SWEBench benchmark, our approach led to\nsignificant performance gains.\n","authors":["Jierui Li","Hung Le","Yingbo Zhou","Caiming Xiong","Silvio Savarese","Doyen Sahoo"],"pdf_url":"https://arxiv.org/pdf/2411.04329v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16803v2","updated":"2024-11-12T19:28:34Z","published":"2024-10-22T08:28:05Z","title":"Context-aware Inductive Knowledge Graph Completion with Latent Type\n Constraints and Subgraph Reasoning","summary":" Inductive knowledge graph completion (KGC) aims to predict missing triples\nwith unseen entities. Recent works focus on modeling reasoning paths between\nthe head and tail entity as direct supporting evidence. However, these methods\ndepend heavily on the existence and quality of reasoning paths, which limits\ntheir general applicability in different scenarios. In addition, we observe\nthat latent type constraints and neighboring facts inherent in KGs are also\nvital in inferring missing triples. To effectively utilize all useful\ninformation in KGs, we introduce CATS, a novel context-aware inductive KGC\nsolution. With sufficient guidance from proper prompts and supervised\nfine-tuning, CATS activates the strong semantic understanding and reasoning\ncapabilities of large language models to assess the existence of query triples,\nwhich consist of two modules. First, the type-aware reasoning module evaluates\nwhether the candidate entity matches the latent entity type as required by the\nquery relation. Then, the subgraph reasoning module selects relevant reasoning\npaths and neighboring facts, and evaluates their correlation to the query\ntriple. Experiment results on three widely used datasets demonstrate that CATS\nsignificantly outperforms state-of-the-art methods in 16 out of 18\ntransductive, inductive, and few-shot settings with an average absolute MRR\nimprovement of 7.2%.\n","authors":["Muzhi Li","Cehao Yang","Chengjin Xu","Zixing Song","Xuhui Jiang","Jian Guo","Ho-fung Leung","Irwin King"],"pdf_url":"https://arxiv.org/pdf/2410.16803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08135v1","updated":"2024-11-12T19:26:43Z","published":"2024-11-12T19:26:43Z","title":"On the Role of Speech Data in Reducing Toxicity Detection Bias","summary":" Text toxicity detection systems exhibit significant biases, producing\ndisproportionate rates of false positives on samples mentioning demographic\ngroups. But what about toxicity detection in speech? To investigate the extent\nto which text-based biases are mitigated by speech-based systems, we produce a\nset of high-quality group annotations for the multilingual MuTox dataset, and\nthen leverage these annotations to systematically compare speech- and\ntext-based toxicity classifiers. Our findings indicate that access to speech\ndata during inference supports reduced bias against group mentions,\nparticularly for ambiguous and disagreement-inducing samples. Our results also\nsuggest that improving classifiers, rather than transcription pipelines, is\nmore helpful for reducing group bias. We publicly release our annotations and\nprovide recommendations for future toxicity dataset construction.\n","authors":["Samuel J. Bell","Mariano Coria Meglioli","Megan Richards","Eduardo Sánchez","Christophe Ropers","Skyler Wang","Adina Williams","Levent Sagun","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2411.08135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08019v1","updated":"2024-11-12T18:50:35Z","published":"2024-11-12T18:50:35Z","title":"Language Models as Causal Effect Generators","summary":" We present a framework for large language model (LLM) based data generation\nwith controllable causal structure. In particular, we define a procedure for\nturning any language model and any directed acyclic graph (DAG) into a\nsequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM\nis a causal model with user-defined structure and LLM-defined structural\nequations. We characterize how an SD-SCM allows sampling from observational,\ninterventional, and counterfactual distributions according to the desired\ncausal structure. We then leverage this procedure to propose a new type of\nbenchmark for causal inference methods, generating individual-level\ncounterfactual data without needing to manually specify functional\nrelationships between variables. We create an example benchmark consisting of\nthousands of datasets, and test a suite of popular estimation methods on these\ndatasets for average, conditional average, and individual treatment effect\nestimation, both with and without hidden confounding. Apart from generating\ndata, the same procedure also allows us to test for the presence of a causal\neffect that might be encoded in an LLM. This procedure can underpin auditing\nLLMs for misinformation, discrimination, or otherwise undesirable behavior. We\nbelieve SD-SCMs can serve as a useful tool in any application that would\nbenefit from sequential data with controllable causal structure.\n","authors":["Lucius E. J. Bynum","Kyunghyun Cho"],"pdf_url":"https://arxiv.org/pdf/2411.08019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08010v1","updated":"2024-11-12T18:35:28Z","published":"2024-11-12T18:35:28Z","title":"ExpressivityArena: Can LLMs Express Information Implicitly?","summary":" While Large Language Models (LLMs) have demonstrated remarkable performance\nin certain dimensions, their ability to express implicit language cues that\nhuman use for effective communication remains unclear. This paper presents\nExpressivityArena, a Python library for measuring the implicit communication\nabilities of LLMs. We provide a comprehensive framework to evaluate\nexpressivity of arbitrary LLMs and explore its practical implications. To this\nend, we refine the definition and measurements of ``expressivity,'' and use our\nframework in a set of small experiments. These experiments test LLMs in\ncreative and logical tasks such as poetry, coding, and emotion-based responses.\nThey are then evaluated by an automated grader, through ExpressivityArena,\nwhich we verify to be the most pragmatic for testing expressivity. Building on\nthese experiments, we deepen our understanding of the expressivity of LLMs by\nassessing their ability to remain expressive in conversations. Our findings\nindicate that LLMs are capable of generating and understanding expressive\ncontent, however, with some limitations. These insights will inform the future\ndevelopment and deployment of expressive LLMs. We provide the code for\nExpressivityArena alongside our paper.\n","authors":["Joshua Tint","Som Sagar","Aditya Taparia","Kelly Raines","Bimsara Pathiraja","Caleb Liu","Ransalu Senanayake"],"pdf_url":"https://arxiv.org/pdf/2411.08010v1.pdf","comment":"8 pages, 22 figures"},{"id":"http://arxiv.org/abs/2411.08003v1","updated":"2024-11-12T18:28:57Z","published":"2024-11-12T18:28:57Z","title":"Can adversarial attacks by large language models be attributed?","summary":" Attributing outputs from Large Language Models (LLMs) in adversarial\nsettings-such as cyberattacks and disinformation-presents significant\nchallenges that are likely to grow in importance. We investigate this\nattribution problem using formal language theory, specifically language\nidentification in the limit as introduced by Gold and extended by Angluin. By\nmodeling LLM outputs as formal languages, we analyze whether finite text\nsamples can uniquely pinpoint the originating model. Our results show that due\nto the non-identifiability of certain language classes, under some mild\nassumptions about overlapping outputs from fine-tuned models it is\ntheoretically impossible to attribute outputs to specific LLMs with certainty.\nThis holds also when accounting for expressivity limitations of Transformer\narchitectures. Even with direct model access or comprehensive monitoring,\nsignificant computational hurdles impede attribution efforts. These findings\nhighlight an urgent need for proactive measures to mitigate risks posed by\nadversarial LLM use as their influence continues to expand.\n","authors":["Manuel Cebrian","Jan Arne Telle"],"pdf_url":"https://arxiv.org/pdf/2411.08003v1.pdf","comment":"7 pages, 1 figure"},{"id":"http://arxiv.org/abs/2411.07990v1","updated":"2024-11-12T18:15:19Z","published":"2024-11-12T18:15:19Z","title":"Derivational Morphology Reveals Analogical Generalization in Large\n Language Models","summary":" What mechanisms underlie linguistic generalization in large language models\n(LLMs)? This question has attracted considerable attention, with most studies\nanalyzing the extent to which the language skills of LLMs resemble rules. As of\nyet, it is not known whether linguistic generalization in LLMs could equally\nwell be explained as the result of analogical processes, which can be\nformalized as similarity operations on stored exemplars. A key shortcoming of\nprior research is its focus on linguistic phenomena with a high degree of\nregularity, for which rule-based and analogical approaches make the same\npredictions. Here, we instead examine derivational morphology, specifically\nEnglish adjective nominalization, which displays notable variability. We\nintroduce a new method for investigating linguistic generalization in LLMs:\nfocusing on GPT-J, we fit cognitive models that instantiate rule-based and\nanalogical learning to the LLM training data and compare their predictions on a\nset of nonce adjectives with those of the LLM, allowing us to draw direct\nconclusions regarding underlying mechanisms. As expected, rule-based and\nanalogical models explain the predictions of GPT-J equally well for adjectives\nwith regular nominalization patterns. However, for adjectives with variable\nnominalization patterns, the analogical model provides a much better match.\nFurthermore, GPT-J's behavior is sensitive to the individual word frequencies,\neven for regular forms, a behavior that is consistent with an analogical\naccount of regular forms but not a rule-based one. These findings refute the\nhypothesis that GPT-J's linguistic generalization on adjective nominalization\ninvolves rules, suggesting similarity operations on stored exemplars as the\nunderlying mechanism. Overall, our study suggests that analogical processes\nplay a bigger role in the linguistic generalization of LLMs than previously\nthought.\n","authors":["Valentin Hofmann","Leonie Weissweiler","David Mortensen","Hinrich Schütze","Janet Pierrehumbert"],"pdf_url":"https://arxiv.org/pdf/2411.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07975v1","updated":"2024-11-12T17:55:10Z","published":"2024-11-12T17:55:10Z","title":"JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified\n Multimodal Understanding and Generation","summary":" We present JanusFlow, a powerful framework that unifies image understanding\nand generation in a single model. JanusFlow introduces a minimalist\narchitecture that integrates autoregressive language models with rectified\nflow, a state-of-the-art method in generative modeling. Our key finding\ndemonstrates that rectified flow can be straightforwardly trained within the\nlarge language model framework, eliminating the need for complex architectural\nmodifications. To further improve the performance of our unified model, we\nadopt two key strategies: (i) decoupling the understanding and generation\nencoders, and (ii) aligning their representations during unified training.\nExtensive experiments show that JanusFlow achieves comparable or superior\nperformance to specialized models in their respective domains, while\nsignificantly outperforming existing unified approaches across standard\nbenchmarks. This work represents a step toward more efficient and versatile\nvision-language models.\n","authors":["Yiyang Ma","Xingchao Liu","Xiaokang Chen","Wen Liu","Chengyue Wu","Zhiyu Wu","Zizheng Pan","Zhenda Xie","Haowei Zhang","Xingkai yu","Liang Zhao","Yisong Wang","Jiaying Liu","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2411.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07965v1","updated":"2024-11-12T17:41:16Z","published":"2024-11-12T17:41:16Z","title":"From General to Specific: Utilizing General Hallucation to Automatically\n Measure the Role Relationship Fidelity for Specific Role-Play Agents","summary":" The advanced role-playing capabilities of Large Language Models (LLMs) have\npaved the way for developing Role-Playing Agents (RPAs). However, existing\nbenchmarks, such as HPD, which incorporates manually scored character\nrelationships into the context for LLMs to sort coherence, and SocialBench,\nwhich uses specific profiles generated by LLMs in the context of\nmultiple-choice tasks to assess character preferences, face limitations like\npoor generalizability, implicit and inaccurate judgments, and excessive context\nlength. To address the above issues, we propose an automatic, scalable, and\ngeneralizable paradigm. Specifically, we construct a benchmark by extracting\nrelations from a general knowledge graph and leverage RPA's inherent\nhallucination properties to prompt it to interact across roles, employing\nChatGPT for stance detection and defining relationship hallucination along with\nthree related metrics. Extensive experiments validate the effectiveness and\nstability of our metrics. Our findings further explore factors influencing\nthese metrics and discuss the trade-off between relationship hallucination and\nfactuality.\n","authors":["Chuyi Kong","Ziyang Luo","Hongzhan Lin","Zhiyuan Fan","Yaxin Fan","Yuxi Sun","Jing Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.08479v5","updated":"2024-11-12T17:38:29Z","published":"2024-02-13T14:12:32Z","title":"Plausible Extractive Rationalization through Semi-Supervised Entailment\n Signal","summary":" The increasing use of complex and opaque black box models requires the\nadoption of interpretable measures, one such option is extractive rationalizing\nmodels, which serve as a more interpretable alternative. These models, also\nknown as Explain-Then-Predict models, employ an explainer model to extract\nrationales and subsequently condition the predictor with the extracted\ninformation. Their primary objective is to provide precise and faithful\nexplanations, represented by the extracted rationales. In this paper, we take a\nsemi-supervised approach to optimize for the plausibility of extracted\nrationales. We adopt a pre-trained natural language inference (NLI) model and\nfurther fine-tune it on a small set of supervised rationales ($10\\%$). The NLI\npredictor is leveraged as a source of supervisory signals to the explainer via\nentailment alignment. We show that, by enforcing the alignment agreement\nbetween the explanation and answer in a question-answering task, the\nperformance can be improved without access to ground truth labels. We evaluate\nour approach on the ERASER dataset and show that our approach achieves\ncomparable results with supervised extractive models and outperforms\nunsupervised approaches by $> 100\\%$.\n","authors":["Wei Jie Yeo","Ranjan Satapathy","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2402.08479v5.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2406.11275v2","updated":"2024-11-12T17:37:10Z","published":"2024-06-17T07:25:09Z","title":"Self-training Large Language Models through Knowledge Detection","summary":" Large language models (LLMs) often necessitate extensive labeled datasets and\ntraining compute to achieve impressive performance across downstream tasks.\nThis paper explores a self-training paradigm, where the LLM autonomously\ncurates its own labels and selectively trains on unknown data samples\nidentified through a reference-free consistency method. Empirical evaluations\ndemonstrate significant improvements in reducing hallucination in generation\nacross multiple subjects. Furthermore, the selective training framework\nmitigates catastrophic forgetting in out-of-distribution benchmarks, addressing\na critical limitation in training LLMs. Our findings suggest that such an\napproach can substantially reduce the dependency on large labeled datasets,\npaving the way for more scalable and cost-effective language model training.\n","authors":["Wei Jie Yeo","Teddy Ferdinan","Przemyslaw Kazienko","Ranjan Satapathy","Erik Cambria"],"pdf_url":"https://arxiv.org/pdf/2406.11275v2.pdf","comment":"EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2411.07917v1","updated":"2024-11-12T16:49:51Z","published":"2024-11-12T16:49:51Z","title":"CryptoLLM: Unleashing the Power of Prompted LLMs for SmartQnA and\n Classification of Crypto Posts","summary":" The rapid growth of social media has resulted in an large volume of\nuser-generated content, particularly in niche domains such as cryptocurrency.\nThis task focuses on developing robust classification models to accurately\ncategorize cryptocurrency-related social media posts into predefined classes,\nincluding but not limited to objective, positive, negative, etc. Additionally,\nthe task requires participants to identify the most relevant answers from a set\nof posts in response to specific questions. By leveraging advanced LLMs, this\nresearch aims to enhance the understanding and filtering of cryptocurrency\ndiscourse, thereby facilitating more informed decision-making in this volatile\nsector. We have used a prompt-based technique to solve the classification task\nfor reddit posts and twitter posts. Also, we have used 64-shot technique along\nwith prompts on GPT-4-Turbo model to determine whether a answer is relevant to\na question or not.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2411.07917v1.pdf","comment":"Accepted at FIRE 2024 (Track: Opinion Extraction and Question\n Answering from CryptoCurrency-Related Tweets and Reddit posts (CryptOQA))"},{"id":"http://arxiv.org/abs/2406.13230v2","updated":"2024-11-12T16:47:49Z","published":"2024-06-19T05:33:34Z","title":"Enhancing Language Model Factuality via Activation-Based Confidence\n Calibration and Guided Decoding","summary":" Calibrating language models (LMs) aligns their generation confidence with the\nactual likelihood of answer correctness, which can inform users about LMs'\nreliability and mitigate hallucinated content. However, prior calibration\nmethods, such as self-consistency-based and logit-based approaches, are either\nlimited in inference-time efficiency or fall short of providing informative\nsignals. Moreover, simply filtering out low-confidence responses reduces the\nLM's helpfulness when the answers are correct. Therefore, effectively using\ncalibration techniques to enhance an LM's factuality remains an unsolved\nchallenge. In this paper, we first propose an activation-based calibration\nmethod, ActCab, which trains a linear layer on top of the LM's last-layer\nactivations that can better capture the representations of knowledge. Built on\ntop of ActCab, we further propose CoDec, a confidence-guided decoding strategy\nto elicit truthful answers with high confidence from LMs. By evaluating on five\npopular QA benchmarks, ActCab achieves superior calibration performance than\nall competitive baselines, e.g., by reducing the average expected calibration\nerror (ECE) score by up to 39%. Further experiments on CoDec show consistent\nimprovements in several LMs' factuality on challenging QA datasets, such as\nTruthfulQA, highlighting the value of confidence signals in enhancing\nfactuality.\n","authors":["Xin Liu","Farima Fatahi Bayat","Lu Wang"],"pdf_url":"https://arxiv.org/pdf/2406.13230v2.pdf","comment":"EMNLP 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2310.10429v2","updated":"2024-11-12T16:39:55Z","published":"2023-10-16T14:13:38Z","title":"Exploiting User Comments for Early Detection of Fake News Prior to\n Users' Commenting","summary":" Both accuracy and timeliness are key factors in detecting fake news on social\nmedia. However, most existing methods encounter an accuracy-timeliness dilemma:\nContent-only methods guarantee timeliness but perform moderately because of\nlimited available information, while social con-text-based ones generally\nperform better but inevitably lead to latency because of social context\naccumulation needs. To break such a dilemma, a feasible but not well-studied\nsolution is to leverage social contexts (e.g., comments) from historical news\nfor training a detection model and apply it to newly emerging news without\nsocial contexts. This requires the model to (1) sufficiently learn helpful\nknowledge from social contexts, and (2) be well compatible with situations that\nsocial contexts are available or not. To achieve this goal, we propose to\nabsorb and parameterize useful knowledge from comments in historical news and\nthen inject it into a content-only detection model. Specifically, we design the\nComments ASsisted FakE News Detection method (CAS-FEND), which transfers useful\nknowledge from a comment-aware teacher model to a content-only student model\nand detects newly emerging news with the student model. Experiments show that\nthe CAS-FEND student model outperforms all content-only methods and even\ncomment-aware ones with 1/4 comments as inputs, demonstrating its superiority\nfor early detection.\n","authors":["Qiong Nan","Qiang Sheng","Juan Cao","Yongchun Zhu","Danding Wang","Guang Yang","Jintao Li"],"pdf_url":"https://arxiv.org/pdf/2310.10429v2.pdf","comment":"19 pages, 6 figures, 7 tables. The article has been accepted by\n Frontiers of Computer Science (FCS), with the DOI:\n {10.1007/s11704-024-40674-6}"},{"id":"http://arxiv.org/abs/2406.11813v3","updated":"2024-11-12T16:38:37Z","published":"2024-06-17T17:54:40Z","title":"How Do Large Language Models Acquire Factual Knowledge During\n Pretraining?","summary":" Despite the recent observation that large language models (LLMs) can store\nsubstantial factual knowledge, there is a limited understanding of the\nmechanisms of how they acquire factual knowledge through pretraining. This work\naddresses this gap by studying how LLMs acquire factual knowledge during\npretraining. The findings reveal several important insights into the dynamics\nof factual knowledge acquisition during pretraining. First, counterintuitively,\nwe observe that pretraining on more data shows no significant improvement in\nthe model's capability to acquire and maintain factual knowledge. Next, there\nis a power-law relationship between training steps and forgetting of\nmemorization and generalization of factual knowledge, and LLMs trained with\nduplicated training data exhibit faster forgetting. Third, training LLMs with\nlarger batch sizes can enhance the models' robustness to forgetting. Overall,\nour observations suggest that factual knowledge acquisition in LLM pretraining\noccurs by progressively increasing the probability of factual knowledge\npresented in the pretraining data at each step. However, this increase is\ndiluted by subsequent forgetting. Based on this interpretation, we demonstrate\nthat we can provide plausible explanations for recently observed behaviors of\nLLMs, such as the poor performance of LLMs on long-tail knowledge and the\nbenefits of deduplicating the pretraining corpus.\n","authors":["Hoyeon Chang","Jinho Park","Seonghyeon Ye","Sohee Yang","Youngkyung Seo","Du-Seong Chang","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2406.11813v3.pdf","comment":"Accepted at NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.06855v2","updated":"2024-11-12T16:03:24Z","published":"2024-11-11T10:37:11Z","title":"A Unified Multi-Task Learning Architecture for Hate Detection Leveraging\n User-Based Information","summary":" Hate speech, offensive language, aggression, racism, sexism, and other\nabusive language are common phenomena in social media. There is a need for\nArtificial Intelligence(AI)based intervention which can filter hate content at\nscale. Most existing hate speech detection solutions have utilized the features\nby treating each post as an isolated input instance for the classification.\nThis paper addresses this issue by introducing a unique model that improves\nhate speech identification for the English language by utilising intra-user and\ninter-user-based information. The experiment is conducted over single-task\nlearning (STL) and multi-task learning (MTL) paradigms that use deep neural\nnetworks, such as convolutional neural networks (CNN), gated recurrent unit\n(GRU), bidirectional encoder representations from the transformer (BERT), and A\nLite BERT (ALBERT). We use three benchmark datasets and conclude that combining\ncertain user features with textual features gives significant improvements in\nmacro-F1 and weighted-F1.\n","authors":["Prashant Kapil","Asif Ekbal"],"pdf_url":"https://arxiv.org/pdf/2411.06855v2.pdf","comment":"7 pages, 1 figure, and two tables. Accepted at the 20th International\n Conference on Natural Language Processing (ICON) 2023.\n https://aclanthology.org/2023.icon-1.53"},{"id":"http://arxiv.org/abs/2411.07892v1","updated":"2024-11-12T15:56:48Z","published":"2024-11-12T15:56:48Z","title":"Mapping the Podcast Ecosystem with the Structured Podcast Research\n Corpus","summary":" Podcasts provide highly diverse content to a massive listener base through a\nunique on-demand modality. However, limited data has prevented large-scale\ncomputational analysis of the podcast ecosystem. To fill this gap, we introduce\na massive dataset of over 1.1M podcast transcripts that is largely\ncomprehensive of all English language podcasts available through public RSS\nfeeds from May and June of 2020. This data is not limited to text, but rather\nincludes audio features and speaker turns for a subset of 370K episodes, and\nspeaker role inferences and other metadata for all 1.1M episodes. Using this\ndata, we also conduct a foundational investigation into the content, structure,\nand responsiveness of this ecosystem. Together, our data and analyses open the\ndoor to continued computational research of this popular and impactful medium.\n","authors":["Benjamin Litterer","David Jurgens","Dallas Card"],"pdf_url":"https://arxiv.org/pdf/2411.07892v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.05508v2","updated":"2024-11-12T15:36:04Z","published":"2024-11-08T12:08:17Z","title":"An Early FIRST Reproduction and Improvements to Single-Token Decoding\n for Fast Listwise Reranking","summary":" Recent advances have demonstrated that large language models (LLMs) excel as\nlistwise rerankers, but their high computational demands remain a barrier to\nwidespread adoption. Further, the traditional language modeling (LM) objective\nis not ideally suited for reranking tasks. FIRST is a novel approach that\naddresses these challenges by integrating a learning-to-rank objective and\nleveraging the logits of only the first generated token, thereby significantly\nreducing inference latency compared to traditional LLM rerankers. In this\nstudy, we extend the evaluation of FIRST to the TREC Deep Learning datasets\n(DL19-22), validating its robustness across diverse domains. We investigate the\ninfluence of different first-stage retrievers on FIRST rerankers, observing\ndiminishing returns and patterns consistent with traditional LLM rerankers.\nThrough applying the FIRST objective to a broader range of backbone models, we\nachieve effectiveness surpassing the original implementation. Our experiments\nconfirm that fast reranking with single-token logits does not compromise\nout-of-domain reranking quality. To better quantify the computational savings\nin the original study, we measure and compare latency to find a 21%-42% gain\nacross various models and benchmarks. Moreover, while LM training implicitly\nimproves zero-shot single-token reranking, our experiments also raise questions\nabout whether LM pre-training may hinder subsequent fine-tuning with the FIRST\nobjective. These findings pave the way for more efficient and effective\nlistwise reranking in future applications.\n","authors":["Zijian Chen","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.05508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07870v1","updated":"2024-11-12T15:26:17Z","published":"2024-11-12T15:26:17Z","title":"Trustful LLMs: Customizing and Grounding Text Generation with Knowledge\n Bases and Dual Decoders","summary":" Although people are impressed by the content generation skills of large\nlanguage models, the use of LLMs, such as ChatGPT, is limited by the domain\ngrounding of the content. The correctness and groundedness of the generated\ncontent need to be based on a verified context, such as results from\nRetrieval-Augmented Generation (RAG). One important issue when adapting LLMs to\na customized domain is that the generated responses are often incomplete, or\nthe additions are not verified and may even be hallucinated. Prior studies on\nhallucination detection have focused on evaluation metrics, which are not\neasily adaptable to dynamic domains and can be vulnerable to attacks like\njail-breaking. In this work, we propose 1) a post-processing algorithm that\nleverages knowledge triplets in RAG context to correct hallucinations and 2) a\ndual-decoder model that fuses RAG context to guide the generation process.\n","authors":["Xiaofeng Zhu","Jaya Krishna Mandivarapu"],"pdf_url":"https://arxiv.org/pdf/2411.07870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07858v1","updated":"2024-11-12T15:15:20Z","published":"2024-11-12T15:15:20Z","title":"Verbosity $\\neq$ Veracity: Demystify Verbosity Compensation Behavior of\n Large Language Models","summary":" When unsure about an answer, humans often respond with more words than\nnecessary, hoping that part of the response will be correct. We observe a\nsimilar behavior in large language models (LLMs), which we term \"Verbosity\nCompensation\" (VC). VC is harmful because it confuses the user understanding,\nleading to low efficiency, and influences the LLM services by increasing the\nlatency and cost of generating useless tokens. In this paper, we present the\nfirst work that defines and analyzes Verbosity Compensation, explores its\ncauses, and proposes a simple mitigating approach. We define Verbosity\nCompensation as the behavior of generating responses that can be compressed\nwithout information loss when prompted to write concisely. Our experiments,\nconducted on five datasets of knowledge and reasoning-based QA tasks with 14\nnewly developed LLMs, reveal three conclusions. 1) We reveal a pervasive\npresence of verbosity compensation across all models and all datasets. Notably,\nGPT-4 exhibits a VC frequency of 50.40%. 2) We reveal the large performance gap\nbetween verbose and concise responses, with a notable difference of 27.61% on\nthe Qasper dataset. We also demonstrate that this difference does not naturally\ndiminish as LLM capability increases. Both 1) and 2) highlight the urgent need\nto mitigate the frequency of VC behavior and disentangle verbosity with\nveracity. We propose a simple yet effective cascade algorithm that replaces the\nverbose responses with the other model-generated responses. The results show\nthat our approach effectively alleviates the VC of the Mistral model from\n63.81% to 16.16% on the Qasper dataset. 3) We also find that verbose responses\nexhibit higher uncertainty across all five datasets, suggesting a strong\nconnection between verbosity and model uncertainty. Our dataset and code are\navailable at https://github.com/psunlpgroup/VerbosityLLM.\n","authors":["Yusen Zhang","Sarkar Snigdha Sarathi Das","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07858v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2411.07854v1","updated":"2024-11-12T15:06:06Z","published":"2024-11-12T15:06:06Z","title":"Tucano: Advancing Neural Text Generation for Portuguese","summary":" Significant advances have been made in natural language processing in recent\nyears. However, our current deep learning approach to language modeling\nrequires substantial resources in terms of data and computation. One of the\nside effects of this data-hungry paradigm is the current schism between\nlanguages, separating those considered high-resource, where most of the\ndevelopment happens and resources are available, and the low-resource ones,\nwhich struggle to attain the same level of performance and autonomy. This study\naims to introduce a new set of resources to stimulate the future development of\nneural text generation in Portuguese. In this work, we document the development\nof GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting\nto 200 billion tokens. Via this corpus, we trained a series of\ndecoder-transformers named Tucano. Our models perform equal or superior to\nother Portuguese and multilingual language models of similar size in several\nPortuguese benchmarks. The evaluation of our models also reveals that model\nperformance on many currently available benchmarks used by the Portuguese NLP\ncommunity has little to no correlation with the scaling of token ingestion\nduring training, highlighting the limitations of such evaluations when it comes\nto the assessment of Portuguese generative language models. All derivatives of\nour study are openly released on GitHub and Hugging Face. See\nhttps://nkluge-correa.github.io/Tucano/\n","authors":["Nicholas Kluge Corrêa","Aniket Sen","Sophia Falk","Shiza Fatimah"],"pdf_url":"https://arxiv.org/pdf/2411.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07850v1","updated":"2024-11-12T15:01:47Z","published":"2024-11-12T15:01:47Z","title":"IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems","summary":" Adversarial examples, which are inputs deliberately perturbed with\nimperceptible changes to induce model errors, have raised serious concerns for\nthe reliability and security of deep neural networks (DNNs). While adversarial\nattacks have been extensively studied in continuous data domains such as\nimages, the discrete nature of text presents unique challenges. In this paper,\nwe propose Irony-based Adversarial Examples (IAE), a method that transforms\nstraightforward sentences into ironic ones to create adversarial text. This\napproach exploits the rhetorical device of irony, where the intended meaning is\nopposite to the literal interpretation, requiring a deeper understanding of\ncontext to detect. The IAE method is particularly challenging due to the need\nto accurately locate evaluation words, substitute them with appropriate\ncollocations, and expand the text with suitable ironic elements while\nmaintaining semantic coherence. Our research makes the following key\ncontributions: (1) We introduce IAE, a strategy for generating textual\nadversarial examples using irony. This method does not rely on pre-existing\nirony corpora, making it a versatile tool for creating adversarial text in\nvarious NLP tasks. (2) We demonstrate that the performance of several\nstate-of-the-art deep learning models on sentiment analysis tasks significantly\ndeteriorates when subjected to IAE attacks. This finding underscores the\nsusceptibility of current NLP systems to adversarial manipulation through\nirony. (3) We compare the impact of IAE on human judgment versus NLP systems,\nrevealing that humans are less susceptible to the effects of irony in text.\n","authors":["Xiaoyin Yi","Jiacheng Huang"],"pdf_url":"https://arxiv.org/pdf/2411.07850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07845v1","updated":"2024-11-12T14:53:12Z","published":"2024-11-12T14:53:12Z","title":"Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics\n Statements","summary":" What ethical concerns, if any, do LLM researchers have? We introduce EthiCon,\na corpus of 1,580 ethical concern statements extracted from scientific papers\npublished in the ACL Anthology. We extract ethical concern keywords from the\nstatements and show promising results in automating the concern identification\nprocess. Through a survey, we compare the ethical concerns of the corpus to the\nconcerns listed by the general public and professionals in the field. Finally,\nwe compare our retrieved ethical concerns with existing taxonomies pointing to\ngaps and future research directions.\n","authors":["Antonia Karamolegkou","Sandrine Schiller Hansen","Ariadni Christopoulou","Filippos Stamatiou","Anne Lauscher","Anders Søgaard"],"pdf_url":"https://arxiv.org/pdf/2411.07845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07843v1","updated":"2024-11-12T14:51:41Z","published":"2024-11-12T14:51:41Z","title":"Chain Association-based Attacking and Shielding Natural Language\n Processing Systems","summary":" Association as a gift enables people do not have to mention something in\ncompletely straightforward words and allows others to understand what they\nintend to refer to. In this paper, we propose a chain association-based\nadversarial attack against natural language processing systems, utilizing the\ncomprehension gap between humans and machines. We first generate a chain\nassociation graph for Chinese characters based on the association paradigm for\nbuilding search space of potential adversarial examples. Then, we introduce an\ndiscrete particle swarm optimization algorithm to search for the optimal\nadversarial examples. We conduct comprehensive experiments and show that\nadvanced natural language processing models and applications, including large\nlanguage models, are vulnerable to our attack, while humans appear good at\nunderstanding the perturbed text. We also explore two methods, including\nadversarial training and associative graph-based recovery, to shield systems\nfrom chain association-based attack. Since a few examples that use some\nderogatory terms, this paper contains materials that may be offensive or\nupsetting to some people.\n","authors":["Jiacheng Huang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2411.07843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06008v2","updated":"2024-11-12T14:30:28Z","published":"2024-11-08T23:02:59Z","title":"The Dark Patterns of Personalized Persuasion in Large Language Models:\n Exposing Persuasive Linguistic Features for Big Five Personality Traits in\n LLMs Responses","summary":" This study explores how the Large Language Models (LLMs) adjust linguistic\nfeatures to create personalized persuasive outputs. While research showed that\nLLMs personalize outputs, a gap remains in understanding the linguistic\nfeatures of their persuasive capabilities. We identified 13 linguistic features\ncrucial for influencing personalities across different levels of the Big Five\nmodel of personality. We analyzed how prompts with personality trait\ninformation influenced the output of 19 LLMs across five model families. The\nfindings show that models use more anxiety-related words for neuroticism,\nincrease achievement-related words for conscientiousness, and employ fewer\ncognitive processes words for openness to experience. Some model families excel\nat adapting language for openness to experience, others for conscientiousness,\nwhile only one model adapts language for neuroticism. Our findings show how\nLLMs tailor responses based on personality cues in prompts, indicating their\npotential to create persuasive content affecting the mind and well-being of the\nrecipients.\n","authors":["Wiktoria Mieleszczenko-Kowszewicz","Dawid Płudowski","Filip Kołodziejczyk","Jakub Świstak","Julian Sienkiewicz","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2411.06008v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2405.05894v3","updated":"2024-11-12T14:06:49Z","published":"2024-05-09T16:45:27Z","title":"Efficient LLM Comparative Assessment: a Product of Experts Framework for\n Pairwise Comparisons","summary":" LLM-as-a-judge approaches are a practical and effective way of assessing a\nrange of text tasks. However, when using pairwise comparisons to rank a set of\ncandidates, the computational cost scales quadratically with the number of\ncandidates, which has practical limitations. This paper introduces a Product of\nExpert (PoE) framework for efficient LLM Comparative Assessment. Here\nindividual comparisons are considered experts that provide information on a\npair's score difference. The PoE framework combines the information from these\nexperts to yield an expression that can be maximized with respect to the\nunderlying set of candidates, and is highly flexible where any form of expert\ncan be assumed. When Gaussian experts are used one can derive simple\nclosed-form solutions for the optimal candidate ranking, and expressions for\nselecting which comparisons should be made to maximize the probability of this\nranking. Our approach enables efficient comparative assessment, where by using\nonly a small subset of the possible comparisons, one can generate score\npredictions that correlate well with human judgements. We evaluate the approach\non multiple NLG tasks and demonstrate that our framework can yield considerable\ncomputational savings when performing pairwise comparative assessment. With\nmany candidate texts, using as few as 2% of comparisons the PoE solution can\nachieve similar performance to when all comparisons are used.\n","authors":["Adian Liusie","Vatsal Raina","Yassir Fathullah","Mark Gales"],"pdf_url":"https://arxiv.org/pdf/2405.05894v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12186v3","updated":"2024-11-12T13:24:25Z","published":"2024-09-18T17:57:57Z","title":"Qwen2.5-Coder Technical Report","summary":" In this report, we introduce the Qwen2.5-Coder series, a significant upgrade\nfrom its predecessor, CodeQwen1.5. This series includes six models:\nQwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model,\nQwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained\non a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning,\nscalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder\ndemonstrates impressive code generation capabilities while retaining general\nand math skills. These models have been evaluated on a wide range of\ncode-related tasks, achieving state-of-the-art (SOTA) performance across more\nthan 10 benchmarks, including code generation, completion, reasoning, and\nrepair, consistently outperforming larger models of the same model size. We\nbelieve that the release of the Qwen2.5-Coder series will advance research in\ncode intelligence and, with its permissive licensing, support wider adoption by\ndevelopers in real-world applications.\n","authors":["Binyuan Hui","Jian Yang","Zeyu Cui","Jiaxi Yang","Dayiheng Liu","Lei Zhang","Tianyu Liu","Jiajun Zhang","Bowen Yu","Keming Lu","Kai Dang","Yang Fan","Yichang Zhang","An Yang","Rui Men","Fei Huang","Bo Zheng","Yibo Miao","Shanghaoran Quan","Yunlong Feng","Xingzhang Ren","Xuancheng Ren","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12186v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07773v1","updated":"2024-11-12T13:14:09Z","published":"2024-11-12T13:14:09Z","title":"Likelihood as a Performance Gauge for Retrieval-Augmented Generation","summary":" Recent work finds that retrieval-augmented generation with large language\nmodels is prone to be influenced by the order of retrieved documents in the\ncontext. However, the lack of in-depth analysis limits the use of this\nphenomenon for prompt engineering in practice. In this study, we posit that\nlikelihoods serve as an effective gauge for language model performance. Through\nexperiments on two question-answering datasets with a variety of\nstate-of-the-art language models, we reveal correlations between answer\naccuracy and the likelihood of the question at both the corpus level and the\ninstance level. In addition, we find that question likelihood can also indicate\nthe position of the task-relevant information in the context. Based on these\nfindings, we propose two methods that use question likelihood as a gauge for\nselecting and constructing prompts that lead to better performance. We\ndemonstrate their effectiveness with experiments. In addition, our\nlikelihood-based methods are efficient, as they only need to compute the\nlikelihood of the input, requiring much fewer language model passes than\nheuristic prompt engineering methods that require generating responses. Our\nanalysis deepens our understanding of how input prompts affect model\nperformance and provides a promising direction for efficient prompt\noptimization.\n","authors":["Tianyu Liu","Jirui Qi","Paul He","Arianna Bisazza","Mrinmaya Sachan","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07773v1.pdf","comment":"Under review at NAACL 2025. Code is available at\n https://github.com/lyutyuh/poptimizer"},{"id":"http://arxiv.org/abs/2411.07772v1","updated":"2024-11-12T13:13:20Z","published":"2024-11-12T13:13:20Z","title":"Automatic Album Sequencing","summary":" Album sequencing is a critical part of the album production process.\nRecently, a data-driven approach was proposed that sequences general\ncollections of independent media by extracting the narrative essence of the\nitems in the collections. While this approach implies an album sequencing\ntechnique, it is not widely accessible to a less technical audience, requiring\nadvanced knowledge of machine learning techniques to use. To address this, we\nintroduce a new user-friendly web-based tool that allows a less technical\naudience to upload music tracks, execute this technique in one click, and\nsubsequently presents the result in a clean visualization to the user. To both\nincrease the number of templates available to the user and address shortcomings\nof previous work, we also introduce a new direct transformer-based album\nsequencing method. We find that our more direct method outperforms a random\nbaseline but does not reach the same performance as the narrative essence\napproach. Both methods are included in our web-based user interface, and this\n-- alongside a full copy of our implementation -- is publicly available at\nhttps://github.com/dylanashley/automatic-album-sequencing\n","authors":["Vincent Herrmann","Dylan R. Ashley","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2411.07772v1.pdf","comment":"presented as a late breaking demo in the 25th International Society\n for Music Information Retrieval Conference; 3 pages in main text, 3 figures\n in main text; source code available at\n https://github.com/dylanashley/automatic-album-sequencing"},{"id":"http://arxiv.org/abs/2411.04799v2","updated":"2024-11-12T12:57:58Z","published":"2024-11-07T15:38:25Z","title":"Kwai-STaR: Transform LLMs into State-Transition Reasoners","summary":" Mathematical reasoning presents a significant challenge to the cognitive\ncapabilities of LLMs. Various methods have been proposed to enhance the\nmathematical ability of LLMs. However, few recognize the value of state\ntransition for LLM reasoning. In this work, we define mathematical\nproblem-solving as a process of transiting from an initial unsolved state to\nthe final resolved state, and propose Kwai-STaR framework, which transforms\nLLMs into State-Transition Reasoners to improve their intuitive reasoning\ncapabilities. Our approach comprises three main steps: (1) Define the state\nspace tailored to the mathematical reasoning. (2) Generate state-transition\ndata based on the state space. (3) Convert original LLMs into State-Transition\nReasoners via a curricular training strategy. Our experiments validate the\neffectiveness of Kwai-STaR in enhancing mathematical reasoning: After training\non the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and\nLLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard\ndataset. Additionally, the state transition-based design endows Kwai-STaR with\nremarkable training and inference efficiency. Further experiments are underway\nto establish the generality of Kwai-STaR.\n","authors":["Xingyu Lu","Yuhang Hu","Changyi Liu","Tianke Zhang","Zhenyu Yang","Zhixiang Ding","Shengsheng Qian","Meng Du","Ruiwen Kang","Kaiyu Tang","Fan Yang","Tingting Gao","Di Zhang","Hai-Tao Zheng","Bin Wen"],"pdf_url":"https://arxiv.org/pdf/2411.04799v2.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2411.07763v1","updated":"2024-11-12T12:52:17Z","published":"2024-11-12T12:52:17Z","title":"Spider 2.0: Evaluating Language Models on Real-World Enterprise\n Text-to-SQL Workflows","summary":" Real-world enterprise text-to-SQL workflows often involve complex cloud or\nlocal data across various database systems, multiple SQL queries in various\ndialects, and diverse operations from data transformation to analytics. We\nintroduce Spider 2.0, an evaluation framework comprising 632 real-world\ntext-to-SQL workflow problems derived from enterprise-level database use cases.\nThe databases in Spider 2.0 are sourced from real data applications, often\ncontaining over 1,000 columns and stored in local or cloud database systems\nsuch as BigQuery and Snowflake. We show that solving problems in Spider 2.0\nfrequently requires understanding and searching through database metadata,\ndialect documentation, and even project-level codebases. This challenge calls\nfor models to interact with complex SQL workflow environments, process\nextremely long contexts, perform intricate reasoning, and generate multiple SQL\nqueries with diverse operations, often exceeding 100 lines, which goes far\nbeyond traditional text-to-SQL challenges. Our evaluations indicate that based\non o1-preview, our code agent framework successfully solves only 17.0% of the\ntasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on\nSpider 2.0 show that while language models have demonstrated remarkable\nperformance in code generation -- especially in prior text-to-SQL benchmarks --\nthey require significant improvement in order to achieve adequate performance\nfor real-world enterprise usage. Progress on Spider 2.0 represents crucial\nsteps towards developing intelligent, autonomous, code agents for real-world\nenterprise settings. Our code, baseline models, and data are available at\nhttps://spider2-sql.github.io.\n","authors":["Fangyu Lei","Jixuan Chen","Yuxiao Ye","Ruisheng Cao","Dongchan Shin","Hongjin Su","Zhaoqing Suo","Hongcheng Gao","Wenjing Hu","Pengcheng Yin","Victor Zhong","Caiming Xiong","Ruoxi Sun","Qian Liu","Sida Wang","Tao Yu"],"pdf_url":"https://arxiv.org/pdf/2411.07763v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00722v2","updated":"2024-11-12T11:49:33Z","published":"2024-04-26T11:57:21Z","title":"LLMs for Generating and Evaluating Counterfactuals: A Comprehensive\n Study","summary":" As NLP models become more complex, understanding their decisions becomes more\ncrucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's\nprediction, offer a way to explain these models. While Large Language Models\n(LLMs) have shown remarkable performance in NLP tasks, their efficacy in\ngenerating high-quality CFs remains uncertain. This work fills this gap by\ninvestigating how well LLMs generate CFs for two NLU tasks. We conduct a\ncomprehensive comparison of several common LLMs, and evaluate their CFs,\nassessing both intrinsic metrics, and the impact of these CFs on data\naugmentation. Moreover, we analyze differences between human and LLM-generated\nCFs, providing insights for future research directions. Our results show that\nLLMs generate fluent CFs, but struggle to keep the induced changes minimal.\nGenerating CFs for Sentiment Analysis (SA) is less challenging than NLI where\nLLMs show weaknesses in generating CFs that flip the original label. This also\nreflects on the data augmentation performance, where we observe a large gap\nbetween augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs'\nability to assess CFs in a mislabelled data setting, and show that they have a\nstrong bias towards agreeing with the provided labels. GPT4 is more robust\nagainst this bias and its scores correlate well with automatic metrics. Our\nfindings reveal several limitations and point to potential future work\ndirections.\n","authors":["Van Bach Nguyen","Paul Youssef","Christin Seifert","Jörg Schlötterer"],"pdf_url":"https://arxiv.org/pdf/2405.00722v2.pdf","comment":"Accepted to EMNLP Findings 2024"},{"id":"http://arxiv.org/abs/2405.07863v3","updated":"2024-11-12T11:18:43Z","published":"2024-05-13T15:50:39Z","title":"RLHF Workflow: From Reward Modeling to Online RLHF","summary":" We present the workflow of Online Iterative Reinforcement Learning from Human\nFeedback (RLHF) in this technical report, which is widely reported to\noutperform its offline counterpart by a large margin in the recent large\nlanguage model (LLM) literature. However, existing open-source RLHF projects\nare still largely confined to the offline learning setting. In this technical\nreport, we aim to fill in this gap and provide a detailed recipe that is easy\nto reproduce for online iterative RLHF. In particular, since online human\nfeedback is usually infeasible for open-source communities with limited\nresources, we start by constructing preference models using a diverse set of\nopen-source datasets and use the constructed proxy preference model to\napproximate human feedback. Then, we discuss the theoretical insights and\nalgorithmic principles behind online iterative RLHF, followed by a detailed\npractical implementation. Our trained LLM achieves impressive performance on\nLLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as\nwell as other academic benchmarks such as HumanEval and TruthfulQA. We have\nshown that supervised fine-tuning (SFT) and iterative RLHF can obtain\nstate-of-the-art performance with fully open-source datasets. Further, we have\nmade our models, curated datasets, and comprehensive step-by-step code\nguidebooks publicly available. Please refer to\nhttps://github.com/RLHFlow/RLHF-Reward-Modeling and\nhttps://github.com/RLHFlow/Online-RLHF for more detailed information.\n","authors":["Hanze Dong","Wei Xiong","Bo Pang","Haoxiang Wang","Han Zhao","Yingbo Zhou","Nan Jiang","Doyen Sahoo","Caiming Xiong","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2405.07863v3.pdf","comment":"Published in Transactions on Machine Learning Research (09/2024)"},{"id":"http://arxiv.org/abs/2407.14192v2","updated":"2024-11-12T11:09:35Z","published":"2024-07-19T10:40:10Z","title":"LeKUBE: A Legal Knowledge Update BEnchmark","summary":" Recent advances in Large Language Models (LLMs) have significantly shaped the\napplications of AI in multiple fields, including the studies of legal\nintelligence. Trained on extensive legal texts, including statutes and legal\ndocuments, the legal LLMs can capture important legal knowledge/concepts\neffectively and provide important support for downstream legal applications\nsuch as legal consultancy. Yet, the dynamic nature of legal statutes and\ninterpretations also poses new challenges to the use of LLMs in legal\napplications. Particularly, how to update the legal knowledge of LLMs\neffectively and efficiently has become an important research problem in\npractice. Existing benchmarks for evaluating knowledge update methods are\nmostly designed for the open domain and cannot address the specific challenges\nof the legal domain, such as the nuanced application of new legal knowledge,\nthe complexity and lengthiness of legal regulations, and the intricate nature\nof legal reasoning. To address this gap, we introduce the Legal Knowledge\nUpdate BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for\nlegal LLMs across five dimensions. Specifically, we categorize the needs of\nknowledge updates in the legal domain with the help of legal professionals, and\nthen hire annotators from law schools to create synthetic updates to the\nChinese Criminal and Civil Code as well as sets of questions of which the\nanswers would change after the updates. Through a comprehensive evaluation of\nstate-of-the-art knowledge update methods, we reveal a notable gap between\nexisting knowledge update methods and the unique needs of the legal domain,\nemphasizing the need for further research and development of knowledge update\nmechanisms tailored for legal LLMs.\n","authors":["Changyue Wang","Weihang Su","Hu Yiran","Qingyao Ai","Yueyue Wu","Cheng Luo","Yiqun Liu","Min Zhang","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2407.14192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12036v2","updated":"2024-11-12T10:12:49Z","published":"2024-07-01T05:37:17Z","title":"Exploring Advanced Large Language Models with LLMsuite","summary":" This tutorial explores the advancements and challenges in the development of\nLarge Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent\nlimitations like temporal knowledge cutoffs, mathematical inaccuracies, and the\ngeneration of incorrect information, proposing solutions like Retrieval\nAugmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks\nsuch as ReAct and LangChain. The integration of these techniques enhances LLM\nperformance and reliability, especially in multi-step reasoning and complex\ntask execution. The paper also covers fine-tuning strategies, including\ninstruction fine-tuning, parameter-efficient methods like LoRA, and\nReinforcement Learning from Human Feedback (RLHF) as well as Reinforced\nSelf-Training (ReST). Additionally, it provides a comprehensive survey of\ntransformer architectures and training techniques for LLMs. The source code can\nbe accessed by contacting the author via email for a request.\n","authors":["Giorgio Roffo"],"pdf_url":"https://arxiv.org/pdf/2407.12036v2.pdf","comment":"Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison,\n LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset\n Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing\n Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite,\n Comprehensive LLM Evaluation Toolkit"},{"id":"http://arxiv.org/abs/2406.16620v3","updated":"2024-11-12T10:02:12Z","published":"2024-06-24T13:05:39Z","title":"OmAgent: A Multi-modal Agent Framework for Complex Video Understanding\n with Task Divide-and-Conquer","summary":" Recent advancements in Large Language Models (LLMs) have expanded their\ncapabilities to multimodal contexts, including comprehensive video\nunderstanding. However, processing extensive videos such as 24-hour CCTV\nfootage or full-length films presents significant challenges due to the vast\ndata and processing demands. Traditional methods, like extracting key frames or\nconverting frames to text, often result in substantial information loss. To\naddress these shortcomings, we develop OmAgent, efficiently stores and\nretrieves relevant video frames for specific queries, preserving the detailed\ncontent of videos. Additionally, it features an Divide-and-Conquer Loop capable\nof autonomous reasoning, dynamically invoking APIs and tools to enhance query\nprocessing and accuracy. This approach ensures robust video understanding,\nsignificantly reducing information loss. Experimental results affirm OmAgent's\nefficacy in handling various types of videos and complex tasks. Moreover, we\nhave endowed it with greater autonomy and a robust tool-calling system,\nenabling it to accomplish even more intricate tasks.\n","authors":["Lu Zhang","Tiancheng Zhao","Heting Ying","Yibo Ma","Kyusong Lee"],"pdf_url":"https://arxiv.org/pdf/2406.16620v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07656v1","updated":"2024-11-12T09:14:16Z","published":"2024-11-12T09:14:16Z","title":"Mitigating Bias in Queer Representation within Large Language Models: A\n Collaborative Agent Approach","summary":" Large Language Models (LLMs) often perpetuate biases in pronoun usage,\nleading to misrepresentation or exclusion of queer individuals. This paper\naddresses the specific problem of biased pronoun usage in LLM outputs,\nparticularly the inappropriate use of traditionally gendered pronouns (\"he,\"\n\"she\") when inclusive language is needed to accurately represent all\nidentities. We introduce a collaborative agent pipeline designed to mitigate\nthese biases by analyzing and optimizing pronoun usage for inclusivity. Our\nmulti-agent framework includes specialized agents for both bias detection and\ncorrection. Experimental evaluations using the Tango dataset-a benchmark\nfocused on gender pronoun usage-demonstrate that our approach significantly\nimproves inclusive pronoun classification, achieving a 32.6 percentage point\nincrease over GPT-4o in correctly disagreeing with inappropriate traditionally\ngendered pronouns $(\\chi^2 = 38.57, p < 0.0001)$. These results accentuate the\npotential of agent-driven frameworks in enhancing fairness and inclusivity in\nAI-generated content, demonstrating their efficacy in reducing biases and\npromoting socially responsible AI.\n","authors":["Tianyi Huang","Arya Somasundaram"],"pdf_url":"https://arxiv.org/pdf/2411.07656v1.pdf","comment":"NeurIPS 2024 Queer in AI Workshop"},{"id":"http://arxiv.org/abs/2409.18412v3","updated":"2024-11-12T09:11:37Z","published":"2024-09-27T03:00:29Z","title":"SciDFM: A Large Language Model with Mixture-of-Experts for Science","summary":" Recently, there has been a significant upsurge of interest in leveraging\nlarge language models (LLMs) to assist scientific discovery. However, most LLMs\nonly focus on general science, while they lack domain-specific knowledge, such\nas chemical molecules and amino acid sequences. To bridge these gaps, we\nintroduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and\nis able to conduct college-level scientific reasoning and understand molecules\nand amino acid sequences. We collect a large-scale training corpus containing\nnumerous scientific papers and books from different disciplines as well as data\nfrom domain-specific databases. We further fine-tune the pre-trained model on\nlots of instruction data to improve performances on downstream benchmarks. From\nexperiment results, we show that SciDFM achieves strong performance on general\nscientific benchmarks such as SciEval and SciQ, and it reaches a SOTA\nperformance on domain-specific benchmarks among models of similar size. We\nfurther analyze the expert layers and show that the results of expert selection\nvary with data from different disciplines. To benefit the broader research\ncommunity, we open-source SciDFM at\nhttps://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0.\n","authors":["Liangtai Sun","Danyu Luo","Da Ma","Zihan Zhao","Baocai Chen","Zhennan Shen","Su Zhu","Lu Chen","Xin Chen","Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2409.18412v3.pdf","comment":"12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS\n 2024 Workshop FM4Science"},{"id":"http://arxiv.org/abs/2404.12866v2","updated":"2024-11-12T08:59:30Z","published":"2024-04-19T13:05:37Z","title":"How Does the Textual Information Affect the Retrieval of Multimodal\n In-Context Learning?","summary":" The increase in parameter size of multimodal large language models (MLLMs)\nintroduces significant capabilities, particularly in-context learning, where\nMLLMs enhance task performance without updating pre-trained parameters. This\neffectiveness, however, hinges on the appropriate selection of in-context\nexamples, a process that is currently biased towards visual data, overlooking\ntextual information. Furthermore, the area of supervised retrievers for MLLMs,\ncrucial for optimal in-context example selection, continues to be\nuninvestigated. Our study offers an in-depth evaluation of the impact of\ntextual information on the unsupervised selection of in-context examples in\nmultimodal contexts, uncovering a notable sensitivity of retriever performance\nto the employed modalities. Responding to this, we introduce a novel supervised\nMLLM-retriever MSIER that employs a neural network to select examples that\nenhance multimodal in-context learning efficiency. This approach is validated\nthrough extensive testing across three distinct tasks, demonstrating the\nmethod's effectiveness. Additionally, we investigate the influence of\nmodalities on our supervised retrieval method's training and pinpoint factors\ncontributing to our model's success. This exploration paves the way for future\nadvancements, highlighting the potential for refined in-context learning in\nMLLMs through the strategic use of multimodal data.\n","authors":["Yang Luo","Zangwei Zheng","Zirui Zhu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2404.12866v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2311.10944v5","updated":"2024-11-12T08:31:22Z","published":"2023-11-18T02:44:33Z","title":"Deception Detection from Linguistic and Physiological Data Streams Using\n Bimodal Convolutional Neural Networks","summary":" Deception detection is gaining increasing interest due to ethical and\nsecurity concerns. This paper explores the application of convolutional neural\nnetworks for the purpose of multimodal deception detection. We use a dataset\nbuilt by interviewing 104 subjects about two topics, with one truthful and one\nfalsified response from each subject about each topic. In particular, we make\nthree main contributions. First, we extract linguistic and physiological\nfeatures from this data to train and construct the neural network models.\nSecond, we propose a fused convolutional neural network model using both\nmodalities in order to achieve an improved overall performance. Third, we\ncompare our new approach with earlier methods designed for multimodal deception\ndetection. We find that our system outperforms regular classification methods;\nour results indicate the feasibility of using neural networks for deception\ndetection even in the presence of limited amounts of data.\n","authors":["Panfeng Li","Mohamed Abouelenien","Rada Mihalcea","Zhicheng Ding","Qikai Yang","Yiming Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.10944v5.pdf","comment":"Accepted by 2024 5th International Conference on Information Science,\n Parallel and Distributed Systems"},{"id":"http://arxiv.org/abs/2405.06219v3","updated":"2024-11-12T08:18:45Z","published":"2024-05-10T03:06:24Z","title":"SKVQ: Sliding-window Key and Value Cache Quantization for Large Language\n Models","summary":" Large language models (LLMs) can now handle longer sequences of tokens,\nenabling complex tasks like book understanding and generating lengthy novels.\nHowever, the key-value (KV) cache required for LLMs consumes substantial memory\nas context length increasing, becoming the bottleneck for deployment. In this\npaper, we present a strategy called SKVQ, which stands for sliding-window KV\ncache quantization, to address the issue of extremely low bitwidth KV cache\nquantization. To achieve this, SKVQ rearranges the channels of the KV cache in\norder to improve the similarity of channels in quantization groups, and applies\nclipped dynamic quantization at the group level. Additionally, SKVQ ensures\nthat the most recent window tokens in the KV cache are preserved with high\nprecision. This helps maintain the accuracy of a small but important portion of\nthe KV cache.SKVQ achieves high compression ratios while maintaining accuracy.\nOur evaluation on LLMs demonstrates that SKVQ surpasses previous quantization\napproaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit\nvalues with minimal loss of accuracy. With SKVQ, it is possible to process\ncontext lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7\ntimes faster decoding.\n","authors":["Haojie Duanmu","Zhihang Yuan","Xiuhong Li","Jiangfei Duan","Xingcheng Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2405.06219v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07623v1","updated":"2024-11-12T08:10:54Z","published":"2024-11-12T08:10:54Z","title":"Annotating Constructions with UD: the experience of the Italian\n Constructicon","summary":" The paper descirbes a first attempt of linking the Italian constructicon to\nUD resources\n","authors":["Ludovica Pannitto","Beatrice Bernasconi","Lucia Busso","Flavio Pisciotta","Giulia Rambelli","Francesca Masini"],"pdf_url":"https://arxiv.org/pdf/2411.07623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07618v1","updated":"2024-11-12T07:54:13Z","published":"2024-11-12T07:54:13Z","title":"Direct Preference Optimization Using Sparse Feature-Level Constraints","summary":" The alignment of large language models (LLMs) with human preferences remains\na key challenge. While post-training techniques like Reinforcement Learning\nfrom Human Feedback (RLHF) and Direct Preference Optimization (DPO) have\nachieved notable success, they often introduce computational inefficiencies and\ntraining instability. In this paper, we propose Feature-level constrained\nPreference Optimization (FPO), a novel method designed to simplify the\nalignment process while ensuring stability. FPO leverages pre-trained Sparse\nAutoencoders (SAEs) and introduces feature-level constraints, allowing for\nefficient, sparsity-enforced alignment. Our approach enjoys efficiency by using\nsparse features activated in a well-trained sparse autoencoder and the quality\nof sequential KL divergence by using the feature-level offline reference.\nExperimental results on benchmark datasets demonstrate that FPO achieves a\n5.08% absolute improvement in win rate with much lower computational cost\ncompared to state-of-the-art baselines, making it a promising solution for\nefficient and controllable LLM alignments.\n","authors":["Qingyu Yin","Chak Tou Leong","Hongbo Zhang","Minjun Zhu","Hanqi Yan","Qiang Zhang","Yulan He","Wenjie Li","Jun Wang","Yue Zhang","Linyi Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06634v2","updated":"2024-11-12T07:52:33Z","published":"2024-08-13T04:53:31Z","title":"Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM\n Approach","summary":" Accurate stock market predictions following earnings reports are crucial for\ninvestors. Traditional methods, particularly classical machine learning models,\nstruggle with these predictions because they cannot effectively process and\ninterpret extensive textual data contained in earnings reports and often\noverlook nuances that influence market movements. This paper introduces an\nadvanced approach by employing Large Language Models (LLMs) instruction\nfine-tuned with a novel combination of instruction-based techniques and\nquantized low-rank adaptation (QLoRA) compression. Our methodology integrates\n'base factors', such as financial metric growth and earnings transcripts, with\n'external factors', including recent market indices performances and analyst\ngrades, to create a rich, supervised dataset. This comprehensive dataset\nenables our models to achieve superior predictive performance in terms of\naccuracy, weighted F1, and Matthews correlation coefficient (MCC), especially\nevident in the comparison with benchmarks such as GPT-4. We specifically\nhighlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases\nsignificant improvements over baseline models. The paper also discusses the\npotential of expanding the output capabilities to include a 'Hold' option and\nextending the prediction horizon, aiming to accommodate various investment\nstyles and time frames. This study not only demonstrates the power of\nintegrating cutting-edge AI with fine-tuned financial data but also paves the\nway for future research in enhancing AI-driven financial analysis tools.\n","authors":["Haowei Ni","Shuchen Meng","Xupeng Chen","Ziqing Zhao","Andi Chen","Panfeng Li","Shiyao Zhang","Qifu Yin","Yuanqing Wang","Yuxi Chan"],"pdf_url":"https://arxiv.org/pdf/2408.06634v2.pdf","comment":"Accepted by 2024 6th International Conference on Data-driven\n Optimization of Complex Systems"},{"id":"http://arxiv.org/abs/2411.07611v1","updated":"2024-11-12T07:34:56Z","published":"2024-11-12T07:34:56Z","title":"Multimodal Clinical Reasoning through Knowledge-augmented Rationale\n Generation","summary":" Clinical rationales play a pivotal role in accurate disease diagnosis;\nhowever, many models predominantly use discriminative methods and overlook the\nimportance of generating supportive rationales. Rationale distillation is a\nprocess that transfers knowledge from large language models (LLMs) to smaller\nlanguage models (SLMs), thereby enhancing the latter's ability to break down\ncomplex tasks. Despite its benefits, rationale distillation alone is inadequate\nfor addressing domain knowledge limitations in tasks requiring specialized\nexpertise, such as disease diagnosis. Effectively embedding domain knowledge in\nSLMs poses a significant challenge. While current LLMs are primarily geared\ntoward processing textual data, multimodal LLMs that incorporate time series\ndata, especially electronic health records (EHRs), are still evolving. To\ntackle these limitations, we introduce ClinRaGen, an SLM optimized for\nmultimodal rationale generation in disease diagnosis. ClinRaGen incorporates a\nunique knowledge-augmented attention mechanism to merge domain knowledge with\ntime series EHR data, utilizing a stepwise rationale distillation strategy to\nproduce both textual and time series-based clinical rationales. Our evaluations\nshow that ClinRaGen markedly improves the SLM's capability to interpret\nmultimodal EHR data and generate accurate clinical rationales, supporting more\nreliable disease diagnosis, advancing LLM applications in healthcare, and\nnarrowing the performance divide between LLMs and SLMs.\n","authors":["Shuai Niu","Jing Ma","Liang Bai","Zhihua Wang","Yida Xu","Yunya Song","Xian Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07611v1.pdf","comment":"11 pages. 4 figures"},{"id":"http://arxiv.org/abs/2411.07602v1","updated":"2024-11-12T07:24:41Z","published":"2024-11-12T07:24:41Z","title":"Circuit Complexity Bounds for RoPE-based Transformer Architecture","summary":" Characterizing the express power of the Transformer architecture is critical\nto understanding its capacity limits and scaling law. Recent works provide the\ncircuit complexity bounds to Transformer-like architecture. On the other hand,\nRotary Position Embedding ($\\mathsf{RoPE}$) has emerged as a crucial technique\nin modern large language models, offering superior performance in capturing\npositional information compared to traditional position embeddings, which shows\ngreat potential in application prospects, particularly for the long context\nscenario. Empirical evidence also suggests that $\\mathsf{RoPE}$-based\nTransformer architectures demonstrate greater generalization capabilities\ncompared to conventional Transformer models. In this work, we establish a\ntighter circuit complexity bound for Transformers with $\\mathsf{RoPE}$\nattention. Our key contribution is that we show that unless $\\mathsf{TC}^0 =\n\\mathsf{NC}^1$, a $\\mathsf{RoPE}$-based Transformer with\n$\\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \\leq O(n)$\ncannot solve the arithmetic problem or the Boolean formula value problem. This\nresult significantly demonstrates the fundamental limitation of the\nexpressivity of the $\\mathsf{RoPE}$-based Transformer architecture, although it\nachieves giant empirical success. Our theoretical framework not only\nestablishes tighter complexity bounds but also may instruct further work on the\n$\\mathsf{RoPE}$-based Transformer.\n","authors":["Bo Chen","Xiaoyu Li","Yingyu Liang","Jiangxuan Long","Zhenmei Shi","Zhao Song"],"pdf_url":"https://arxiv.org/pdf/2411.07602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12196v2","updated":"2024-11-12T07:22:21Z","published":"2024-07-16T21:43:47Z","title":"MASIVE: Open-Ended Affective State Identification in English and Spanish","summary":" In the field of emotion analysis, much NLP research focuses on identifying a\nlimited number of discrete emotion categories, often applied across languages.\nThese basic sets, however, are rarely designed with textual data in mind, and\nculture, language, and dialect can influence how particular emotions are\ninterpreted. In this work, we broaden our scope to a practically unbounded set\nof \\textit{affective states}, which includes any terms that humans use to\ndescribe their experiences of feeling. We collect and publish MASIVE, a dataset\nof Reddit posts in English and Spanish containing over 1,000 unique affective\nstates each. We then define the new problem of \\textit{affective state\nidentification} for language generation models framed as a masked span\nprediction task. On this task, we find that smaller finetuned multilingual\nmodels outperform much larger LLMs, even on region-specific Spanish affective\nstates. Additionally, we show that pretraining on MASIVE improves model\nperformance on existing emotion benchmarks. Finally, through machine\ntranslation experiments, we find that native speaker-written data is vital to\ngood performance on this task.\n","authors":["Nicholas Deas","Elsbeth Turcan","Iván Pérez Mejía","Kathleen McKeown"],"pdf_url":"https://arxiv.org/pdf/2407.12196v2.pdf","comment":"EMNLP 2024"},{"id":"http://arxiv.org/abs/2404.13565v3","updated":"2024-11-12T07:21:04Z","published":"2024-04-21T07:34:44Z","title":"Exploring Diverse Methods in Visual Question Answering","summary":" This study explores innovative methods for improving Visual Question\nAnswering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and\nattention mechanisms. Leveraging a balanced VQA dataset, we investigate three\ndistinct strategies. Firstly, GAN-based approaches aim to generate answer\nembeddings conditioned on image and question inputs, showing potential but\nstruggling with more complex tasks. Secondly, autoencoder-based techniques\nfocus on learning optimal embeddings for questions and images, achieving\ncomparable results with GAN due to better ability on complex questions. Lastly,\nattention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB),\naddress language priors and attention modeling, albeit with a\ncomplexity-performance trade-off. This study underscores the challenges and\nopportunities in VQA and suggests avenues for future research, including\nalternative GAN formulations and attentional mechanisms.\n","authors":["Panfeng Li","Qikai Yang","Xieming Geng","Wenjing Zhou","Zhicheng Ding","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13565v3.pdf","comment":"Accepted by 2024 5th International Conference on Electronic\n Communication and Artificial Intelligence"},{"id":"http://arxiv.org/abs/2411.07598v1","updated":"2024-11-12T07:16:51Z","published":"2024-11-12T07:16:51Z","title":"Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring\n Conversations","summary":" Many open-ended conversations (e.g., tutoring lessons or business meetings)\nrevolve around pre-defined reference materials, like worksheets or meeting\nbullets. To provide a framework for studying such conversation structure, we\nintroduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly\nbreaking down conversations into segments and linking each segment to the\nrelevant reference item. As a case study, we apply POSR to education where\neffectively structuring lessons around problems is critical yet difficult. We\npresent LessonLink, the first dataset of real-world tutoring lessons, featuring\n3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT\nmath problems. We define and evaluate several joint and independent approaches\nfor POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT),\nand large language models (LLMs) methods. Our results highlight that modeling\nPOSR as one joint task is essential: POSR methods outperform independent\nsegmentation and retrieval pipelines by up to +76% on joint metrics and surpass\ntraditional segmentation methods by up to +78% on segmentation metrics. We\ndemonstrate POSR's practical impact on downstream education applications,\nderiving new insights on the language and time use in real-world lesson\nstructures.\n","authors":["Rose E. Wang","Pawan Wirawarn","Kenny Lam","Omar Khattab","Dorottya Demszky"],"pdf_url":"https://arxiv.org/pdf/2411.07598v1.pdf","comment":"EMNLP 2024 Findings. Our code and dataset are open-sourced at\n https://github.com/rosewang2008/posr"},{"id":"http://arxiv.org/abs/2411.07595v1","updated":"2024-11-12T07:09:44Z","published":"2024-11-12T07:09:44Z","title":"Entropy Controllable Direct Preference Optimization","summary":" In the post-training of large language models (LLMs), Reinforcement Learning\nfrom Human Feedback (RLHF) is an effective approach to achieve generation\naligned with human preferences. Direct Preference Optimization (DPO) allows for\npolicy training with a simple binary cross-entropy loss without a reward model.\nThe objective of DPO is regularized by reverse KL divergence that encourages\nmode-seeking fitting to the reference policy. Nonetheless, we indicate that\nminimizing reverse KL divergence could fail to capture a mode of the reference\ndistribution, which may hurt the policy's performance. Based on this\nobservation, we propose a simple modification to DPO, H-DPO, which allows for\ncontrol over the entropy of the resulting policy, enhancing the distribution's\nsharpness and thereby enabling mode-seeking fitting more effectively. In our\nexperiments, we show that H-DPO outperformed DPO across various tasks,\ndemonstrating superior results in pass@$k$ evaluations for mathematical tasks.\nMoreover, H-DPO is simple to implement, requiring only minor modifications to\nthe loss calculation of DPO, which makes it highly practical and promising for\nwide-ranging applications in the training of LLMs.\n","authors":["Motoki Omura","Yasuhiro Fujita","Toshiki Kataoka"],"pdf_url":"https://arxiv.org/pdf/2411.07595v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.09059v2","updated":"2024-11-12T06:15:50Z","published":"2024-03-14T02:56:38Z","title":"LAMP: A Language Model on the Map","summary":" Large Language Models (LLMs) are poised to play an increasingly important\nrole in our lives, providing assistance across a wide array of tasks. In the\ngeospatial domain, LLMs have demonstrated the ability to answer generic\nquestions, such as identifying a country's capital; nonetheless, their utility\nis hindered when it comes to answering fine-grained questions about specific\nplaces, such as grocery stores or restaurants, which constitute essential\naspects of people's everyday lives. This is mainly because the places in our\ncities haven't been systematically fed into LLMs, so as to understand and\nmemorize them. This study introduces a novel framework for fine-tuning a\npre-trained model on city-specific data, to enable it to provide accurate\nrecommendations, while minimizing hallucinations. We share our model, LAMP, and\nthe data used to train it. We conduct experiments to analyze its ability to\ncorrectly retrieving spatial objects, and compare it to well-known open- and\nclosed- source language models, such as GPT-4. Finally, we explore its emerging\ncapabilities through a case study on day planning.\n","authors":["Pasquale Balsebre","Weiming Huang","Gao Cong"],"pdf_url":"https://arxiv.org/pdf/2403.09059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05990v2","updated":"2024-11-12T05:46:46Z","published":"2024-11-08T22:02:22Z","title":"Game-theoretic LLM: Agent Workflow for Negotiation Games","summary":" This paper investigates the rationality of large language models (LLMs) in\nstrategic decision-making contexts, specifically within the framework of game\ntheory. We evaluate several state-of-the-art LLMs across a spectrum of\ncomplete-information and incomplete-information games. Our findings reveal that\nLLMs frequently deviate from rational strategies, particularly as the\ncomplexity of the game increases with larger payoff matrices or deeper\nsequential trees.\n To address these limitations, we design multiple game-theoretic workflows\nthat guide the reasoning and decision-making processes of LLMs. These workflows\naim to enhance the models' ability to compute Nash Equilibria and make rational\nchoices, even under conditions of uncertainty and incomplete information.\nExperimental results demonstrate that the adoption of these workflows\nsignificantly improves the rationality and robustness of LLMs in game-theoretic\ntasks. Specifically, with the workflow, LLMs exhibit marked improvements in\nidentifying optimal strategies, achieving near-optimal allocations in\nnegotiation scenarios, and reducing susceptibility to exploitation during\nnegotiations. Furthermore, we explore the meta-strategic considerations of\nwhether it is rational for agents to adopt such workflows, recognizing that the\ndecision to use or forgo the workflow constitutes a game-theoretic issue in\nitself.\n Our research contributes to a deeper understanding of LLMs' decision-making\ncapabilities in strategic contexts and provides insights into enhancing their\nrationality through structured workflows. The findings have implications for\nthe development of more robust and strategically sound AI agents capable of\nnavigating complex interactive environments. Code and data supporting this\nstudy are available at \\url{https://github.com/Wenyueh/game_theory}.\n","authors":["Wenyue Hua","Ollie Liu","Lingyao Li","Alfonso Amayuelas","Julie Chen","Lucas Jiang","Mingyu Jin","Lizhou Fan","Fei Sun","William Wang","Xintong Wang","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.05990v2.pdf","comment":"45 pages, 12 figures"},{"id":"http://arxiv.org/abs/2408.11856v2","updated":"2024-11-12T05:37:15Z","published":"2024-08-15T19:13:38Z","title":"Dynamic Adaptive Optimization for Effective Sentiment Analysis\n Fine-Tuning on Large Language Models","summary":" Sentiment analysis plays a crucial role in various domains, such as business\nintelligence and financial forecasting. Large language models (LLMs) have\nbecome a popular paradigm for sentiment analysis, leveraging multi-task\nlearning to address specific tasks concurrently. However, LLMs with fine-tuning\nfor sentiment analysis often underperforms due to the inherent challenges in\nmanaging diverse task complexities. Moreover, constant-weight approaches in\nmulti-task learning struggle to adapt to variations in data characteristics,\nfurther complicating model effectiveness. To address these issues, we propose a\nnovel multi-task learning framework with a dynamic adaptive optimization (DAO)\nmodule. This module is designed as a plug-and-play component that can be\nseamlessly integrated into existing models, providing an effective and flexible\nsolution for multi-task learning. The key component of the DAO module is\ndynamic adaptive loss, which dynamically adjusts the weights assigned to\ndifferent tasks based on their relative importance and data characteristics\nduring training. Sentiment analyses on a standard and customized financial text\ndataset demonstrate that the proposed framework achieves superior performance.\nSpecifically, this work improves the Mean Squared Error (MSE) and Accuracy\n(ACC) by 15.58% and 1.24% respectively, compared with previous work.\n","authors":["Hongcheng Ding","Xuanze Zhao","Shamsul Nahar Abdullah","Deshinta Arrova Dewi","Zixiao Jiang","Xiangyu Shi"],"pdf_url":"https://arxiv.org/pdf/2408.11856v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.10839v3","updated":"2024-11-12T05:33:05Z","published":"2024-06-16T08:20:12Z","title":"Reminding Multimodal Large Language Models of Object-aware Knowledge\n with Retrieved Tags","summary":" Despite recent advances in the general visual instruction-following ability\nof Multimodal Large Language Models (MLLMs), they still struggle with critical\nproblems when required to provide a precise and detailed response to a visual\ninstruction: (1) failure to identify novel objects or entities, (2) mention of\nnon-existent objects, and (3) neglect of object's attributed details. Intuitive\nsolutions include improving the size and quality of data or using larger\nfoundation models. They show effectiveness in mitigating these issues, but at\nan expensive cost of collecting a vast amount of new data and introducing a\nsignificantly larger model. Standing at the intersection of these approaches,\nwe examine the three object-oriented problems from the perspective of the\nimage-to-text mapping process by the multimodal connector. In this paper, we\nfirst identify the limitations of multimodal connectors stemming from\ninsufficient training data. Driven by this, we propose to enhance the mapping\nwith retrieval-augmented tag tokens, which contain rich object-aware\ninformation such as object names and attributes. With our Tag-grounded visual\ninstruction tuning with retrieval Augmentation (TUNA), we outperform baselines\nthat share the same language model and training data on 12 benchmarks.\nFurthermore, we show the zero-shot capability of TUNA when provided with\nspecific datastores.\n","authors":["Daiqing Qi","Handong Zhao","Zijun Wei","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2406.10839v3.pdf","comment":"Main Conference at EMNLP 2024"},{"id":"http://arxiv.org/abs/2401.12585v6","updated":"2024-11-12T05:09:34Z","published":"2024-01-23T09:33:31Z","title":"SLANG: New Concept Comprehension of Large Language Models","summary":" The dynamic nature of language, particularly evident in the realm of slang\nand memes on the Internet, poses serious challenges to the adaptability of\nlarge language models (LLMs). Traditionally anchored to static datasets, these\nmodels often struggle to keep up with the rapid linguistic evolution\ncharacteristic of online communities. This research aims to bridge this gap by\nenhancing LLMs' comprehension of the evolving new concepts on the Internet,\nwithout the high cost of continual retraining. In pursuit of this goal, we\nintroduce $\\textbf{SLANG}$, a benchmark designed to autonomously integrate\nnovel data and assess LLMs' ability to comprehend emerging concepts, alongside\n$\\textbf{FOCUS}$, an approach uses causal inference to enhance LLMs to\nunderstand new phrases and their colloquial context. Our benchmark and approach\ninvolves understanding real-world instances of linguistic shifts, serving as\ncontextual beacons, to form more precise and contextually relevant connections\nbetween newly emerging expressions and their meanings. The empirical analysis\nshows that our causal inference-based approach outperforms the baseline methods\nin terms of precision and relevance in the comprehension of Internet slang and\nmemes.\n","authors":["Lingrui Mei","Shenghua Liu","Yiwei Wang","Baolong Bi","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.12585v6.pdf","comment":"EMNLP 2024 Main"},{"id":"http://arxiv.org/abs/2411.07546v1","updated":"2024-11-12T04:50:10Z","published":"2024-11-12T04:50:10Z","title":"Contrastive Language Prompting to Ease False Positives in Medical\n Anomaly Detection","summary":" A pre-trained visual-language model, contrastive language-image pre-training\n(CLIP), successfully accomplishes various downstream tasks with text prompts,\nsuch as finding images or localizing regions within the image. Despite CLIP's\nstrong multi-modal data capabilities, it remains limited in specialized\nenvironments, such as medical applications. For this purpose, many CLIP\nvariants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives\nrelated to normal regions persist. Thus, we aim to present a simple yet\nimportant goal of reducing false positives in medical anomaly detection. We\nintroduce a Contrastive LAnguage Prompting (CLAP) method that leverages both\npositive and negative text prompts. This straightforward approach identifies\npotential lesion regions by visual attention to the positive prompts in the\ngiven image. To reduce false positives, we attenuate attention on normal\nregions using negative prompts. Extensive experiments with the BMAD dataset,\nincluding six biomedical benchmarks, demonstrate that CLAP method enhances\nanomaly detection performance. Our future plans include developing an automated\nfine prompting method for more practical usage.\n","authors":["YeongHyeon Park","Myung Jin Kim","Hyeong Seok Kim"],"pdf_url":"https://arxiv.org/pdf/2411.07546v1.pdf","comment":"4 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2407.01523v3","updated":"2024-11-12T04:37:44Z","published":"2024-07-01T17:59:26Z","title":"MMLongBench-Doc: Benchmarking Long-context Document Understanding with\n Visualizations","summary":" Understanding documents with rich layouts and multi-modal components is a\nlong-standing and practical task. Recent Large Vision-Language Models (LVLMs)\nhave made remarkable strides in various tasks, particularly in single-page\ndocument understanding (DU). However, their abilities on long-context DU remain\nan open problem. This work presents MMLongBench-Doc, a long-context,\nmulti-modal benchmark comprising 1,062 expert-annotated questions. Distinct\nfrom previous datasets, it is constructed upon 130 lengthy PDF-formatted\ndocuments with an average of 49.4 pages and 20,971 textual tokens. Towards\ncomprehensive evaluation, answers to these questions rely on pieces of evidence\nfrom (1) different sources (text, image, chart, table, and layout structure)\nand (2) various locations (i.e. page number). Moreover, 33.2% of the questions\nare cross-page questions requiring evidence across multiple pages. 22.8% of the\nquestions are designed to be unanswerable for detecting potential\nhallucinations. Experiments on 14 LVLMs demonstrate that long-context DU\ngreatly challenges current models. Notably, the best-performing model, GPT-4o,\nachieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores\n31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse\nperformance than their LLM counterparts which are fed with lossy-parsed OCR\ndocuments. These results validate the necessity of future research toward more\ncapable long-context LVLMs. Project Page:\nhttps://mayubo2333.github.io/MMLongBench-Doc\n","authors":["Yubo Ma","Yuhang Zang","Liangyu Chen","Meiqi Chen","Yizhu Jiao","Xinze Li","Xinyuan Lu","Ziyu Liu","Yan Ma","Xiaoyi Dong","Pan Zhang","Liangming Pan","Yu-Gang Jiang","Jiaqi Wang","Yixin Cao","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2407.01523v3.pdf","comment":"Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight)"},{"id":"http://arxiv.org/abs/2410.09982v3","updated":"2024-11-12T04:20:00Z","published":"2024-10-13T19:53:40Z","title":"Self-Data Distillation for Recovering Quality in Pruned Large Language\n Models","summary":" Large language models have driven significant progress in natural language\nprocessing, but their deployment requires substantial compute and memory\nresources. As models scale, compression techniques become essential for\nbalancing model quality with computational efficiency. Structured pruning,\nwhich removes less critical components of the model, is a promising strategy\nfor reducing complexity. However, one-shot pruning often results in significant\nquality degradation, particularly in tasks requiring multi-step reasoning. To\nrecover lost quality, supervised fine-tuning (SFT) is commonly applied, but it\ncan lead to catastrophic forgetting by shifting the model's learned data\ndistribution. Therefore, addressing the degradation from both pruning and SFT\nis essential to preserve the original model's quality. In this work, we utilize\nself-data distilled fine-tuning to address these challenges. Our approach\nleverages the original, unpruned model to generate a distilled dataset that\npreserves semantic richness and mitigates catastrophic forgetting by\nmaintaining alignment with the base model's knowledge. Empirically, we\ndemonstrate that self-data distillation consistently outperforms standard SFT,\nimproving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard\nv1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct\n(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B\nparameters), our method retains 91.2% of the original model's accuracy compared\nto 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore,\ncombining self-data distilled models through model merging yields enhanced\nquality retention. Additionally, leveraging these pruned models in speculative\ndecoding increases token acceptance rates, thereby improving inference\nefficiency in applied settings.\n","authors":["Vithursan Thangarasa","Ganesh Venkatesh","Mike Lasby","Nish Sinnadurai","Sean Lie"],"pdf_url":"https://arxiv.org/pdf/2410.09982v3.pdf","comment":"13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary\n Material)"},{"id":"http://arxiv.org/abs/2411.07533v1","updated":"2024-11-12T04:16:44Z","published":"2024-11-12T04:16:44Z","title":"Large Language Models as Neurolinguistic Subjects: Identifying Internal\n Representations for Form and Meaning","summary":" This study investigates the linguistic understanding of Large Language Models\n(LLMs) regarding signifier (form) and signified (meaning) by distinguishing two\nLLM evaluation paradigms: psycholinguistic and neurolinguistic. Traditional\npsycholinguistic evaluations often reflect statistical biases that may\nmisrepresent LLMs' true linguistic capabilities. We introduce a neurolinguistic\napproach, utilizing a novel method that combines minimal pair and diagnostic\nprobing to analyze activation patterns across model layers. This method allows\nfor a detailed examination of how LLMs represent form and meaning, and whether\nthese representations are consistent across languages. Our contributions are\nthree-fold: (1) We compare neurolinguistic and psycholinguistic methods,\nrevealing distinct patterns in LLM assessment; (2) We demonstrate that LLMs\nexhibit higher competence in form compared to meaning, with the latter largely\ncorrelated to the former; (3) We present new conceptual minimal pair datasets\nfor Chinese (COMPS-ZH) and German (COMPS-DE), complementing existing English\ndatasets.\n","authors":["Linyang He","Ercong Nie","Helmut Schmid","Hinrich Schütze","Nima Mesgarani","Jonathan Brennan"],"pdf_url":"https://arxiv.org/pdf/2411.07533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07070v2","updated":"2024-11-12T04:12:32Z","published":"2024-11-11T15:46:07Z","title":"On Active Privacy Auditing in Supervised Fine-tuning for White-Box\n Language Models","summary":" The pretraining and fine-tuning approach has become the leading technique for\nvarious NLP applications. However, recent studies reveal that fine-tuning data,\ndue to their sensitive nature, domain-specific characteristics, and\nidentifiability, pose significant privacy concerns. To help develop more\nprivacy-resilient fine-tuning models, we introduce a novel active privacy\nauditing framework, dubbed Parsing, designed to identify and quantify privacy\nleakage risks during the supervised fine-tuning (SFT) of language models (LMs).\nThe framework leverages improved white-box membership inference attacks (MIAs)\nas the core technology, utilizing novel learning objectives and a two-stage\npipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the\nexposure of privacy risks. Additionally, we have improved the effectiveness of\nMIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our\nresearch aims to provide the SFT community of LMs with a reliable, ready-to-use\nprivacy auditing tool, and to offer valuable insights into safeguarding privacy\nduring the fine-tuning process. Experimental results confirm the framework's\nefficiency across various models and tasks, emphasizing notable privacy\nconcerns in the fine-tuning process. Project code available for\nhttps://anonymous.4open.science/r/PARSING-4817/.\n","authors":["Qian Sun","Hanpeng Wu","Xi Sheryl Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07070v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07133v2","updated":"2024-11-12T04:05:54Z","published":"2024-11-11T17:06:48Z","title":"Stronger Models are NOT Stronger Teachers for Instruction Tuning","summary":" Instruction tuning has been widely adopted to ensure large language models\n(LLMs) follow user instructions effectively. The resulting\ninstruction-following capabilities of LLMs heavily rely on the instruction\ndatasets used for tuning. Recently, synthetic instruction datasets have emerged\nas an economically viable solution to provide LLMs diverse and high-quality\ninstructions. However, existing approaches typically assume that larger or\nstronger models are stronger teachers for instruction tuning, and hence simply\nadopt these models as response generators to the synthetic instructions. In\nthis paper, we challenge this commonly-adopted assumption. Our extensive\nexperiments across five base models and twenty response generators reveal that\nlarger and stronger models are not necessarily stronger teachers of smaller\nmodels. We refer to this phenomenon as the Larger Models' Paradox. We observe\nthat existing metrics cannot precisely predict the effectiveness of response\ngenerators since they ignore the compatibility between teachers and base models\nbeing fine-tuned. We thus develop a novel metric, named as\nCompatibility-Adjusted Reward (CAR) to measure the effectiveness of response\ngenerators. Our experiments across five base models demonstrate that CAR\noutperforms almost all baselines.\n","authors":["Zhangchen Xu","Fengqing Jiang","Luyao Niu","Bill Yuchen Lin","Radha Poovendran"],"pdf_url":"https://arxiv.org/pdf/2411.07133v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07528v1","updated":"2024-11-12T03:56:07Z","published":"2024-11-12T03:56:07Z","title":"SecEncoder: Logs are All You Need in Security","summary":" Large and Small Language Models (LMs) are typically pretrained using\nextensive volumes of text, which are sourced from publicly accessible platforms\nsuch as Wikipedia, Book Corpus, or through web scraping. These models, due to\ntheir exposure to a wide range of language data, exhibit impressive\ngeneralization capabilities and can perform a multitude of tasks\nsimultaneously. However, they often fall short when it comes to domain-specific\ntasks due to their broad training data. This paper introduces SecEncoder, a\nspecialized small language model that is pretrained using security logs.\nSecEncoder is designed to address the domain-specific limitations of general\nLMs by focusing on the unique language and patterns found in security logs.\nExperimental results indicate that SecEncoder outperforms other LMs, such as\nBERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002)\nmodels, which are pretrained mainly on natural language, across various tasks.\nFurthermore, although SecEncoder is primarily pretrained on log data, it\noutperforms models pretrained on natural language for a range of tasks beyond\nlog analysis, such as incident prioritization and threat intelligence document\nretrieval. This suggests that domain specific pretraining with logs can\nsignificantly enhance the performance of LMs in security. These findings pave\nthe way for future research into security-specific LMs and their potential\napplications.\n","authors":["Muhammed Fatih Bulut","Yingqi Liu","Naveed Ahmad","Maximilian Turner","Sami Ait Ouahmane","Cameron Andrews","Lloyd Greenwald"],"pdf_url":"https://arxiv.org/pdf/2411.07528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07527v1","updated":"2024-11-12T03:55:27Z","published":"2024-11-12T03:55:27Z","title":"Prompt-enhanced Network for Hateful Meme Classification","summary":" The dynamic expansion of social media has led to an inundation of hateful\nmemes on media platforms, accentuating the growing need for efficient\nidentification and removal. Acknowledging the constraints of conventional\nmultimodal hateful meme classification, which heavily depends on external\nknowledge and poses the risk of including irrelevant or redundant content, we\ndeveloped Pen -- a prompt-enhanced network framework based on the prompt\nlearning approach. Specifically, after constructing the sequence through the\nprompt method and encoding it with a language model, we performed region\ninformation global extraction on the encoded sequence for multi-view\nperception. By capturing global information about inference instances and\ndemonstrations, Pen facilitates category selection by fully leveraging sequence\ninformation. This approach significantly improves model classification\naccuracy. Additionally, to bolster the model's reasoning capabilities in the\nfeature space, we introduced prompt-aware contrastive learning into the\nframework to improve the quality of sample feature distributions. Through\nextensive ablation experiments on two public datasets, we evaluate the\neffectiveness of the Pen framework, concurrently comparing it with\nstate-of-the-art model baselines. Our research findings highlight that Pen\nsurpasses manual prompt methods, showcasing superior generalization and\nclassification accuracy in hateful meme classification tasks. Our code is\navailable at https://github.com/juszzi/Pen.\n","authors":["Junxi Liu","Yanyan Feng","Jiehai Chen","Yun Xue","Fenghuan Li"],"pdf_url":"https://arxiv.org/pdf/2411.07527v1.pdf","comment":"Published in Proceedings of the Thirty-Third International Joint\n Conference on Artificial Intelligence Main Track. Pages 6397-6405"},{"id":"http://arxiv.org/abs/2411.07516v1","updated":"2024-11-12T03:25:33Z","published":"2024-11-12T03:25:33Z","title":"SparrowVQE: Visual Question Explanation for Course Content Understanding","summary":" Visual Question Answering (VQA) research seeks to create AI systems to answer\nnatural language questions in images, yet VQA methods often yield overly\nsimplistic and short answers. This paper aims to advance the field by\nintroducing Visual Question Explanation (VQE), which enhances the ability of\nVQA to provide detailed explanations rather than brief responses and address\nthe need for more complex interaction with visual content. We first created an\nMLVQE dataset from a 14-week streamed video machine learning course, including\n885 slide images, 110,407 words of transcripts, and 9,416 designed\nquestion-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3\nbillion parameters multimodal model. We trained our model with a three-stage\ntraining mechanism consisting of multimodal pre-training (slide images and\ntranscripts feature alignment), instruction tuning (tuning the pre-trained\nmodel with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide\nimage and QA pairs). Eventually, our SparrowVQE can understand and connect\nvisual information using the SigLIP model with transcripts using the Phi-2\nlanguage model with an MLP adapter. Experimental results demonstrate that our\nSparrowVQE achieves better performance in our developed MLVQE dataset and\noutperforms state-of-the-art methods in the other five benchmark VQA datasets.\nThe source code is available at\n\\url{https://github.com/YoushanZhang/SparrowVQE}.\n","authors":["Jialu Li","Manish Kumar Thota","Ruslan Gokhman","Radek Holik","Youshan Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07494v1","updated":"2024-11-12T02:44:49Z","published":"2024-11-12T02:44:49Z","title":"Rapid Response: Mitigating LLM Jailbreaks with a Few Examples","summary":" As large language models (LLMs) grow more powerful, ensuring their safety\nagainst misuse becomes crucial. While researchers have focused on developing\nrobust defenses, no method has yet achieved complete invulnerability to\nattacks. We propose an alternative approach: instead of seeking perfect\nadversarial robustness, we develop rapid response techniques to look to block\nwhole classes of jailbreaks after observing only a handful of attacks. To study\nthis setting, we develop RapidResponseBench, a benchmark that measures a\ndefense's robustness against various jailbreak strategies after adapting to a\nfew observed examples. We evaluate five rapid response methods, all of which\nuse jailbreak proliferation, where we automatically generate additional\njailbreaks similar to the examples observed. Our strongest method, which\nfine-tunes an input classifier to block proliferated jailbreaks, reduces attack\nsuccess rate by a factor greater than 240 on an in-distribution set of\njailbreaks and a factor greater than 15 on an out-of-distribution set, having\nobserved just one example of each jailbreaking strategy. Moreover, further\nstudies suggest that the quality of proliferation model and number of\nproliferated examples play an key role in the effectiveness of this defense.\nOverall, our results highlight the potential of responding rapidly to novel\njailbreaks to limit LLM misuse.\n","authors":["Alwin Peng","Julian Michael","Henry Sleight","Ethan Perez","Mrinank Sharma"],"pdf_url":"https://arxiv.org/pdf/2411.07494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.00774v2","updated":"2024-11-12T02:18:38Z","published":"2024-11-01T17:59:51Z","title":"Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model\n with Frozen LLM","summary":" Rapidly developing large language models (LLMs) have brought tremendous\nintelligent applications. GPT-4o's excellent duplex speech interaction ability\nhas recently brought impressive experience to users. Researchers have recently\nproposed several multi-modal LLMs in this direction that can achieve\nspeech-to-speech dialogue. This paper proposes a novel speech-text multimodal\nLLM architecture called Freeze-Omni. Our main contribution is that the speech\ninput and output modalities can be easily connected to a textual LLM while\nkeeping the LLM's parameters frozen throughout the training process. We\ndesigned 3-stage training strategies both for the modeling of speech input and\noutput, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using\ntext-speech paired data (such as ASR and TTS data) and only 60,000 multi-round\ntext Q&A data on 8 GPUs. Moreover, we can effectively ensure that the\nintelligence of the Freeze-Omni in the speech modality is at the same level\ncompared with that in the text modality of its backbone LLM, while the\nend-to-end latency of the spoken response achieves a low level. In addition, we\nalso designed a method to achieve duplex dialogue ability through multi-task\ntraining, making Freeze-Omni have a more natural style of dialogue ability\nbetween the users. Freeze-Omni mainly provides a possibility for researchers to\nconduct multimodal LLM under the condition of a frozen LLM, avoiding various\nimpacts caused by the catastrophic forgetting of LLM caused by fewer data and\ntraining resources.\n","authors":["Xiong Wang","Yangze Li","Chaoyou Fu","Yunhang Shen","Lei Xie","Ke Li","Xing Sun","Long Ma"],"pdf_url":"https://arxiv.org/pdf/2411.00774v2.pdf","comment":"Project Page: https://freeze-omni.github.io/"},{"id":"http://arxiv.org/abs/2409.13755v2","updated":"2024-11-12T02:01:37Z","published":"2024-09-15T10:50:51Z","title":"Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation\n Extraction in Long Sentences","summary":" Relation extraction as an important natural Language processing (NLP) task is\nto identify relations between named entities in text. Recently, graph\nconvolutional networks over dependency trees have been widely used to capture\nsyntactic features and achieved attractive performance. However, most existing\ndependency-based approaches ignore the positive influence of the words outside\nthe dependency trees, sometimes conveying rich and useful information on\nrelation extraction. In this paper, we propose a novel model, Entity-aware\nSelf-attention Contextualized GCN (ESC-GCN), which efficiently incorporates\nsyntactic structure of input sentences and semantic context of sequences. To be\nspecific, relative position self-attention obtains the overall semantic\npairwise correlation related to word position, and contextualized graph\nconvolutional networks capture rich intra-sentence dependencies between words\nby adequately pruning operations. Furthermore, entity-aware attention layer\ndynamically selects which token is more decisive to make final relation\nprediction. In this way, our proposed model not only reduces the noisy impact\nfrom dependency trees, but also obtains easily-ignored entity-related semantic\nrepresentation. Extensive experiments on various tasks demonstrate that our\nmodel achieves encouraging performance as compared to existing dependency-based\nand sequence-based models. Specially, our model excels in extracting relations\nbetween entities of long sentences.\n","authors":["Xin Wang","Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.13755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.00625v2","updated":"2024-11-12T01:47:40Z","published":"2024-09-01T05:59:54Z","title":"Entity-Aware Biaffine Attention Model for Improved Constituent Parsing\n with Reduced Entity Violations","summary":" Constituency parsing involves analyzing a sentence by breaking it into\nsub-phrases, or constituents. While many deep neural models have achieved\nstate-of-the-art performance in this task, they often overlook the\nentity-violating issue, where an entity fails to form a complete sub-tree in\nthe resultant parsing tree. To address this, we propose an entity-aware\nbiaffine attention model for constituent parsing. This model incorporates\nentity information into the biaffine attention mechanism by using additional\nentity role vectors for potential phrases, which enhances the parsing accuracy.\nWe introduce a new metric, the Entity Violating Rate (EVR), to quantify the\nextent of entity violations in parsing results. Experiments on three popular\ndatasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest\nEVR while maintaining high precision, recall, and F1-scores comparable to\nexisting models. Further evaluation in downstream tasks, such as sentence\nsentiment analysis, highlights the effectiveness of our model and the validity\nof the proposed EVR metric.\n","authors":["Xinyi Bai"],"pdf_url":"https://arxiv.org/pdf/2409.00625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07474v1","updated":"2024-11-12T01:26:41Z","published":"2024-11-12T01:26:41Z","title":"Controlled Evaluation of Syntactic Knowledge in Multilingual Language\n Models","summary":" Language models (LMs) are capable of acquiring elements of human-like\nsyntactic knowledge. Targeted syntactic evaluation tests have been employed to\nmeasure how well they form generalizations about syntactic phenomena in\nhigh-resource languages such as English. However, we still lack a thorough\nunderstanding of LMs' capacity for syntactic generalizations in low-resource\nlanguages, which are responsible for much of the diversity of syntactic\npatterns worldwide. In this study, we develop targeted syntactic evaluation\ntests for three low-resource languages (Basque, Hindi, and Swahili) and use\nthem to evaluate five families of open-access multilingual Transformer LMs. We\nfind that some syntactic tasks prove relatively easy for LMs while others\n(agreement in sentences containing indirect objects in Basque, agreement across\na prepositional phrase in Swahili) are challenging. We additionally uncover\nissues with publicly available Transformers, including a bias toward the\nhabitual aspect in Hindi in multilingual BERT and underperformance compared to\nsimilar-sized models in XGLM-4.5B.\n","authors":["Daria Kryvosheieva","Roger Levy"],"pdf_url":"https://arxiv.org/pdf/2411.07474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.01332v3","updated":"2024-11-12T01:06:22Z","published":"2024-03-29T22:49:43Z","title":"Explaining Large Language Models Decisions Using Shapley Values","summary":" The emergence of large language models (LLMs) has opened up exciting\npossibilities for simulating human behavior and cognitive processes, with\npotential applications in various domains, including marketing research and\nconsumer behavior analysis. However, the validity of utilizing LLMs as\nstand-ins for human subjects remains uncertain due to glaring divergences that\nsuggest fundamentally different underlying processes at play and the\nsensitivity of LLM responses to prompt variations. This paper presents a novel\napproach based on Shapley values from cooperative game theory to interpret LLM\nbehavior and quantify the relative contribution of each prompt component to the\nmodel's output. Through two applications - a discrete choice experiment and an\ninvestigation of cognitive biases - we demonstrate how the Shapley value method\ncan uncover what we term \"token noise\" effects, a phenomenon where LLM\ndecisions are disproportionately influenced by tokens providing minimal\ninformative content. This phenomenon raises concerns about the robustness and\ngeneralizability of insights obtained from LLMs in the context of human\nbehavior simulation. Our model-agnostic approach extends its utility to\nproprietary LLMs, providing a valuable tool for practitioners and researchers\nto strategically optimize prompts and mitigate apparent cognitive biases. Our\nfindings underscore the need for a more nuanced understanding of the factors\ndriving LLM responses before relying on them as substitutes for human subjects\nin survey settings. We emphasize the importance of researchers reporting\nresults conditioned on specific prompt templates and exercising caution when\ndrawing parallels between human behavior and LLMs.\n","authors":["Behnam Mohammadi"],"pdf_url":"https://arxiv.org/pdf/2404.01332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07466v1","updated":"2024-11-12T01:05:55Z","published":"2024-11-12T01:05:55Z","title":"IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark","summary":" Recent evaluations of LLMs on coreference resolution have revealed that\ntraditional output formats and evaluation metrics do not fully capture the\nmodels' referential understanding. To address this, we introduce IdentifyMe, a\nnew benchmark for mention resolution presented in a multiple-choice question\n(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long\nnarratives and employs heuristics to exclude easily identifiable mentions,\ncreating a more challenging task. The benchmark also consists of a curated\nmixture of different mention types and corresponding entities, allowing for a\nfine-grained analysis of model performance. We evaluate both closed- and open\nsource LLMs on IdentifyMe and observe a significant performance gap (20-30%)\nbetween the state-of-the-art sub-10B open models vs. closed ones. We observe\nthat pronominal mentions, which have limited surface information, are typically\nmuch harder for models to resolve than nominal mentions. Additionally, we find\nthat LLMs often confuse entities when their mentions overlap in nested\nstructures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy,\nhighlighting the strong referential capabilities of state-of-the-art LLMs while\nalso indicating room for further improvement.\n","authors":["Kawshik Manikantan","Makarand Tapaswi","Vineet Gandhi","Shubham Toshniwal"],"pdf_url":"https://arxiv.org/pdf/2411.07466v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.04916v4","updated":"2024-11-12T01:01:32Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v4.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4pi-1.23/)"},{"id":"http://arxiv.org/abs/2407.02885v5","updated":"2024-11-12T01:00:53Z","published":"2024-07-03T07:59:52Z","title":"CogErgLLM: Exploring Large Language Model Systems Design Perspective\n Using Cognitive Ergonomics","summary":" Integrating cognitive ergonomics with LLMs is crucial for improving safety,\nreliability, and user satisfaction in human-AI interactions. Current LLM\ndesigns often lack this integration, resulting in systems that may not fully\nalign with human cognitive capabilities and limitations. This oversight\nexacerbates biases in LLM outputs and leads to suboptimal user experiences due\nto inconsistent application of user-centered design principles. Researchers are\nincreasingly leveraging NLP, particularly LLMs, to model and understand human\nbehavior across social sciences, psychology, psychiatry, health, and\nneuroscience. Our position paper explores the need to integrate cognitive\nergonomics into LLM design, providing a comprehensive framework and practical\nguidelines for ethical development. By addressing these challenges, we aim to\nadvance safer, more reliable, and ethically sound human-AI interactions.\n","authors":["Azmine Toushik Wasi","Mst Rafia Islam"],"pdf_url":"https://arxiv.org/pdf/2407.02885v5.pdf","comment":"10 Page, 3 Figures. Accepted in: (i) ICML'24: LLMs & Cognition\n Workshop (Non-archival; OpenReview:\n https://openreview.net/forum?id=63C9YSc77p) (ii) EMNLP'24 : NLP for Science\n Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4science-1.22/)"},{"id":"http://arxiv.org/abs/2411.07464v1","updated":"2024-11-12T00:57:30Z","published":"2024-11-12T00:57:30Z","title":"BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating\n Machine Learning Tasks","summary":" Large Language Models (LLMs) excel in diverse applications including\ngeneration of code snippets, but often struggle with generating code for\ncomplex Machine Learning (ML) tasks. Although existing LLM single-agent based\nsystems give varying performance depending on the task complexity, they purely\nrely on larger and expensive models such as GPT-4. Our investigation reveals\nthat no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama\nperform far worse than GPT-4 in a single-agent setting. With the motivation of\ndeveloping a cost-efficient LLM based solution for solving ML tasks, we propose\nan LLM Multi-Agent based system which leverages combination of experts using\nprofiling, efficient retrieval of past observations, LLM cascades, and\nask-the-expert calls. Through empirical analysis on ML engineering tasks in the\nMLAgentBench benchmark, we demonstrate the effectiveness of our system, using\nno-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and\nexpert to serve occasional ask-the-expert calls for planning. With 94.2\\%\nreduction in the cost (from \\$0.931 per run cost averaged over all tasks for\nGPT-4 single agent system to \\$0.054), our system is able to yield better\naverage success rate of 32.95\\% as compared to GPT-4 single-agent system\nyielding 22.72\\% success rate averaged over all the tasks of MLAgentBench.\n","authors":["Shubham Gandhi","Manasi Patwardhan","Lovekesh Vig","Gautam Shroff"],"pdf_url":"https://arxiv.org/pdf/2411.07464v1.pdf","comment":"Presented at AIMLSystems '24"},{"id":"http://arxiv.org/abs/2411.07457v1","updated":"2024-11-12T00:48:01Z","published":"2024-11-12T00:48:01Z","title":"DecoPrompt : Decoding Prompts Reduces Hallucinations when Large Language\n Models Meet False Premises","summary":" While large language models (LLMs) have demonstrated increasing power, they\nhave also called upon studies on their hallucinated outputs that deviate from\nfactually correct statements. In this paper, we focus on one important scenario\nof false premises, where LLMs are distracted by misaligned claims although the\nmodel possesses the required factual knowledge to answer original questions\naccurately. Inspired by the observation that entropy of the false-premise\nprompt is closely related to its likelihood to elicit hallucination generation,\nwe propose a new prompting algorithm, named DecoPrompt, to mitigate\nhallucination. DecoPrompt leverages LLMs to \"decode\" the false-premise prompts\nwithout really eliciting hallucination output from LLMs. We perform experiments\non two datasets, demonstrating that DecoPrompt can reduce hallucinations\neffectively on outputs from different LLMs. Moreover, DecoPrompt exhibits\ncross-model transferability, which facilitates its applications to scenarios\nsuch as LLMs of large sizes or unavailable model logits.\n","authors":["Nan Xu","Xuezhe Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07457v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07446v1","updated":"2024-11-12T00:07:29Z","published":"2024-11-12T00:07:29Z","title":"Efficient and Accurate Prompt Optimization: the Benefit of Memory in\n Exemplar-Guided Reflection","summary":" Automatic prompt engineering aims to enhance the generation quality of large\nlanguage models (LLMs). Recent works utilize feedbacks generated from erroneous\ncases to guide the prompt optimization. During inference, they may further\nretrieve several semantically-related exemplars and concatenate them to the\noptimized prompts to improve the performance. However, those works only utilize\nthe feedback at the current step, ignoring historical and unseleccted feedbacks\nwhich are potentially beneficial. Moreover, the selection of exemplars only\nconsiders the general semantic relationship and may not be optimal in terms of\ntask performance and matching with the optimized prompt. In this work, we\npropose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize\nmore efficient and accurate prompt optimization. Specifically, we design an\nexemplar-guided reflection mechanism where the feedback generation is\nadditionally guided by the generated exemplars. We further build two kinds of\nmemory to fully utilize the historical feedback information and support more\neffective exemplar retrieval. Empirical evaluations show our method surpasses\nprevious state-of-the-arts with less optimization steps, i.e., improving F1\nscore by 10.1 on LIAR dataset, and reducing half of the optimization steps on\nProTeGi.\n","authors":["Cilin Yan","Jingyun Wang","Lin Zhang","Ruihui Zhao","Xiaopu Wu","Kai Xiong","Qingsong Liu","Guoliang Kang","Yangyang Kang"],"pdf_url":"https://arxiv.org/pdf/2411.07446v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2411.08227v1","updated":"2024-11-12T22:43:16Z","published":"2024-11-12T22:43:16Z","title":"DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution\n Detection","summary":" Out-of-distribution (OOD) detection is essential for ensuring the robustness\nof machine learning models by identifying samples that deviate from the\ntraining distribution. While traditional OOD detection has primarily focused on\nsingle-modality inputs, such as images, recent advances in multimodal models\nhave demonstrated the potential of leveraging multiple modalities (e.g., video,\noptical flow, audio) to enhance detection performance. However, existing\nmethods often overlook intra-class variability within in-distribution (ID)\ndata, assuming that samples of the same class are perfectly cohesive and\nconsistent. This assumption can lead to performance degradation, especially\nwhen prediction discrepancies are uniformly amplified across all samples. To\naddress this issue, we propose Dynamic Prototype Updating (DPU), a novel\nplug-and-play framework for multimodal OOD detection that accounts for\nintra-class variations. Our method dynamically updates class center\nrepresentations for each class by measuring the variance of similar samples\nwithin each batch, enabling adaptive adjustments. This approach allows us to\namplify prediction discrepancies based on the updated class centers, thereby\nimproving the model's robustness and generalization across different\nmodalities. Extensive experiments on two tasks, five datasets, and nine base\nOOD algorithms demonstrate that DPU significantly improves OOD detection\nperformance, setting a new state-of-the-art in multimodal OOD detection, with\nimprovements of up to 80 percent in Far-OOD detection. To facilitate\naccessibility and reproducibility, our code is publicly available on GitHub.\n","authors":["Shawn Li","Huixian Gong","Hao Dong","Tiankai Yang","Zhengzhong Tu","Yue Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08216v1","updated":"2024-11-12T22:16:50Z","published":"2024-11-12T22:16:50Z","title":"GTA: Global Tracklet Association for Multi-Object Tracking in Sports","summary":" Multi-object tracking in sports scenarios has become one of the focal points\nin computer vision, experiencing significant advancements through the\nintegration of deep learning techniques. Despite these breakthroughs,\nchallenges remain, such as accurately re-identifying players upon re-entry into\nthe scene and minimizing ID switches. In this paper, we propose an\nappearance-based global tracklet association algorithm designed to enhance\ntracking performance by splitting tracklets containing multiple identities and\nconnecting tracklets seemingly from the same identity. This method can serve as\na plug-and-play refinement tool for any multi-object tracker to further boost\ntheir performance. The proposed method achieved a new state-of-the-art\nperformance on the SportsMOT dataset with HOTA score of 81.04%. Similarly, on\nthe SoccerNet dataset, our method enhanced multiple trackers' performance,\nconsistently increasing the HOTA score from 79.41% to 83.11%. These significant\nand consistent improvements across different trackers and datasets underscore\nour proposed method's potential impact on the application of sports player\ntracking. We open-source our project codebase at\nhttps://github.com/sjc042/gta-link.git.\n","authors":["Jiacheng Sun","Hsiang-Wei Huang","Cheng-Yen Yang","Zhongyu Jiang","Jenq-Neng Hwang"],"pdf_url":"https://arxiv.org/pdf/2411.08216v1.pdf","comment":"Accepted by ACCV 2024 MLCSA Workshop"},{"id":"http://arxiv.org/abs/2411.08196v1","updated":"2024-11-12T21:34:30Z","published":"2024-11-12T21:34:30Z","title":"Latent Space Disentanglement in Diffusion Transformers Enables Precise\n Zero-shot Semantic Editing","summary":" Diffusion Transformers (DiTs) have recently achieved remarkable success in\ntext-guided image generation. In image editing, DiTs project text and image\ninputs to a joint latent space, from which they decode and synthesize new\nimages. However, it remains largely unexplored how multimodal information\ncollectively forms this joint space and how they guide the semantics of the\nsynthesized images. In this paper, we investigate the latent space of DiT\nmodels and uncover two key properties: First, DiT's latent space is inherently\nsemantically disentangled, where different semantic attributes can be\ncontrolled by specific editing directions. Second, consistent semantic editing\nrequires utilizing the entire joint latent space, as neither encoded image nor\ntext alone contains enough semantic information. We show that these editing\ndirections can be obtained directly from text prompts, enabling precise\nsemantic control without additional training or mask annotations. Based on\nthese insights, we propose a simple yet effective Encode-Identify-Manipulate\n(EIM) framework for zero-shot fine-grained image editing. Specifically, we\nfirst encode both the given source image and the text prompt that describes the\nimage, to obtain the joint latent embedding. Then, using our proposed Hessian\nScore Distillation Sampling (HSDS) method, we identify editing directions that\ncontrol specific target attributes while preserving other image features. These\ndirections are guided by text prompts and used to manipulate the latent\nembeddings. Moreover, we propose a new metric to quantify the disentanglement\ndegree of the latent space of diffusion models. Extensive experiment results on\nour new curated benchmark dataset and analysis demonstrate DiT's\ndisentanglement properties and effectiveness of the EIM framework.\n","authors":["Zitao Shuai","Chenwei Wu","Zhengxu Tang","Bowen Song","Liyue Shen"],"pdf_url":"https://arxiv.org/pdf/2411.08196v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2408.13335"},{"id":"http://arxiv.org/abs/2411.08195v1","updated":"2024-11-12T21:33:11Z","published":"2024-11-12T21:33:11Z","title":"An Explainable Machine Learning Approach for Age and Gender Estimation\n in Living Individuals Using Dental Biometrics","summary":" Objectives: Age and gender estimation is crucial for various applications,\nincluding forensic investigations and anthropological studies. This research\naims to develop a predictive system for age and gender estimation in living\nindividuals, leveraging dental measurements such as Coronal Height (CH),\nCoronal Pulp Cavity Height (CPCH), and Tooth Coronal Index (TCI). Methods:\nMachine learning models were employed in our study, including Cat Boost\nClassifier (Catboost), Gradient Boosting Machine (GBM), Ada Boost Classifier\n(AdaBoost), Random Forest (RF), eXtreme Gradient Boosting (XGB), Light Gradient\nBoosting Machine (LGB), and Extra Trees Classifier (ETC), to analyze dental\ndata from 862 living individuals (459 males and 403 females). Specifically,\nperiapical radiographs from six teeth per individual were utilized, including\npremolars and molars from both maxillary and mandibular. A novel ensemble\nlearning technique was developed, which uses multiple models each tailored to\ndistinct dental metrics, to estimate age and gender accurately. Furthermore, an\nexplainable AI model has been created utilizing SHAP, enabling dental experts\nto make judicious decisions based on comprehensible insight. Results: The RF\nand XGB models were particularly effective, yielding the highest F1 score for\nage and gender estimation. Notably, the XGB model showed a slightly better\nperformance in age estimation, achieving an F1 score of 73.26%. A similar trend\nfor the RF model was also observed in gender estimation, achieving a F1 score\nof 77.53%. Conclusions: This study marks a significant advancement in dental\nforensic methods, showcasing the potential of machine learning to automate age\nand gender estimation processes with improved accuracy.\n","authors":["Mohsin Ali","Haider Raza","John Q Gan","Ariel Pokhojaev","Matanel Katz","Esra Kosan","Dian Agustin Wahjuningrum","Omnina Saleh","Rachel Sarig","Akhilanada Chaurasia"],"pdf_url":"https://arxiv.org/pdf/2411.08195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00169v2","updated":"2024-11-12T21:16:52Z","published":"2024-07-31T21:42:42Z","title":"Strike the Balance: On-the-Fly Uncertainty based User Interactions for\n Long-Term Video Object Segmentation","summary":" In this paper, we introduce a variant of video object segmentation (VOS) that\nbridges interactive and semi-automatic approaches, termed Lazy Video Object\nSegmentation (ziVOS). In contrast, to both tasks, which handle video object\nsegmentation in an off-line manner (i.e., pre-recorded sequences), we propose\nthrough ziVOS to target online recorded sequences. Here, we strive to strike a\nbalance between performance and robustness for long-term scenarios by\nsoliciting user feedback's on-the-fly during the segmentation process. Hence,\nwe aim to maximize the tracking duration of an object of interest, while\nrequiring minimal user corrections to maintain tracking over an extended\nperiod. We propose a competitive baseline, i.e., Lazy-XMem, as a reference for\nfuture works in ziVOS. Our proposed approach uses an uncertainty estimation of\nthe tracking state to determine whether a user interaction is necessary to\nrefine the model's prediction. To quantitatively assess the performance of our\nmethod and the user's workload, we introduce complementary metrics alongside\nthose already established in the field. We evaluate our approach using the\nrecently introduced LVOS dataset, which offers numerous long-term videos. Our\ncode is publicly available at https://github.com/Vujas-Eteph/LazyXMem.\n","authors":["Stéphane Vujasinović","Stefan Becker","Sebastian Bullinger","Norbert Scherer-Negenborn","Michael Arens","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2408.00169v2.pdf","comment":"Accepted at ACCV 2024"},{"id":"http://arxiv.org/abs/2411.08187v1","updated":"2024-11-12T21:12:51Z","published":"2024-11-12T21:12:51Z","title":"TractoEmbed: Modular Multi-level Embedding framework for white matter\n tract segmentation","summary":" White matter tract segmentation is crucial for studying brain structural\nconnectivity and neurosurgical planning. However, segmentation remains\nchallenging due to issues like class imbalance between major and minor tracts,\nstructural similarity, subject variability, symmetric streamlines between\nhemispheres etc. To address these challenges, we propose TractoEmbed, a modular\nmulti-level embedding framework, that encodes localized representations through\nlearning tasks in respective encoders. In this paper, TractoEmbed introduces a\nnovel hierarchical streamline data representation that captures maximum spatial\ninformation at each level i.e. individual streamlines, clusters, and patches.\nExperiments show that TractoEmbed outperforms state-of-the-art methods in white\nmatter tract segmentation across different datasets, and spanning various age\ngroups. The modular framework directly allows the integration of additional\nembeddings in future works.\n","authors":["Anoushkrit Goel","Bipanjit Singh","Ankita Joshi","Ranjeet Ranjan Jha","Chirag Ahuja","Aditya Nigam","Arnav Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2411.08187v1.pdf","comment":"Accepted at 27th International Conference on Pattern Recognition\n (ICPR), 2024 15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2408.07832v6","updated":"2024-11-12T20:51:07Z","published":"2024-07-31T14:49:35Z","title":"LADDER: Language Driven Slice Discovery and Error Rectification","summary":" Error slice discovery associates structured patterns with model errors.\nExisting methods discover error slices by clustering the error-prone samples\nwith similar patterns or assigning discrete attributes to each sample for\npost-hoc analysis. While these methods aim for interpretability and easier\nmitigation through reweighting or rebalancing, they may not capture the full\ncomplexity of error patterns due to incomplete or missing attributes. Contrary\nto the existing approach, this paper utilizes the reasoning capabilities of the\nLarge Language Model (LLM) to analyze complex error patterns and generate\ntestable hypotheses. This paper proposes LADDER: Language Driven slice\nDiscovery and Error Rectification. It first projects the model's representation\ninto a language-aligned feature space (eg CLIP) to preserve semantics in the\noriginal model feature space. This ensures the accurate retrieval of sentences\nthat highlight the model's errors. Next, the LLM utilizes the sentences and\ngenerates hypotheses to discover error slices. Finally, we mitigate the error\nby fine-tuning the classification head by creating a group-balanced dataset\nusing the hypotheses. Our entire method does not require any attribute\nannotation, either explicitly or through external tagging models. We validate\nour method with \\textbf{five} image classification datasets.\n","authors":["Shantanu Ghosh","Rayan Syed","Chenyu Wang","Clare B. Poynton","Shyam Visweswaran","Kayhan Batmanghelich"],"pdf_url":"https://arxiv.org/pdf/2408.07832v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08171v1","updated":"2024-11-12T20:30:23Z","published":"2024-11-12T20:30:23Z","title":"Comprehensive and Comparative Analysis between Transfer Learning and\n Custom Built VGG and CNN-SVM Models for Wildfire Detection","summary":" Contemporary Artificial Intelligence (AI) and Machine Learning (ML) research\nplaces a significant emphasis on transfer learning, showcasing its\ntransformative potential in enhancing model performance across diverse domains.\nThis paper examines the efficiency and effectiveness of transfer learning in\nthe context of wildfire detection. Three purpose-built models -- Visual\nGeometry Group (VGG)-7, VGG-10, and Convolutional Neural Network (CNN)-Support\nVector Machine(SVM) CNN-SVM -- are rigorously compared with three pretrained\nmodels -- VGG-16, VGG-19, and Residual Neural Network (ResNet) ResNet101. We\ntrained and evaluated these models using a dataset that captures the\ncomplexities of wildfires, incorporating variables such as varying lighting\nconditions, time of day, and diverse terrains. The objective is to discern how\ntransfer learning performs against models trained from scratch in addressing\nthe intricacies of the wildfire detection problem. By assessing the performance\nmetrics, including accuracy, precision, recall, and F1 score, a comprehensive\nunderstanding of the advantages and disadvantages of transfer learning in this\nspecific domain is obtained. This study contributes valuable insights to the\nongoing discourse, guiding future directions in AI and ML research. Keywords:\nWildfire prediction, deep learning, machine learning fire, detection\n","authors":["Aditya V. Jonnalagadda","Hashim A. Hashim","Andrew Harris"],"pdf_url":"https://arxiv.org/pdf/2411.08171v1.pdf","comment":"In Proc. of the 2024 IEEE International Conference On Intelligent\n Computing in Data Sciences"},{"id":"http://arxiv.org/abs/2411.08164v1","updated":"2024-11-12T20:15:32Z","published":"2024-11-12T20:15:32Z","title":"EAPCR: A Universal Feature Extractor for Scientific Data without\n Explicit Feature Relation Patterns","summary":" Conventional methods, including Decision Tree (DT)-based methods, have been\neffective in scientific tasks, such as non-image medical diagnostics, system\nanomaly detection, and inorganic catalysis efficiency prediction. However, most\ndeep-learning techniques have struggled to surpass or even match this level of\nsuccess as traditional machine-learning methods. The primary reason is that\nthese applications involve multi-source, heterogeneous data where features lack\nexplicit relationships. This contrasts with image data, where pixels exhibit\nspatial relationships; textual data, where words have sequential dependencies;\nand graph data, where nodes are connected through established associations. The\nabsence of explicit Feature Relation Patterns (FRPs) presents a significant\nchallenge for deep learning techniques in scientific applications that are not\nimage, text, and graph-based. In this paper, we introduce EAPCR, a universal\nfeature extractor designed for data without explicit FRPs. Tested across\nvarious scientific tasks, EAPCR consistently outperforms traditional methods\nand bridges the gap where deep learning models fall short. To further\ndemonstrate its robustness, we synthesize a dataset without explicit FRPs.\nWhile Kolmogorov-Arnold Network (KAN) and feature extractors like Convolutional\nNeural Networks (CNNs), Graph Convolutional Networks (GCNs), and Transformers\nstruggle, EAPCR excels, demonstrating its robustness and superior performance\nin scientific tasks without FRPs.\n","authors":["Zhuohang Yu","Ling An","Yansong Li","Yu Wu","Zeyu Dong","Zhangdi Liu","Le Gao","Zhenyu Zhang","Chichun Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.08164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08158v1","updated":"2024-11-12T20:07:59Z","published":"2024-11-12T20:07:59Z","title":"TomoGRAF: A Robust and Generalizable Reconstruction Network for\n Single-View Computed Tomography","summary":" Computed tomography (CT) provides high spatial resolution visualization of 3D\nstructures for scientific and clinical applications. Traditional\nanalytical/iterative CT reconstruction algorithms require hundreds of angular\ndata samplings, a condition that may not be met in practice due to physical and\nmechanical limitations. Sparse view CT reconstruction has been proposed using\nconstrained optimization and machine learning methods with varying success,\nless so for ultra-sparse view CT reconstruction with one to two views. Neural\nradiance field (NeRF) is a powerful tool for reconstructing and rendering 3D\nnatural scenes from sparse views, but its direct application to 3D medical\nimage reconstruction has been minimally successful due to the differences\nbetween optical and X-ray photon transportation. Here, we develop a novel\nTomoGRAF framework incorporating the unique X-ray transportation physics to\nreconstruct high-quality 3D volumes using ultra-sparse projections without\nprior. TomoGRAF captures the CT imaging geometry, simulates the X-ray casting\nand tracing process, and penalizes the difference between simulated and ground\ntruth CT sub-volume during training. We evaluated the performance of TomoGRAF\non an unseen dataset of distinct imaging characteristics from the training data\nand demonstrated a vast leap in performance compared with state-of-the-art deep\nlearning and NeRF methods. TomoGRAF provides the first generalizable solution\nfor image-guided radiotherapy and interventional radiology applications, where\nonly one or a few X-ray views are available, but 3D volumetric information is\ndesired.\n","authors":["Di Xu","Yang Yang","Hengjie Liu","Qihui Lyu","Martina Descovich","Dan Ruan","Ke Sheng"],"pdf_url":"https://arxiv.org/pdf/2411.08158v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.05249v5","updated":"2024-11-12T19:59:51Z","published":"2024-10-07T17:52:56Z","title":"LoTLIP: Improving Language-Image Pre-training for Long Text\n Understanding","summary":" Understanding long text is of great demands in practice but beyond the reach\nof most language-image pre-training (LIP) models. In this work, we empirically\nconfirm that the key reason causing such an issue is that the training images\nare usually paired with short captions, leaving certain tokens easily\novershadowed by salient tokens. Towards this problem, our initial attempt is to\nrelabel the data with long captions, however, directly learning with which may\nlead to performance degradation in understanding short text (e.g., in the image\nclassification task). Then, after incorporating corner tokens to aggregate\ndiverse textual information, we manage to help the model catch up to its\noriginal level of short text understanding yet greatly enhance its capability\nof long text understanding. We further look into whether the model can\ncontinuously benefit from longer captions and notice a clear trade-off between\nthe performance and the efficiency. Finally, we validate the effectiveness of\nour approach using a self-constructed large-scale dataset, which consists of\n100M long caption oriented text-image pairs. Our method demonstrates superior\nperformance in long-text-image retrieval tasks. The project page is available\nat https://wuw2019.github.io/lot-lip.\n","authors":["Wei Wu","Kecheng Zheng","Shuailei Ma","Fan Lu","Yuxin Guo","Yifei Zhang","Wei Chen","Qingpei Guo","Yujun Shen","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2410.05249v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08128v1","updated":"2024-11-12T19:12:12Z","published":"2024-11-12T19:12:12Z","title":"CameraHMR: Aligning People with Perspective","summary":" We address the challenge of accurate 3D human pose and shape estimation from\nmonocular images. The key to accuracy and robustness lies in high-quality\ntraining data. Existing training datasets containing real images with pseudo\nground truth (pGT) use SMPLify to fit SMPL to sparse 2D joint locations,\nassuming a simplified camera with default intrinsics. We make two contributions\nthat improve pGT accuracy. First, to estimate camera intrinsics, we develop a\nfield-of-view prediction model (HumanFoV) trained on a dataset of images\ncontaining people. We use the estimated intrinsics to enhance the 4D-Humans\ndataset by incorporating a full perspective camera model during SMPLify\nfitting. Second, 2D joints provide limited constraints on 3D body shape,\nresulting in average-looking bodies. To address this, we use the BEDLAM dataset\nto train a dense surface keypoint detector. We apply this detector to the\n4D-Humans dataset and modify SMPLify to fit the detected keypoints, resulting\nin significantly more realistic body shapes. Finally, we upgrade the HMR2.0\narchitecture to include the estimated camera parameters. We iterate model\ntraining and SMPLify fitting initialized with the previously trained model.\nThis leads to more accurate pGT and a new model, CameraHMR, with\nstate-of-the-art accuracy. Code and pGT are available for research purposes.\n","authors":["Priyanka Patel","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2411.08128v1.pdf","comment":"3DV 2025"},{"id":"http://arxiv.org/abs/2411.08127v1","updated":"2024-11-12T19:09:45Z","published":"2024-11-12T19:09:45Z","title":"TIPO: Text to Image with Text Presampling for Prompt Optimization","summary":" TIPO (Text to Image with text pre-sampling for Prompt Optimization) is an\ninnovative framework designed to enhance text-to-image (T2I) generation by\nlanguage model (LM) for automatic prompt engineering. By refining and extending\nuser-provided prompts, TIPO bridges the gap between simple inputs and the\ndetailed prompts required for high-quality image generation. Unlike previous\napproaches that rely on Large Language Models (LLMs) or reinforcement learning\n(RL), TIPO adjusts user input prompts with the distribution of a trained prompt\ndataset, eliminating the need for complex runtime cost via lightweight model.\nThis pre-sampling approach enables efficient and scalable prompt optimization,\ngrounded in the model's training distribution. Experimental results demonstrate\nTIPO's effectiveness in improving aesthetic scores, reducing image corruption,\nand better aligning generated images with dataset distributions. These findings\nhighlight the critical role of prompt engineering in T2I systems and open\navenues for broader applications of automatic prompt refinement.\n","authors":["Shih-Ying Yeh","Sang-Hyun Park","Giyeong Oh","Min Song","Youngjae Yu"],"pdf_url":"https://arxiv.org/pdf/2411.08127v1.pdf","comment":"21 pages, 13 figures"},{"id":"http://arxiv.org/abs/2411.08037v1","updated":"2024-11-12T18:59:59Z","published":"2024-11-12T18:59:59Z","title":"Material Transforms from Disentangled NeRF Representations","summary":" In this paper, we first propose a novel method for transferring material\ntransformations across different scenes. Building on disentangled Neural\nRadiance Field (NeRF) representations, our approach learns to map Bidirectional\nReflectance Distribution Functions (BRDF) from pairs of scenes observed in\nvarying conditions, such as dry and wet. The learned transformations can then\nbe applied to unseen scenes with similar materials, therefore effectively\nrendering the transformation learned with an arbitrary level of intensity.\nExtensive experiments on synthetic scenes and real-world objects validate the\neffectiveness of our approach, showing that it can learn various\ntransformations such as wetness, painting, coating, etc. Our results highlight\nnot only the versatility of our method but also its potential for practical\napplications in computer graphics. We publish our method implementation, along\nwith our synthetic/real datasets on\nhttps://github.com/astra-vision/BRDFTransform\n","authors":["Ivan Lopes","Jean-François Lalonde","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2411.08037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08033v1","updated":"2024-11-12T18:59:32Z","published":"2024-11-12T18:59:32Z","title":"GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D\n Generation","summary":" While 3D content generation has advanced significantly, existing methods\nstill face challenges with input formats, latent space design, and output\nrepresentations. This paper introduces a novel 3D generation framework that\naddresses these challenges, offering scalable, high-quality 3D generation with\nan interactive Point Cloud-structured Latent space. Our framework employs a\nVariational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal)\nrenderings as input, using a unique latent space design that preserves 3D shape\ninformation, and incorporates a cascaded latent diffusion model for improved\nshape-texture disentanglement. The proposed method, GaussianAnything, supports\nmulti-modal conditional 3D generation, allowing for point cloud, caption, and\nsingle/multi-view image inputs. Notably, the newly proposed latent space\nnaturally enables geometry-texture disentanglement, thus allowing 3D-aware\nediting. Experimental results demonstrate the effectiveness of our approach on\nmultiple datasets, outperforming existing methods in both text- and\nimage-conditioned 3D generation.\n","authors":["Yushi Lan","Shangchen Zhou","Zhaoyang Lyu","Fangzhou Hong","Shuai Yang","Bo Dai","Xingang Pan","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2411.08033v1.pdf","comment":"project page: https://nirvanalan.github.io/projects/GA/"},{"id":"http://arxiv.org/abs/2411.08027v1","updated":"2024-11-12T18:56:58Z","published":"2024-11-12T18:56:58Z","title":"LLMPhy: Complex Physical Reasoning Using Large Language Models and World\n Models","summary":" Physical reasoning is an important skill needed for robotic agents when\noperating in the real world. However, solving such reasoning problems often\ninvolves hypothesizing and reflecting over complex multi-body interactions\nunder the effect of a multitude of physical forces and thus learning all such\ninteractions poses a significant hurdle for state-of-the-art machine learning\nframeworks, including large language models (LLMs). To study this problem, we\npropose a new physical reasoning task and a dataset, dubbed TraySim. Our task\ninvolves predicting the dynamics of several objects on a tray that is given an\nexternal impact -- the domino effect of the ensued object interactions and\ntheir dynamics thus offering a challenging yet controlled setup, with the goal\nof reasoning being to infer the stability of the objects after the impact. To\nsolve this complex physical reasoning task, we present LLMPhy, a zero-shot\nblack-box optimization framework that leverages the physics knowledge and\nprogram synthesis abilities of LLMs, and synergizes these abilities with the\nworld models built into modern physics engines. Specifically, LLMPhy uses an\nLLM to generate code to iteratively estimate the physical hyperparameters of\nthe system (friction, damping, layout, etc.) via an implicit\nanalysis-by-synthesis approach using a (non-differentiable) simulator in the\nloop and uses the inferred parameters to imagine the dynamics of the scene\ntowards solving the reasoning task. To show the effectiveness of LLMPhy, we\npresent experiments on our TraySim dataset to predict the steady-state poses of\nthe objects. Our results show that the combination of the LLM and the physics\nengine leads to state-of-the-art zero-shot physical reasoning performance,\nwhile demonstrating superior convergence against standard black-box\noptimization methods and better estimation of the physical parameters.\n","authors":["Anoop Cherian","Radu Corcodel","Siddarth Jain","Diego Romeres"],"pdf_url":"https://arxiv.org/pdf/2411.08027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17245v6","updated":"2024-11-12T18:50:19Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advances in real-time neural rendering using point-based techniques\nhave enabled broader adoption of 3D representations. However, foundational\napproaches like 3D Gaussian Splatting impose substantial storage overhead, as\nStructure-from-Motion (SfM) points can grow to millions, often requiring\ngigabyte-level disk space for a single unbounded scene. This growth presents\nscalability challenges and hinders splatting efficiency. To address this, we\nintroduce LightGaussian, a method for transforming 3D Gaussians into a more\ncompact format. Inspired by Network Pruning, LightGaussian identifies Gaussians\nwith minimal global significance on scene reconstruction, and applies a pruning\nand recovery process to reduce redundancy while preserving visual quality.\nKnowledge distillation and pseudo-view augmentation then transfer spherical\nharmonic coefficients to a lower degree, yielding compact representations.\nGaussian Vector Quantization, based on each Gaussian's global significance,\nfurther lowers bitwidth with minimal accuracy loss. LightGaussian achieves an\naverage 15x compression rate while boosting FPS from 144 to 237 within the\n3D-GS framework, enabling efficient complex scene representation on the\nMip-NeRF 360 and Tank & Temple datasets. The proposed Gaussian pruning approach\nis also adaptable to other 3D representations (e.g., Scaffold-GS),\ndemonstrating strong generalization capabilities.\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v6.pdf","comment":"NeurIPS 2024, Project page: https://lightgaussian.github.io/"},{"id":"http://arxiv.org/abs/2411.08017v1","updated":"2024-11-12T18:49:06Z","published":"2024-11-12T18:49:06Z","title":"Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model\n with Compact Wavelet Encodings","summary":" Large-scale 3D generative models require substantial computational resources\nyet often fall short in capturing fine details and complex geometries at high\nresolutions. We attribute this limitation to the inefficiency of current\nrepresentations, which lack the compactness required to model the generative\nmodels effectively. To address this, we introduce a novel approach called\nWavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based,\ncompact latent encodings. Specifically, we compress a $256^3$ signed distance\nfield into a $12^3 \\times 4$ latent grid, achieving an impressive 2427x\ncompression ratio with minimal loss of detail. This high level of compression\nallows our method to efficiently train large-scale generative networks without\nincreasing the inference time. Our models, both conditional and unconditional,\ncontain approximately one billion parameters and successfully generate\nhigh-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid\ninference, producing shapes within two to four seconds depending on the\ncondition, despite the model's scale. We demonstrate state-of-the-art\nperformance across multiple datasets, with significant improvements in\ngeneration quality, diversity, and computational efficiency. We open-source our\ncode and, to the best of our knowledge, release the largest pretrained 3D\ngenerative models across different modalities.\n","authors":["Aditya Sanghi","Aliasghar Khani","Pradyumna Reddy","Arianna Rampini","Derek Cheung","Kamal Rahimi Malekshan","Kanika Madan","Hooman Shayani"],"pdf_url":"https://arxiv.org/pdf/2411.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.20099v2","updated":"2024-11-12T18:46:33Z","published":"2024-06-28T17:59:51Z","title":"Odd-One-Out: Anomaly Detection by Comparing with Neighbors","summary":" This paper introduces a novel anomaly detection (AD) problem that focuses on\nidentifying `odd-looking' objects relative to the other instances in a given\nscene. In contrast to the traditional AD benchmarks, anomalies in our task are\nscene-specific, defined by the regular instances that make up the majority.\nSince object instances may be only partly visible from a single viewpoint, our\nsetting employs multiple views of each scene as input. To provide a testbed for\nfuture research in this task, we introduce two benchmarks, ToysAD-8K and\nPartsAD-15K. We propose a novel method that constructs 3D object-centric\nrepresentations from multiple 2D views for each instance and detects the\nanomalous ones through a cross-instance comparison. We rigorously analyze our\nmethod quantitatively and qualitatively on the presented benchmarks.\n","authors":["Ankan Bhunia","Changjian Li","Hakan Bilen"],"pdf_url":"https://arxiv.org/pdf/2406.20099v2.pdf","comment":"Codes & Dataset at https://github.com/VICO-UoE/OddOneOutAD"},{"id":"http://arxiv.org/abs/2411.08014v1","updated":"2024-11-12T18:44:13Z","published":"2024-11-12T18:44:13Z","title":"Artistic Neural Style Transfer Algorithms with Activation Smoothing","summary":" The works of Gatys et al. demonstrated the capability of Convolutional Neural\nNetworks (CNNs) in creating artistic style images. This process of transferring\ncontent images in different styles is called Neural Style Transfer (NST). In\nthis paper, we re-implement image-based NST, fast NST, and arbitrary NST. We\nalso explore to utilize ResNet with activation smoothing in NST. Extensive\nexperimental results demonstrate that smoothing transformation can greatly\nimprove the quality of stylization results.\n","authors":["Xiangtian Li","Han Cao","Zhaoyang Zhang","Jiacheng Hu","Yuhui Jin","Zihao Zhao"],"pdf_url":"https://arxiv.org/pdf/2411.08014v1.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2411.07975v1","updated":"2024-11-12T17:55:10Z","published":"2024-11-12T17:55:10Z","title":"JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified\n Multimodal Understanding and Generation","summary":" We present JanusFlow, a powerful framework that unifies image understanding\nand generation in a single model. JanusFlow introduces a minimalist\narchitecture that integrates autoregressive language models with rectified\nflow, a state-of-the-art method in generative modeling. Our key finding\ndemonstrates that rectified flow can be straightforwardly trained within the\nlarge language model framework, eliminating the need for complex architectural\nmodifications. To further improve the performance of our unified model, we\nadopt two key strategies: (i) decoupling the understanding and generation\nencoders, and (ii) aligning their representations during unified training.\nExtensive experiments show that JanusFlow achieves comparable or superior\nperformance to specialized models in their respective domains, while\nsignificantly outperforming existing unified approaches across standard\nbenchmarks. This work represents a step toward more efficient and versatile\nvision-language models.\n","authors":["Yiyang Ma","Xingchao Liu","Xiaokang Chen","Wen Liu","Chengyue Wu","Zhiyu Wu","Zizheng Pan","Zhenda Xie","Haowei Zhang","Xingkai yu","Liang Zhao","Yisong Wang","Jiaying Liu","Chong Ruan"],"pdf_url":"https://arxiv.org/pdf/2411.07975v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07956v1","updated":"2024-11-12T17:31:51Z","published":"2024-11-12T17:31:51Z","title":"Commissioning An All-Sky Infrared Camera Array for Detection Of Airborne\n Objects","summary":" To date there is little publicly available scientific data on Unidentified\nAerial Phenomena (UAP) whose properties and kinematics purportedly reside\noutside the performance envelope of known phenomena. To address this\ndeficiency, the Galileo Project is designing, building, and commissioning a\nmulti-modal ground-based observatory to continuously monitor the sky and\nconduct a rigorous long-term aerial census of all aerial phenomena, including\nnatural and human-made. One of the key instruments is an all-sky infrared\ncamera array using eight uncooled long-wave infrared FLIR Boson 640 cameras.\nTheir calibration includes a novel extrinsic calibration method using airplane\npositions from Automatic Dependent Surveillance-Broadcast (ADS-B) data. We\nestablish a first baseline for the system performance over five months of field\noperation, using a real-world dataset derived from ADS-B data, synthetic 3-D\ntrajectories, and a hand-labelled real-world dataset. We report acceptance\nrates (e.g. viewable airplanes that are recorded) and detection efficiencies\n(e.g. recorded airplanes which are successfully detected) for a variety of\nweather conditions, range and aircraft size. We reconstruct $\\sim$500,000\ntrajectories of aerial objects from this commissioning period. A toy outlier\nsearch focused on large sinuosity of the 2-D reconstructed trajectories flags\nabout 16% of trajectories as outliers. After manual review, 144 trajectories\nremain ambiguous: they are likely mundane objects but cannot be elucidated at\nthis stage of development without distance and kinematics estimation or other\nsensor modalities. Our observed count of ambiguous outliers combined with\nsystematic uncertainties yields an upper limit of 18,271 outliers count for the\nfive-month interval at a 95% confidence level. This likelihood-based method to\nevaluate significance is applicable to all of our future outlier searches.\n","authors":["Laura Dominé","Ankit Biswas","Richard Cloete","Alex Delacroix","Andriy Fedorenko","Lucas Jacaruso","Ezra Kelderman","Eric Keto","Sarah Little","Abraham Loeb","Eric Masson","Mike Prior","Forrest Schultz","Matthew Szenher","Wes Watters","Abby White"],"pdf_url":"https://arxiv.org/pdf/2411.07956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07945v1","updated":"2024-11-12T17:17:33Z","published":"2024-11-12T17:17:33Z","title":"SimBase: A Simple Baseline for Temporal Video Grounding","summary":" This paper presents SimBase, a simple yet effective baseline for temporal\nvideo grounding. While recent advances in temporal grounding have led to\nimpressive performance, they have also driven network architectures toward\ngreater complexity, with a range of methods to (1) capture temporal\nrelationships and (2) achieve effective multimodal fusion. In contrast, this\npaper explores the question: How effective can a simplified approach be? To\ninvestigate, we design SimBase, a network that leverages lightweight,\none-dimensional temporal convolutional layers instead of complex temporal\nstructures. For cross-modal interaction, SimBase only employs an element-wise\nproduct instead of intricate multimodal fusion. Remarkably, SimBase achieves\nstate-of-the-art results on two large-scale datasets. As a simple yet powerful\nbaseline, we hope SimBase will spark new ideas and streamline future\nevaluations in temporal video grounding.\n","authors":["Peijun Bao","Alex C. Kot"],"pdf_url":"https://arxiv.org/pdf/2411.07945v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2411.07941v1","updated":"2024-11-12T17:11:18Z","published":"2024-11-12T17:11:18Z","title":"DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with\n Generative Adversarial Networks","summary":" Computed tomography (CT) provides highly detailed three-dimensional (3D)\nmedical images but is costly, time-consuming, and often inaccessible in\nintraoperative settings (Organization et al. 2011). Recent advancements have\nexplored reconstructing 3D chest volumes from sparse 2D X-rays, such as\nsingle-view or orthogonal double-view images. However, current models tend to\nprocess 2D images in a planar manner, prioritizing visual realism over\nstructural accuracy. In this work, we introduce DuoLift Generative Adversarial\nNetworks (DuoLift-GAN), a novel architecture with dual branches that\nindependently elevate 2D images and their features into 3D representations.\nThese 3D outputs are merged into a unified 3D feature map and decoded into a\ncomplete 3D chest volume, enabling richer 3D information capture. We also\npresent a masked loss function that directs reconstruction towards critical\nanatomical regions, improving structural accuracy and visual quality. This\npaper demonstrates that DuoLift-GAN significantly enhances reconstruction\naccuracy while achieving superior visual realism compared to existing methods.\n","authors":["Zhaoxi Zhang","Yueliang Ying"],"pdf_url":"https://arxiv.org/pdf/2411.07941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07936v1","updated":"2024-11-12T17:05:18Z","published":"2024-11-12T17:05:18Z","title":"Learning Disentangled Representations for Perceptual Point Cloud Quality\n Assessment via Mutual Information Minimization","summary":" No-Reference Point Cloud Quality Assessment (NR-PCQA) aims to objectively\nassess the human perceptual quality of point clouds without relying on\npristine-quality point clouds for reference. It is becoming increasingly\nsignificant with the rapid advancement of immersive media applications such as\nvirtual reality (VR) and augmented reality (AR). However, current NR-PCQA\nmodels attempt to indiscriminately learn point cloud content and distortion\nrepresentations within a single network, overlooking their distinct\ncontributions to quality information. To address this issue, we propose DisPA,\na novel disentangled representation learning framework for NR-PCQA. The\nframework trains a dual-branch disentanglement network to minimize mutual\ninformation (MI) between representations of point cloud content and distortion.\nSpecifically, to fully disentangle representations, the two branches adopt\ndifferent philosophies: the content-aware encoder is pretrained by a masked\nauto-encoding strategy, which can allow the encoder to capture semantic\ninformation from rendered images of distorted point clouds; the\ndistortion-aware encoder takes a mini-patch map as input, which forces the\nencoder to focus on low-level distortion patterns. Furthermore, we utilize an\nMI estimator to estimate the tight upper bound of the actual MI and further\nminimize it to achieve explicit representation disentanglement. Extensive\nexperimental results demonstrate that DisPA outperforms state-of-the-art\nmethods on multiple PCQA datasets.\n","authors":["Ziyu Shan","Yujie Zhang","Yipeng Liu","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08085v1","updated":"2024-11-12T16:52:51Z","published":"2024-11-12T16:52:51Z","title":"Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation,\n Embrace Orthogonality","summary":" We introduce a yat-product-powered neural network, the Neural Matter Network\n(NMN), a breakthrough in deep learning that achieves non-linear pattern\nrecognition without activation functions. Our key innovation relies on the\nyat-product and yat-product, which naturally induces non-linearity by\nprojecting inputs into a pseudo-metric space, eliminating the need for\ntraditional activation functions while maintaining only a softmax layer for\nfinal class probability distribution. This approach simplifies network\narchitecture and provides unprecedented transparency into the network's\ndecision-making process. Our comprehensive empirical evaluation across\ndifferent datasets demonstrates that NMN consistently outperforms traditional\nMLPs. The results challenge the assumption that separate activation functions\nare necessary for effective deep-learning models. The implications of this work\nextend beyond immediate architectural benefits, by eliminating intermediate\nactivation functions while preserving non-linear capabilities, yat-MLP\nestablishes a new paradigm for neural network design that combines simplicity\nwith effectiveness. Most importantly, our approach provides unprecedented\ninsights into the traditionally opaque \"black-box\" nature of neural networks,\noffering a clearer understanding of how these models process and classify\ninformation.\n","authors":["Taha Bouhsine"],"pdf_url":"https://arxiv.org/pdf/2411.08085v1.pdf","comment":"Submitted to CVPR 2025"},{"id":"http://arxiv.org/abs/2411.07918v1","updated":"2024-11-12T16:50:13Z","published":"2024-11-12T16:50:13Z","title":"Isometric Transformations for Image Augmentation in Mueller Matrix\n Polarimetry","summary":" Mueller matrix polarimetry captures essential information about polarized\nlight interactions with a sample, presenting unique challenges for data\naugmentation in deep learning due to its distinct structure. While\naugmentations are an effective and affordable way to enhance dataset diversity\nand reduce overfitting, standard transformations like rotations and flips do\nnot preserve the polarization properties in Mueller matrix images. To this end,\nwe introduce a versatile simulation framework that applies physically\nconsistent rotations and flips to Mueller matrices, tailored to maintain\npolarization fidelity. Our experimental results across multiple datasets reveal\nthat conventional augmentations can lead to misleading results when applied to\npolarimetric data, underscoring the necessity of our physics-based approach. In\nour experiments, we first compare our polarization-specific augmentations\nagainst real-world captures to validate their physical consistency. We then\napply these augmentations in a semantic segmentation task, achieving\nsubstantial improvements in model generalization and performance. This study\nunderscores the necessity of physics-informed data augmentation for\npolarimetric imaging in deep learning (DL), paving the way for broader adoption\nand more robust applications across diverse research in the field. In\nparticular, our framework unlocks the potential of DL models for polarimetric\ndatasets with limited sample sizes. Our code implementation is available at\ngithub.com/hahnec/polar_augment.\n","authors":["Christopher Hahne","Omar Rodriguez-Nunez","Éléa Gros","Théotim Lucas","Ekkehard Hewer","Tatiana Novikova","Theoni Maragkou","Philippe Schucht","Richard McKinley"],"pdf_url":"https://arxiv.org/pdf/2411.07918v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2411.05747v3","updated":"2024-11-12T16:42:02Z","published":"2024-11-08T18:08:33Z","title":"WavShadow: Wavelet Based Shadow Segmentation and Removal","summary":" Shadow removal and segmentation remain challenging tasks in computer vision,\nparticularly in complex real world scenarios. This study presents a novel\napproach that enhances the ShadowFormer model by incorporating Masked\nAutoencoder (MAE) priors and Fast Fourier Convolution (FFC) blocks, leading to\nsignificantly faster convergence and improved performance. We introduce key\ninnovations: (1) integration of MAE priors trained on Places2 dataset for\nbetter context understanding, (2) adoption of Haar wavelet features for\nenhanced edge detection and multiscale analysis, and (3) implementation of a\nmodified SAM Adapter for robust shadow segmentation. Extensive experiments on\nthe challenging DESOBA dataset demonstrate that our approach achieves state of\nthe art results, with notable improvements in both convergence speed and shadow\nremoval quality.\n","authors":["Shreyans Jain","Viraj Vekaria","Karan Gandhi","Aadya Arora"],"pdf_url":"https://arxiv.org/pdf/2411.05747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.07155v2","updated":"2024-11-12T16:39:29Z","published":"2024-05-12T04:18:10Z","title":"Meta-Learned Modality-Weighted Knowledge Distillation for Robust\n Multi-Modal Learning with Missing Data","summary":" In multi-modal learning, some modalities are more influential than others,\nand their absence can have a significant impact on classification/segmentation\naccuracy. Addressing this challenge, we propose a novel approach called\nMeta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables\nmulti-modal models to maintain high accuracy even when key modalities are\nmissing. MetaKD adaptively estimates the importance weight of each modality\nthrough a meta-learning process. These learned importance weights guide a\npairwise modality-weighted knowledge distillation process, allowing\nhigh-importance modalities to transfer knowledge to lower-importance ones,\nresulting in robust performance despite missing inputs. Unlike previous methods\nin the field, which are often task-specific and require significant\nmodifications, our approach is designed to work in multiple tasks (e.g.,\nsegmentation and classification) with minimal adaptation. Experimental results\non five prevalent datasets, including three Brain Tumor Segmentation datasets\n(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging\nInitiative (ADNI) classification dataset and the Audiovision-MNIST\nclassification dataset, demonstrate the proposed model is able to outperform\nthe compared models by a large margin.\n","authors":["Hu Wang","Salma Hassan","Yuyuan Liu","Congbo Ma","Yuanhong Chen","Yutong Xie","Mostafa Salem","Yu Tian","Jodie Avery","Louise Hull","Ian Reid","Mohammad Yaqub","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2405.07155v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07901v1","updated":"2024-11-12T16:15:25Z","published":"2024-11-12T16:15:25Z","title":"TLDR: Traffic Light Detection using Fourier Domain Adaptation in Hostile\n WeatheR","summary":" The scarcity of comprehensive datasets in the traffic light detection and\nrecognition domain and the poor performance of state-of-the-art models under\nhostile weather conditions present significant challenges. To address these\nissues, this paper proposes a novel approach by merging two widely used\ndatasets, LISA and S2TLD. The merged dataset is further processed to tackle\nclass imbalance, a common problem in this domain. This merged dataset becomes\nour source domain. Synthetic rain and fog are added to the dataset to create\nour target domain. We employ Fourier Domain Adaptation (FDA) to create a final\ndataset with a minimized domain gap between the two datasets, helping the model\ntrained on this final dataset adapt to rainy and foggy weather conditions.\nAdditionally, we explore Semi-Supervised Learning (SSL) techniques to leverage\nthe available data more effectively. Experimental results demonstrate that\nmodels trained on FDA-augmented images outperform those trained without FDA\nacross confidence-dependent and independent metrics, like mAP50, mAP50-95,\nPrecision, and Recall. The best-performing model, YOLOv8, achieved a Precision\nincrease of 5.1860%, Recall increase of 14.8009%, mAP50 increase of 9.5074%,\nand mAP50-95 increase of 19.5035%. On average, percentage increases of 7.6892%\nin Precision, 19.9069% in Recall, 15.8506% in mAP50, and 23.8099% in mAP50-95\nwere observed across all models, highlighting the effectiveness of FDA in\nmitigating the impact of adverse weather conditions on model performance. These\nimprovements pave the way for real-world applications where reliable\nperformance in challenging environmental conditions is critical.\n","authors":["Ishaan Gakhar","Aryesh Guha","Aryaman Gupta","Amit Agarwal","Durga Toshniwal","Ujjwal Verma"],"pdf_url":"https://arxiv.org/pdf/2411.07901v1.pdf","comment":"Under Review at IEEE Transactions of Artificial Intelligence. 10\n Pages, 7 Figures"},{"id":"http://arxiv.org/abs/2411.07899v1","updated":"2024-11-12T16:12:51Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n Tensor-based Transformer","summary":" The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Ho","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07893v1","updated":"2024-11-12T15:58:09Z","published":"2024-11-12T15:58:09Z","title":"Joint multi-dimensional dynamic attention and transformer for general\n image restoration","summary":" Outdoor images often suffer from severe degradation due to rain, haze, and\nnoise, impairing image quality and challenging high-level tasks. Current image\nrestoration methods struggle to handle complex degradation while maintaining\nefficiency. This paper introduces a novel image restoration architecture that\ncombines multi-dimensional dynamic attention and self-attention within a U-Net\nframework. To leverage the global modeling capabilities of transformers and the\nlocal modeling capabilities of convolutions, we integrate sole CNNs in the\nencoder-decoder and sole transformers in the latent layer. Additionally, we\ndesign convolutional kernels with selected multi-dimensional dynamic attention\nto capture diverse degraded inputs efficiently. A transformer block with\ntransposed self-attention further enhances global feature extraction while\nmaintaining efficiency. Extensive experiments demonstrate that our method\nachieves a better balance between performance and computational complexity\nacross five image restoration tasks: deraining, deblurring, denoising,\ndehazing, and enhancement, as well as superior performance for high-level\nvision tasks. The source code will be available at\nhttps://github.com/House-yuyu/MDDA-former.\n","authors":["Huan Zhang","Xu Zhang","Nian Cai","Jianglei Di","Yun Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07885v1","updated":"2024-11-12T15:47:17Z","published":"2024-11-12T15:47:17Z","title":"INTRABENCH: Interactive Radiological Benchmark","summary":" Current interactive segmentation approaches, inspired by the success of\nMETA's Segment Anything model, have achieved notable advancements, however,\nthey come with substantial limitations that hinder their practical application\nin real clinical scenarios. These include unrealistic human interaction\nrequirements, such as slice-by-slice operations for 2D models on 3D data, a\nlack of iterative refinement, and insufficient evaluation experiments. These\nshortcomings prevent accurate assessment of model performance and lead to\ninconsistent outcomes across studies. IntRaBench overcomes these challenges by\noffering a comprehensive and reproducible framework for evaluating interactive\nsegmentation methods in realistic, clinically relevant scenarios. It includes\ndiverse datasets, target structures, and segmentation models, and provides a\nflexible codebase that allows seamless integration of new models and prompting\nstrategies. Additionally, we introduce advanced techniques to minimize\nclinician interaction, ensuring fair comparisons between 2D and 3D models. By\nopen-sourcing IntRaBench, we invite the research community to integrate their\nmodels and prompting techniques, ensuring continuous and transparent evaluation\nof interactive segmentation models in 3D medical imaging.\n","authors":["Constantin Ulrich","Tassilo Wald","Emily Tempus","Maximilian Rokuss","Paul F. Jaeger","Klaus Maier-Hein"],"pdf_url":"https://arxiv.org/pdf/2411.07885v1.pdf","comment":"Undergoing Peer-Review"},{"id":"http://arxiv.org/abs/2411.07873v1","updated":"2024-11-12T15:29:50Z","published":"2024-11-12T15:29:50Z","title":"Diverse capability and scaling of diffusion and auto-regressive models\n when learning abstract rules","summary":" Humans excel at discovering regular structures from limited samples and\napplying inferred rules to novel settings. We investigate whether modern\ngenerative models can similarly learn underlying rules from finite samples and\nperform reasoning through conditional sampling. Inspired by Raven's Progressive\nMatrices task, we designed GenRAVEN dataset, where each sample consists of\nthree rows, and one of 40 relational rules governing the object position,\nnumber, or attributes applies to all rows. We trained generative models to\nlearn the data distribution, where samples are encoded as integer arrays to\nfocus on rule learning. We compared two generative model families: diffusion\n(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their\nability to generate structurally consistent samples and perform panel\ncompletion via unconditional and conditional sampling. We found diffusion\nmodels excel at unconditional generation, producing more novel and consistent\nsamples from scratch and memorizing less, but performing less well in panel\ncompletion, even with advanced conditional sampling methods. Conversely,\nautoregressive models excel at completing missing panels in a rule-consistent\nmanner but generate less consistent samples unconditionally. We observe diverse\ndata scaling behaviors: for both model families, rule learning emerges at a\ncertain dataset size - around 1000s examples per rule. With more training data,\ndiffusion models improve both their unconditional and conditional generation\ncapabilities. However, for autoregressive models, while panel completion\nimproves with more training data, unconditional generation consistency\ndeclines. Our findings highlight complementary capabilities and limitations of\ndiffusion and autoregressive models in rule learning and reasoning tasks,\nsuggesting avenues for further research into their mechanisms and potential for\nhuman-like reasoning.\n","authors":["Binxu Wang","Jiaqi Shang","Haim Sompolinsky"],"pdf_url":"https://arxiv.org/pdf/2411.07873v1.pdf","comment":"12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2\n Reasoning At Scale as long paper"},{"id":"http://arxiv.org/abs/2411.07863v1","updated":"2024-11-12T15:22:14Z","published":"2024-11-12T15:22:14Z","title":"CDXFormer: Boosting Remote Sensing Change Detection with Extended Long\n Short-Term Memory","summary":" In complex scenes and varied conditions, effectively integrating\nspatial-temporal context is crucial for accurately identifying changes.\nHowever, current RS-CD methods lack a balanced consideration of performance and\nefficiency. CNNs lack global context, Transformers have quadratic computational\ncomplexity, and Mambas are restricted by CUDA acceleration. In this paper, we\npropose CDXFormer, with a core component that is a powerful XLSTM-based feature\nenhancement layer, integrating the advantages of linear computational\ncomplexity, global context perception, and strong interpret-ability.\nSpecifically, we introduce a scale-specific Feature Enhancer layer,\nincorporating a Cross-Temporal Global Perceptron customized for\nsemantic-accurate deep features, and a Cross-Temporal Spatial Refiner\ncustomized for detail-rich shallow features. Additionally, we propose a\nCross-Scale Interactive Fusion module to progressively interact global change\nrepresentations with spatial responses. Extensive experimental results\ndemonstrate that CDXFormer achieves state-of-the-art performance across three\nbenchmark datasets, offering a compelling balance between efficiency and\naccuracy. Code is available at https://github.com/xwmaxwma/rschange.\n","authors":["Zhenkai Wu","Xiaowen Ma","Rongrong Lian","Zhentao Lin","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.07863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.04492v4","updated":"2024-11-12T15:16:36Z","published":"2024-10-06T14:11:39Z","title":"Interpret Your Decision: Logical Reasoning Regularization for\n Generalization in Visual Classification","summary":" Vision models excel in image classification but struggle to generalize to\nunseen data, such as classifying images from unseen domains or discovering\nnovel categories. In this paper, we explore the relationship between logical\nreasoning and deep learning generalization in visual classification. A logical\nregularization termed L-Reg is derived which bridges a logical analysis\nframework to image classification. Our work reveals that L-Reg reduces the\ncomplexity of the model in terms of the feature distribution and classifier\nweights. Specifically, we unveil the interpretability brought by L-Reg, as it\nenables the model to extract the salient features, such as faces to persons,\nfor classification. Theoretical analysis and experiments demonstrate that L-Reg\nenhances generalization across various scenarios, including multi-domain\ngeneralization and generalized category discovery. In complex real-world\nscenarios where images span unknown classes and unseen domains, L-Reg\nconsistently improves generalization, highlighting its practical efficacy.\n","authors":["Zhaorui Tan","Xi Yang","Qiufeng Wang","Anh Nguyen","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2410.04492v4.pdf","comment":"Accepted by NeurIPS2024 as Spotlight"},{"id":"http://arxiv.org/abs/2407.06001v2","updated":"2024-11-12T15:14:41Z","published":"2024-07-08T14:53:07Z","title":"Pseudo-triplet Guided Few-shot Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging task that aims to retrieve\nthe target image with a multimodal query, i.e., a reference image, and its\ncomplementary modification text. As previous supervised or zero-shot learning\nparadigms all fail to strike a good trade-off between the model's\ngeneralization ability and retrieval performance, recent researchers have\nintroduced the task of few-shot CIR (FS-CIR) and proposed a textual\ninversion-based network based on pretrained CLIP model to realize it. Despite\nits promising performance, the approach encounters two key limitations: simply\nrelying on the few annotated samples for CIR model training and\nindiscriminately selecting training triplets for CIR model fine-tuning. To\naddress these two limitations, we propose a novel two-stage pseudo triplet\nguided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an\nattentive masking and captioning-based pseudo triplet generation method, to\nconstruct pseudo triplets from pure image data and use them to fulfill the\nCIR-task specific pertaining. In the second stage, we propose a challenging\ntriplet-based CIR fine-tuning method, where we design a pseudo modification\ntext-based sample challenging score estimation strategy and a robust top\nrange-based random sampling strategy for sampling robust challenging triplets\nto promote the model fine-tuning. Notably, our scheme is plug-and-play and\ncompatible with any existing supervised CIR models. We test our scheme across\ntwo backbones on three public datasets (i.e., FashionIQ, CIRR, and\nBirds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4%\nrespectively, demonstrating our scheme's efficacy.\n","authors":["Bohan Hou","Haoqiang Lin","Haokun Wen","Meng Liu","Mingzhu Xu","Xuemeng Song"],"pdf_url":"https://arxiv.org/pdf/2407.06001v2.pdf","comment":"10pages"},{"id":"http://arxiv.org/abs/2411.07848v1","updated":"2024-11-12T15:01:40Z","published":"2024-11-12T15:01:40Z","title":"NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric\n VLN","summary":" Landmark-based navigation (e.g. go to the wooden desk) and relative\npositional navigation (e.g. move 5 meters forward) are distinct navigation\nchallenges solved very differently in existing robotics navigation methodology.\nWe present a new dataset, OC-VLN, in order to distinctly evaluate grounding\nobject-centric natural language navigation instructions in a method for\nperforming landmark-based navigation. We also propose Natural Language grounded\nSLAM (NL-SLAM), a method to ground natural language instruction to robot\nobservations and poses. We actively perform NL-SLAM in order to follow\nobject-centric natural language navigation instructions. Our methods leverage\npre-trained vision and language foundation models and require no task-specific\ntraining. We construct two strong baselines from state-of-the-art methods on\nrelated tasks, Object Goal Navigation and Vision Language Navigation, and we\nshow that our approach, NL-SLAM, outperforms these baselines across all our\nmetrics of success on OC-VLN. Finally, we successfully demonstrate the\neffectiveness of NL-SLAM for performing navigation instruction following in the\nreal world on a Boston Dynamics Spot robot.\n","authors":["Sonia Raychaudhuri","Duy Ta","Katrina Ashton","Angel X. Chang","Jiuguang Wang","Bernadette Bucher"],"pdf_url":"https://arxiv.org/pdf/2411.07848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.12203v3","updated":"2024-11-12T15:00:37Z","published":"2024-03-18T19:25:57Z","title":"Bootstrapping Reinforcement Learning with Imitation for Vision-Based\n Agile Flight","summary":" Learning visuomotor policies for agile quadrotor flight presents significant\ndifficulties, primarily from inefficient policy exploration caused by\nhigh-dimensional visual inputs and the need for precise and low-latency\ncontrol. To address these challenges, we propose a novel approach that combines\nthe performance of Reinforcement Learning (RL) and the sample efficiency of\nImitation Learning (IL) in the task of vision-based autonomous drone racing.\nWhile RL provides a framework for learning high-performance controllers through\ntrial and error, it faces challenges with sample efficiency and computational\ndemands due to the high dimensionality of visual inputs. Conversely, IL\nefficiently learns from visual expert demonstrations, but it remains limited by\nthe expert's performance and state distribution. To overcome these limitations,\nour policy learning framework integrates the strengths of both approaches. Our\nframework contains three phases: training a teacher policy using RL with\nprivileged state information, distilling it into a student policy via IL, and\nadaptive fine-tuning via RL. Testing in both simulated and real-world scenarios\nshows our approach can not only learn in scenarios where RL from scratch fails\nbut also outperforms existing IL methods in both robustness and performance,\nsuccessfully navigating a quadrotor through a race course using only visual\ninformation. Videos of the experiments are available at\nhttps://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html.\n","authors":["Jiaxu Xing","Angel Romero","Leonard Bauersfeld","Davide Scaramuzza"],"pdf_url":"https://arxiv.org/pdf/2403.12203v3.pdf","comment":"8th Annual Conference on Robot Learning (CoRL)"},{"id":"http://arxiv.org/abs/2401.11796v2","updated":"2024-11-12T14:58:37Z","published":"2024-01-22T09:53:20Z","title":"REVEX: A Unified Framework for Removal-Based Explainable Artificial\n Intelligence in Video","summary":" We developed REVEX, a removal-based video explanations framework. This work\nextends fine-grained explanation frameworks for computer vision data and adapts\nsix existing techniques to video by adding temporal information and local\nexplanations. The adapted methods were evaluated across networks, datasets,\nimage classes, and evaluation metrics. By decomposing explanation into steps,\nstrengths and weaknesses were revealed in the studied methods, for example, on\npixel clustering and perturbations in the input. Video LIME outperformed other\nmethods with deletion values up to 31\\% lower and insertion up to 30\\% higher,\ndepending on method and network. Video RISE achieved superior performance in\nthe average drop metric, with values 10\\% lower. In contrast,\nlocalization-based metrics revealed low performance across all methods, with\nsignificant variation depending on network. Pointing game accuracy reached\n53\\%, and IoU-based metrics remained below 20\\%. Drawing on the findings across\nXAI methods, we further examine the limitations of the employed XAI evaluation\nmetrics and highlight their suitability in different applications.\n","authors":["F. Xavier Gaya-Morey","Jose M. Buades-Rubio","I. Scott MacKenzie","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11796v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00002v2","updated":"2024-11-12T14:55:50Z","published":"2024-07-10T15:03:00Z","title":"Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against\n DenseNet, ResNet, and VGGNet on a Custom Dataset","summary":" This study evaluates the performance of various deep learning models,\nspecifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species\nclassification on a custom dataset. The dataset comprises 575 images of 23\nendangered species sourced from reputable online repositories. The study\nutilizes transfer learning to fine-tune pre-trained models on the dataset,\nfocusing on reducing training time and enhancing classification accuracy. The\nresults demonstrate that YOLOv8 outperforms other models, achieving a training\naccuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest\nthat YOLOv8, with its advanced architecture and efficient feature extraction\ncapabilities, holds great promise for automating wildlife monitoring and\nconservation efforts.\n","authors":["Subek Sharma","Sisir Dhakal","Mansi Bhavsar"],"pdf_url":"https://arxiv.org/pdf/2408.00002v2.pdf","comment":"This is published in Journal of Artificial Intelligence and Capsule\n Networks, December 2024, Volume 6, Issue 4, Pages 415-435"},{"id":"http://arxiv.org/abs/2410.20178v2","updated":"2024-11-12T14:45:18Z","published":"2024-10-26T13:19:57Z","title":"LLMs Can Evolve Continually on Modality for X-Modal Reasoning","summary":" Multimodal Large Language Models (MLLMs) have gained significant attention\ndue to their impressive capabilities in multimodal understanding. However,\nexisting methods rely heavily on extensive modal-specific pretraining and\njoint-modal tuning, leading to significant computational burdens when expanding\nto new modalities. In this paper, we propose PathWeave, a flexible and scalable\nframework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs\nto continually EVolve on modalities for $\\mathbb{X}$-modal reasoning. We\nleverage the concept of Continual Learning and develop an incremental training\nstrategy atop pre-trained MLLMs, enabling their expansion to new modalities\nusing uni-modal data, without executing joint-modal pretraining. In detail, a\nnovel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and\ncross-modal adapters are seamlessly integrated to facilitate efficient modality\nalignment and collaboration. Additionally, an MoE-based gating module is\napplied between two types of adapters to further enhance the multimodal\ninteraction. To investigate the proposed method, we establish a challenging\nbenchmark called Continual Learning of Modality (MCL), which consists of\nhigh-quality QA data from five distinct modalities: image, video, audio, depth\nand point cloud. Extensive experiments demonstrate the effectiveness of the\nproposed AnA framework on learning plasticity and memory stability during\ncontinual learning. Furthermore, PathWeave performs comparably to\nstate-of-the-art MLLMs while concurrently reducing parameter training burdens\nby 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave\n","authors":["Jiazuo Yu","Haomiao Xiong","Lu Zhang","Haiwen Diao","Yunzhi Zhuge","Lanqing Hong","Dong Wang","Huchuan Lu","You He","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2410.20178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07834v1","updated":"2024-11-12T14:36:06Z","published":"2024-11-12T14:36:06Z","title":"Towards Vision Mixture of Experts for Wildlife Monitoring on the Edge","summary":" The explosion of IoT sensors in industrial, consumer and remote sensing use\ncases has come with unprecedented demand for computing infrastructure to\ntransmit and to analyze petabytes of data. Concurrently, the world is slowly\nshifting its focus towards more sustainable computing. For these reasons, there\nhas been a recent effort to reduce the footprint of related computing\ninfrastructure, especially by deep learning algorithms, for advanced insight\ngeneration. The `TinyML' community is actively proposing methods to save\ncommunication bandwidth and excessive cloud storage costs while reducing\nalgorithm inference latency and promoting data privacy. Such proposed\napproaches should ideally process multiple types of data, including time\nseries, audio, satellite images, and video, near the network edge as multiple\ndata streams has been shown to improve the discriminative ability of learning\nalgorithms, especially for generating fine grained results. Incidentally, there\nhas been recent work on data driven conditional computation of subnetworks that\nhas shown real progress in using a single model to share parameters among very\ndifferent types of inputs such as images and text, reducing the computation\nrequirement of multi-tower multimodal networks. Inspired by such line of work,\nwe explore similar per patch conditional computation for the first time for\nmobile vision transformers (vision only case), that will eventually be used for\nsingle-tower multimodal edge models. We evaluate the model on Cornell Sap\nSucker Woods 60, a fine grained bird species discrimination dataset. Our\ninitial experiments uses $4X$ fewer parameters compared to MobileViTV2-1.0 with\na $1$% accuracy drop on the iNaturalist '21 birds test data provided as part of\nthe SSW60 dataset.\n","authors":["Emmanuel Azuh Mensah","Anderson Lee","Haoran Zhang","Yitong Shan","Kurtis Heimerl"],"pdf_url":"https://arxiv.org/pdf/2411.07834v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.03677v4","updated":"2024-11-12T14:33:36Z","published":"2024-08-07T10:36:26Z","title":"L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection","summary":" LiDAR-based vision systems are integral for 3D object detection, which is\ncrucial for autonomous navigation. However, they suffer from performance\ndegradation in adverse weather conditions due to the quality deterioration of\nLiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is\nexpected to solve this problem. However, the fusion of LiDAR and 4D radar is\nchallenging because they differ significantly in terms of data quality and the\ndegree of degradation in adverse weather. To address these issues, we introduce\nL4DR, a weather-robust 3D object detection method that effectively achieves\nLiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and\nForeground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is\nthe first exploration of the complementarity of early fusion between LiDAR and\n4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 )\nparallel feature extraction backbone coupled with a Multi-Scale Gated Fusion\n(MSGF) module to counteract the varying degrees of sensor degradation under\nadverse weather conditions. Experimental evaluation on a VoD dataset with\nsimulated fog proves that L4DR is more adaptable to changing weather\nconditions. It delivers a significant performance increase under different fog\nlevels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only\napproach. Moreover, the results on the K-Radar dataset validate the consistent\nperformance improvement of L4DR in real-world adverse weather conditions.\n","authors":["Xun Huang","Ziyu Xu","Hai Wu","Jinlong Wang","Qiming Xia","Yan Xia","Jonathan Li","Kyle Gao","Chenglu Wen","Cheng Wang"],"pdf_url":"https://arxiv.org/pdf/2408.03677v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.15063v4","updated":"2024-11-12T14:29:09Z","published":"2024-08-27T13:47:31Z","title":"Adapting Segment Anything Model to Multi-modal Salient Object Detection\n with Semantic Feature Fusion Guidance","summary":" Although most existing multi-modal salient object detection (SOD) methods\ndemonstrate effectiveness through training models from scratch, the limited\nmulti-modal data hinders these methods from reaching optimality. In this paper,\nwe propose a novel framework to explore and exploit the powerful feature\nrepresentation and zero-shot generalization ability of the pre-trained Segment\nAnything Model (SAM) for multi-modal SOD. Despite serving as a recent vision\nfundamental model, driving the class-agnostic SAM to comprehend and detect\nsalient objects accurately is non-trivial, especially in challenging scenes. To\nthis end, we develop \\underline{SAM} with se\\underline{m}antic\nf\\underline{e}ature fu\\underline{s}ion guidanc\\underline{e} (Sammese), which\nincorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to\nmulti-modal SOD tasks. However, it is difficult for SAM trained on single-modal\ndata to directly mine the complementary benefits of multi-modal inputs and\ncomprehensively utilize them to achieve accurate saliency prediction. To\naddress these issues, we first design a multi-modal complementary fusion module\nto extract robust multi-modal semantic features by integrating information from\nvisible and thermal or depth image pairs. Then, we feed the extracted\nmulti-modal semantic features into both the SAM image encoder and mask decoder\nfor fine-tuning and prompting, respectively. Specifically, in the image\nencoder, a multi-modal adapter is proposed to adapt the single-modal SAM to\nmulti-modal information. In the mask decoder, a semantic-geometric prompt\ngeneration strategy is proposed to produce corresponding embeddings with\nvarious saliency cues. Extensive experiments on both RGB-D and RGB-T SOD\nbenchmarks show the effectiveness of the proposed framework. The code will be\navailable at \\url{https://github.com/Angknpng/Sammese}.\n","authors":["Kunpeng Wang","Danying Lin","Chenglong Li","Zhengzheng Tu","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2408.15063v4.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2410.23091v3","updated":"2024-11-12T14:13:17Z","published":"2024-10-30T15:06:44Z","title":"CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for\n Adversarial Defense","summary":" Despite ongoing efforts to defend neural classifiers from adversarial\nattacks, they remain vulnerable, especially to unseen attacks. In contrast,\nhumans are difficult to be cheated by subtle manipulations, since we make\njudgments only based on essential factors. Inspired by this observation, we\nattempt to model label generation with essential label-causative factors and\nincorporate label-non-causative factors to assist data generation. For an\nadversarial example, we aim to discriminate the perturbations as non-causative\nfactors and make predictions only based on the label-causative factors.\nConcretely, we propose a casual diffusion model (CausalDiff) that adapts\ndiffusion models for conditional data generation and disentangles the two types\nof casual factors by learning towards a novel casual information bottleneck\nobjective. Empirically, CausalDiff has significantly outperformed\nstate-of-the-art defense methods on various unseen attacks, achieving an\naverage robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on\nCIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition\nBenchmark).\n","authors":["Mingkun Zhang","Keping Bi","Wei Chen","Quanrun Chen","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2410.23091v3.pdf","comment":"accepted by NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03926v2","updated":"2024-11-12T14:04:53Z","published":"2024-11-06T13:57:53Z","title":"Act in Collusion: A Persistent Distributed Multi-Target Backdoor in\n Federated Learning","summary":" Federated learning, a novel paradigm designed to protect data privacy, is\nvulnerable to backdoor attacks due to its distributed nature. Current research\noften designs attacks based on a single attacker with a single backdoor,\noverlooking more realistic and complex threats in federated learning. We\npropose a more practical threat model for federated learning: the distributed\nmulti-target backdoor. In this model, multiple attackers control different\nclients, embedding various triggers and targeting different classes,\ncollaboratively implanting backdoors into the global model via central\naggregation. Empirical validation shows that existing methods struggle to\nmaintain the effectiveness of multiple backdoors in the global model. Our key\ninsight is that similar backdoor triggers cause parameter conflicts and\ninjecting new backdoors disrupts gradient directions, significantly weakening\nsome backdoors performance. To solve this, we propose a Distributed\nMulti-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of\nbackdoors from different malicious clients. To avoid parameter conflicts, we\ndesign a multi-channel dispersed frequency trigger strategy to maximize trigger\ndifferences. To mitigate gradient interference, we introduce backdoor replay in\nlocal training to neutralize conflicting gradients. Extensive validation shows\nthat 30 rounds after the attack, Attack Success Rates of three different\nbackdoors from various clients remain above 93%. The code will be made publicly\navailable after the review period.\n","authors":["Tao Liu","Wu Yang","Chen Xu","Jiguang Lv","Huanran Wang","Yuhang Zhang","Shuchun Xu","Dapeng Man"],"pdf_url":"https://arxiv.org/pdf/2411.03926v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07802v1","updated":"2024-11-12T13:57:13Z","published":"2024-11-12T13:57:13Z","title":"Large-scale Remote Sensing Image Target Recognition and Automatic\n Annotation","summary":" This paper presents a method for object recognition and automatic labeling in\nlarge-area remote sensing images called LRSAA. The method integrates YOLOv11\nand MobileNetV3-SSD object detection algorithms through ensemble learning to\nenhance model performance. Furthermore, it employs Poisson disk sampling\nsegmentation techniques and the EIOU metric to optimize the training and\ninference processes of segmented images, followed by the integration of\nresults. This approach not only reduces the demand for computational resources\nbut also achieves a good balance between accuracy and speed. The source code\nfor this project has been made publicly available on\nhttps://github.com/anaerovane/LRSAA.\n","authors":["Wuzheng Dong"],"pdf_url":"https://arxiv.org/pdf/2411.07802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07799v1","updated":"2024-11-12T13:53:22Z","published":"2024-11-12T13:53:22Z","title":"Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and\n Re-Identification using Point Clouds","summary":" Robotic fruit monitoring is a key step toward automated agricultural\nproduction systems. Robots can significantly enhance plant and temporal fruit\nmonitoring by providing precise, high-throughput assessments that overcome the\nlimitations of traditional manual methods. Fruit monitoring is a challenging\ntask due to the significant variation in size, shape, orientation, and\nocclusion of fruits. Also, fruits may be harvested or newly grown between\nrecording sessions. Most methods are 2D image-based and they lack the 3D\nstructure, depth, and spatial information, which represent key aspects of fruit\nmonitoring. 3D colored point clouds, instead, can offer this information but\nthey introduce challenges such as their sparsity and irregularity. In this\npaper, we present a novel approach for temporal fruit monitoring that addresses\npoint clouds collected in a greenhouse over time. Our method segments fruits\nusing a learning-based instance segmentation approach directly on the point\ncloud. Each segmented fruit is processed by a 3D sparse convolutional neural\nnetwork to extract descriptors, which are used in an attention-based matching\nnetwork to associate fruits with their instances from previous data\ncollections. Experimental results on a real dataset of strawberries demonstrate\nthat our approach outperforms other methods for fruits re-identification over\ntime, allowing for precise temporal fruit monitoring in real and complex\nscenarios.\n","authors":["Daniel Fusaro","Federico Magistri","Jens Behley","Alberto Pretto","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2411.07799v1.pdf","comment":"Submitted to IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2411.07784v1","updated":"2024-11-12T13:33:26Z","published":"2024-11-12T13:33:26Z","title":"Interaction Asymmetry: A General Principle for Learning Composable\n Abstractions","summary":" Learning disentangled representations of concepts and re-composing them in\nunseen ways is crucial for generalizing to out-of-domain situations. However,\nthe underlying properties of concepts that enable such disentanglement and\ncompositional generalization remain poorly understood. In this work, we propose\nthe principle of interaction asymmetry which states: \"Parts of the same concept\nhave more complex interactions than parts of different concepts\". We formalize\nthis via block diagonality conditions on the $(n+1)$th order derivatives of the\ngenerator mapping concepts to observed data, where different orders of\n\"complexity\" correspond to different $n$. Using this formalism, we prove that\ninteraction asymmetry enables both disentanglement and compositional\ngeneralization. Our results unify recent theoretical results for learning\nconcepts of objects, which we show are recovered as special cases with\n$n\\!=\\!0$ or $1$. We provide results for up to $n\\!=\\!2$, thus extending these\nprior works to more flexible generator functions, and conjecture that the same\nproof strategies generalize to larger $n$. Practically, our theory suggests\nthat, to disentangle concepts, an autoencoder should penalize its latent\ncapacity and the interactions between concepts during decoding. We propose an\nimplementation of these criteria using a flexible Transformer-based VAE, with a\nnovel regularizer on the attention weights of the decoder. On synthetic image\ndatasets consisting of objects, we provide evidence that this model can achieve\ncomparable object disentanglement to existing models that use more explicit\nobject-centric priors.\n","authors":["Jack Brady","Julius von Kügelgen","Sébastien Lachapelle","Simon Buchholz","Thomas Kipf","Wieland Brendel"],"pdf_url":"https://arxiv.org/pdf/2411.07784v1.pdf","comment":"Preprint, under review"},{"id":"http://arxiv.org/abs/2407.00352v2","updated":"2024-11-12T13:01:48Z","published":"2024-06-29T07:53:47Z","title":"PhyTracker: An Online Tracker for Phytoplankton","summary":" Phytoplankton, a crucial component of aquatic ecosystems, requires efficient\nmonitoring to understand marine ecological processes and environmental\nconditions. Traditional phytoplankton monitoring methods, relying on non-in\nsitu observations, are time-consuming and resource-intensive, limiting timely\nanalysis. To address these limitations, we introduce PhyTracker, an intelligent\nin situ tracking framework designed for automatic tracking of phytoplankton.\nPhyTracker overcomes significant challenges unique to phytoplankton monitoring,\nsuch as constrained mobility within water flow, inconspicuous appearance, and\nthe presence of impurities. Our method incorporates three innovative modules: a\nTexture-enhanced Feature Extraction (TFE) module, an Attention-enhanced\nTemporal Association (ATA) module, and a Flow-agnostic Movement Refinement\n(FMR) module. These modules enhance feature capture, differentiate between\nphytoplankton and impurities, and refine movement characteristics,\nrespectively. Extensive experiments on the PMOT dataset validate the\nsuperiority of PhyTracker in phytoplankton tracking, and additional tests on\nthe MOT dataset demonstrate its general applicability, outperforming\nconventional tracking methods. This work highlights key differences between\nphytoplankton and traditional objects, offering an effective solution for\nphytoplankton monitoring.\n","authors":["Yang Yu","Qingxuan Lv","Yuezun Li","Zhiqiang Wei","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2407.00352v2.pdf","comment":"13pages,eleven figures"},{"id":"http://arxiv.org/abs/2411.07765v1","updated":"2024-11-12T12:58:33Z","published":"2024-11-12T12:58:33Z","title":"Novel View Synthesis with Pixel-Space Diffusion Models","summary":" Synthesizing a novel view from a single input image is a challenging task.\nTraditionally, this task was approached by estimating scene depth, warping, and\ninpainting, with machine learning models enabling parts of the pipeline. More\nrecently, generative models are being increasingly employed in novel view\nsynthesis (NVS), often encompassing the entire end-to-end system. In this work,\nwe adapt a modern diffusion model architecture for end-to-end NVS in the pixel\nspace, substantially outperforming previous state-of-the-art (SOTA) techniques.\nWe explore different ways to encode geometric information into the network. Our\nexperiments show that while these methods may enhance performance, their impact\nis minor compared to utilizing improved generative models. Moreover, we\nintroduce a novel NVS training scheme that utilizes single-view datasets,\ncapitalizing on their relative abundance compared to their multi-view\ncounterparts. This leads to improved generalization capabilities to scenes with\nout-of-domain content.\n","authors":["Noam Elata","Bahjat Kawar","Yaron Ostrovsky-Berman","Miriam Farber","Ron Sokolovsky"],"pdf_url":"https://arxiv.org/pdf/2411.07765v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2304.04918v2","updated":"2024-11-12T15:42:42Z","published":"2023-04-11T01:10:49Z","title":"Explicit and Implicit Semantic Ranking Framework","summary":" The core challenge in numerous real-world applications is to match an inquiry\nto the best document from a mutable and finite set of candidates. Existing\nindustry solutions, especially latency-constrained services, often rely on\nsimilarity algorithms that sacrifice quality for speed. In this paper we\nintroduce a generic semantic learning-to-rank framework, Self-training Semantic\nCross-attention Ranking (sRank). This transformer-based framework uses linear\npairwise loss with mutable training batch sizes and achieves quality gains and\nhigh efficiency, and has been applied effectively to show gains on two industry\ntasks at Microsoft over real-world large-scale data sets: Smart Reply (SR) and\nAmbient Clinical Intelligence (ACI). In Smart Reply, sRank assists live\ncustomers with technical support by selecting the best reply from predefined\nsolutions based on consumer and support agent messages. It achieves 11.7% gain\nin offline top-one accuracy on the SR task over the previous system, and has\nenabled 38.7% time reduction in composing messages in telemetry recorded since\nits general release in January 2021. In the ACI task, sRank selects relevant\nhistorical physician templates that serve as guidance for a text summarization\nmodel to generate higher quality medical notes. It achieves 35.5% top-one\naccuracy gain, along with 46% relative ROUGE-L gain in generated medical notes.\n","authors":["Xiaofeng Zhu","Thomas Lin","Vishal Anand","Matthew Calderwood","Eric Clausen-Brown","Gord Lueck","Wen-wai Yim","Cheng Wu"],"pdf_url":"https://arxiv.org/pdf/2304.04918v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05508v2","updated":"2024-11-12T15:36:04Z","published":"2024-11-08T12:08:17Z","title":"An Early FIRST Reproduction and Improvements to Single-Token Decoding\n for Fast Listwise Reranking","summary":" Recent advances have demonstrated that large language models (LLMs) excel as\nlistwise rerankers, but their high computational demands remain a barrier to\nwidespread adoption. Further, the traditional language modeling (LM) objective\nis not ideally suited for reranking tasks. FIRST is a novel approach that\naddresses these challenges by integrating a learning-to-rank objective and\nleveraging the logits of only the first generated token, thereby significantly\nreducing inference latency compared to traditional LLM rerankers. In this\nstudy, we extend the evaluation of FIRST to the TREC Deep Learning datasets\n(DL19-22), validating its robustness across diverse domains. We investigate the\ninfluence of different first-stage retrievers on FIRST rerankers, observing\ndiminishing returns and patterns consistent with traditional LLM rerankers.\nThrough applying the FIRST objective to a broader range of backbone models, we\nachieve effectiveness surpassing the original implementation. Our experiments\nconfirm that fast reranking with single-token logits does not compromise\nout-of-domain reranking quality. To better quantify the computational savings\nin the original study, we measure and compare latency to find a 21%-42% gain\nacross various models and benchmarks. Moreover, while LM training implicitly\nimproves zero-shot single-token reranking, our experiments also raise questions\nabout whether LM pre-training may hinder subsequent fine-tuning with the FIRST\nobjective. These findings pave the way for more efficient and effective\nlistwise reranking in future applications.\n","authors":["Zijian Chen","Ronak Pradeep","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2411.05508v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18480v2","updated":"2024-11-12T13:54:25Z","published":"2024-03-27T11:49:58Z","title":"Content-Based Collaborative Generation for Recommender Systems","summary":" Generative models have emerged as a promising utility to enhance recommender\nsystems. It is essential to model both item content and user-item collaborative\ninteractions in a unified generative framework for better recommendation.\nAlthough some existing large language model (LLM)-based methods contribute to\nfusing content information and collaborative signals, they fundamentally rely\non textual language generation, which is not fully aligned with the\nrecommendation task. How to integrate content knowledge and collaborative\ninteraction signals in a generative framework tailored for item recommendation\nis still an open research challenge.\n In this paper, we propose content-based collaborative generation for\nrecommender systems, namely ColaRec. ColaRec is a sequence-to-sequence\nframework which is tailored for directly generating the recommended item\nidentifier. Precisely, the input sequence comprises data pertaining to the\nuser's interacted items, and the output sequence represents the generative\nidentifier (GID) for the suggested item. To model collaborative signals, the\nGIDs are constructed from a pretrained collaborative filtering model, and the\nuser is represented as the content aggregation of interacted items. To this\nend, ColaRec captures both collaborative signals and content information in a\nunified framework. Then an item indexing task is proposed to conduct the\nalignment between the content-based semantic space and the interaction-based\ncollaborative space. Besides, a contrastive loss is further introduced to\nensure that items with similar collaborative GIDs have similar content\nrepresentations. To verify the effectiveness of ColaRec, we conduct experiments\non four benchmark datasets. Empirical results demonstrate the superior\nperformance of ColaRec.\n","authors":["Yidan Wang","Zhaochun Ren","Weiwei Sun","Jiyuan Yang","Zhixiang Liang","Xin Chen","Ruobing Xie","Su Yan","Xu Zhang","Pengjie Ren","Zhumin Chen","Xin Xin"],"pdf_url":"https://arxiv.org/pdf/2403.18480v2.pdf","comment":"Accepted by CIKM 2024; GitHub:\n https://github.com/Junewang0614/ColaRec"},{"id":"http://arxiv.org/abs/2411.07770v1","updated":"2024-11-12T13:06:16Z","published":"2024-11-12T13:06:16Z","title":"A Theoretical Analysis of Recommendation Loss Functions under Negative\n Sampling","summary":" Recommender Systems (RSs) are pivotal in diverse domains such as e-commerce,\nmusic streaming, and social media. This paper conducts a comparative analysis\nof prevalent loss functions in RSs: Binary Cross-Entropy (BCE), Categorical\nCross-Entropy (CCE), and Bayesian Personalized Ranking (BPR). Exploring the\nbehaviour of these loss functions across varying negative sampling settings, we\nreveal that BPR and CCE are equivalent when one negative sample is used.\nAdditionally, we demonstrate that all losses share a common global minimum.\nEvaluation of RSs mainly relies on ranking metrics known as Normalized\nDiscounted Cumulative Gain (NDCG) and Mean Reciprocal Rank (MRR). We produce\nbounds of the different losses for negative sampling settings to establish a\nprobabilistic lower bound for NDCG. We show that the BPR bound on NDCG is\nweaker than that of BCE, contradicting the common assumption that BPR is\nsuperior to BCE in RSs training. Experiments on five datasets and four models\nempirically support these theoretical findings. Our code is available at\n\\url{https://anonymous.4open.science/r/recsys_losses} .\n","authors":["Giulia Di Teodoro","Federico Siciliano","Nicola Tonellotto","Fabrizio Silvestri"],"pdf_url":"https://arxiv.org/pdf/2411.07770v1.pdf","comment":"main paper 8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.07739v1","updated":"2024-11-12T12:03:57Z","published":"2024-11-12T12:03:57Z","title":"Unlocking Legal Knowledge with Multi-Layered Embedding-Based Retrieval","summary":" This work addresses the challenge of capturing the complexities of legal\nknowledge by proposing a multi-layered embedding-based retrieval method for\nlegal and legislative texts. Creating embeddings not only for individual\narticles but also for their components (paragraphs, clauses) and structural\ngroupings (books, titles, chapters, etc), we seek to capture the subtleties of\nlegal information through the use of dense vectors of embeddings, representing\nit at varying levels of granularity. Our method meets various information needs\nby allowing the Retrieval Augmented Generation system to provide accurate\nresponses, whether for specific segments or entire sections, tailored to the\nuser's query. We explore the concepts of aboutness, semantic chunking, and\ninherent hierarchy within legal texts, arguing that this method enhances the\nlegal information retrieval. Despite the focus being on Brazil's legislative\nmethods and the Brazilian Constitution, which follow a civil law tradition, our\nfindings should in principle be applicable across different legal systems,\nincluding those adhering to common law traditions. Furthermore, the principles\nof the proposed method extend beyond the legal domain, offering valuable\ninsights for organizing and retrieving information in any field characterized\nby information encoded in hierarchical text.\n","authors":["João Alberto de Oliveira Lima"],"pdf_url":"https://arxiv.org/pdf/2411.07739v1.pdf","comment":"27 pages, 10 figures"},{"id":"http://arxiv.org/abs/2411.07658v1","updated":"2024-11-12T09:19:32Z","published":"2024-11-12T09:19:32Z","title":"Advancing Sustainability via Recommender Systems: A Survey","summary":" Human behavioral patterns and consumption paradigms have emerged as pivotal\ndeterminants in environmental degradation and climate change, with quotidian\ndecisions pertaining to transportation, energy utilization, and resource\nconsumption collectively precipitating substantial ecological impacts.\nRecommender systems, which generate personalized suggestions based on user\npreferences and historical interaction data, exert considerable influence on\nindividual behavioral trajectories. However, conventional recommender systems\npredominantly optimize for user engagement and economic metrics, inadvertently\nneglecting the environmental and societal ramifications of their\nrecommendations, potentially catalyzing over-consumption and reinforcing\nunsustainable behavioral patterns. Given their instrumental role in shaping\nuser decisions, there exists an imperative need for sustainable recommender\nsystems that incorporate sustainability principles to foster eco-conscious and\nsocially responsible choices. This comprehensive survey addresses this critical\nresearch gap by presenting a systematic analysis of sustainable recommender\nsystems. As these systems can simultaneously advance multiple sustainability\nobjectives--including resource conservation, sustainable consumer behavior, and\nsocial impact enhancement--examining their implementations across distinct\napplication domains provides a more rigorous analytical framework. Through a\nmethodological analysis of domain-specific implementations encompassing\ntransportation, food, buildings, and auxiliary sectors, we can better elucidate\nhow these systems holistically advance sustainability objectives while\naddressing sector-specific constraints and opportunities. Moreover, we\ndelineate future research directions for evolving recommender systems beyond\nsustainability advocacy toward fostering environmental resilience and social\nconsciousness in society.\n","authors":["Xin Zhou","Lei Zhang","Honglei Zhang","Yixin Zhang","Xiaoxiong Zhang","Jie Zhang","Zhiqi Shen"],"pdf_url":"https://arxiv.org/pdf/2411.07658v1.pdf","comment":"20pages, 10 figures. Working paper: https://github.com/enoche/SusRec"},{"id":"http://arxiv.org/abs/2404.13812v4","updated":"2024-11-12T07:44:20Z","published":"2024-04-22T01:16:11Z","title":"A Comparative Study on Enhancing Prediction in Social Network\n Advertisement through Data Augmentation","summary":" In the ever-evolving landscape of social network advertising, the volume and\naccuracy of data play a critical role in the performance of predictive models.\nHowever, the development of robust predictive algorithms is often hampered by\nthe limited size and potential bias present in real-world datasets. This study\npresents and explores a generative augmentation framework of social network\nadvertising data. Our framework explores three generative models for data\naugmentation - Generative Adversarial Networks (GANs), Variational Autoencoders\n(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and\ndiversity in the context of social network advertising analytics effectiveness.\nBy performing synthetic extensions of the feature space, we find that through\ndata augmentation, the performance of various classifiers has been\nquantitatively improved. Furthermore, we compare the relative performance gains\nbrought by each data augmentation technique, providing insights for\npractitioners to select appropriate techniques to enhance model performance.\nThis paper contributes to the literature by showing that synthetic data\naugmentation alleviates the limitations imposed by small or imbalanced datasets\nin the field of social network advertising. At the same time, this article also\nprovides a comparative perspective on the practicality of different data\naugmentation methods, thereby guiding practitioners to choose appropriate\ntechniques to enhance model performance.\n","authors":["Qikai Yang","Panfeng Li","Xinhe Xu","Zhicheng Ding","Wenjing Zhou","Yi Nian"],"pdf_url":"https://arxiv.org/pdf/2404.13812v4.pdf","comment":"Accepted by 2024 4th International Conference on Machine Learning and\n Intelligent Systems Engineering (MLISE)"},{"id":"http://arxiv.org/abs/2411.07589v1","updated":"2024-11-12T06:58:03Z","published":"2024-11-12T06:58:03Z","title":"Overhead-free User-side Recommender Systems","summary":" Traditionally, recommendation algorithms have been designed for service\ndevelopers. But recently, a new paradigm called user-side recommender systems\nhas been proposed. User-side recommender systems are built and used by end\nusers, in sharp contrast to traditional provider-side recommender systems. Even\nif the official recommender system offered by the provider is not fair, end\nusers can create and enjoy their own user-side recommender systems by\nthemselves. Although the concept of user-side recommender systems is\nattractive, the problem is they require tremendous communication costs between\nthe user and the official system. Even the most efficient user-side recommender\nsystems require about 5 times more costs than provider-side recommender\nsystems. Such high costs hinder the adoption of user-side recommender systems.\nIn this paper, we propose overhead-free user-side recommender systems,\nRecCycle, which realizes user-side recommender systems without any\ncommunication overhead. The main idea of RecCycle is to recycle past\nrecommendation results offered by the provider's recommender systems. The\ningredients of RecCycle can be retrieved ``for free,'' and it greatly reduces\nthe cost of user-side recommendations. In the experiments, we confirm that\nRecCycle performs as well as state-of-the-art user-side recommendation\nalgorithms while RecCycle reduces costs significantly.\n","authors":["Ryoma Sato"],"pdf_url":"https://arxiv.org/pdf/2411.07589v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2208.09864,\n arXiv:2403.15757"},{"id":"http://arxiv.org/abs/2411.07569v1","updated":"2024-11-12T06:03:47Z","published":"2024-11-12T06:03:47Z","title":"Towards Automated Model Design on Recommender Systems","summary":" The increasing popularity of deep learning models has created new\nopportunities for developing AI-based recommender systems. Designing\nrecommender systems using deep neural networks requires careful architecture\ndesign, and further optimization demands extensive co-design efforts on jointly\noptimizing model architecture and hardware. Design automation, such as\nAutomated Machine Learning (AutoML), is necessary to fully exploit the\npotential of recommender model design, including model choices and\nmodel-hardware co-design strategies. We introduce a novel paradigm that\nutilizes weight sharing to explore abundant solution spaces. Our paradigm\ncreates a large supernet to search for optimal architectures and co-design\nstrategies to address the challenges of data multi-modality and heterogeneity\nin the recommendation domain. From a model perspective, the supernet includes a\nvariety of operators, dense connectivity, and dimension search options. From a\nco-design perspective, it encompasses versatile Processing-In-Memory (PIM)\nconfigurations to produce hardware-efficient models. Our solution space's\nscale, heterogeneity, and complexity pose several challenges, which we address\nby proposing various techniques for training and evaluating the supernet. Our\ncrafted models show promising results on three Click-Through Rates (CTR)\nprediction benchmarks, outperforming both manually designed and AutoML-crafted\nmodels with state-of-the-art performance when focusing solely on architecture\nsearch. From a co-design perspective, we achieve 2x FLOPs efficiency, 1.8x\nenergy efficiency, and 1.5x performance improvements in recommender models.\n","authors":["Tunhou Zhang","Dehua Cheng","Yuchen He","Zhengxing Chen","Xiaoliang Dai","Liang Xiong","Yudong Liu","Feng Cheng","Yufan Cao","Feng Yan","Hai Li","Yiran Chen","Wei Wen"],"pdf_url":"https://arxiv.org/pdf/2411.07569v1.pdf","comment":"Accepted in ACM Transactions on Recommender Systems. arXiv admin\n note: substantial text overlap with arXiv:2207.07187"},{"id":"http://arxiv.org/abs/2411.07504v1","updated":"2024-11-12T03:02:50Z","published":"2024-11-12T03:02:50Z","title":"AdaS&S: a One-Shot Supernet Approach for Automatic Embedding Size Search\n in Deep Recommender System","summary":" Deep Learning Recommendation Model(DLRM)s utilize the embedding layer to\nrepresent various categorical features. Traditional DLRMs adopt unified\nembedding size for all features, leading to suboptimal performance and\nredundant parameters. Thus, lots of Automatic Embedding size Search (AES) works\nfocus on obtaining mixed embedding sizes with strong model performance.\nHowever, previous AES works can hardly address several challenges together: (1)\nThe search results of embedding sizes are unstable; (2) Recommendation effect\nwith AES results is unsatisfactory; (3) Memory cost of embeddings is\nuncontrollable. To address these challenges, we propose a novel one-shot AES\nframework called AdaS&S, in which a supernet encompassing various candidate\nembeddings is built and AES is performed as searching network architectures\nwithin it. Our framework contains two main stages: In the first stage, we\ndecouple training parameters from searching embedding sizes, and propose the\nAdaptive Sampling method to yield a well-trained supernet, which further helps\nto produce stable AES results. In the second stage, to obtain embedding sizes\nthat benefits the model effect, we design a reinforcement learning search\nprocess which utilizes the supernet trained previously. Meanwhile, to adapt\nsearching to specific resource constraint, we introduce the resource\ncompetition penalty to balance the model effectiveness and memory cost of\nembeddings. We conduct extensive experiments on public datasets to show the\nsuperiority of AdaS&S. Our method could improve AUC by about 0.3% while saving\nabout 20% of model parameters. Empirical analysis also shows that the stability\nof searching results in AdaS&S significantly exceeds other methods.\n","authors":["He Wei","Yuekui Yang","Yang Zhang","Haiyang Wu","Meixi Liu","Shaoping Ma"],"pdf_url":"https://arxiv.org/pdf/2411.07504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07482v1","updated":"2024-11-12T02:08:19Z","published":"2024-11-12T02:08:19Z","title":"Enhancing Link Prediction with Fuzzy Graph Attention Networks and\n Dynamic Negative Sampling","summary":" Link prediction is crucial for understanding complex networks but traditional\nGraph Neural Networks (GNNs) often rely on random negative sampling, leading to\nsuboptimal performance. This paper introduces Fuzzy Graph Attention Networks\n(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative\nsampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS)\nsystematically selects high-quality negative edges based on fuzzy similarities,\nimproving training efficiency. FGAT layer incorporates fuzzy rough set\nprinciples, enabling robust and discriminative node representations.\nExperiments on two research collaboration networks demonstrate FGAT's superior\nlink prediction accuracy, outperforming state-of-the-art baselines by\nleveraging the power of fuzzy rough sets for effective negative sampling and\nnode feature learning.\n","authors":["Jinming Xing"],"pdf_url":"https://arxiv.org/pdf/2411.07482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04916v4","updated":"2024-11-12T01:01:32Z","published":"2023-11-02T04:01:04Z","title":"Explainable Identification of Hate Speech towards Islam using Graph\n Neural Networks","summary":" Islamophobic language on online platforms fosters intolerance, making\ndetection and elimination crucial for promoting harmony. Traditional hate\nspeech detection models rely on NLP techniques like tokenization,\npart-of-speech tagging, and encoder-decoder models. However, Graph Neural\nNetworks (GNNs), with their ability to utilize relationships between data\npoints, offer more effective detection and greater explainability. In this\nwork, we represent speeches as nodes and connect them with edges based on their\ncontext and similarity to develop the graph. This study introduces a novel\nparadigm using GNNs to identify and explain hate speech towards Islam. Our\nmodel leverages GNNs to understand the context and patterns of hate speech by\nconnecting texts via pretrained NLP-generated word embeddings, achieving\nstate-of-the-art performance and enhancing detection accuracy while providing\nvaluable explanations. This highlights the potential of GNNs in combating\nonline hate speech and fostering a safer, more inclusive online environment.\n","authors":["Azmine Toushik Wasi"],"pdf_url":"https://arxiv.org/pdf/2311.04916v4.pdf","comment":"Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival)\n (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP\n 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology:\n https://aclanthology.org/2024.nlp4pi-1.23/)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2411.08249v1","updated":"2024-11-12T23:55:11Z","published":"2024-11-12T23:55:11Z","title":"Retrieval Augmented Time Series Forecasting","summary":" Retrieval-augmented generation (RAG) is a central component of modern LLM\nsystems, particularly in scenarios where up-to-date information is crucial for\naccurately responding to user queries or when queries exceed the scope of the\ntraining data. The advent of time-series foundation models (TSFM), such as\nChronos, and the need for effective zero-shot forecasting performance across\nvarious time-series domains motivates the question: Do benefits of RAG\nsimilarly carry over to time series forecasting? In this paper, we advocate\nthat the dynamic and event-driven nature of time-series data makes RAG a\ncrucial component of TSFMs and introduce a principled RAG framework for\ntime-series forecasting, called Retrieval Augmented Forecasting (RAF). Within\nRAF, we develop efficient strategies for retrieving related time-series\nexamples and incorporating them into forecast. Through experiments and\nmechanistic studies, we demonstrate that RAF indeed improves the forecasting\naccuracy across diverse time series domains and the improvement is more\nsignificant for larger TSFM sizes.\n","authors":["Kutay Tire","Ege Onur Taga","Muhammed Emrullah Ildiz","Samet Oymak"],"pdf_url":"https://arxiv.org/pdf/2411.08249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08248v1","updated":"2024-11-12T23:54:58Z","published":"2024-11-12T23:54:58Z","title":"Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial\n Approach","summary":" Deep learning underpins most of the currently advanced natural language\nprocessing (NLP) tasks such as textual classification, neural machine\ntranslation (NMT), abstractive summarization and question-answering (QA).\nHowever, the robustness of the models, particularly QA models, against\nadversarial attacks is a critical concern that remains insufficiently explored.\nThis paper introduces QA-Attack (Question Answering Attack), a novel word-level\nadversarial strategy that fools QA models. Our attention-based attack exploits\nthe customized attention mechanism and deletion ranking strategy to identify\nand target specific words within contextual passages. It creates deceptive\ninputs by carefully choosing and substituting synonyms, preserving grammatical\nintegrity while misleading the model to produce incorrect responses. Our\napproach demonstrates versatility across various question types, particularly\nwhen dealing with extensive long textual inputs. Extensive experiments on\nmultiple benchmark datasets demonstrate that QA-Attack successfully deceives\nbaseline QA models and surpasses existing adversarial techniques regarding\nsuccess rate, semantics changes, BLEU score, fluency and grammar error rate.\n","authors":["Jiyao Li","Mingze Ni","Yongshun Gong","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2411.08248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08244v1","updated":"2024-11-12T23:43:20Z","published":"2024-11-12T23:43:20Z","title":"NVCiM-PT: An NVCiM-assisted Prompt Tuning Framework for Edge LLMs","summary":" Large Language Models (LLMs) deployed on edge devices, known as edge LLMs,\nneed to continuously fine-tune their model parameters from user-generated data\nunder limited resource constraints. However, most existing learning methods are\nnot applicable for edge LLMs because of their reliance on high resources and\nlow learning capacity. Prompt tuning (PT) has recently emerged as an effective\nfine-tuning method for edge LLMs by only modifying a small portion of LLM\nparameters, but it suffers from user domain shifts, resulting in repetitive\ntraining and losing resource efficiency. Conventional techniques to address\ndomain shift issues often involve complex neural networks and sophisticated\ntraining, which are incompatible for PT for edge LLMs. Therefore, an open\nresearch question is how to address domain shift issues for edge LLMs with\nlimited resources. In this paper, we propose a prompt tuning framework for edge\nLLMs, exploiting the benefits offered by non-volatile computing-in-memory\n(NVCiM) architectures. We introduce a novel NVCiM-assisted PT framework, where\nwe narrow down the core operations to matrix-matrix multiplication, which can\nthen be accelerated by performing in-situ computation on NVCiM. To the best of\nour knowledge, this is the first work employing NVCiM to improve the edge LLM\nPT performance.\n","authors":["Ruiyang Qin","Pengyu Ren","Zheyu Yan","Liu Liu","Dancheng Liu","Amir Nassereldine","Jinjun Xiong","Kai Ni","Sharon Hu","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2411.08244v1.pdf","comment":"Accepted by DATE 2025"},{"id":"http://arxiv.org/abs/2411.08241v1","updated":"2024-11-12T23:32:21Z","published":"2024-11-12T23:32:21Z","title":"A Social Outcomes and Priorities centered (SOP) Framework for AI policy","summary":" Rapid developments in AI and its adoption across various domains have\nnecessitated a need to build robust guardrails and risk containment plans while\nensuring equitable benefits for the betterment of society. The current\ntechnology-centered approach has resulted in a fragmented, reactive, and\nineffective policy apparatus. This paper highlights the immediate and urgent\nneed to pivot to a society-centered approach to develop comprehensive,\ncoherent, forward-looking AI policy. To this end, we present a Social Outcomes\nand Priorities centered (SOP) framework for AI policy along with proposals on\nimplementation of its various components. While the SOP framework is presented\nfrom a US-centric view, the takeaways are general and applicable globally.\n","authors":["Mohak Shah"],"pdf_url":"https://arxiv.org/pdf/2411.08241v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.05864v2","updated":"2024-11-12T23:12:55Z","published":"2024-03-09T10:24:12Z","title":"PEaRL: Personalized Privacy of Human-Centric Systems using Early-Exit\n Reinforcement Learning","summary":" In the evolving landscape of human-centric systems, personalized privacy\nsolutions are becoming increasingly crucial due to the dynamic nature of human\ninteractions. Traditional static privacy models often fail to meet the diverse\nand changing privacy needs of users. This paper introduces PEaRL, a system\ndesigned to enhance privacy preservation by tailoring its approach to\nindividual behavioral patterns and preferences. While incorporating\nreinforcement learning (RL) for its adaptability, PEaRL primarily focuses on\nemploying an early-exit strategy that dynamically balances privacy protection\nand system utility. This approach addresses the challenges posed by the\nvariability and evolution of human behavior, which static privacy models\nstruggle to handle effectively. We evaluate PEaRL in two distinct contexts:\nSmart Home environments and Virtual Reality (VR) Smart Classrooms. The\nempirical results demonstrate PEaRL's capability to provide a personalized\ntradeoff between user privacy and application utility, adapting effectively to\nindividual user preferences. On average, across both systems, PEaRL enhances\nprivacy protection by 31%, with a corresponding utility reduction of 24%.\n","authors":["Mojtaba Taherisadr","Salma Elmalaki"],"pdf_url":"https://arxiv.org/pdf/2403.05864v2.pdf","comment":"15 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.16362v2","updated":"2024-11-12T23:08:25Z","published":"2023-08-30T23:34:11Z","title":"A Unified Analysis on the Subgradient Upper Bounds for the Subgradient\n Methods Minimizing Composite Nonconvex, Nonsmooth and Non-Lipschitz Functions","summary":" This paper presents a unified analysis for the proximal subgradient method\n(Prox-SubGrad) type approach to minimize an overall objective of $f(x)+r(x)$,\nsubject to convex constraints, where both $f$ and $r$ are weakly convex,\nnonsmooth, and non-Lipschitz. Leveraging on the properties of the Moreau\nenvelope of weakly convex functions, we are able to relate error-bound\nconditions, the growth conditions of the subgradients of the objective, and the\nbehavior of the proximal subgradient iterates on some remarkably broad classes\nof objective functions. Various existing as well as new bounding conditions are\nstudied, leading to novel iteration complexity results. The terrain of our\nexploration expands to stochastic proximal subgradient algorithms.\n","authors":["Daoli Zhu","Lei Zhao","Shuzhong Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.16362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.08232v1","updated":"2024-11-12T22:56:28Z","published":"2024-11-12T22:56:28Z","title":"Imitation Learning from Observations: An Autoregressive Mixture of\n Experts Approach","summary":" This paper presents a novel approach to imitation learning from observations,\nwhere an autoregressive mixture of experts model is deployed to fit the\nunderlying policy. The parameters of the model are learned via a two-stage\nframework. By leveraging the existing dynamics knowledge, the first stage of\nthe framework estimates the control input sequences and hence reduces the\nproblem complexity. At the second stage, the policy is learned by solving a\nregularized maximum-likelihood estimation problem using the estimated control\ninput sequences. We further extend the learning procedure by incorporating a\nLyapunov stability constraint to ensure asymptotic stability of the identified\nmodel, for accurate multi-step predictions. The effectiveness of the proposed\nframework is validated using two autonomous driving datasets collected from\nhuman demonstrations, demonstrating its practical applicability in modelling\ncomplex nonlinear dynamics.\n","authors":["Renzi Wang","Flavia Sofia Acerbo","Tong Duy Son","Panagiotis Patrinos"],"pdf_url":"https://arxiv.org/pdf/2411.08232v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.07899v1","updated":"2024-11-12T16:12:51Z","published":"2024-11-12T16:12:51Z","title":"Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse\n Tensor-based Transformer","summary":" The evolution of 3D visualization techniques has fundamentally transformed\nhow we interact with digital content. At the forefront of this change is point\ncloud technology, offering an immersive experience that surpasses traditional\n2D representations. However, the massive data size of point clouds presents\nsignificant challenges in data compression. Current methods for lossy point\ncloud attribute compression (PCAC) generally focus on reconstructing the\noriginal point clouds with minimal error. However, for point cloud\nvisualization scenarios, the reconstructed point clouds with distortion still\nneed to undergo a complex rendering process, which affects the final\nuser-perceived quality. In this paper, we propose an end-to-end deep learning\nframework that seamlessly integrates PCAC with differentiable rendering,\ndenoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of\nrendered multiview images for viewing. In a differentiable manner, the impact\nof the rendering process on the reconstructed point clouds is taken into\naccount. Moreover, we characterize point clouds as sparse tensors and propose a\nsparse tensor-based transformer, called SP-Trans. By aligning with the local\ndensity of the point cloud and utilizing an enhanced local attention mechanism,\nSP-Trans captures the intricate relationships within the point cloud, further\nimproving feature analysis and synthesis within the framework. Extensive\nexperiments demonstrate that the proposed RO-PCAC achieves state-of-the-art\ncompression performance, compared to existing reconstruction-oriented methods,\nincluding traditional, learning-based, and hybrid methods.\n","authors":["Xiao Huo","Junhui Ho","Shuai Wan","Fuzheng Yang"],"pdf_url":"https://arxiv.org/pdf/2411.07899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.06001v2","updated":"2024-11-12T15:14:41Z","published":"2024-07-08T14:53:07Z","title":"Pseudo-triplet Guided Few-shot Composed Image Retrieval","summary":" Composed Image Retrieval (CIR) is a challenging task that aims to retrieve\nthe target image with a multimodal query, i.e., a reference image, and its\ncomplementary modification text. As previous supervised or zero-shot learning\nparadigms all fail to strike a good trade-off between the model's\ngeneralization ability and retrieval performance, recent researchers have\nintroduced the task of few-shot CIR (FS-CIR) and proposed a textual\ninversion-based network based on pretrained CLIP model to realize it. Despite\nits promising performance, the approach encounters two key limitations: simply\nrelying on the few annotated samples for CIR model training and\nindiscriminately selecting training triplets for CIR model fine-tuning. To\naddress these two limitations, we propose a novel two-stage pseudo triplet\nguided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an\nattentive masking and captioning-based pseudo triplet generation method, to\nconstruct pseudo triplets from pure image data and use them to fulfill the\nCIR-task specific pertaining. In the second stage, we propose a challenging\ntriplet-based CIR fine-tuning method, where we design a pseudo modification\ntext-based sample challenging score estimation strategy and a robust top\nrange-based random sampling strategy for sampling robust challenging triplets\nto promote the model fine-tuning. Notably, our scheme is plug-and-play and\ncompatible with any existing supervised CIR models. We test our scheme across\ntwo backbones on three public datasets (i.e., FashionIQ, CIRR, and\nBirds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4%\nrespectively, demonstrating our scheme's efficacy.\n","authors":["Bohan Hou","Haoqiang Lin","Haokun Wen","Meng Liu","Mingzhu Xu","Xuemeng Song"],"pdf_url":"https://arxiv.org/pdf/2407.06001v2.pdf","comment":"10pages"},{"id":"http://arxiv.org/abs/2411.07772v1","updated":"2024-11-12T13:13:20Z","published":"2024-11-12T13:13:20Z","title":"Automatic Album Sequencing","summary":" Album sequencing is a critical part of the album production process.\nRecently, a data-driven approach was proposed that sequences general\ncollections of independent media by extracting the narrative essence of the\nitems in the collections. While this approach implies an album sequencing\ntechnique, it is not widely accessible to a less technical audience, requiring\nadvanced knowledge of machine learning techniques to use. To address this, we\nintroduce a new user-friendly web-based tool that allows a less technical\naudience to upload music tracks, execute this technique in one click, and\nsubsequently presents the result in a clean visualization to the user. To both\nincrease the number of templates available to the user and address shortcomings\nof previous work, we also introduce a new direct transformer-based album\nsequencing method. We find that our more direct method outperforms a random\nbaseline but does not reach the same performance as the narrative essence\napproach. Both methods are included in our web-based user interface, and this\n-- alongside a full copy of our implementation -- is publicly available at\nhttps://github.com/dylanashley/automatic-album-sequencing\n","authors":["Vincent Herrmann","Dylan R. Ashley","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2411.07772v1.pdf","comment":"presented as a late breaking demo in the 25th International Society\n for Music Information Retrieval Conference; 3 pages in main text, 3 figures\n in main text; source code available at\n https://github.com/dylanashley/automatic-album-sequencing"},{"id":"http://arxiv.org/abs/2411.07751v1","updated":"2024-11-12T12:23:41Z","published":"2024-11-12T12:23:41Z","title":"SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State\n Space Model","summary":" Speech enhancement plays an essential role in various applications, and the\nintegration of visual information has been demonstrated to bring substantial\nadvantages. However, the majority of current research concentrates on the\nexamination of facial and lip movements, which can be compromised or entirely\ninaccessible in scenarios where occlusions occur or when the camera view is\ndistant. Whereas contextual visual cues from the surrounding environment have\nbeen overlooked: for example, when we see a dog bark, our brain has the innate\nability to discern and filter out the barking noise. To this end, in this\npaper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is\nthe first proposal to use rich contextual information from synchronized video\nas auxiliary cues to indicate the type of noise, which eventually improves the\nspeech enhancement performance. Specifically, we propose the VC-S$^2$E method,\nwhich incorporates the Conformer and Mamba modules for their complementary\nstrengths. Extensive experiments are conducted on public MUSIC, AVSpeech and\nAudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E\nover other competitive methods. We will make the source code publicly\navailable. Project demo page: https://AVSEPage.github.io/\n","authors":["Xinyuan Qian","Jiaran Gao","Yaodan Zhang","Qiquan Zhang","Hexin Liu","Leibny Paola Garcia","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2411.07751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07650v1","updated":"2024-11-12T09:02:11Z","published":"2024-11-12T09:02:11Z","title":"Understanding Audiovisual Deepfake Detection: Techniques, Challenges,\n Human Factors and Perceptual Insights","summary":" Deep Learning has been successfully applied in diverse fields, and its impact\non deepfake detection is no exception. Deepfakes are fake yet realistic\nsynthetic content that can be used deceitfully for political impersonation,\nphishing, slandering, or spreading misinformation. Despite extensive research\non unimodal deepfake detection, identifying complex deepfakes through joint\nanalysis of audio and visual streams remains relatively unexplored. To fill\nthis gap, this survey first provides an overview of audiovisual deepfake\ngeneration techniques, applications, and their consequences, and then provides\na comprehensive review of state-of-the-art methods that combine audio and\nvisual modalities to enhance detection accuracy, summarizing and critically\nanalyzing their strengths and limitations. Furthermore, we discuss existing\nopen source datasets for a deeper understanding, which can contribute to the\nresearch community and provide necessary information to beginners who want to\nanalyze deep learning-based audiovisual methods for video forensics. By\nbridging the gap between unimodal and multimodal approaches, this paper aims to\nimprove the effectiveness of deepfake detection strategies and guide future\nresearch in cybersecurity and media integrity.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2411.07650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07539v1","updated":"2024-11-12T04:34:09Z","published":"2024-11-12T04:34:09Z","title":"Harmonizing Pixels and Melodies: Maestro-Guided Film Score Generation\n and Composition Style Transfer","summary":" We introduce a film score generation framework to harmonize visual pixels and\nmusic melodies utilizing a latent diffusion model. Our framework processes film\nclips as input and generates music that aligns with a general theme while\noffering the capability to tailor outputs to a specific composition style. Our\nmodel directly produces music from video, utilizing a streamlined and efficient\ntuning mechanism on ControlNet. It also integrates a film encoder adept at\nunderstanding the film's semantic depth, emotional impact, and aesthetic\nappeal. Additionally, we introduce a novel, effective yet straightforward\nevaluation metric to evaluate the originality and recognizability of music\nwithin film scores. To fill this gap for film scores, we curate a comprehensive\ndataset of film videos and legendary original scores, injecting domain-specific\nknowledge into our data-driven generation model. Our model outperforms existing\nmethodologies in creating film scores, capable of generating music that\nreflects the guidance of a maestro's style, thereby redefining the benchmark\nfor automated film scores and laying a robust groundwork for future research in\nthis domain. The code and generated samples are available at\nhttps://anonymous.4open.science/r/HPM.\n","authors":["F. Qi","L. Ni","C. Xu"],"pdf_url":"https://arxiv.org/pdf/2411.07539v1.pdf","comment":null}]},"2024-11-11T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2410.11119v3","updated":"2024-11-11T23:36:36Z","published":"2024-10-14T22:06:54Z","title":"ChuLo: Chunk-Level Key Information Representation for Long Document\n Processing","summary":" Transformer-based models have achieved remarkable success in various Natural\nLanguage Processing (NLP) tasks, yet their ability to handle long documents is\nconstrained by computational limitations. Traditional approaches, such as\ntruncating inputs, sparse self-attention, and chunking, attempt to mitigate\nthese issues, but they often lead to information loss and hinder the model's\nability to capture long-range dependencies. In this paper, we introduce ChuLo,\na novel chunk representation method for long document classification that\naddresses these limitations. Our ChuLo groups input tokens using unsupervised\nkeyphrase extraction, emphasizing semantically important keyphrase based chunk\nto retain core document content while reducing input length. This approach\nminimizes information loss and improves the efficiency of Transformer-based\nmodels. Preserving all tokens in long document understanding, especially token\nclassification tasks, is especially important to ensure that fine-grained\nannotations, which depend on the entire sequence context, are not lost. We\nevaluate our method on multiple long document classification tasks and long\ndocument token classification tasks, demonstrating its effectiveness through\ncomprehensive qualitative and quantitative analyses.\n","authors":["Yan Li","Soyeon Caren Han","Yue Dai","Feiqi Cao"],"pdf_url":"https://arxiv.org/pdf/2410.11119v3.pdf","comment":"The paper has been submitted to a conference and is currently under\n review"},{"id":"http://arxiv.org/abs/2409.06803v2","updated":"2024-11-11T23:33:50Z","published":"2024-09-10T18:14:02Z","title":"Decomposition of surprisal: Unified computational model of ERP\n components in language processing","summary":" The functional interpretation of language-related ERP components has been a\ncentral debate in psycholinguistics for decades. We advance an\ninformation-theoretic model of human language processing in the brain in which\nincoming linguistic input is processed at first shallowly and later with more\ndepth, with these two kinds of information processing corresponding to distinct\nelectroencephalographic signatures. Formally, we show that the information\ncontent (surprisal) of a word in context can be decomposed into two quantities:\n(A) shallow surprisal, which signals shallow processing difficulty for a word,\nand corresponds with the N400 signal; and (B) deep surprisal, which reflects\nthe discrepancy between shallow and deep representations, and corresponds to\nthe P600 signal and other late positivities. Both of these quantities can be\nestimated straightforwardly using modern NLP models. We validate our theory by\nsuccessfully simulating ERP patterns elicited by a variety of linguistic\nmanipulations in previously-reported experimental data from six experiments,\nwith successful novel qualitative and quantitative predictions. Our theory is\ncompatible with traditional cognitive theories assuming a `good-enough' shallow\nrepresentation stage, but with a precise information-theoretic formulation. The\nmodel provides an information-theoretic model of ERP components grounded on\ncognitive processes, and brings us closer to a fully-specified\nneuro-computational model of language processing.\n","authors":["Jiaxuan Li","Richard Futrell"],"pdf_url":"https://arxiv.org/pdf/2409.06803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.16914v3","updated":"2024-11-11T23:08:20Z","published":"2024-02-25T17:43:29Z","title":"DrAttack: Prompt Decomposition and Reconstruction Makes Powerful LLM\n Jailbreakers","summary":" The safety alignment of Large Language Models (LLMs) is vulnerable to both\nmanual and automated jailbreak attacks, which adversarially trigger LLMs to\noutput harmful content. However, current methods for jailbreaking LLMs, which\nnest entire harmful prompts, are not effective at concealing malicious intent\nand can be easily identified and rejected by well-aligned LLMs. This paper\ndiscovers that decomposing a malicious prompt into separated sub-prompts can\neffectively obscure its underlying malicious intent by presenting it in a\nfragmented, less detectable form, thereby addressing these limitations. We\nintroduce an automatic prompt \\textbf{D}ecomposition and\n\\textbf{R}econstruction framework for jailbreak \\textbf{Attack} (DrAttack).\nDrAttack includes three key components: (a) `Decomposition' of the original\nprompt into sub-prompts, (b) `Reconstruction' of these sub-prompts implicitly\nby in-context learning with semantically similar but harmless reassembling\ndemo, and (c) a `Synonym Search' of sub-prompts, aiming to find sub-prompts'\nsynonyms that maintain the original intent while jailbreaking LLMs. An\nextensive empirical study across multiple open-source and closed-source LLMs\ndemonstrates that, with a significantly reduced number of queries, DrAttack\nobtains a substantial gain of success rate over prior SOTA prompt-only\nattackers. Notably, the success rate of 78.0\\% on GPT-4 with merely 15 queries\nsurpassed previous art by 33.1\\%. The project is available at\nhttps://github.com/xirui-li/DrAttack.\n","authors":["Xirui Li","Ruochen Wang","Minhao Cheng","Tianyi Zhou","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2402.16914v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06770v3","updated":"2024-11-11T23:05:04Z","published":"2023-10-10T16:47:29Z","title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","summary":" Language models have outpaced our ability to evaluate them effectively, but\nfor their future development it is essential to study the frontier of their\ncapabilities. We find real-world software engineering to be a rich,\nsustainable, and challenging testbed for evaluating the next generation of\nlanguage models. To this end, we introduce SWE-bench, an evaluation framework\nconsisting of $2,294$ software engineering problems drawn from real GitHub\nissues and corresponding pull requests across $12$ popular Python repositories.\nGiven a codebase along with a description of an issue to be resolved, a\nlanguage model is tasked with editing the codebase to address the issue.\nResolving issues in SWE-bench frequently requires understanding and\ncoordinating changes across multiple functions, classes, and even files\nsimultaneously, calling for models to interact with execution environments,\nprocess extremely long contexts and perform complex reasoning that goes far\nbeyond traditional code generation tasks. Our evaluations show that both\nstate-of-the-art proprietary models and our fine-tuned model SWE-Llama can\nresolve only the simplest issues. The best-performing model, Claude 2, is able\nto solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps\ntowards LMs that are more practical, intelligent, and autonomous.\n","authors":["Carlos E. Jimenez","John Yang","Alexander Wettig","Shunyu Yao","Kexin Pei","Ofir Press","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2310.06770v3.pdf","comment":"Data, code, and leaderboard are available at https://www.swebench.com\n ICLR 2024, https://openreview.net/forum?id=VTF8yNQM66"},{"id":"http://arxiv.org/abs/2405.03111v2","updated":"2024-11-11T22:53:21Z","published":"2024-05-06T02:07:13Z","title":"Temporal Dynamics of Emotion and Cognition in Human Translation:\n Integrating the Task Segment Framework and the HOF Taxonomy","summary":" The paper develops a novel generative model of human translation processes\ngrounded in empirical translation process data. Assuming three processes that\nunfold concurrently in the translating mind, it integrates the Task Segment\nFramework (Munoz & Apfelthaler 2022) and the HOF taxonomy (Carl et al 2024)\ninto a coherent architecture: uninterrupted translation production is caused by\nroutinized/automated processes, cognitive/reflective interventions lead to\nlonger keystroke pauses, while emotional/affective states of the mind are\nidentified by distinctive gazing patterns. Utilizing data from the CRITT\nTranslation Process Research Database (TPR-DB), the paper illustrates how the\ntemporal structure of keystroke and gazing data can be related to the three\nassumed hidden mental processes that are believed to cause the observable data.\nThe paper relates this embedded generative model with Robinsons (2023)\nideosomatic theory of translation, opening exciting, new theoretical horizons\nfor Cognitive Translation Studies, grounded in empirical data and evaluation.\n","authors":["Michael Carl"],"pdf_url":"https://arxiv.org/pdf/2405.03111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07417v1","updated":"2024-11-11T22:44:29Z","published":"2024-11-11T22:44:29Z","title":"Untangling Hate Speech Definitions: A Semantic Componential Analysis\n Across Cultures and Domains","summary":" Hate speech relies heavily on cultural influences, leading to varying\nindividual interpretations. For that reason, we propose a Semantic Componential\nAnalysis (SCA) framework for a cross-cultural and cross-domain analysis of hate\nspeech definitions. We create the first dataset of definitions derived from\nfive domains: online dictionaries, research papers, Wikipedia articles,\nlegislation, and online platforms, which are later analyzed into semantic\ncomponents. Our analysis reveals that the components differ from definition to\ndefinition, yet many domains borrow definitions from one another without taking\ninto account the target culture. We conduct zero-shot model experiments using\nour proposed dataset, employing three popular open-sourced LLMs to understand\nthe impact of different definitions on hate speech detection. Our findings\nindicate that LLMs are sensitive to definitions: responses for hate speech\ndetection change according to the complexity of definitions used in the prompt.\n","authors":["Katerina Korre","Arianna Muti","Federico Ruggeri","Alberto Barrón-Cedeño"],"pdf_url":"https://arxiv.org/pdf/2411.07417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07407v1","updated":"2024-11-11T22:27:36Z","published":"2024-11-11T22:27:36Z","title":"Using Generative AI and Multi-Agents to Provide Automatic Feedback","summary":" This study investigates the use of generative AI and multi-agent systems to\nprovide automatic feedback in educational contexts, particularly for student\nconstructed responses in science assessments. The research addresses a key gap\nin the field by exploring how multi-agent systems, called AutoFeedback, can\nimprove the quality of GenAI-generated feedback, overcoming known issues such\nas over-praise and over-inference that are common in single-agent large\nlanguage models (LLMs). The study developed a multi-agent system consisting of\ntwo AI agents: one for generating feedback and another for validating and\nrefining it. The system was tested on a dataset of 240 student responses, and\nits performance was compared to that of a single-agent LLM. Results showed that\nAutoFeedback significantly reduced the occurrence of over-praise and\nover-inference errors, providing more accurate and pedagogically sound\nfeedback. The findings suggest that multi-agent systems can offer a more\nreliable solution for generating automated feedback in educational settings,\nhighlighting their potential for scalable and personalized learning support.\nThese results have important implications for educators and researchers seeking\nto leverage AI in formative assessments, offering a pathway to more effective\nfeedback mechanisms that enhance student learning outcomes.\n","authors":["Shuchen Guo","Ehsan Latif","Yifan Zhou","Xuan Huang","Xiaoming Zhai"],"pdf_url":"https://arxiv.org/pdf/2411.07407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07404v1","updated":"2024-11-11T22:22:21Z","published":"2024-11-11T22:22:21Z","title":"Controllable Context Sensitivity and the Knob Behind It","summary":" When making predictions, a language model must trade off how much it relies\non its context vs. its prior knowledge. Choosing how sensitive the model is to\nits context is a fundamental functionality, as it enables the model to excel at\ntasks like retrieval-augmented generation and question-answering. In this\npaper, we search for a knob which controls this sensitivity, determining\nwhether language models answer from the context or their prior knowledge. To\nguide this search, we design a task for controllable context sensitivity. In\nthis task, we first feed the model a context (Paris is in England) and a\nquestion (Where is Paris?); we then instruct the model to either use its prior\nor contextual knowledge and evaluate whether it generates the correct answer\nfor both intents (either France or England). When fine-tuned on this task,\ninstruction-tuned versions of Llama-3.1, Mistral-v0.3, and Gemma-2 can solve it\nwith high accuracy (85-95%). Analyzing these high-performing models, we narrow\ndown which layers may be important to context sensitivity using a novel linear\ntime algorithm. Then, in each model, we identify a 1-D subspace in a single\nlayer that encodes whether the model follows context or prior knowledge.\nInterestingly, while we identify this subspace in a fine-tuned model, we find\nthat the exact same subspace serves as an effective knob in not only that model\nbut also non-fine-tuned instruct and base models of that model family. Finally,\nwe show a strong correlation between a model's performance and how distinctly\nit separates context-agreeing from context-ignoring answers in this subspace.\nThese results suggest a single subspace facilitates how the model chooses\nbetween context and prior knowledge, hinting at a simple fundamental mechanism\nthat controls this behavior.\n","authors":["Julian Minder","Kevin Du","Niklas Stoehr","Giovanni Monea","Chris Wendler","Robert West","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2411.07404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.20724v2","updated":"2024-11-11T22:18:14Z","published":"2024-10-28T04:39:32Z","title":"Simple is Effective: The Roles of Graphs and Large Language Models in\n Knowledge-Graph-Based Retrieval-Augmented Generation","summary":" Large Language Models (LLMs) demonstrate strong reasoning abilities but face\nlimitations such as hallucinations and outdated knowledge. Knowledge Graph\n(KG)-based Retrieval-Augmented Generation (RAG) addresses these issues by\ngrounding LLM outputs in structured external knowledge from KGs. However,\ncurrent KG-based RAG frameworks still struggle to optimize the trade-off\nbetween retrieval effectiveness and efficiency in identifying a suitable amount\nof relevant graph information for the LLM to digest. We introduce SubgraphRAG,\nextending the KG-based RAG framework that retrieves subgraphs and leverages\nLLMs for reasoning and answer prediction. Our approach innovatively integrates\na lightweight multilayer perceptron with a parallel triple-scoring mechanism\nfor efficient and flexible subgraph retrieval while encoding directional\nstructural distances to enhance retrieval effectiveness. The size of retrieved\nsubgraphs can be flexibly adjusted to match the query's need and the downstream\nLLM's capabilities. This design strikes a balance between model complexity and\nreasoning power, enabling scalable and generalizable retrieval processes.\nNotably, based on our retrieved subgraphs, smaller LLMs like\nLlama3.1-8B-Instruct deliver competitive results with explainable reasoning,\nwhile larger models like GPT-4o achieve state-of-the-art accuracy compared with\nprevious baselines -- all without fine-tuning. Extensive evaluations on the\nWebQSP and CWQ benchmarks highlight SubgraphRAG's strengths in efficiency,\naccuracy, and reliability by reducing hallucinations and improving response\ngrounding.\n","authors":["Mufei Li","Siqi Miao","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2410.20724v2.pdf","comment":"Code available at https://github.com/Graph-COM/SubgraphRAG"},{"id":"http://arxiv.org/abs/2311.08303v2","updated":"2024-11-11T22:17:17Z","published":"2023-11-14T16:46:15Z","title":"Extrinsically-Focused Evaluation of Omissions in Medical Summarization","summary":" Large language models (LLMs) have shown promise in safety-critical\napplications such as healthcare, yet the ability to quantify performance has\nlagged. An example of this challenge is in evaluating a summary of the\npatient's medical record. A resulting summary can enable the provider to get a\nhigh-level overview of the patient's health status quickly. Yet, a summary that\nomits important facts about the patient's record can produce a misleading\npicture. This can lead to negative consequences on medical decision-making. We\npropose MED-OMIT as a metric to explore this challenge. We focus on using\nprovider-patient history conversations to generate a subjective (a summary of\nthe patient's history) as a case study. We begin by discretizing facts from the\ndialogue and identifying which are omitted from the subjective. To determine\nwhich facts are clinically relevant, we measure the importance of each fact to\na simulated differential diagnosis. We compare MED-OMIT's performance to that\nof clinical experts and find broad agreement We use MED-OMIT to evaluate LLM\nperformance on subjective generation and find some LLMs (gpt-4 and\nllama-3.1-405b) work well with little effort, while others (e.g. Llama 2)\nperform worse.\n","authors":["Elliot Schumacher","Daniel Rosenthal","Dhruv Naik","Varun Nair","Luladay Price","Geoffrey Tso","Anitha Kannan"],"pdf_url":"https://arxiv.org/pdf/2311.08303v2.pdf","comment":"Accepted to ML4H 2024"},{"id":"http://arxiv.org/abs/2409.17912v2","updated":"2024-11-11T22:14:04Z","published":"2024-09-26T14:56:38Z","title":"Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan\n Arabic Dialect","summary":" We introduce Atlas-Chat, the first-ever collection of LLMs specifically\ndeveloped for dialectal Arabic. Focusing on Moroccan Arabic, also known as\nDarija, we construct our instruction dataset by consolidating existing Darija\nlanguage resources, creating novel datasets both manually and synthetically,\nand translating English instructions with stringent quality control.\nAtlas-Chat-2B, 9B, and 27B models, fine-tuned on the dataset, exhibit superior\nability in following Darija instructions and performing standard NLP tasks.\nNotably, our models outperform both state-of-the-art and Arabic-specialized\nLLMs like LLaMa, Jais, and AceGPT, e.g., our 9B model gains a 13% performance\nboost over a larger 13B model on DarijaMMLU, in our newly introduced evaluation\nsuite for Darija covering both discriminative and generative tasks.\nFurthermore, we perform an experimental analysis of various fine-tuning\nstrategies and base model choices to determine optimal configurations. All our\nresources are publicly accessible, and we believe our work offers comprehensive\ndesign methodologies of instruction-tuning for low-resource languages, which\nare often neglected in favor of data-rich languages by contemporary LLMs.\n","authors":["Guokan Shang","Hadi Abdine","Yousef Khoubrane","Amr Mohamed","Yassine Abbahaddou","Sofiane Ennadir","Imane Momayiz","Xuguang Ren","Eric Moulines","Preslav Nakov","Michalis Vazirgiannis","Eric Xing"],"pdf_url":"https://arxiv.org/pdf/2409.17912v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07398v1","updated":"2024-11-11T22:08:48Z","published":"2024-11-11T22:08:48Z","title":"Beyond Keywords: A Context-based Hybrid Approach to Mining Ethical\n Concern-related App Reviews","summary":" With the increasing proliferation of mobile applications in our everyday\nexperiences, the concerns surrounding ethics have surged significantly. Users\ngenerally communicate their feedback, report issues, and suggest new\nfunctionalities in application (app) reviews, frequently emphasizing safety,\nprivacy, and accountability concerns. Incorporating these reviews is essential\nto developing successful products. However, app reviews related to ethical\nconcerns generally use domain-specific language and are expressed using a more\nvaried vocabulary. Thus making automated ethical concern-related app review\nextraction a challenging and time-consuming effort.\n This study proposes a novel Natural Language Processing (NLP) based approach\nthat combines Natural Language Inference (NLI), which provides a deep\ncomprehension of language nuances, and a decoder-only (LLaMA-like) Large\nLanguage Model (LLM) to extract ethical concern-related app reviews at scale.\nUtilizing 43,647 app reviews from the mental health domain, the proposed\nmethodology 1) Evaluates four NLI models to extract potential privacy reviews\nand compares the results of domain-specific privacy hypotheses with generic\nprivacy hypotheses; 2) Evaluates four LLMs for classifying app reviews to\nprivacy concerns; and 3) Uses the best NLI and LLM models further to extract\nnew privacy reviews from the dataset. Results show that the\nDeBERTa-v3-base-mnli-fever-anli NLI model with domain-specific hypotheses\nyields the best performance, and Llama3.1-8B-Instruct LLM performs best in the\nclassification of app reviews. Then, using NLI+LLM, an additional 1,008 new\nprivacy-related reviews were extracted that were not identified through the\nkeyword-based approach in previous research, thus demonstrating the\neffectiveness of the proposed approach.\n","authors":["Aakash Sorathiya","Gouri Ginde"],"pdf_url":"https://arxiv.org/pdf/2411.07398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07396v1","updated":"2024-11-11T22:06:51Z","published":"2024-11-11T22:06:51Z","title":"Toward Optimal Search and Retrieval for RAG","summary":" Retrieval-augmented generation (RAG) is a promising method for addressing\nsome of the memory-related challenges associated with Large Language Models\n(LLMs). Two separate systems form the RAG pipeline, the retriever and the\nreader, and the impact of each on downstream task performance is not\nwell-understood. Here, we work towards the goal of understanding how retrievers\ncan be optimized for RAG pipelines for common tasks such as Question Answering\n(QA). We conduct experiments focused on the relationship between retrieval and\nRAG performance on QA and attributed QA and unveil a number of insights useful\nto practitioners developing high-performance RAG pipelines. For example,\nlowering search accuracy has minor implications for RAG performance while\npotentially increasing retrieval speed and memory efficiency.\n","authors":["Alexandria Leto","Cecilia Aguerrebere","Ishwar Bhati","Ted Willke","Mariano Tepper","Vy Ai Vo"],"pdf_url":"https://arxiv.org/pdf/2411.07396v1.pdf","comment":"Accepted to NeurIPS 2024 Workshop ATTRIB"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.07439v1","updated":"2024-11-11T23:40:45Z","published":"2024-11-11T23:40:45Z","title":"Music Discovery Dialogue Generation Using Human Intent Analysis and\n Large Language Models","summary":" A conversational music retrieval system can help users discover music that\nmatches their preferences through dialogue. To achieve this, a conversational\nmusic retrieval system should seamlessly engage in multi-turn conversation by\n1) understanding user queries and 2) responding with natural language and\nretrieved music. A straightforward solution would be a data-driven approach\nutilizing such conversation logs. However, few datasets are available for the\nresearch and are limited in terms of volume and quality. In this paper, we\npresent a data generation framework for rich music discovery dialogue using a\nlarge language model (LLM) and user intents, system actions, and musical\nattributes. This is done by i) dialogue intent analysis using grounded theory,\nii) generating attribute sequences via cascading database filtering, and iii)\ngenerating utterances using large language models. By applying this framework\nto the Million Song dataset, we create LP-MusicDialog, a Large Language Model\nbased Pseudo Music Dialogue dataset, containing over 288k music conversations\nusing more than 319k music items. Our evaluation shows that the synthetic\ndataset is competitive with an existing, small human dialogue dataset in terms\nof dialogue consistency, item relevance, and naturalness. Furthermore, using\nthe dataset, we train a conversational music retrieval model and show promising\nresults.\n","authors":["SeungHeon Doh","Keunwoo Choi","Daeyong Kwon","Taesu Kim","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2411.07439v1.pdf","comment":"Accepted for publication at the 25th International Society for Music\n Information Retrieval Conference (ISMIR 2024)"},{"id":"http://arxiv.org/abs/2411.05059v2","updated":"2024-11-11T21:48:52Z","published":"2024-11-07T18:22:14Z","title":"FineTuneBench: How well do commercial fine-tuning APIs infuse knowledge\n into LLMs?","summary":" There is great interest in fine-tuning frontier large language models (LLMs)\nto inject new information and update existing knowledge. While commercial LLM\nfine-tuning APIs from providers such as OpenAI and Google promise flexible\nadaptation for various applications, the efficacy of fine-tuning remains\nunclear. In this study, we introduce FineTuneBench, an evaluation framework and\ndataset for understanding how well commercial fine-tuning APIs can successfully\nlearn new and updated knowledge. We analyze five frontier LLMs with\ncommercially available fine-tuning APIs, including GPT-4o and Gemini 1.5 Pro,\non their effectiveness in two settings: (1) ingesting novel information, such\nas recent news events and new people profiles, and (2) updating existing\nknowledge, such as updated medical guidelines and code frameworks. Our results\nreveal substantial shortcomings in all the models' abilities to effectively\nlearn new information through fine-tuning, with an average generalization\naccuracy of 37% across all models. When updating existing knowledge, such as\nincorporating medical guideline updates, commercial fine-tuning APIs show even\nmore limited capability (average generalization accuracy of 19%). Overall,\nfine-tuning GPT-4o mini is the most effective for infusing new knowledge and\nupdating knowledge, followed by GPT-3.5 Turbo and GPT-4o. The fine-tuning APIs\nfor Gemini 1.5 Flesh and Gemini 1.5 Pro are unable to learn new knowledge or\nupdate existing knowledge. These findings underscore a major shortcoming in\nusing current commercial fine-tuning services to achieve reliable knowledge\ninfusion in common scenarios. We open source the FineTuneBench dataset at\nhttps://github.com/kevinwu23/StanfordFineTuneBench.\n","authors":["Eric Wu","Kevin Wu","James Zou"],"pdf_url":"https://arxiv.org/pdf/2411.05059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.02606v2","updated":"2024-11-11T21:40:16Z","published":"2024-06-02T18:26:50Z","title":"Know Your Neighborhood: General and Zero-Shot Capable Binary Function\n Search Powered by Call Graphlets","summary":" Binary code similarity detection is an important problem with applications in\nareas such as malware analysis, vulnerability research and license violation\ndetection. This paper proposes a novel graph neural network architecture\ncombined with a novel graph data representation called call graphlets. A call\ngraphlet encodes the neighborhood around each function in a binary executable,\ncapturing the local and global context through a series of statistical\nfeatures. A specialized graph neural network model operates on this graph\nrepresentation, learning to map it to a feature vector that encodes semantic\nbinary code similarities using deep-metric learning. The proposed approach is\nevaluated across five distinct datasets covering different architectures,\ncompiler tool chains, and optimization levels. Experimental results show that\nthe combination of call graphlets and the novel graph neural network\narchitecture achieves comparable or state-of-the-art performance compared to\nbaseline techniques across cross-architecture, mono-architecture and zero shot\ntasks. In addition, our proposed approach also performs well when evaluated\nagainst an out-of-domain function inlining task. The work provides a general\nand effective graph neural network-based solution for conducting binary code\nsimilarity detection.\n","authors":["Joshua Collyer","Tim Watson","Iain Phillips"],"pdf_url":"https://arxiv.org/pdf/2406.02606v2.pdf","comment":"13 pages, Under-Review"},{"id":"http://arxiv.org/abs/2411.02537v3","updated":"2024-11-11T18:49:52Z","published":"2024-11-04T19:16:53Z","title":"INQUIRE: A Natural World Text-to-Image Retrieval Benchmark","summary":" We introduce INQUIRE, a text-to-image retrieval benchmark designed to\nchallenge multimodal vision-language models on expert-level queries. INQUIRE\nincludes iNaturalist 2024 (iNat24), a new dataset of five million natural world\nimages, along with 250 expert-level retrieval queries. These queries are paired\nwith all relevant images comprehensively labeled within iNat24, comprising\n33,000 total matches. Queries span categories such as species identification,\ncontext, behavior, and appearance, emphasizing tasks that require nuanced image\nunderstanding and domain expertise. Our benchmark evaluates two core retrieval\ntasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2)\nINQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed\nevaluation of a range of recent multimodal models demonstrates that INQUIRE\nposes a significant challenge, with the best models failing to achieve an\nmAP@50 above 50%. In addition, we show that reranking with more powerful\nmultimodal models can enhance retrieval performance, yet there remains a\nsignificant margin for improvement. By focusing on scientifically-motivated\necological challenges, INQUIRE aims to bridge the gap between AI capabilities\nand the needs of real-world scientific inquiry, encouraging the development of\nretrieval systems that can assist with accelerating ecological and biodiversity\nresearch. Our dataset and code are available at\nhttps://inquire-benchmark.github.io\n","authors":["Edward Vendrow","Omiros Pantazis","Alexander Shepard","Gabriel Brostow","Kate E. Jones","Oisin Mac Aodha","Sara Beery","Grant Van Horn"],"pdf_url":"https://arxiv.org/pdf/2411.02537v3.pdf","comment":"Published in NeurIPS 2024, Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2411.07166v1","updated":"2024-11-11T17:46:34Z","published":"2024-11-11T17:46:34Z","title":"The Shapley index for music streaming platforms","summary":" We study an index to measure the popularity of artists in music streaming\nplatforms. This index, which can be used to allocate the amount raised via paid\nsubscriptions among participating artists, is based on the Shapley value, a\ncenterpiece in cooperative game theory. We characterize this Shapley index\ncombining several axioms formalizing principles with normative appeal. This\npermits to place the index in the literature, as an alternative to the\nwell-known (and widely used in the industry) pro-rata and user-centric indices.\n","authors":["Gustavo Bergantiños","Juan D. Moreno-Ternero"],"pdf_url":"https://arxiv.org/pdf/2411.07166v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.07021v1","updated":"2024-11-11T14:25:37Z","published":"2024-11-11T14:25:37Z","title":"Invar-RAG: Invariant LLM-aligned Retrieval for Better Generation","summary":" Retrieval-augmented generation (RAG) has shown impressive capability in\nproviding reliable answer predictions and addressing hallucination problems. A\ntypical RAG implementation uses powerful retrieval models to extract external\ninformation and large language models (LLMs) to generate answers. In contrast,\nrecent LLM-based retrieval has gained attention for its substantial\nimprovements in information retrieval (IR) due to the LLMs' semantic\nunderstanding capability. However, directly applying LLM to RAG systems\npresents challenges. This may cause feature locality problems as massive\nparametric knowledge can hinder effective usage of global information across\nthe corpus; for example, an LLM-based retriever often inputs document summaries\ninstead of full documents. Moreover, various pre-trained tasks in LLMs\nintroduce variance, further weakening performance as a retriever.\n To address these issues, we propose a novel two-stage fine-tuning\narchitecture called Invar-RAG. In the retrieval stage, an LLM-based retriever\nis constructed by integrating LoRA-based representation learning to tackle\nfeature locality issues. To enhance retrieval performance, we develop two\npatterns (invariant and variant patterns) and an invariance loss to reduce LLM\nvariance. In the generation stage, a refined fine-tuning method is employed to\nimprove LLM accuracy in generating answers based on retrieved information.\nExperimental results show that Invar-RAG significantly outperforms existing\nbaselines across three open-domain question answering (ODQA) datasets. Code is\navailable in the Supplementary Material for reproducibility.\n","authors":["Ziwei Liu","Liang Zhang","Qian Li","Jianghua Wu","Guangxu Zhu"],"pdf_url":"https://arxiv.org/pdf/2411.07021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.02971v3","updated":"2024-11-11T11:46:16Z","published":"2022-08-05T03:45:56Z","title":"CROLoss: Towards a Customizable Loss for Retrieval Models in Recommender\n Systems","summary":" In large-scale recommender systems, retrieving top N relevant candidates\naccurately with resource constrain is crucial. To evaluate the performance of\nsuch retrieval models, Recall@N, the frequency of positive samples being\nretrieved in the top N ranking, is widely used. However, most of the\nconventional loss functions for retrieval models such as softmax cross-entropy\nand pairwise comparison methods do not directly optimize Recall@N. Moreover,\nthose conventional loss functions cannot be customized for the specific\nretrieval size N required by each application and thus may lead to sub-optimal\nperformance. In this paper, we proposed the Customizable Recall@N Optimization\nLoss (CROLoss), a loss function that can directly optimize the Recall@N metrics\nand is customizable for different choices of N. This proposed CROLoss\nformulation defines a more generalized loss function space, covering most of\nthe conventional loss functions as special cases. Furthermore, we develop the\nLambda method, a gradient-based method that invites more flexibility and can\nfurther boost the system performance. We evaluate the proposed CROLoss on two\npublic benchmark datasets. The results show that CROLoss achieves SOTA results\nover conventional loss functions for both datasets with various choices of\nretrieval size N. CROLoss has been deployed onto our online E-commerce\nadvertising platform, where a fourteen-day online A/B test demonstrated that\nCROLoss contributes to a significant business revenue growth of 4.75%.\n","authors":["Yongxiang Tang","Wentao Bai","Guilin Li","Xialong Liu","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2208.02971v3.pdf","comment":"9 pages, 5 figures. Accepted by by CIKM 2022"},{"id":"http://arxiv.org/abs/2411.06877v1","updated":"2024-11-11T11:17:35Z","published":"2024-11-11T11:17:35Z","title":"LLM-Assisted Relevance Assessments: When Should We Ask LLMs for Help?","summary":" Test collections are information retrieval tools that allow researchers to\nquickly and easily evaluate ranking algorithms. While test collections have\nbecome an integral part of IR research, the process of data creation involves\nsignificant efforts in manual annotations, which often makes it very expensive\nand time-consuming. Thus, the test collections could become small when the\nbudget is limited, which may lead to unstable evaluations. As an alternative,\nrecent studies have proposed the use of large language models (LLMs) to\ncompletely replace human assessors. However, while LLMs seem to somewhat\ncorrelate with human judgments, they are not perfect and often show bias.\nMoreover, even if a well-performing LLM or prompt is found on one dataset,\nthere is no guarantee that it will perform similarly in practice, due to\ndifference in tasks and data. Thus a complete replacement with LLMs is argued\nto be too risky and not fully trustable.\n Thus, in this paper, we propose \\textbf{L}LM-\\textbf{A}ssisted\n\\textbf{R}elevance \\textbf{A}ssessments (\\textbf{LARA}), an effective method to\nbalance manual annotations with LLM annotations, which helps to make a rich and\nreliable test collection. We use the LLM's predicted relevance probabilities in\norder to select the most profitable documents to manually annotate under a\nbudget constraint. While solely relying on LLM's predicted probabilities to\nmanually annotate performs fairly well, with theoretical reasoning, LARA guides\nthe human annotation process even more effectively via online calibration\nlearning. Then, using the calibration model learned from the limited manual\nannotations, LARA debiases the LLM predictions to annotate the remaining\nnon-assessed data. Empirical evaluations on TREC-COVID and TREC-8 Ad Hoc\ndatasets show that LARA outperforms the alternative solutions under almost any\nbudget constraint.\n","authors":["Rikiya Takehi","Ellen M. Voorhees","Tetsuya Sakai"],"pdf_url":"https://arxiv.org/pdf/2411.06877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.13704v2","updated":"2024-11-11T10:02:24Z","published":"2024-09-05T10:27:32Z","title":"Entity Extraction from High-Level Corruption Schemes via Large Language\n Models","summary":" The rise of financial crime that has been observed in recent years has\ncreated an increasing concern around the topic and many people, organizations\nand governments are more and more frequently trying to combat it. Despite the\nincrease of interest in this area, there is a lack of specialized datasets that\ncan be used to train and evaluate works that try to tackle those problems. This\narticle proposes a new micro-benchmark dataset for algorithms and models that\nidentify individuals and organizations, and their multiple writings, in news\narticles, and presents an approach that assists in its creation. Experimental\nefforts are also reported, using this dataset, to identify individuals and\norganizations in financial-crime-related articles using various low-billion\nparameter Large Language Models (LLMs). For these experiments, standard metrics\n(Accuracy, Precision, Recall, F1 Score) are reported and various prompt\nvariants comprising the best practices of prompt engineering are tested. In\naddition, to address the problem of ambiguous entity mentions, a simple, yet\neffective LLM-based disambiguation method is proposed, ensuring that the\nevaluation aligns with reality. Finally, the proposed approach is compared\nagainst a widely used state-of-the-art open-source baseline, showing the\nsuperiority of the proposed method.\n","authors":["Panagiotis Koletsis","Panagiotis-Konstantinos Gemos","Christos Chronis","Iraklis Varlamis","Vasilis Efthymiou","Georgios Th. Papadopoulos"],"pdf_url":"https://arxiv.org/pdf/2409.13704v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06826v1","updated":"2024-11-11T09:39:31Z","published":"2024-11-11T09:39:31Z","title":"Adaptive Conditional Expert Selection Network for Multi-domain\n Recommendation","summary":" Mixture-of-Experts (MOE) has recently become the de facto standard in\nMulti-domain recommendation (MDR) due to its powerful expressive ability.\nHowever, such MOE-based method typically employs all experts for each instance,\nleading to scalability issue and low-discriminability between domains and\nexperts. Furthermore, the design of commonly used domain-specific networks\nexacerbates the scalability issues. To tackle the problems, We propose a novel\nmethod named CESAA consists of Conditional Expert Selection (CES) Module and\nAdaptive Expert Aggregation (AEA) Module to tackle these challenges.\nSpecifically, CES first combines a sparse gating strategy with domain-shared\nexperts. Then AEA utilizes mutual information loss to strengthen the\ncorrelations between experts and specific domains, and significantly improve\nthe distinction between experts. As a result, only domain-shared experts and\nselected domain-specific experts are activated for each instance, striking a\nbalance between computational efficiency and model performance. Experimental\nresults on both public ranking and industrial retrieval datasets verify the\neffectiveness of our method in MDR tasks.\n","authors":["Kuiyao Dong","Xingyu Lou","Feng Liu","Ruian Wang","Wenyi Yu","Ping Wang","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2411.06826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06823v1","updated":"2024-11-11T09:31:46Z","published":"2024-11-11T09:31:46Z","title":"Large Language Model in Medical Informatics: Direct Classification and\n Enhanced Text Representations for Automatic ICD Coding","summary":" Addressing the complexity of accurately classifying International\nClassification of Diseases (ICD) codes from medical discharge summaries is\nchallenging due to the intricate nature of medical documentation. This paper\nexplores the use of Large Language Models (LLM), specifically the LLAMA\narchitecture, to enhance ICD code classification through two methodologies:\ndirect application as a classifier and as a generator of enriched text\nrepresentations within a Multi-Filter Residual Convolutional Neural Network\n(MultiResCNN) framework. We evaluate these methods by comparing them against\nstate-of-the-art approaches, revealing LLAMA's potential to significantly\nimprove classification outcomes by providing deep contextual insights into\nmedical texts.\n","authors":["Zeyd Boukhers","AmeerAli Khan","Qusai Ramadan","Cong Yang"],"pdf_url":"https://arxiv.org/pdf/2411.06823v1.pdf","comment":"accepted at the 2024 IEEE International Conference on Bioinformatics\n and Biomedicine (BIBM 2024)"},{"id":"http://arxiv.org/abs/2411.06805v1","updated":"2024-11-11T09:03:52Z","published":"2024-11-11T09:03:52Z","title":"AssistRAG: Boosting the Potential of Large Language Models with an\n Intelligent Information Assistant","summary":" The emergence of Large Language Models (LLMs) has significantly advanced\nnatural language processing, but these models often generate factually\nincorrect information, known as \"hallucination\". Initial retrieval-augmented\ngeneration (RAG) methods like the \"Retrieve-Read\" framework was inadequate for\ncomplex reasoning tasks. Subsequent prompt-based RAG strategies and Supervised\nFine-Tuning (SFT) methods improved performance but required frequent retraining\nand risked altering foundational LLM capabilities. To cope with these\nchallenges, we propose Assistant-based Retrieval-Augmented Generation\n(AssistRAG), integrating an intelligent information assistant within LLMs. This\nassistant manages memory and knowledge through tool usage, action execution,\nmemory building, and plan specification. Using a two-phase training approach,\nCurriculum Assistant Learning and Reinforced Preference Optimization. AssistRAG\nenhances information retrieval and decision-making. Experiments show AssistRAG\nsignificantly outperforms benchmarks, especially benefiting less advanced LLMs,\nby providing superior reasoning capabilities and accurate responses.\n","authors":["Yujia Zhou","Zheng Liu","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2411.06805v1.pdf","comment":"Accepted by NeurIPS 2024 (poster)"},{"id":"http://arxiv.org/abs/2411.06784v1","updated":"2024-11-11T08:23:37Z","published":"2024-11-11T08:23:37Z","title":"Boosting the Targeted Transferability of Adversarial Examples via\n Salient Region & Weighted Feature Drop","summary":" Deep neural networks can be vulnerable to adversarially crafted examples,\npresenting significant risks to practical applications. A prevalent approach\nfor adversarial attacks relies on the transferability of adversarial examples,\nwhich are generated from a substitute model and leveraged to attack unknown\nblack-box models. Despite various proposals aimed at improving transferability,\nthe success of these attacks in targeted black-box scenarios is often hindered\nby the tendency for adversarial examples to overfit to the surrogate models. In\nthis paper, we introduce a novel framework based on Salient region & Weighted\nFeature Drop (SWFD) designed to enhance the targeted transferability of\nadversarial examples. Drawing from the observation that examples with higher\ntransferability exhibit smoother distributions in the deep-layer outputs, we\npropose the weighted feature drop mechanism to modulate activation values\naccording to weights scaled by norm distribution, effectively addressing the\noverfitting issue when generating adversarial examples. Additionally, by\nleveraging salient region within the image to construct auxiliary images, our\nmethod enables the adversarial example's features to be transferred to the\ntarget category in a model-agnostic manner, thereby enhancing the\ntransferability. Comprehensive experiments confirm that our approach\noutperforms state-of-the-art methods across diverse configurations. On average,\nthe proposed SWFD raises the attack success rate for normally trained models\nand robust models by 16.31% and 7.06% respectively.\n","authors":["Shanjun Xu","Linghui Li","Kaiguo Yuan","Bingyu Li"],"pdf_url":"https://arxiv.org/pdf/2411.06784v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2409.14038v4","updated":"2024-11-11T06:26:39Z","published":"2024-09-21T06:49:34Z","title":"OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model\n Hallucinations in Ontology Matching","summary":" Hallucinations of large language models (LLMs) commonly occur in\ndomain-specific downstream tasks, with no exception in ontology matching (OM).\nThe prevalence of using LLMs for OM raises the need for benchmarks to better\nunderstand LLM hallucinations. The OAEI-LLM dataset is an extended version of\nthe Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate\nLLM-specific hallucinations in OM tasks. We outline the methodology used in\ndataset construction and schema extension, and provide examples of potential\nuse cases.\n","authors":["Zhangcheng Qiang","Kerry Taylor","Weiqing Wang","Jing Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.14038v4.pdf","comment":"5 pages, 1 figure, 1 table"},{"id":"http://arxiv.org/abs/2310.09874v4","updated":"2024-11-11T06:16:24Z","published":"2023-10-15T16:15:07Z","title":"TF-DCon: Leveraging Large Language Models (LLMs) to Empower\n Training-Free Dataset Condensation for Content-Based Recommendation","summary":" Modern techniques in Content-based Recommendation (CBR) leverage item content\ninformation to provide personalized services to users, but suffer from\nresource-intensive training on large datasets. To address this issue, we\nexplore the dataset condensation for textual CBR in this paper. The goal of\ndataset condensation is to synthesize a small yet informative dataset, upon\nwhich models can achieve performance comparable to those trained on large\ndatasets. While existing condensation approaches are tailored to classification\ntasks for continuous data like images or embeddings, direct application of them\nto CBR has limitations. To bridge this gap, we investigate efficient dataset\ncondensation for content-based recommendation. Inspired by the remarkable\nabilities of large language models (LLMs) in text comprehension and generation,\nwe leverage LLMs to empower the generation of textual content during\ncondensation. To handle the interaction data involving both users and items, we\ndevise a dual-level condensation method: content-level and user-level. At\ncontent-level, we utilize LLMs to condense all contents of an item into a new\ninformative title. At user-level, we design a clustering-based synthesis\nmodule, where we first utilize LLMs to extract user interests. Then, the user\ninterests and user embeddings are incorporated to condense users and generate\ninteractions for condensed users. Notably, the condensation paradigm of this\nmethod is forward and free from iterative optimization on the synthesized\ndataset. Extensive empirical findings from our study, conducted on three\nauthentic datasets, substantiate the efficacy of the proposed method.\nParticularly, we are able to approximate up to 97% of the original performance\nwhile reducing the dataset size by 95% (i.e., on dataset MIND).\n","authors":["Jiahao Wu","Qijiong Liu","Hengchang Hu","Wenqi Fan","Shengcai Liu","Qing Li","Xiao-Ming Wu","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2310.09874v4.pdf","comment":"An updated version"}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.07428v1","updated":"2024-11-11T23:05:02Z","published":"2024-11-11T23:05:02Z","title":"Just Label the Repeats for In-The-Wild Audio-to-Score Alignment","summary":" We propose an efficient workflow for high-quality offline alignment of\nin-the-wild performance audio and corresponding sheet music scans (images).\nRecent work on audio-to-score alignment extends dynamic time warping (DTW) to\nbe theoretically able to handle jumps in sheet music induced by repeat\nsigns-this method requires no human annotations, but we show that it often\nyields low-quality alignments. As an alternative, we propose a workflow and\ninterface that allows users to quickly annotate jumps (by clicking on repeat\nsigns), requiring a small amount of human supervision but yielding much higher\nquality alignments on average. Additionally, we refine audio and score feature\nrepresentations to improve alignment quality by: (1) integrating measure\ndetection into the score feature representation, and (2) using raw onset\nprediction probabilities from a music transcription model instead of piano\nroll. We propose an evaluation protocol for audio-to-score alignment that\ncomputes the distance between the estimated and ground truth alignment in units\nof measures. Under this evaluation, we find that our proposed jump annotation\nworkflow and improved feature representations together improve alignment\naccuracy by 150% relative to prior work (33% to 82%).\n","authors":["Irmak Bukey","Michael Feffer","Chris Donahue"],"pdf_url":"https://arxiv.org/pdf/2411.07428v1.pdf","comment":"25th International Society for Music Information Retrieval\n Conference, San Francisco, 2024"},{"id":"http://arxiv.org/abs/2411.07335v1","updated":"2024-11-11T19:53:05Z","published":"2024-11-11T19:53:05Z","title":"Multimodal Fusion Balancing Through Game-Theoretic Regularization","summary":" Multimodal learning can complete the picture of information extraction by\nuncovering key dependencies between data sources. However, current systems fail\nto fully leverage multiple modalities for optimal performance. This has been\nattributed to modality competition, where modalities strive for training\nresources, leaving some underoptimized. We show that current balancing methods\nstruggle to train multimodal models that surpass even simple baselines, such as\nensembles. This raises the question: how can we ensure that all modalities in\nmultimodal training are sufficiently trained, and that learning from new\nmodalities consistently improves performance? This paper proposes the\nMultimodal Competition Regularizer (MCR), a new loss component inspired by\nmutual information (MI) decomposition designed to prevent the adverse effects\nof competition in multimodal training. Our key contributions are: 1)\nIntroducing game-theoretic principles in multimodal learning, where each\nmodality acts as a player competing to maximize its influence on the final\noutcome, enabling automatic balancing of the MI terms. 2) Refining lower and\nupper bounds for each MI term to enhance the extraction of task-relevant unique\nand shared information across modalities. 3) Suggesting latent space\npermutations for conditional MI estimation, significantly improving\ncomputational efficiency. MCR outperforms all previously suggested training\nstrategies and is the first to consistently improve multimodal learning beyond\nthe ensemble baseline, clearly demonstrating that combining modalities leads to\nsignificant performance gains on both synthetic and large real-world datasets.\n","authors":["Konstantinos Kontras","Thomas Strypsteen","Christos Chatzichristos","Paul P. Liang","Matthew Blaschko","Maarten De Vos"],"pdf_url":"https://arxiv.org/pdf/2411.07335v1.pdf","comment":"21 pages, 6 figures, 4 tables, 1 algorithm"},{"id":"http://arxiv.org/abs/2411.07155v1","updated":"2024-11-11T17:32:55Z","published":"2024-11-11T17:32:55Z","title":"Low Complexity Learning-based Lossless Event-based Compression","summary":" Event cameras are a cutting-edge type of visual sensors that capture data by\ndetecting brightness changes at the pixel level asynchronously. These cameras\noffer numerous benefits over conventional cameras, including high temporal\nresolution, wide dynamic range, low latency, and lower power consumption.\nHowever, the substantial data rates they produce require efficient compression\ntechniques, while also fulfilling other typical application requirements, such\nas the ability to respond to visual changes in real-time or near real-time.\nAdditionally, many event-based applications demand high accuracy, making\nlossless coding desirable, as it retains the full detail of the sensor data.\nLearning-based methods show great potential due to their ability to model the\nunique characteristics of event data thus allowing to achieve high compression\nrates. This paper proposes a low-complexity lossless coding solution based on\nthe quadtree representation that outperforms traditional compression algorithms\nin efficiency and speed, ensuring low computational complexity and minimal\ndelay for real-time applications. Experimental results show that the proposed\nmethod delivers better compression ratios, i.e., with fewer bits per event, and\nlower computational complexity compared to current lossless data compression\nmethods.\n","authors":["Ahmadreza Sezavar","Catarina Brites","Joao Ascenso"],"pdf_url":"https://arxiv.org/pdf/2411.07155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06976v1","updated":"2024-11-11T13:34:24Z","published":"2024-11-11T13:34:24Z","title":"A Hierarchical Compression Technique for 3D Gaussian Splatting\n Compression","summary":" 3D Gaussian Splatting (GS) demonstrates excellent rendering quality and\ngeneration speed in novel view synthesis. However, substantial data size poses\nchallenges for storage and transmission, making 3D GS compression an essential\ntechnology. Current 3D GS compression research primarily focuses on developing\nmore compact scene representations, such as converting explicit 3D GS data into\nimplicit forms. In contrast, compression of the GS data itself has hardly been\nexplored. To address this gap, we propose a Hierarchical GS Compression (HGSC)\ntechnique. Initially, we prune unimportant Gaussians based on importance scores\nderived from both global and local significance, effectively reducing\nredundancy while maintaining visual quality. An Octree structure is used to\ncompress 3D positions. Based on the 3D GS Octree, we implement a hierarchical\nattribute compression strategy by employing a KD-tree to partition the 3D GS\ninto multiple blocks. We apply farthest point sampling to select anchor\nprimitives within each block and others as non-anchor primitives with varying\nLevels of Details (LoDs). Anchor primitives serve as reference points for\npredicting non-anchor primitives across different LoDs to reduce spatial\nredundancy. For anchor primitives, we use the region adaptive hierarchical\ntransform to achieve near-lossless compression of various attributes. For\nnon-anchor primitives, each is predicted based on the k-nearest anchor\nprimitives. To further minimize prediction errors, the reconstructed LoD and\nanchor primitives are combined to form new anchor primitives to predict the\nnext LoD. Our method notably achieves superior compression quality and a\nsignificant data size reduction of over 4.5 times compared to the\nstate-of-the-art compression method on small scenes datasets.\n","authors":["He Huang","Wenjie Huang","Qi Yang","Yiling Xu","Zhu li"],"pdf_url":"https://arxiv.org/pdf/2411.06976v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06810v1","updated":"2024-11-11T09:11:01Z","published":"2024-11-11T09:11:01Z","title":"JPEG AI Image Compression Visual Artifacts: Detection Methods and\n Dataset","summary":" Learning-based image compression methods have improved in recent years and\nstarted to outperform traditional codecs. However, neural-network approaches\ncan unexpectedly introduce visual artifacts in some images. We therefore\npropose methods to separately detect three types of artifacts (texture and\nboundary degradation, color change, and text corruption), to localize the\naffected regions, and to quantify the artifact strength. We consider only those\nregions that exhibit distortion due solely to the neural compression but that a\ntraditional codec recovers successfully at a comparable bitrate. We employed\nour methods to collect artifacts for the JPEG AI verification model with\nrespect to HM-18.0, the H.265 reference software. We processed about 350,000\nunique images from the Open Images dataset using different compression-quality\nparameters; the result is a dataset of 46,440 artifacts validated through\ncrowd-sourced subjective assessment. Our proposed dataset and methods are\nvaluable for testing neural-network-based image codecs, identifying bugs in\nthese codecs, and enhancing their performance. We make source code of the\nmethods and the dataset publicly available.\n","authors":["Daria Tsereh","Mark Mirgaleev","Ivan Molodetskikh","Roman Kazantsev","Dmitriy Vatolin"],"pdf_url":"https://arxiv.org/pdf/2411.06810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.04097v2","updated":"2024-11-11T09:05:15Z","published":"2024-05-07T07:57:15Z","title":"Unmasking Illusions: Understanding Human Perception of Audiovisual\n Deepfakes","summary":" The emergence of contemporary deepfakes has attracted significant attention\nin machine learning research, as artificial intelligence (AI) generated\nsynthetic media increases the incidence of misinterpretation and is difficult\nto distinguish from genuine content. Currently, machine learning techniques\nhave been extensively studied for automatically detecting deepfakes. However,\nhuman perception has been less explored. Malicious deepfakes could ultimately\ncause public and social problems. Can we humans correctly perceive the\nauthenticity of the content of the videos we watch? The answer is obviously\nuncertain; therefore, this paper aims to evaluate the human ability to discern\ndeepfake videos through a subjective study. We present our findings by\ncomparing human observers to five state-ofthe-art audiovisual deepfake\ndetection models. To this end, we used gamification concepts to provide 110\nparticipants (55 native English speakers and 55 non-native English speakers)\nwith a webbased platform where they could access a series of 40 videos (20 real\nand 20 fake) to determine their authenticity. Each participant performed the\nexperiment twice with the same 40 videos in different random orders. The videos\nare manually selected from the FakeAVCeleb dataset. We found that all AI models\nperformed better than humans when evaluated on the same 40 videos. The study\nalso reveals that while deception is not impossible, humans tend to\noverestimate their detection capabilities. Our experimental results may help\nbenchmark human versus machine performance, advance forensics analysis, and\nenable adaptive countermeasures.\n","authors":["Ammarah Hashmi","Sahibzada Adil Shahzad","Chia-Wen Lin","Yu Tsao","Hsin-Min Wang"],"pdf_url":"https://arxiv.org/pdf/2405.04097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06742v1","updated":"2024-11-11T06:40:06Z","published":"2024-11-11T06:40:06Z","title":"Loss-tolerant neural video codec aware congestion control for real time\n video communication","summary":" Because of reinforcement learning's (RL) ability to automatically create more\nadaptive controlling logics beyond the hand-crafted heuristics, numerous effort\nhas been made to apply RL to congestion control (CC) design for real time video\ncommunication (RTC) applications and has successfully shown promising benefits\nover the rule-based RTC CCs. Online reinforcement learning is often adopted to\ntrain the RL models so the models can directly adapt to real network\nenvironments. However, its trail-and-error manner can also cause catastrophic\ndegradation of the quality of experience (QoE) of RTC application at run time.\nThus, safeguard strategies such as falling back to hand-crafted heuristics can\nbe used to run along with RL models to guarantee the actions explored in the\ntraining sensible, despite that these safeguard strategies interrupt the\nlearning process and make it more challenging to discover optimal RL policies.\n The recent emergence of loss-tolerant neural video codecs (NVC) naturally\nprovides a layer of protection for the online learning of RL-based congestion\ncontrol because of its resilience to packet losses, but such packet loss\nresilience have not been fully exploited in prior works yet. In this paper, we\npresent a reinforcement learning (RL) based congestion control which can be\naware of and takes advantage of packet loss tolerance characteristic of NVCs\nvia reward in online RL learning. Through extensive evaluation on various\nvideos and network traces in a simulated environment, we demonstrate that our\nNVC-aware CC running with the loss-tolerant NVC reduces the training time by\n41\\% compared to other prior RL-based CCs. It also boosts the mean video\nquality by 0.3 to 1.6dB, lower the tail frame delay by 3 to 200ms, and reduces\nthe video stalls by 20\\% to 77\\% in comparison with other baseline RTC CCs.\n","authors":["Zhengxu Xia","Hanchen Li","Junchen Jiang"],"pdf_url":"https://arxiv.org/pdf/2411.06742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02188v2","updated":"2024-11-11T05:14:15Z","published":"2023-12-01T23:56:00Z","title":"Video Summarization: Towards Entity-Aware Captions","summary":" Existing popular video captioning benchmarks and models deal with generic\ncaptions devoid of specific person, place or organization named entities. In\ncontrast, news videos present a challenging setting where the caption requires\nsuch named entities for meaningful summarization. As such, we propose the task\nof summarizing news video directly to entity-aware captions. We also release a\nlarge-scale dataset, VIEWS (VIdeo NEWS), to support research on this task.\nFurther, we propose a method that augments visual information from videos with\ncontext retrieved from external world knowledge to generate entity-aware\ncaptions. We demonstrate the effectiveness of our approach on three video\ncaptioning models. We also show that our approach generalizes to existing news\nimage captions dataset. With all the extensive experiments and insights, we\nbelieve we establish a solid basis for future research on this challenging\ntask.\n","authors":["Hammad A. Ayyubi","Tianqi Liu","Arsha Nagrani","Xudong Lin","Mingda Zhang","Anurag Arnab","Feng Han","Yukun Zhu","Jialu Liu","Shih-Fu Chang"],"pdf_url":"https://arxiv.org/pdf/2312.02188v2.pdf","comment":null}]},"2024-11-10T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.11361v3","updated":"2024-11-10T23:58:53Z","published":"2023-12-18T17:18:04Z","title":"\"Knowing When You Don't Know\": A Multilingual Relevance Assessment\n Dataset for Robust Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) grounds Large Language Model (LLM)\noutput by leveraging external knowledge sources to reduce factual\nhallucinations. However, prior work lacks a comprehensive evaluation of\ndifferent language families, making it challenging to evaluate LLM robustness\nagainst errors in external retrieved knowledge. To overcome this, we establish\nNoMIRACL, a human-annotated dataset for evaluating LLM robustness in RAG across\n18 typologically diverse languages. NoMIRACL includes both a non-relevant and a\nrelevant subset. Queries in the non-relevant subset contain passages judged as\nnon-relevant, whereas queries in the relevant subset include at least a single\njudged relevant passage. We measure relevance assessment using: (i)\nhallucination rate, measuring model tendency to hallucinate, when the answer is\nnot present in passages in the non-relevant subset, and (ii) error rate,\nmeasuring model inaccuracy to recognize relevant passages in the relevant\nsubset.In our work, we observe that most models struggle to balance the two\ncapacities. Models such as LLAMA-2 and Orca-2 achieve over 88% hallucination\nrate on the non-relevant subset. Mistral and LLAMA-3 hallucinate less but can\nachieve up to a 74.9% error rate on the relevant subset. Overall, GPT-4 is\nobserved to provide the best tradeoff on both subsets, highlighting future work\nnecessary to improve LLM robustness. NoMIRACL dataset and evaluation code are\navailable at: https://github.com/project-miracl/nomiracl.\n","authors":["Nandan Thakur","Luiz Bonifacio","Xinyu Zhang","Odunayo Ogundepo","Ehsan Kamalloo","David Alfonso-Hermelo","Xiaoguang Li","Qun Liu","Boxing Chen","Mehdi Rezagholizadeh","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.11361v3.pdf","comment":"EMNLP 2024 (Findings)"},{"id":"http://arxiv.org/abs/2406.10593v2","updated":"2024-11-10T13:45:48Z","published":"2024-06-15T10:54:54Z","title":"QDA-SQL: Questions Enhanced Dialogue Augmentation for Multi-Turn\n Text-to-SQL","summary":" Fine-tuning large language models (LLMs) for specific domain tasks has\nachieved great success in Text-to-SQL tasks. However, these fine-tuned models\noften face challenges with multi-turn Text-to-SQL tasks caused by ambiguous or\nunanswerable questions. It is desired to enhance LLMs to handle multiple types\nof questions in multi-turn Text-to-SQL tasks. To address this, we propose a\nnovel data augmentation method, called QDA-SQL, which generates multiple types\nof multi-turn Q\\&A pairs using LLMs. In QDA-SQL, we introduce a method\nincorporating validation and correction mechanisms to handle complex multi-turn\nText-to-SQL tasks. Experimental results demonstrate that QDA-SQL enables\nfine-tuned models to exhibit higher performance on SQL statement accuracy and\nenhances their ability to handle complex, unanswerable questions in multi-turn\nText-to-SQL tasks. The generation script and test set are released at\nhttps://github.com/mcxiaoxiao/QDA-SQL\n","authors":["Yinggang Sun","Ziming Guo","Haining Yu","Chuanyi Liu","Xiang Li","Bingxuan Wang","Xiangzhan Yu","Tiancheng Zhao"],"pdf_url":"https://arxiv.org/pdf/2406.10593v2.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2411.06420v1","updated":"2024-11-10T10:49:13Z","published":"2024-11-10T10:49:13Z","title":"Generating Mixcode Popular Songs with Artificial Intelligence: Concepts,\n Plans, and Speculations","summary":" Music is a potent form of expression that can communicate, accentuate or even\ncreate the emotions of an individual or a collective. Both historically and in\ncontemporary experiences, musical expression was and is commonly\ninstrumentalized for social, political and/or economic purposes. Generative\nartificial intelligence provides a wealth of both opportunities and challenges\nwith regard to music and its role in society. This paper discusses a proposed\nproject integrating artificial intelligence and popular music, with the\nultimate goal of creating a powerful tool for implementing music for social\ntransformation, education, healthcare, and emotional well-being. Given that it\nis being presented at the outset of a collaboration between a computer\nscientist/data analyst and an ethnomusicologist/social anthropologist. it is\nmainly conceptual and somewhat speculative in nature.\n","authors":["Abhishek Kaushik","Kayla Rush"],"pdf_url":"https://arxiv.org/pdf/2411.06420v1.pdf","comment":"Link to the paper:https://aimc2024.pubpub.org/pub/rdulfbve/release/1\n Published in The International Conference on AI and Musical Creativity at the\n University of Oxford (2024) https://aimc2024.pubpub.org/"},{"id":"http://arxiv.org/abs/2410.13293v2","updated":"2024-11-10T08:53:38Z","published":"2024-10-17T07:46:49Z","title":"SBI-RAG: Enhancing Math Word Problem Solving for Students through\n Schema-Based Instruction and Retrieval-Augmented Generation","summary":" Many students struggle with math word problems (MWPs), often finding it\ndifficult to identify key information and select the appropriate mathematical\noperations. Schema-based instruction (SBI) is an evidence-based strategy that\nhelps students categorize problems based on their structure, improving\nproblem-solving accuracy. Building on this, we propose a Schema-Based\nInstruction Retrieval-Augmented Generation (SBI-RAG) framework that\nincorporates a large language model (LLM). Our approach emphasizes step-by-step\nreasoning by leveraging schemas to guide solution generation. We evaluate its\nperformance on the GSM8K dataset, comparing it with GPT-4 and GPT-3.5 Turbo,\nand introduce a \"reasoning score\" metric to assess solution quality. Our\nfindings suggest that SBI-RAG enhances reasoning clarity and facilitates a more\nstructured problem-solving process potentially providing educational benefits\nfor students.\n","authors":["Prakhar Dixit","Tim Oates"],"pdf_url":"https://arxiv.org/pdf/2410.13293v2.pdf","comment":"Accepted to the 4th MATH-AI Workshop at NeurIPS'24"},{"id":"http://arxiv.org/abs/2411.06374v1","updated":"2024-11-10T06:46:44Z","published":"2024-11-10T06:46:44Z","title":"Metric Learning for Tag Recommendation: Tackling Data Sparsity and Cold\n Start Issues","summary":" With the rapid growth of digital information, personalized recommendation\nsystems have become an indispensable part of Internet services, especially in\nthe fields of e-commerce, social media, and online entertainment. However,\ntraditional collaborative filtering and content-based recommendation methods\nhave limitations in dealing with data sparsity and cold start problems,\nespecially in the face of largescale heterogeneous data, which makes it\ndifficult to meet user expectations. This paper proposes a new label\nrecommendation algorithm based on metric learning, which aims to overcome the\nchallenges of traditional recommendation systems by learning effective distance\nor similarity metrics to capture the subtle differences between user\npreferences and item features. Experimental results show that the algorithm\noutperforms baseline methods including local response metric learning (LRML),\ncollaborative metric learning (CML), and adaptive tensor factorization (ATF)\nbased on adversarial learning on multiple evaluation metrics. In particular, it\nperforms particularly well in the accuracy of the first few recommended items,\nwhile maintaining high robustness and maintaining high recommendation accuracy.\n","authors":["Yuanshuai Luo","Rui Wang","Yaxin Liang","Ankai Liang","Wenyi Liu"],"pdf_url":"https://arxiv.org/pdf/2411.06374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18957v3","updated":"2024-11-10T03:45:30Z","published":"2024-09-27T17:58:50Z","title":"LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction","summary":" Classification tasks are typically handled using Machine Learning (ML)\nmodels, which lack a balance between accuracy and interpretability. This paper\nintroduces a new approach for classification tasks using Large Language Models\n(LLMs) in an explainable method. Unlike ML models, which rely heavily on data\ncleaning and feature engineering, this method streamlines the process using\nLLMs. This paper proposes a method called \"Language Model Learning (LML)\"\npowered by a new method called \"Data-Augmented Prediction (DAP).\" The\nclassification is performed by LLMs using a method similar to that used by\nhumans who manually explore and understand the data to decide classifications.\nIn the process of LML, a dataset is summarized and evaluated to determine the\nfeatures leading to each label the most. In the DAP process, the system uses\nthe data summary and a row of the testing dataset to automatically generate a\nquery to retrieve relevant rows from the dataset for context-aware\nclassification. LML and DAP unlock new possibilities in areas that require\nexplainable and context-aware decisions by ensuring satisfactory accuracy even\nwith complex data. The system scored an accuracy above 90% in some test cases,\nconfirming the effectiveness and potential of the system to outperform ML\nmodels in various scenarios. The source code is available at\nhttps://github.com/Pro-GenAI/LML-DAP\n","authors":["Praneeth Vadlapati"],"pdf_url":"https://arxiv.org/pdf/2409.18957v3.pdf","comment":"Made the abstract and the content clearer"}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.09381v8","updated":"2024-11-10T06:55:48Z","published":"2023-05-16T12:09:30Z","title":"AMD: Autoregressive Motion Diffusion","summary":" Human motion generation aims to produce plausible human motion sequences\naccording to various conditional inputs, such as text or audio. Despite the\nfeasibility of existing methods in generating motion based on short prompts and\nsimple motion patterns, they encounter difficulties when dealing with long\nprompts or complex motions. The challenges are two-fold: 1) the scarcity of\nhuman motion-captured data for long prompts and complex motions. 2) the high\ndiversity of human motions in the temporal domain and the substantial\ndivergence of distributions from conditional modalities, leading to a\nmany-to-many mapping problem when generating motion with complex and long\ntexts. In this work, we address these gaps by 1) elaborating the first dataset\npairing long textual descriptions and 3D complex motions (HumanLong3D), and 2)\nproposing an autoregressive motion diffusion model (AMD). Specifically, AMD\nintegrates the text prompt at the current timestep with the text prompt and\naction sequences at the previous timestep as conditional information to predict\nthe current action sequences in an iterative manner. Furthermore, we present\nits generalization for X-to-Motion with \"No Modality Left Behind\", enabling the\ngeneration of high-definition and high-fidelity human motions based on\nuser-defined modality input.\n","authors":["Bo Han","Hao Peng","Minjing Dong","Yi Ren","Yixuan Shen","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2305.09381v8.pdf","comment":"accepted by AAAI2024. Official Code:\n https://github.com/fluide1022/AMD"}]},"2024-11-09T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.06264v1","updated":"2024-11-09T19:32:26Z","published":"2024-11-09T19:32:26Z","title":"GuidelineGuard: An Agentic Framework for Medical Note Evaluation with\n Guideline Adherence","summary":" Although rapid advancements in Large Language Models (LLMs) are facilitating\nthe integration of artificial intelligence-based applications and services in\nhealthcare, limited research has focused on the systematic evaluation of\nmedical notes for guideline adherence. This paper introduces GuidelineGuard, an\nagentic framework powered by LLMs that autonomously analyzes medical notes,\nsuch as hospital discharge and office visit notes, to ensure compliance with\nestablished healthcare guidelines. By identifying deviations from recommended\npractices and providing evidence-based suggestions, GuidelineGuard helps\nclinicians adhere to the latest standards from organizations like the WHO and\nCDC. This framework offers a novel approach to improving documentation quality\nand reducing clinical errors.\n","authors":["MD Ragib Shahriyear"],"pdf_url":"https://arxiv.org/pdf/2411.06264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06256v1","updated":"2024-11-09T19:07:58Z","published":"2024-11-09T19:07:58Z","title":"Annotative Indexing","summary":" This paper introduces annotative indexing, a novel framework that unifies and\ngeneralizes traditional inverted indexes, column stores, object stores, and\ngraph databases. As a result, annotative indexing can provide the underlying\nindexing framework for databases that support knowledge graphs, entity\nretrieval, semi-structured data, and ranked retrieval. While we primarily focus\non human language data in the form of text, annotative indexing is sufficiently\ngeneral to support a range of other datatypes, and we provide examples of\nSQL-like queries over a JSON store that includes numbers and dates. Taking\nadvantage of the flexibility of annotative indexing, we also demonstrate a\nfully dynamic annotative index incorporating support for ACID properties of\ntransactions with hundreds of multiple concurrent readers and writers.\n","authors":["Charles L. A. Clarke"],"pdf_url":"https://arxiv.org/pdf/2411.06256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06254v1","updated":"2024-11-09T19:03:56Z","published":"2024-11-09T19:03:56Z","title":"KeyB2: Selecting Key Blocks is Also Important for Long Document Ranking\n with Large Language Models","summary":" The rapid development of large language models (LLMs) like Llama has\nsignificantly advanced information retrieval (IR) systems. However, using LLMs\nfor long documents, as in RankLLaMA, remains challenging due to computational\ncomplexity, especially concerning input token length. Furthermore, the internal\nmechanisms of LLMs during ranking are still not fully understood. In this\npaper, we first explore the internal workings of LLMs during relevance\njudgement and identify that specific attention heads play a crucial role in\naligning relevant tokens. This observation inspires us to revisit the block\npre-ranking strategy used in KeyB, which remains state-of-the-art (SOTA) on the\nTREC 2019 DL document ranking dataset. Building on these insights, we develop\nKeyB2, an advanced long document IR approach that integrates block pre-ranking\nwith the performance of LLMs. KeyB2 efficiently identifies and processes the\nmost relevant blocks, reducing computational costs and improving ranking\neffectiveness. Additionally, we introduce a new bi-encoder block matching\nstrategy for KeyB2. Comprehensive experiments on long-document datasets,\nincluding TREC 2019 DL, Robust04, and MLDR-zh, show that KeyB2 outperforms\nbaselines like RankLLaMA and KeyB by reducing reranking time and GPU memory\nusage while enhancing retrieval performance, achieving new SOTA results on TREC\n2019 DL with higher NDCG@10 and MAP scores.\n","authors":["Minghan Li","Eric Gaussier","Juntao Li","Guodong Zhou"],"pdf_url":"https://arxiv.org/pdf/2411.06254v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.10978v2","updated":"2024-11-09T18:46:07Z","published":"2024-03-16T17:21:58Z","title":"Lambda: Learning Matchable Prior For Entity Alignment with Unlabeled\n Dangling Cases","summary":" We investigate the entity alignment (EA) problem with unlabeled dangling\ncases, meaning that partial entities have no counterparts in the other\nknowledge graph (KG), and this type of entity remains unlabeled. To address\nthis challenge, we propose the framework \\textit{Lambda} for dangling detection\nand then entity alignment. Lambda features a GNN-based encoder called KEESA\nwith spectral contrastive learning for EA and a positive-unlabeled learning\nalgorithm for dangling detection called iPULE. iPULE offers theoretical\nguarantees of unbiasedness, uniform deviation bounds, and convergence.\nExperimental results demonstrate that each component contributes to overall\nperformances that are superior to baselines, even when baselines additionally\nexploit 30\\% of dangling entities labeled for training.\n","authors":["Hang Yin","Liyao Xiang","Dong Ding","Yuheng He","Yihan Wu","Xinbing Wang","Chenghu Zhou"],"pdf_url":"https://arxiv.org/pdf/2403.10978v2.pdf","comment":"Accepted in NeurIPS 2024 as a poster"},{"id":"http://arxiv.org/abs/2411.06237v1","updated":"2024-11-09T17:38:01Z","published":"2024-11-09T17:38:01Z","title":"Leveraging Retrieval-Augmented Generation for University Knowledge\n Retrieval","summary":" This paper introduces an innovative approach using Retrieval-Augmented\nGeneration (RAG) pipelines with Large Language Models (LLMs) to enhance\ninformation retrieval and query response systems for university-related\nquestion answering. By systematically extracting data from the university\nofficial webpage and employing advanced prompt engineering techniques, we\ngenerate accurate, contextually relevant responses to user queries.\n We developed a comprehensive university benchmark, UniversityQuestionBench\n(UQB), to rigorously evaluate our system performance, based on common key\nmetrics in the filed of RAG pipelines, assessing accuracy and reliability\nthrough various metrics and real-world scenarios. Our experimental results\ndemonstrate significant improvements in the precision and relevance of\ngenerated responses, enhancing user experience and reducing the time required\nto obtain relevant answers. In summary, this paper presents a novel application\nof RAG pipelines and LLMs, supported by a meticulously prepared university\nbenchmark, offering valuable insights into advanced AI techniques for academic\ndata retrieval and setting the stage for future research in this domain.\n","authors":["Arshia Hemmat","Kianoosh Vadaei","Mohammad Hassan Heydari","Afsaneh Fatemi"],"pdf_url":"https://arxiv.org/pdf/2411.06237v1.pdf","comment":"6 pages, 2 figures, 1 table, Submitted to 15th IKT conference"},{"id":"http://arxiv.org/abs/2411.06112v1","updated":"2024-11-09T08:22:31Z","published":"2024-11-09T08:22:31Z","title":"Interpret the Internal States of Recommendation Model with Sparse\n Autoencoder","summary":" Explainable recommendation systems are important to enhance transparency,\naccuracy, and fairness. Beyond result-level explanations, model-level\ninterpretations can provide valuable insights that allow developers to optimize\nsystem designs and implement targeted improvements. However, most current\napproaches depend on specialized model designs, which often lack generalization\ncapabilities. Given the various kinds of recommendation models, existing\nmethods have limited ability to effectively interpret them. To address this\nissue, we propose RecSAE, an automatic, generalizable probing method for\ninterpreting the internal states of Recommendation models with Sparse\nAutoEncoder. RecSAE serves as a plug-in module that does not affect original\nmodels during interpretations, while also enabling predictable modifications to\ntheir behaviors based on interpretation results. Firstly, we train an\nautoencoder with sparsity constraints to reconstruct internal activations of\nrecommendation models, making the RecSAE latents more interpretable and\nmonosemantic than the original neuron activations. Secondly, we automated the\nconstruction of concept dictionaries based on the relationship between latent\nactivations and input item sequences. Thirdly, RecSAE validates these\ninterpretations by predicting latent activations on new item sequences using\nthe concept dictionary and deriving interpretation confidence scores from\nprecision and recall. We demonstrate RecSAE's effectiveness on two datasets,\nidentifying hundreds of highly interpretable concepts from pure ID-based\nmodels. Latent ablation studies further confirm that manipulating latent\nconcepts produces corresponding changes in model output behavior, underscoring\nRecSAE's utility for both understanding and targeted tuning recommendation\nmodels. Code and data are publicly available at\nhttps://github.com/Alice1998/RecSAE.\n","authors":["Jiayin Wang","Xiaoyu Zhang","Weizhi Ma","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2411.06112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.06064v1","updated":"2024-11-09T04:23:58Z","published":"2024-11-09T04:23:58Z","title":"Snippet-based Conversational Recommender System","summary":" Conversational Recommender Systems (CRS) engage users in interactive\ndialogues to gather preferences and provide personalized recommendations.\nTraditionally, CRS rely on pre-defined attributes or expensive, domain-specific\nannotated datasets to guide conversations, which limits flexibility and\nadaptability across domains. In this work, we introduce SnipRec, a novel CRS\nthat enhances dialogues and recommendations by extracting diverse expressions\nand preferences from user-generated content (UGC) like customer reviews. Using\nlarge language models, SnipRec maps user responses and UGC to concise snippets,\nwhich are used to generate clarification questions and retrieve relevant items.\nOur approach eliminates the need for domain-specific training, making it\nadaptable to new domains and effective without prior knowledge of user\npreferences. Extensive experiments on the Yelp dataset demonstrate the\neffectiveness of snippet-based representations against document and\nsentence-based representations. Additionally, SnipRec is able to improve\nHits@10 by 0.25 over the course of five conversational turns, underscoring the\nefficiency of SnipRec in capturing user preferences through multi-turn\nconversations.\n","authors":["Haibo Sun","Naoki Otani","Hannah Kim","Dan Zhang","Nikita Bhutani"],"pdf_url":"https://arxiv.org/pdf/2411.06064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05975v5","updated":"2024-11-09T02:41:43Z","published":"2024-01-11T15:22:55Z","title":"End-to-end Learnable Clustering for Intent Learning in Recommendation","summary":" Intent learning, which aims to learn users' intents for user understanding\nand item recommendation, has become a hot research spot in recent years.\nHowever, existing methods suffer from complex and cumbersome alternating\noptimization, limiting performance and scalability. To this end, we propose a\nnovel intent learning method termed \\underline{ELCRec}, by unifying behavior\nrepresentation learning into an \\underline{E}nd-to-end \\underline{L}earnable\n\\underline{C}lustering framework, for effective and efficient\n\\underline{Rec}ommendation. Concretely, we encode user behavior sequences and\ninitialize the cluster centers (latent intents) as learnable neurons. Then, we\ndesign a novel learnable clustering module to separate different cluster\ncenters, thus decoupling users' complex intents. Meanwhile, it guides the\nnetwork to learn intents from behaviors by forcing behavior embeddings close to\ncluster centers. This allows simultaneous optimization of recommendation and\nclustering via mini-batch data. Moreover, we propose intent-assisted\ncontrastive learning by using cluster centers as self-supervision signals,\nfurther enhancing mutual promotion. Both experimental results and theoretical\nanalyses demonstrate the superiority of ELCRec from six perspectives. Compared\nto the runner-up, ELCRec improves NDCG@5 by 8.9\\% and reduces computational\ncosts by 22.5\\% on the Beauty dataset. Furthermore, due to the scalability and\nuniversal applicability, we deploy this method on the industrial recommendation\nsystem with 130 million page views and achieve promising results. The codes are\navailable on GitHub (https://github.com/yueliu1999/ELCRec). A collection\n(papers, codes, datasets) of deep group recommendation/intent learning methods\nis available on GitHub\n(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation).\n","authors":["Yue Liu","Shihao Zhu","Jun Xia","Yingwei Ma","Jian Ma","Xinwang Liu","Shengju Yu","Kejun Zhang","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.05975v5.pdf","comment":"37 pages"}]},"2024-11-08T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2411.07264v1","updated":"2024-11-08T21:03:54Z","published":"2024-11-08T21:03:54Z","title":"Multi-Document Financial Question Answering using LLMs","summary":" We propose two new methods for multi-document financial question answering.\nFirst, a method that uses semantic tagging, and then, queries the index to get\nthe context (RAG_SEM). And second, a Knowledge Graph (KG_RAG) based method that\nuses semantic tagging, and, retrieves knowledge graph triples from a graph\ndatabase, as context. KG_RAG uses knowledge graphs constructed using a small\nmodel that is fine-tuned using knowledge distillation using a large teacher\nmodel. The data consists of 18 10K reports of Apple, Microsoft, Alphabet,\nNVIDIA, Amazon and Tesla for the years 2021, 2022 and 2023. The list of\nquestions in the data consists of 111 complex questions including many esoteric\nquestions that are difficult to answer and the answers are not completely\nobvious. As evaluation metrics, we use overall scores as well as segmented\nscores for measurement including the faithfulness, relevance, correctness,\nsimilarity, an LLM based overall score and the rouge scores as well as a\nsimilarity of embeddings. We find that both methods outperform plain RAG\nsignificantly. KG_RAG outperforms RAG_SEM in four out of nine metrics.\n","authors":["Shalin Shah","Srikanth Ryali","Ramasubbu Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2411.07264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05937v1","updated":"2024-11-08T19:52:57Z","published":"2024-11-08T19:52:57Z","title":"The effect of different feature selection methods on models created with\n XGBoost","summary":" This study examines the effect that different feature selection methods have\non models created with XGBoost, a popular machine learning algorithm with\nsuperb regularization methods. It shows that three different ways for reducing\nthe dimensionality of features produces no statistically significant change in\nthe prediction accuracy of the model. This suggests that the traditional idea\nof removing the noisy training data to make sure models do not overfit may not\napply to XGBoost. But it may still be viable in order to reduce computational\ncomplexity.\n","authors":["Jorge Neyra","Vishal B. Siramshetty","Huthaifa I. Ashqar"],"pdf_url":"https://arxiv.org/pdf/2411.05937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05936v1","updated":"2024-11-08T19:47:02Z","published":"2024-11-08T19:47:02Z","title":"Mitigating Hallucination with ZeroG: An Advanced Knowledge Management\n Engine","summary":" The growth of digital documents presents significant challenges in efficient\nmanagement and knowledge extraction. Traditional methods often struggle with\ncomplex documents, leading to issues such as hallucinations and high latency in\nresponses from Large Language Models (LLMs). ZeroG, an innovative approach,\nsignificantly mitigates these challenges by leveraging knowledge distillation\nand prompt tuning to enhance model performance.\n ZeroG utilizes a smaller model that replicates the behavior of a larger\nteacher model, ensuring contextually relevant and grounded responses, by\nemploying a black-box distillation approach, it creates a distilled dataset\nwithout relying on intermediate features, optimizing computational efficiency.\nThis method significantly enhances accuracy and reduces response times,\nproviding a balanced solution for modern document management.\n Incorporating advanced techniques for document ingestion and metadata\nutilization, ZeroG improves the accuracy of question-and-answer systems. The\nintegration of graph databases and robust metadata management further\nstreamlines information retrieval, allowing for precise and context-aware\nresponses. By transforming how organizations interact with complex data, ZeroG\nenhances productivity and user experience, offering a scalable solution for the\ngrowing demands of digital document management.\n","authors":["Anantha Sharma","Sheeba Elizabeth John","Fatemeh Rezapoor Nikroo","Krupali Bhatt","Mrunal Zambre","Aditi Wikhe"],"pdf_url":"https://arxiv.org/pdf/2411.05936v1.pdf","comment":"10 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2411.05930v1","updated":"2024-11-08T19:31:19Z","published":"2024-11-08T19:31:19Z","title":"BERTrend: Neural Topic Modeling for Emerging Trends Detection","summary":" Detecting and tracking emerging trends and weak signals in large, evolving\ntext corpora is vital for applications such as monitoring scientific\nliterature, managing brand reputation, surveilling critical infrastructure and\nmore generally to any kind of text-based event detection. Existing solutions\noften fail to capture the nuanced context or dynamically track evolving\npatterns over time. BERTrend, a novel method, addresses these limitations using\nneural topic modeling in an online setting. It introduces a new metric to\nquantify topic popularity over time by considering both the number of documents\nand update frequency. This metric classifies topics as noise, weak, or strong\nsignals, flagging emerging, rapidly growing topics for further investigation.\nExperimentation on two large real-world datasets demonstrates BERTrend's\nability to accurately detect and track meaningful weak signals while filtering\nout noise, offering a comprehensive solution for monitoring emerging trends in\nlarge-scale, evolving text corpora. The method can also be used for\nretrospective analysis of past events. In addition, the use of Large Language\nModels together with BERTrend offers efficient means for the interpretability\nof trends of events.\n","authors":["Allaa Boutaleb","Jerome Picault","Guillaume Grosjean"],"pdf_url":"https://arxiv.org/pdf/2411.05930v1.pdf","comment":"17 pages, 12 figures, FuturED 2024: Workshop on Future of Event\n Detection (CoLocated with EMNLP 2024)"},{"id":"http://arxiv.org/abs/2404.04264v4","updated":"2024-11-08T18:35:49Z","published":"2024-03-17T17:01:45Z","title":"Logic Query of Thoughts: Guiding Large Language Models to Answer Complex\n Logic Queries with Knowledge Graphs","summary":" Despite the superb performance in many tasks, large language models (LLMs)\nbear the risk of generating hallucination or even wrong answers when confronted\nwith tasks that demand the accuracy of knowledge. The issue becomes even more\nnoticeable when addressing logic queries that require multiple logic reasoning\nsteps. On the other hand, knowledge graph (KG) based question answering methods\nare capable of accurately identifying the correct answers with the help of\nknowledge graph, yet its accuracy could quickly deteriorate when the knowledge\ngraph itself is sparse and incomplete. It remains a critical challenge on how\nto integrate knowledge graph reasoning with LLMs in a mutually beneficial way\nso as to mitigate both the hallucination problem of LLMs as well as the\nincompleteness issue of knowledge graphs. In this paper, we propose\n'Logic-Query-of-Thoughts' (LGOT) which is the first of its kind to combine LLMs\nwith knowledge graph based logic query reasoning. LGOT seamlessly combines\nknowledge graph reasoning and LLMs, effectively breaking down complex logic\nqueries into easy to answer subquestions. Through the utilization of both\nknowledge graph reasoning and LLMs, it successfully derives answers for each\nsubquestion. By aggregating these results and selecting the highest quality\ncandidate answers for each step, LGOT achieves accurate results to complex\nquestions. Our experimental findings demonstrate substantial performance\nenhancements, with up to 20% improvement over ChatGPT.\n","authors":["Lihui Liu","Zihao Wang","Ruizhong Qiu","Yikun Ban","Eunice Chan","Yangqiu Song","Jingrui He","Hanghang Tong"],"pdf_url":"https://arxiv.org/pdf/2404.04264v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05649v1","updated":"2024-11-08T15:45:33Z","published":"2024-11-08T15:45:33Z","title":"Harnessing High-Level Song Descriptors towards Natural Language-Based\n Music Recommendation","summary":" Recommender systems relying on Language Models (LMs) have gained popularity\nin assisting users to navigate large catalogs. LMs often exploit item\nhigh-level descriptors, i.e. categories or consumption contexts, from training\ndata or user preferences. This has been proven effective in domains like movies\nor products. However, in the music domain, understanding how effectively LMs\nutilize song descriptors for natural language-based music recommendation is\nrelatively limited. In this paper, we assess LMs effectiveness in recommending\nsongs based on user natural language descriptions and items with descriptors\nlike genres, moods, and listening contexts. We formulate the recommendation\ntask as a dense retrieval problem and assess LMs as they become increasingly\nfamiliar with data pertinent to the task and domain. Our findings reveal\nimproved performance as LMs are fine-tuned for general language similarity,\ninformation retrieval, and mapping longer descriptions to shorter, high-level\ndescriptors in music.\n","authors":["Elena V. Epure","Gabriel Meseguer Brocal","Darius Afchar","Romain Hennequin"],"pdf_url":"https://arxiv.org/pdf/2411.05649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05572v1","updated":"2024-11-08T13:51:37Z","published":"2024-11-08T13:51:37Z","title":"Why These Documents? Explainable Generative Retrieval with Hierarchical\n Category Paths","summary":" Generative retrieval has recently emerged as a new alternative of traditional\ninformation retrieval approaches. However, existing generative retrieval\nmethods directly decode docid when a query is given, making it impossible to\nprovide users with explanations as an answer for \"Why this document is\nretrieved?\". To address this limitation, we propose Hierarchical Category\nPath-Enhanced Generative Retrieval(HyPE), which enhances explainability by\ngenerating hierarchical category paths step-by-step before decoding docid. HyPE\nleverages hierarchical category paths as explanation, progressing from broad to\nspecific semantic categories. This approach enables diverse explanations for\nthe same document depending on the query by using shared category paths between\nthe query and the document, and provides reasonable explanation by reflecting\nthe document's semantic structure through a coarse-to-fine manner. HyPE\nconstructs category paths with external high-quality semantic hierarchy,\nleverages LLM to select appropriate candidate paths for each document, and\noptimizes the generative retrieval model with path-augmented dataset. During\ninference, HyPE utilizes path-aware reranking strategy to aggregate diverse\ntopic information, allowing the most relevant documents to be prioritized in\nthe final ranked list of docids. Our extensive experiments demonstrate that\nHyPE not only offers a high level of explainability but also improves the\nretrieval performance in the document retrieval task.\n","authors":["Sangam Lee","Ryang Heo","SeongKu Kang","Susik Yoon","Jinyoung Yeo","Dongha Lee"],"pdf_url":"https://arxiv.org/pdf/2411.05572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.10321v2","updated":"2024-11-08T13:29:47Z","published":"2024-04-16T07:05:16Z","title":"Cluster-based Graph Collaborative Filtering","summary":" Graph Convolution Networks (GCNs) have significantly succeeded in learning\nuser and item representations for recommendation systems. The core of their\nefficacy is the ability to explicitly exploit the collaborative signals from\nboth the first- and high-order neighboring nodes. However, most existing\nGCN-based methods overlook the multiple interests of users while performing\nhigh-order graph convolution. Thus, the noisy information from unreliable\nneighbor nodes (e.g., users with dissimilar interests) negatively impacts the\nrepresentation learning of the target node. Additionally, conducting graph\nconvolution operations without differentiating high-order neighbors suffers the\nover-smoothing issue when stacking more layers, resulting in performance\ndegradation. In this paper, we aim to capture more valuable information from\nhigh-order neighboring nodes while avoiding noise for better representation\nlearning of the target node. To achieve this goal, we propose a novel GCN-based\nrecommendation model, termed Cluster-based Graph Collaborative Filtering\n(ClusterGCF). This model performs high-order graph convolution on\ncluster-specific graphs, which are constructed by capturing the multiple\ninterests of users and identifying the common interests among them.\nSpecifically, we design an unsupervised and optimizable soft node clustering\napproach to classify user and item nodes into multiple clusters. Based on the\nsoft node clustering results and the topology of the user-item interaction\ngraph, we assign the nodes with probabilities for different clusters to\nconstruct the cluster-specific graphs. To evaluate the effectiveness of\nClusterGCF, we conducted extensive experiments on four publicly available\ndatasets. Experimental results demonstrate that our model can significantly\nimprove recommendation performance.\n","authors":["Fan Liu","Shuai Zhao","Zhiyong Cheng","Liqiang Nie","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2404.10321v2.pdf","comment":"Accepted by ACM TOIS"},{"id":"http://arxiv.org/abs/2411.05892v1","updated":"2024-11-08T12:38:10Z","published":"2024-11-08T12:38:10Z","title":"Identifying and Decomposing Compound Ingredients in Meal Plans Using\n Large Language Models","summary":" This study explores the effectiveness of Large Language Models in meal\nplanning, focusing on their ability to identify and decompose compound\ningredients. We evaluated three models-GPT-4o, Llama-3 (70b), and Mixtral\n(8x7b)-to assess their proficiency in recognizing and breaking down complex\ningredient combinations. Preliminary results indicate that while Llama-3 (70b)\nand GPT-4o excels in accurate decomposition, all models encounter difficulties\nwith identifying essential elements like seasonings and oils. Despite strong\noverall performance, variations in accuracy and completeness were observed\nacross models. These findings underscore LLMs' potential to enhance\npersonalized nutrition but highlight the need for further refinement in\ningredient decomposition. Future research should address these limitations to\nimprove nutritional recommendations and health outcomes.\n","authors":["Leon Kopitar","Leon Bedrac","Larissa J Strath","Jiang Bian","Gregor Stiglic"],"pdf_url":"https://arxiv.org/pdf/2411.05892v1.pdf","comment":"Comments: Presented at NeLaMKRR@KR, 2024 (arXiv:2410.05339)"},{"id":"http://arxiv.org/abs/2411.05442v1","updated":"2024-11-08T09:40:53Z","published":"2024-11-08T09:40:53Z","title":"IntellBot: Retrieval Augmented LLM Chatbot for Cyber Threat Knowledge\n Delivery","summary":" In the rapidly evolving landscape of cyber security, intelligent chatbots are\ngaining prominence. Artificial Intelligence, Machine Learning, and Natural\nLanguage Processing empower these chatbots to handle user inquiries and deliver\nthreat intelligence. This helps cyber security knowledge readily available to\nboth professionals and the public. Traditional rule-based chatbots often lack\nflexibility and struggle to adapt to user interactions. In contrast, Large\nLanguage Model-based chatbots offer contextually relevant information across\nmultiple domains and adapt to evolving conversational contexts. In this work,\nwe develop IntellBot, an advanced cyber security Chatbot built on top of\ncutting-edge technologies like Large Language Models and Langchain alongside a\nRetrieval-Augmented Generation model to deliver superior capabilities. This\nchatbot gathers information from diverse data sources to create a comprehensive\nknowledge base covering known vulnerabilities, recent cyber attacks, and\nemerging threats. It delivers tailored responses, serving as a primary hub for\ncyber security insights. By providing instant access to relevant information\nand resources, this IntellBot enhances threat intelligence, incident response,\nand overall security posture, saving time and empowering users with knowledge\nof cyber security best practices. Moreover, we analyzed the performance of our\ncopilot using a two-stage evaluation strategy. We achieved BERT score above 0.8\nby indirect approach and a cosine similarity score ranging from 0.8 to 1, which\naffirms the accuracy of our copilot. Additionally, we utilized RAGAS to\nevaluate the RAG model, and all evaluation metrics consistently produced scores\nabove 0.77, highlighting the efficacy of our system.\n","authors":["Dincy R. Arikkat","Abhinav M.","Navya Binu","Parvathi M.","Navya Biju","K. S. Arunima","Vinod P.","Rafidha Rehiman K. A.","Mauro Conti"],"pdf_url":"https://arxiv.org/pdf/2411.05442v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05375v1","updated":"2024-11-08T07:05:06Z","published":"2024-11-08T07:05:06Z","title":"Ev2R: Evaluating Evidence Retrieval in Automated Fact-Checking","summary":" Current automated fact-checking (AFC) approaches commonly evaluate evidence\neither implicitly via the predicted verdicts or by comparing retrieved evidence\nwith a predefined closed knowledge source, such as Wikipedia. However, these\nmethods suffer from limitations, resulting from their reliance on evaluation\nmetrics developed for different purposes and constraints imposed by closed\nknowledge sources. Recent advances in natural language generation (NLG)\nevaluation offer new possibilities for evidence assessment. In this work, we\nintroduce Ev2R, an evaluation framework for AFC that comprises three types of\napproaches for evidence evaluation: reference-based, proxy-reference, and\nreference-less. We evaluate their effectiveness through agreement with human\nratings and adversarial tests, and demonstrate that prompt-based scorers,\nparticularly those leveraging LLMs and reference evidence, outperform\ntraditional evaluation approaches.\n","authors":["Mubashara Akhtar","Michael Schlichtkrull","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2411.05375v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2411.05340v1","updated":"2024-11-08T05:43:40Z","published":"2024-11-08T05:43:40Z","title":"Improving Multi-Domain Task-Oriented Dialogue System with Offline\n Reinforcement Learning","summary":" Task-oriented dialogue (TOD) system is designed to accomplish user-defined\ntasks through dialogues. The TOD system has progressed towards end-to-end\nmodeling by leveraging pre-trained large language models. Fine-tuning the\npre-trained language models using only supervised learning leads to the\nexposure bias and token loss problem and it deviates the models from completing\nthe user's task. To address these issues, we propose a TOD system that\nleverages a unified pre-trained language model, GPT2, as a base model. It is\noptimized using supervised learning and reinforcement learning (RL). The issues\nin the TOD system are mitigated using a non-differentiable reward function. The\nreward is calculated using the weighted sum of the success rate and BLEU\nevaluation metrics. The success rate and BLEU metrics in reward calculation\nguide the language model for user task completion while ensuring a coherent and\nfluent response. Our model is acquired by fine-tuning a pre-trained model on\nthe dialogue-session level which comprises user utterance, belief state, system\nact, and system response. Experimental results on MultiWOZ2.1 demonstrate that\nour model increases the inform rate by 1.60% and the success rate by 3.17%\ncompared to the baseline.\n","authors":["Dharmendra Prajapat","Durga Toshniwal"],"pdf_url":"https://arxiv.org/pdf/2411.05340v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10108v3","updated":"2024-11-08T03:58:00Z","published":"2023-10-16T06:41:16Z","title":"On Generative Agents in Recommendation","summary":" Recommender systems are the cornerstone of today's information dissemination,\nyet a disconnect between offline metrics and online performance greatly hinders\ntheir development. Addressing this challenge, we envision a recommendation\nsimulator, capitalizing on recent breakthroughs in human-level intelligence\nexhibited by Large Language Models (LLMs). We propose Agent4Rec, a user\nsimulator in recommendation, leveraging LLM-empowered generative agents\nequipped with user profile, memory, and actions modules specifically tailored\nfor the recommender system. In particular, these agents' profile modules are\ninitialized using real-world datasets (e.g. MovieLens, Steam, Amazon-Book),\ncapturing users' unique tastes and social traits; memory modules log both\nfactual and emotional memories and are integrated with an emotion-driven\nreflection mechanism; action modules support a wide variety of behaviors,\nspanning both taste-driven and emotion-driven actions. Each agent interacts\nwith personalized recommender models in a page-by-page manner, relying on a\npre-implemented collaborative filtering-based recommendation algorithm. We\ndelve into both the capabilities and limitations of Agent4Rec, aiming to\nexplore an essential research question: ``To what extent can LLM-empowered\ngenerative agents faithfully simulate the behavior of real, autonomous humans\nin recommender systems?'' Extensive and multi-faceted evaluations of Agent4Rec\nhighlight both the alignment and deviation between agents and user-personalized\npreferences. Beyond mere performance comparison, we explore insightful\nexperiments, such as emulating the filter bubble effect and discovering the\nunderlying causal relationships in recommendation tasks. Our codes are\navailable at https://github.com/LehengTHU/Agent4Rec.\n","authors":["An Zhang","Yuxin Chen","Leheng Sheng","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.10108v3.pdf","comment":"SIGIR 2024 perspective paper"},{"id":"http://arxiv.org/abs/2308.02580v3","updated":"2024-11-08T02:21:38Z","published":"2023-08-03T16:13:46Z","title":"Feature Noise Resilient for QoS Prediction with Probabilistic Deep\n Supervision","summary":" Accurate Quality of Service (QoS) prediction is essential for enhancing user\nsatisfaction in web recommendation systems, yet existing prediction models\noften overlook feature noise, focusing predominantly on label noise. In this\npaper, we present the Probabilistic Deep Supervision Network (PDS-Net), a\nrobust framework designed to effectively identify and mitigate feature noise,\nthereby improving QoS prediction accuracy. PDS-Net operates with a dual-branch\narchitecture: the main branch utilizes a decoder network to learn a\nGaussian-based prior distribution from known features, while the second branch\nderives a posterior distribution based on true labels. A key innovation of\nPDS-Net is its condition-based noise recognition loss function, which enables\nprecise identification of noisy features in objects (users or services). Once\nnoisy features are identified, PDS-Net refines the feature's prior\ndistribution, aligning it with the posterior distribution, and propagates this\nadjusted distribution to intermediate layers, effectively reducing noise\ninterference. Extensive experiments conducted on two real-world QoS datasets\ndemonstrate that PDS-Net consistently outperforms existing models, achieving an\naverage improvement of 8.91% in MAE on Dataset D1 and 8.32% on Dataset D2\ncompared to the ate-of-the-art. These results highlight PDS-Net's ability to\naccurately capture complex user-service relationships and handle feature noise,\nunderscoring its robustness and versatility across diverse QoS prediction\nenvironments.\n","authors":["Ziliang Wang","Xiaohong Zhang","Ze Shi Li","Sheng Huang","Meng Yan"],"pdf_url":"https://arxiv.org/pdf/2308.02580v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.16156v2","updated":"2024-11-08T00:40:05Z","published":"2024-10-21T16:21:45Z","title":"Limpeh ga li gong: Challenges in Singlish Annotations","summary":" Singlish, or Colloquial Singapore English, is a language formed from oral and\nsocial communication within multicultural Singapore. In this work, we work on a\nfundamental Natural Language Processing (NLP) task: Parts-Of-Speech (POS)\ntagging of Singlish sentences. For our analysis, we build a parallel Singlish\ndataset containing direct English translations and POS tags, with translation\nand POS annotation done by native Singlish speakers. Our experiments show that\nautomatic transition- and transformer- based taggers perform with only $\\sim\n80\\%$ accuracy when evaluated against human-annotated POS labels, suggesting\nthat there is indeed room for improvement on computation analysis of the\nlanguage. We provide an exposition of challenges in Singlish annotation: its\ninconsistencies in form and semantics, the highly context-dependent particles\nof the language, its structural unique expressions, and the variation of the\nlanguage on different mediums. Our task definition, resultant labels and\nresults reflects the challenges in analysing colloquial languages formulated\nfrom a variety of dialects, and paves the way for future studies beyond POS\ntagging.\n","authors":["Luo Qi Chan","Lynnette Hui Xian Ng"],"pdf_url":"https://arxiv.org/pdf/2410.16156v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.05715v1","updated":"2024-11-08T17:16:27Z","published":"2024-11-08T17:16:27Z","title":"On the Role of Noise in AudioVisual Integration: Evidence from\n Artificial Neural Networks that Exhibit the McGurk Effect","summary":" Humans are able to fuse information from both auditory and visual modalities\nto help with understanding speech. This is frequently demonstrated through an\nphenomenon known as the McGurk Effect, during which a listener is presented\nwith incongruent auditory and visual speech that fuse together into the percept\nof an illusory intermediate phoneme. Building on a recent framework that\nproposes how to address developmental 'why' questions using artificial neural\nnetworks, we evaluated a set of recent artificial neural networks trained on\naudiovisual speech by testing them with audiovisually incongruent words\ndesigned to elicit the McGurk effect. We compared networks trained on clean\nspeech to those trained on noisy speech, and discovered that training with\nnoisy speech led to an increase in both visual responses and McGurk responses\nacross all models. Furthermore, we observed that systematically increasing the\nlevel of auditory noise during ANN training also increased the amount of\naudiovisual integration up to a point, but at extreme noise levels, this\nintegration failed to develop. These results suggest that excessive noise\nexposure during critical periods of audiovisual learning may negatively\ninfluence the development of audiovisual speech integration. This work also\ndemonstrates that the McGurk effect reliably emerges untrained from the\nbehaviour of both supervised and unsupervised networks. This supports the\nnotion that artificial neural networks might be useful models for certain\naspects of perception and cognition.\n","authors":["Lukas Grasse","Matthew S. Tata"],"pdf_url":"https://arxiv.org/pdf/2411.05715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.05374v1","updated":"2024-11-08T07:04:00Z","published":"2024-11-08T07:04:00Z","title":"Interdisciplinary Translations: Sensory Perception as a Universal\n Language","summary":" This paper investigates sensory perception's pivotal role as a universal\ncommunicative bridge across varied cultures and disciplines, and how it\nmanifests its value in the study of media art, human computer interaction and\nartificial intelligence. By analyzing its function in non-verbal communication\nthrough interactive systems, and drawing on the interpretive model in\ntranslation studies where \"sense\" acts as a mediation between two languages,\nthis paper illustrates how interdisciplinary communication in media art and\nhuman-computer interaction is afforded by the abstract language of human\nsensory perception. Specific examples from traditional art, interactive media\nart, HCI, communication, and translation studies demonstrate how sensory\nfeedback translates and conveys meaning across diverse modalities of expression\nand how it fosters connections between humans, art, and technology. Pertaining\nto this topic, this paper analyzes the impact of sensory feedback systems in\ndesigning interactive experiences, and reveals the guiding role of sensory\nperception in the design philosophy of AI systems. Overall, the study aims to\nbroaden the understanding of sensory perception's role in communication,\nhighlighting its significance in the evolution of interactive experiences and\nits capacity to unify art, science, and the human experience.\n","authors":["Xindi Kang","Xuanyang Huang","Mingdong Song","Varvara Guljajeva","JoAnn Kuchera-Morin"],"pdf_url":"https://arxiv.org/pdf/2411.05374v1.pdf","comment":"This paper has been accepted to the International Symposium of\n Electronic Arts 2024, and the proceedings version will be available at\n https://isea-archives.siggraph.org/publications/ with DOI to be added once\n published"},{"id":"http://arxiv.org/abs/2411.05322v1","updated":"2024-11-08T04:29:14Z","published":"2024-11-08T04:29:14Z","title":"Rate-aware Compression for NeRF-based Volumetric Video","summary":" The neural radiance fields (NeRF) have advanced the development of 3D\nvolumetric video technology, but the large data volumes they involve pose\nsignificant challenges for storage and transmission. To address these problems,\nthe existing solutions typically compress these NeRF representations after the\ntraining stage, leading to a separation between representation training and\ncompression. In this paper, we try to directly learn a compact NeRF\nrepresentation for volumetric video in the training stage based on the proposed\nrate-aware compression framework. Specifically, for volumetric video, we use a\nsimple yet effective modeling strategy to reduce temporal redundancy for the\nNeRF representation. Then, during the training phase, an implicit entropy model\nis utilized to estimate the bitrate of the NeRF representation. This entropy\nmodel is then encoded into the bitstream to assist in the decoding of the NeRF\nrepresentation. This approach enables precise bitrate estimation, thereby\nleading to a compact NeRF representation. Furthermore, we propose an adaptive\nquantization strategy and learn the optimal quantization step for the NeRF\nrepresentations. Finally, the NeRF representation can be optimized by using the\nrate-distortion trade-off. Our proposed compression framework can be used for\ndifferent representations and experimental results demonstrate that our\napproach significantly reduces the storage size with marginal distortion and\nachieves state-of-the-art rate-distortion performance for volumetric video on\nthe HumanRF and ReRF datasets. Compared to the previous state-of-the-art method\nTeTriRF, we achieved an approximately -80% BD-rate on the HumanRF dataset and\n-60% BD-rate on the ReRF dataset.\n","authors":["Zhiyu Zhang","Guo Lu","Huanxiong Liang","Zhengxue Cheng","Anni Tang","Li Song"],"pdf_url":"https://arxiv.org/pdf/2411.05322v1.pdf","comment":"Accepted by ACM MM 2024 (Oral)"},{"id":"http://arxiv.org/abs/2411.05295v1","updated":"2024-11-08T02:57:23Z","published":"2024-11-08T02:57:23Z","title":"Content-Adaptive Rate-Quality Curve Prediction Model in Media Processing\n System","summary":" In streaming media services, video transcoding is a common practice to\nalleviate bandwidth demands. Unfortunately, traditional methods employing a\nuniform rate factor (RF) across all videos often result in significant\ninefficiencies. Content-adaptive encoding (CAE) techniques address this by\ndynamically adjusting encoding parameters based on video content\ncharacteristics. However, existing CAE methods are often tightly coupled with\nspecific encoding strategies, leading to inflexibility. In this paper, we\npropose a model that predicts both RF-quality and RF-bitrate curves, which can\nbe utilized to derive a comprehensive bitrate-quality curve. This approach\nfacilitates flexible adjustments to the encoding strategy without necessitating\nmodel retraining. The model leverages codec features, content features, and\nanchor features to predict the bitrate-quality curve accurately. Additionally,\nwe introduce an anchor suspension method to enhance prediction accuracy.\nExperiments confirm that the actual quality metric (VMAF) of the compressed\nvideo stays within 1 of the target, achieving an accuracy of 99.14%. By\nincorporating our quality improvement strategy with the rate-quality curve\nprediction model, we conducted online A/B tests, obtaining both +0.107%\nimprovements in video views and video completions and +0.064% app duration\ntime. Our model has been deployed on the Xiaohongshu App.\n","authors":["Shibo Yin","Zhiyu Zhang","Peirong Ning","Qiubo Chen","Jing Chen","Quan Zhou","Li Song"],"pdf_url":"https://arxiv.org/pdf/2411.05295v1.pdf","comment":"Accepted by IEEE VCIP 2024 (Oral)"},{"id":"http://arxiv.org/abs/2411.03109v2","updated":"2024-11-08T02:51:57Z","published":"2024-11-05T13:56:44Z","title":"pTSE-T: Presentation Target Speaker Extraction using Unaligned Text Cues","summary":" TSE(Target Speaker Extraction) aims to extract the clean speech of the target\nspeaker in an audio mixture, thus eliminating irrelevant background noise and\nspeech. While prior work has explored various auxiliary cues including\npre-recorded speech, visual information (e.g., lip motions and gestures), and\nspatial information, the acquisition and selection of such strong cues are\ninfeasible in many practical scenarios. Unlike all existing work, in this\npaper, we condition the TSE algorithm on semantic cues extracted from limited\nand unaligned text content, such as condensed points from a presentation slide.\nThis method is particularly useful in scenarios like meetings, poster sessions,\nor lecture presentations, where acquiring other cues in real-time is\nchallenging. To this end, we design two different networks. Specifically, our\nproposed TPE fuses audio features with content-based semantic cues to\nfacilitate time-frequency mask generation to filter out extraneous noise, while\nanother proposal, namely TSR, employs the contrastive learning technique to\nassociate blindly separated speech signals with semantic cues. The experimental\nresults show the efficacy in accurately identifying the target speaker by\nutilizing semantic cues derived from limited and unaligned text, resulting in\nSI-SDRi of 12.16 dB, SDRi of 12.66 dB, PESQi of 0.830 and STOIi of 0.150,\nrespectively. Dataset and source code will be publicly available. Project demo\npage: https://slideTSE.github.io/.\n","authors":["Ziyang Jiang","Xinyuan Qian","Jiahe Lei","Zexu Pan","Wei Xue","Xu-cheng Yin"],"pdf_url":"https://arxiv.org/pdf/2411.03109v2.pdf","comment":null}]},"2024-11-07T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.18097v2","updated":"2024-11-07T21:23:15Z","published":"2024-10-08T11:28:06Z","title":"RRADistill: Distilling LLMs' Passage Ranking Ability for Document\n Re-Ranking of Long-Tail Queries in a Search Engine","summary":" Large Language Models (LLMs) excel at understanding the semantic\nrelationships between queries and documents, even with lengthy and complex\nlong-tail queries. These queries are challenging for feedback-based rankings\ndue to sparse user engagement and limited feedback, making LLMs' ranking\nability highly valuable. However, the large size and slow inference of LLMs\nnecessitate the development of smaller, more efficient models (sLLMs).\nRecently, integrating ranking label generation into distillation techniques has\nbecome crucial, but existing methods underutilize LLMs' capabilities and are\ncumbersome. Our research, RRADistill: Re-Ranking Ability Distillation, propose\nan efficient label generation pipeline and novel sLLM training methods for both\nencoder and decoder models. We introduce an encoder-based method using a Term\nControl Layer to capture term matching signals and a decoder-based model with a\nranking layer for enhanced understanding. A/B testing on a Korean-based search\nplatform, validates the effectiveness of our approach in improving re-ranking\nfor long-tail queries.\n","authors":["Nayoung Choi","Youngjune Lee","Gyu-Hwung Cho","Haeyu Jeong","Jungmin Kong","Saehun Kim","Keunchan Park","Jaeho Choi","Sarah Cho","Inchang Jeong","Gyohee Nam","Sunghoon Han","Wonil Yang"],"pdf_url":"https://arxiv.org/pdf/2410.18097v2.pdf","comment":"Accepted to EMNLP 2024 Industry Track. First two authors contributed\n equally"},{"id":"http://arxiv.org/abs/2406.09215v3","updated":"2024-11-07T18:30:53Z","published":"2024-06-13T15:16:11Z","title":"On Softmax Direct Preference Optimization for Recommendation","summary":" Recommender systems aim to predict personalized rankings based on user\npreference data. With the rise of Language Models (LMs), LM-based recommenders\nhave been widely explored due to their extensive world knowledge and powerful\nreasoning abilities. Most of the LM-based recommenders convert historical\ninteractions into language prompts, pairing with a positive item as the target\nresponse and fine-tuning LM with a language modeling loss. However, the current\nobjective fails to fully leverage preference data and is not optimized for\npersonalized ranking tasks, which hinders the performance of LM-based\nrecommenders. Inspired by the current advancement of Direct Preference\nOptimization (DPO) in human preference alignment and the success of softmax\nloss in recommendations, we propose Softmax-DPO (S-DPO) to instill ranking\ninformation into the LM to help LM-based recommenders distinguish preferred\nitems from negatives, rather than solely focusing on positives. Specifically,\nwe incorporate multiple negatives in user preference data and devise an\nalternative version of DPO loss tailored for LM-based recommenders, which is\nextended from the traditional full-ranking Plackett-Luce (PL) model to partial\nrankings and connected to softmax sampling strategies. Theoretically, we bridge\nS-DPO with the softmax loss over negative sampling and find that it has an\ninherent benefit of mining hard negatives, which assures its exceptional\ncapabilities in recommendation tasks. Empirically, extensive experiments\nconducted on three real-world datasets demonstrate the superiority of S-DPO to\neffectively model user preference and further boost recommendation performance\nwhile providing better rewards for preferred items. Our codes are available at\nhttps://github.com/chenyuxin1999/S-DPO.\n","authors":["Yuxin Chen","Junfei Tan","An Zhang","Zhengyi Yang","Leheng Sheng","Enzhi Zhang","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2406.09215v3.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2406.14894v2","updated":"2024-11-07T18:15:23Z","published":"2024-06-21T06:30:16Z","title":"Talking the Talk Does Not Entail Walking the Walk: On the Limits of\n Large Language Models in Lexical Entailment Recognition","summary":" Verbs form the backbone of language, providing the structure and meaning to\nsentences. Yet, their intricate semantic nuances pose a longstanding challenge.\nUnderstanding verb relations through the concept of lexical entailment is\ncrucial for comprehending sentence meanings and grasping verb dynamics. This\nwork investigates the capabilities of eight Large Language Models in\nrecognizing lexical entailment relations among verbs through differently\ndevised prompting strategies and zero-/few-shot settings over verb pairs from\ntwo lexical databases, namely WordNet and HyperLex. Our findings unveil that\nthe models can tackle the lexical entailment recognition task with moderately\ngood performance, although at varying degree of effectiveness and under\ndifferent conditions. Also, utilizing few-shot prompting can enhance the\nmodels' performance. However, perfectly solving the task arises as an unmet\nchallenge for all examined LLMs, which raises an emergence for further research\ndevelopments on this topic.\n","authors":["Candida M. Greco","Lucio La Cava","Andrea Tagarelli"],"pdf_url":"https://arxiv.org/pdf/2406.14894v2.pdf","comment":"Accepted for publication at The 2024 Conference on Empirical Methods\n in Natural Language Processing (EMNLP-2024) - Findings"},{"id":"http://arxiv.org/abs/2411.04798v1","updated":"2024-11-07T15:38:14Z","published":"2024-11-07T15:38:14Z","title":"Orbit: A Framework for Designing and Evaluating Multi-objective Rankers","summary":" Machine learning in production needs to balance multiple objectives: This is\nparticularly evident in ranking or recommendation models, where conflicting\nobjectives such as user engagement, satisfaction, diversity, and novelty must\nbe considered at the same time. However, designing multi-objective rankers is\ninherently a dynamic wicked problem -- there is no single optimal solution, and\nthe needs evolve over time. Effective design requires collaboration between\ncross-functional teams and careful analysis of a wide range of information. In\nthis work, we introduce Orbit, a conceptual framework for Objective-centric\nRanker Building and Iteration. The framework places objectives at the center of\nthe design process, to serve as boundary objects for communication and guide\npractitioners for design and evaluation. We implement Orbit as an interactive\nsystem, which enables stakeholders to interact with objective spaces directly\nand supports real-time exploration and evaluation of design trade-offs. We\nevaluate Orbit through a user study involving twelve industry practitioners,\nshowing that it supports efficient design space exploration, leads to more\ninformed decision-making, and enhances awareness of the inherent trade-offs of\nmultiple objectives. Orbit (1) opens up new opportunities of an\nobjective-centric design process for any multi-objective ML models, as well as\n(2) sheds light on future designs that push practitioners to go beyond a narrow\nmetric-centric or example-centric mindset.\n","authors":["Chenyang Yang","Tesi Xiao","Michael Shavlovsky","Christian Kästner","Tongshuang Wu"],"pdf_url":"https://arxiv.org/pdf/2411.04798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2410.14331v2","updated":"2024-11-07T14:48:18Z","published":"2024-10-18T09:43:30Z","title":"ChartifyText: Automated Chart Generation from Data-Involved Texts via\n LLM","summary":" Text documents with numerical values involved are widely used in various\napplications such as scientific research, economy, public health and\njournalism. However, it is difficult for readers to quickly interpret such\ndata-involved texts and gain deep insights. To fill this research gap, this\nwork aims to automatically generate charts to accurately convey the underlying\ndata and ideas to readers, which is essentially a challenging task. The\nchallenges originate from text ambiguities, intrinsic sparsity and uncertainty\nof data in text documents, and subjective sentiment differences. Specifically,\nwe propose ChartifyText, a novel fully-automated approach that leverages Large\nLanguage Models (LLMs) to convert complex data-involved texts to expressive\ncharts. It consists of two major modules: tabular data inference and expressive\nchart generation. The tabular data inference module employs systematic prompt\nengineering to guide the LLM (e.g., GPT-4) to infer table data, where data\nranges, uncertainties, missing data values and corresponding subjective\nsentiments are explicitly considered. The expressive chart generation module\naugments standard charts with intuitive visual encodings and concise texts to\naccurately convey the underlying data and insights. We extensively evaluate the\neffectiveness of ChartifyText on real-world data-involved text documents\nthrough case studies, in-depth interviews with three visualization experts, and\na carefully-designed user study with 15 participants. The results demonstrate\nthe usefulness and effectiveness of ChartifyText in helping readers efficiently\nand effectively make sense of data-involved texts.\n","authors":["Songheng Zhang","Lei Wang","Toby Jia-Jun Li","Qiaomu Shen","Yixin Cao","Yong Wang"],"pdf_url":"https://arxiv.org/pdf/2410.14331v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04677v1","updated":"2024-11-07T13:03:21Z","published":"2024-11-07T13:03:21Z","title":"Lightning IR: Straightforward Fine-tuning and Inference of\n Transformer-based Language Models for Information Retrieval","summary":" A wide range of transformer-based language models have been proposed for\ninformation retrieval tasks. However, fine-tuning and inference of these models\nis often complex and requires substantial engineering effort. This paper\nintroduces Lightning IR, a PyTorch Lightning-based framework for fine-tuning\nand inference of transformer-based language models for information retrieval.\nLightning IR provides a modular and extensible architecture that supports all\nstages of an information retrieval pipeline: from fine-tuning and indexing to\nsearching and re-ranking. It is designed to be straightforward to use,\nscalable, and reproducible. Lightning IR is available as open-source:\nhttps://github.com/webis-de/lightning-ir.\n","authors":["Ferdinand Schlatt","Maik Fröbe","Matthias Hagen"],"pdf_url":"https://arxiv.org/pdf/2411.04677v1.pdf","comment":"Accepted as a demo at WSDM'25"},{"id":"http://arxiv.org/abs/2410.05779v2","updated":"2024-11-07T10:44:59Z","published":"2024-10-08T08:00:12Z","title":"LightRAG: Simple and Fast Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) systems enhance large language models\n(LLMs) by integrating external knowledge sources, enabling more accurate and\ncontextually relevant responses tailored to user needs. However, existing RAG\nsystems have significant limitations, including reliance on flat data\nrepresentations and inadequate contextual awareness, which can lead to\nfragmented answers that fail to capture complex inter-dependencies. To address\nthese challenges, we propose LightRAG, which incorporates graph structures into\ntext indexing and retrieval processes. This innovative framework employs a\ndual-level retrieval system that enhances comprehensive information retrieval\nfrom both low-level and high-level knowledge discovery. Additionally, the\nintegration of graph structures with vector representations facilitates\nefficient retrieval of related entities and their relationships, significantly\nimproving response times while maintaining contextual relevance. This\ncapability is further enhanced by an incremental update algorithm that ensures\nthe timely integration of new data, allowing the system to remain effective and\nresponsive in rapidly changing data environments. Extensive experimental\nvalidation demonstrates considerable improvements in retrieval accuracy and\nefficiency compared to existing approaches. We have made our LightRAG\nopen-source and available at the link: https://github.com/HKUDS/LightRAG.\n","authors":["Zirui Guo","Lianghao Xia","Yanhua Yu","Tu Ao","Chao Huang"],"pdf_url":"https://arxiv.org/pdf/2410.05779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04602v1","updated":"2024-11-07T10:31:31Z","published":"2024-11-07T10:31:31Z","title":"Self-Calibrated Listwise Reranking with Large Language Models","summary":" Large language models (LLMs), with advanced linguistic capabilities, have\nbeen employed in reranking tasks through a sequence-to-sequence approach. In\nthis paradigm, multiple passages are reranked in a listwise manner and a\ntextual reranked permutation is generated. However, due to the limited context\nwindow of LLMs, this reranking paradigm requires a sliding window strategy to\niteratively handle larger candidate sets. This not only increases computational\ncosts but also restricts the LLM from fully capturing all the comparison\ninformation for all candidates. To address these challenges, we propose a novel\nself-calibrated listwise reranking method, which aims to leverage LLMs to\nproduce global relevance scores for ranking. To achieve it, we first propose\nthe relevance-aware listwise reranking framework, which incorporates explicit\nlist-view relevance scores to improve reranking efficiency and enable global\ncomparison across the entire candidate set. Second, to ensure the comparability\nof the computed scores, we propose self-calibrated training that uses\npoint-view relevance assessments generated internally by the LLM itself to\ncalibrate the list-view relevance assessments. Extensive experiments and\ncomprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks\ndemonstrate the effectiveness and efficiency of our proposed method.\n","authors":["Ruiyang Ren","Yuhao Wang","Kun Zhou","Wayne Xin Zhao","Wenjie Wang","Jing Liu","Ji-Rong Wen","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2411.04602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04539v1","updated":"2024-11-07T08:54:46Z","published":"2024-11-07T08:54:46Z","title":"Best Practices for Distilling Large Language Models into BERT for Web\n Search Ranking","summary":" Recent studies have highlighted the significant potential of Large Language\nModels (LLMs) as zero-shot relevance rankers. These methods predominantly\nutilize prompt learning to assess the relevance between queries and documents\nby generating a ranked list of potential documents. Despite their promise, the\nsubstantial costs associated with LLMs pose a significant challenge for their\ndirect implementation in commercial search systems. To overcome this barrier\nand fully exploit the capabilities of LLMs for text ranking, we explore\ntechniques to transfer the ranking expertise of LLMs to a more compact model\nsimilar to BERT, using a ranking loss to enable the deployment of less\nresource-intensive models. Specifically, we enhance the training of LLMs\nthrough Continued Pre-Training, taking the query as input and the clicked title\nand summary as output. We then proceed with supervised fine-tuning of the LLM\nusing a rank loss, assigning the final token as a representative of the entire\nsentence. Given the inherent characteristics of autoregressive language models,\nonly the final token can encapsulate all preceding tokens. Additionally,\nwe introduce a hybrid point-wise and margin MSE loss to transfer the ranking\nknowledge from LLMs to smaller models like BERT. This method creates a viable\nsolution for environments with strict resource constraints. Both offline and\nonline evaluations have confirmed the efficacy of our approach, and our model\nhas been successfully integrated into a commercial web search engine as of\nFebruary 2024.\n","authors":["Dezhi Ye","Junwei Hu","Jiabin Fan","Bowen Tian","Jie Liu","Haijin Liang","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2411.04539v1.pdf","comment":"Arxiv Version"},{"id":"http://arxiv.org/abs/2411.05048v1","updated":"2024-11-07T03:58:38Z","published":"2024-11-07T03:58:38Z","title":"Leveraging LLMs to Enable Natural Language Search on Go-to-market\n Platforms","summary":" Enterprise searches require users to have complex knowledge of queries,\nconfigurations, and metadata, rendering it difficult for them to access\ninformation as needed. Most go-to-market (GTM) platforms utilize advanced\nsearch, an interface that enables users to filter queries by various fields\nusing categories or keywords, which, historically, however, has proven to be\nexceedingly cumbersome, as users are faced with seemingly hundreds of options,\nfields, and buttons. Consequently, querying with natural language has long been\nideal, a notion further empowered by Large Language Models (LLMs).\n In this paper, we implement and evaluate a solution for the Zoominfo product\nfor sellers, which prompts the LLM with natural language, producing search\nfields through entity extraction that are then converted into a search query.\nThe intermediary search fields offer numerous advantages for each query,\nincluding the elimination of syntax errors, simpler ground truths, and an\nintuitive format for the LLM to interpret.\n We paired this pipeline with many advanced prompt engineering strategies,\nfeaturing an intricate system message, few-shot prompting, chain-of-thought\n(CoT) reasoning, and execution refinement. Furthermore, we manually created the\nground truth for 500+ natural language queries, enabling the supervised\nfine-tuning of Llama-3-8B-Instruct and the introduction of sophisticated\nnumerical metrics.\n Comprehensive experiments with closed, open source, and fine-tuned LLM models\nwere conducted through exact, Jaccard, cosine, and semantic similarity on\nindividual search entities to demonstrate the efficacy of our approach.\nOverall, the most accurate closed model had an average accuracy of 97% per\nquery, with only one field performing under 90%, with comparable results\nobserved from the fine-tuned models.\n","authors":["Jesse Yao","Saurav Acharya","Priyaranjan Parida","Srinivas Attipalli","Ali Dasdan"],"pdf_url":"https://arxiv.org/pdf/2411.05048v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2411.04403v1","updated":"2024-11-07T03:46:43Z","published":"2024-11-07T03:46:43Z","title":"Towards Competitive Search Relevance For Inference-Free Learned Sparse\n Retrievers","summary":" Learned sparse retrieval, which can efficiently perform retrieval through\nmature inverted-index engines, has garnered growing attention in recent years.\nParticularly, the inference-free sparse retrievers are attractive as they\neliminate online model inference in the retrieval phase thereby avoids huge\ncomputational cost, offering reasonable throughput and latency. However, even\nthe state-of-the-art (SOTA) inference-free sparse models lag far behind in\nterms of search relevance when compared to both sparse and dense siamese\nmodels. Towards competitive search relevance for inference-free sparse\nretrievers, we argue that they deserve dedicated training methods other than\nusing same ones with siamese encoders. In this paper, we propose two different\napproaches for performance improvement. First, we introduce the IDF-aware FLOPS\nloss, which introduces Inverted Document Frequency (IDF) to the sparsification\nof representations. We find that it mitigates the negative impact of the FLOPS\nregularization on search relevance, allowing the model to achieve a better\nbalance between accuracy and efficiency. Moreover, we propose a heterogeneous\nensemble knowledge distillation framework that combines siamese dense and\nsparse retrievers to generate supervisory signals during the pre-training\nphase. The ensemble framework of dense and sparse retriever capitalizes on\ntheir strengths respectively, providing a strong upper bound for knowledge\ndistillation. To concur the diverse feedback from heterogeneous supervisors, we\nnormalize and then aggregate the outputs of the teacher models to eliminate\nscore scale differences. On the BEIR benchmark, our model outperforms existing\nSOTA inference-free sparse model by \\textbf{3.3 NDCG@10 score}. It exhibits\nsearch relevance comparable to siamese sparse retrievers and client-side\nlatency only \\textbf{1.1x that of BM25}.\n","authors":["Zhichao Geng","Dongyu Ru","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2411.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04366v1","updated":"2024-11-07T01:52:46Z","published":"2024-11-07T01:52:46Z","title":"The Concatenator: A Bayesian Approach To Real Time Concatenative\n Musaicing","summary":" We present ``The Concatenator,'' a real time system for audio-guided\nconcatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or\n``audio mosaicing'') technique, we concatenate a set number of windows within a\ncorpus of audio to re-create the harmonic and percussive aspects of a target\naudio stream. Unlike Driedger's NMF-based technique, however, we instead use an\nexplicitly Bayesian point of view, where corpus window indices are hidden\nstates and the target audio stream is an observation. We use a particle filter\nto infer the best hidden corpus states in real-time. Our transition model\nincludes a tunable parameter to control the time-continuity of corpus grains,\nand our observation model allows users to prioritize how quickly windows change\nto match the target. Because the computational complexity of the system is\nindependent of the corpus size, our system scales to corpora that are hours\nlong, which is an important feature in the age of vast audio data collections.\nWithin The Concatenator module itself, composers can vary grain length, fit to\ntarget, and pitch shift in real time while reacting to the sounds they hear,\nenabling them to rapidly iterate ideas. To conclude our work, we evaluate our\nsystem with extensive quantitative tests of the effects of parameters, as well\nas a qualitative evaluation with artistic insights. Based on the quality of the\nresults, we believe the real-time capability unlocks new avenues for musical\nexpression and control, suitable for live performance and modular synthesis\nintegration, which furthermore represents an essential breakthrough in\nconcatenative synthesis technology.\n","authors":["Christopher Tralie","Ben Cantil"],"pdf_url":"https://arxiv.org/pdf/2411.04366v1.pdf","comment":"12 pages, 6 figures, Accepted for Publication in The International\n Society for Music Information Retrieval Proceedings, 2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.04859v1","updated":"2024-11-07T16:49:25Z","published":"2024-11-07T16:49:25Z","title":"A multi-purpose automatic editing system based on lecture semantics for\n remote education","summary":" Remote teaching has become popular recently due to its convenience and\nsafety, especially under extreme circumstances like a pandemic. However, online\nstudents usually have a poor experience since the information acquired from the\nviews provided by the broadcast platforms is limited. One potential solution is\nto show more camera views simultaneously, but it is technically challenging and\ndistracting for the viewers. Therefore, an automatic multi-camera\ndirecting/editing system, which aims at selecting the most concerned view at\neach time instance to guide the attention of online students, is in urgent\ndemand. However, existing systems mostly make simple assumptions and focus on\ntracking the position of the speaker instead of the real lecture semantics, and\ntherefore have limited capacities to deliver optimal information flow. To this\nend, this paper proposes an automatic multi-purpose editing system based on the\nlecture semantics, which can both direct the multiple video streams for\nreal-time broadcasting and edit the optimal video offline for review purposes.\nOur system directs the views by semantically analyzing the class events while\nfollowing the professional directing rules, mimicking a human director to\ncapture the regions of interest from the viewpoint of the onsite students. We\nconduct both qualitative and quantitative analyses to verify the effectiveness\nof the proposed system and its components.\n","authors":["Panwen Hu","Rui Huang"],"pdf_url":"https://arxiv.org/pdf/2411.04859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.04517v1","updated":"2024-11-07T08:19:39Z","published":"2024-11-07T08:19:39Z","title":"Continuous Sign Language Recognition System using Deep Learning with\n MediaPipe Holistic","summary":" Sign languages are the language of hearing-impaired people who use visuals\nlike the hand, facial, and body movements for communication. There are\ndifferent signs and gestures representing alphabets, words, and phrases.\nNowadays approximately 300 sign languages are being practiced worldwide such as\nAmerican Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language\n(ISL), and many more. Sign languages are dependent on the vocal language of a\nplace. Unlike vocal or spoken languages, there are no helping words in sign\nlanguage like is, am, are, was, were, will, be, etc. As only a limited\npopulation is well-versed in sign language, this lack of familiarity of sign\nlanguage hinders hearing-impaired people from communicating freely and easily\nwith everyone. This issue can be addressed by a sign language recognition (SLR)\nsystem which has the capability to translate the sign language into vocal\nlanguage. In this paper, a continuous SLR system is proposed using a deep\nlearning model employing Long Short-Term Memory (LSTM), trained and tested on\nan ISL primary dataset. This dataset is created using MediaPipe Holistic\npipeline for tracking face, hand, and body movements and collecting landmarks.\nThe system recognizes the signs and gestures in real-time with 88.23% accuracy.\n","authors":["Sharvani Srivastava","Sudhakar Singh"," Pooja","Shiv Prakash"],"pdf_url":"https://arxiv.org/pdf/2411.04517v1.pdf","comment":"14 pages, 4 figures, Wireless Pers Commun"},{"id":"http://arxiv.org/abs/2411.02551v2","updated":"2024-11-07T07:18:51Z","published":"2024-11-04T19:34:13Z","title":"PIAST: A Multimodal Piano Dataset with Audio, Symbolic and Text","summary":" While piano music has become a significant area of study in Music Information\nRetrieval (MIR), there is a notable lack of datasets for piano solo music with\ntext labels. To address this gap, we present PIAST (PIano dataset with Audio,\nSymbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy\nof semantic tags, we collected 9,673 tracks from YouTube and added human\nannotations for 2,023 tracks by music experts, resulting in two subsets:\nPIAST-YT and PIAST-AT. Both include audio, text, tag annotations, and\ntranscribed MIDI utilizing state-of-the-art piano transcription and beat\ntracking models. Among many possible tasks with the multi-modal dataset, we\nconduct music tagging and retrieval using both audio and MIDI data and report\nbaseline performances to demonstrate its potential as a valuable resource for\nMIR research.\n","authors":["Hayeon Bang","Eunjin Choi","Megan Finch","Seungheon Doh","Seolhee Lee","Gyeong-Hoon Lee","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2411.02551v2.pdf","comment":"Accepted for publication at the 3rd Workshop on NLP for Music and\n Audio (NLP4MusA 2024)"},{"id":"http://arxiv.org/abs/2411.04366v1","updated":"2024-11-07T01:52:46Z","published":"2024-11-07T01:52:46Z","title":"The Concatenator: A Bayesian Approach To Real Time Concatenative\n Musaicing","summary":" We present ``The Concatenator,'' a real time system for audio-guided\nconcatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or\n``audio mosaicing'') technique, we concatenate a set number of windows within a\ncorpus of audio to re-create the harmonic and percussive aspects of a target\naudio stream. Unlike Driedger's NMF-based technique, however, we instead use an\nexplicitly Bayesian point of view, where corpus window indices are hidden\nstates and the target audio stream is an observation. We use a particle filter\nto infer the best hidden corpus states in real-time. Our transition model\nincludes a tunable parameter to control the time-continuity of corpus grains,\nand our observation model allows users to prioritize how quickly windows change\nto match the target. Because the computational complexity of the system is\nindependent of the corpus size, our system scales to corpora that are hours\nlong, which is an important feature in the age of vast audio data collections.\nWithin The Concatenator module itself, composers can vary grain length, fit to\ntarget, and pitch shift in real time while reacting to the sounds they hear,\nenabling them to rapidly iterate ideas. To conclude our work, we evaluate our\nsystem with extensive quantitative tests of the effects of parameters, as well\nas a qualitative evaluation with artistic insights. Based on the quality of the\nresults, we believe the real-time capability unlocks new avenues for musical\nexpression and control, suitable for live performance and modular synthesis\nintegration, which furthermore represents an essential breakthrough in\nconcatenative synthesis technology.\n","authors":["Christopher Tralie","Ben Cantil"],"pdf_url":"https://arxiv.org/pdf/2411.04366v1.pdf","comment":"12 pages, 6 figures, Accepted for Publication in The International\n Society for Music Information Retrieval Proceedings, 2024"}]},"2024-11-06T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2410.22233v2","updated":"2024-11-06T19:52:58Z","published":"2024-10-29T17:01:05Z","title":"ContextIQ: A Multimodal Expert-Based Video Retrieval System for\n Contextual Advertising","summary":" Contextual advertising serves ads that are aligned to the content that the\nuser is viewing. The rapid growth of video content on social platforms and\nstreaming services, along with privacy concerns, has increased the need for\ncontextual advertising. Placing the right ad in the right context creates a\nseamless and pleasant ad viewing experience, resulting in higher audience\nengagement and, ultimately, better ad monetization. From a technology\nstandpoint, effective contextual advertising requires a video retrieval system\ncapable of understanding complex video content at a very granular level.\nCurrent text-to-video retrieval models based on joint multimodal training\ndemand large datasets and computational resources, limiting their practicality\nand lacking the key functionalities required for ad ecosystem integration. We\nintroduce ContextIQ, a multimodal expert-based video retrieval system designed\nspecifically for contextual advertising. ContextIQ utilizes modality-specific\nexperts-video, audio, transcript (captions), and metadata such as objects,\nactions, emotion, etc.-to create semantically rich video representations. We\nshow that our system, without joint training, achieves better or comparable\nresults to state-of-the-art models and commercial solutions on multiple\ntext-to-video retrieval benchmarks. Our ablation studies highlight the benefits\nof leveraging multiple modalities for enhanced video retrieval accuracy instead\nof using a vision-language model alone. Furthermore, we show how video\nretrieval systems such as ContextIQ can be used for contextual advertising in\nan ad ecosystem while also addressing concerns related to brand safety and\nfiltering inappropriate content.\n","authors":["Ashutosh Chaubey","Anoubhav Agarwaal","Sartaki Sinha Roy","Aayush Agrawal","Susmita Ghose"],"pdf_url":"https://arxiv.org/pdf/2410.22233v2.pdf","comment":"Accepted at WACV 2025"},{"id":"http://arxiv.org/abs/2411.04228v1","updated":"2024-11-06T19:50:00Z","published":"2024-11-06T19:50:00Z","title":"dsld: A Socially Relevant Tool for Teaching Statistics","summary":" The growing power of data science can play a crucial role in addressing\nsocial discrimination, necessitating nuanced understanding and effective\nmitigation strategies of potential biases. Data Science Looks At Discrimination\n(dsld) is an R and Python package designed to provide users with a\ncomprehensive toolkit of statistical and graphical methods for assessing\npossible discrimination related to protected groups, such as race, gender, and\nage. Our software offers techniques for discrimination analysis by identifying\nand mitigating confounding variables, along with methods for reducing bias in\npredictive models.\n In educational settings, dsld offers instructors powerful tools to teach\nimportant statistical principles through motivating real world examples of\ndiscrimination analysis. The inclusion of an 80-page Quarto book further\nsupports users, from statistics educators to legal professionals, in\neffectively applying these analytical tools to real world scenarios.\n","authors":["Taha Abdullah","Arjun Ashok","Brandon Estrada","Norman Matloff","Aditya Mittal"],"pdf_url":"https://arxiv.org/pdf/2411.04228v1.pdf","comment":"To be submitted to the Journal of Statistics and Data Science\n Education"},{"id":"http://arxiv.org/abs/2411.04051v1","updated":"2024-11-06T16:57:55Z","published":"2024-11-06T16:57:55Z","title":"Reproducible Hybrid Time-Travel Retrieval in Evolving Corpora","summary":" There are settings in which reproducibility of ranked lists is desirable,\nsuch as when extracting a subset of an evolving document corpus for downstream\nresearch tasks or in domains such as patent retrieval or in medical systematic\nreviews, with high reproducibility expectations. However, as global term\nstatistics change when documents change or are added to a corpus, queries using\ntypical ranked retrieval models are not even reproducible for the parts of the\ndocument corpus that have not changed. Thus, Boolean retrieval frequently\nremains the mechanism of choice in such settings.\n We present a hybrid retrieval system combining Lucene for fast retrieval with\na column-store-based retrieval system maintaining a versioned and time-stamped\nindex. The latter component allows re-execution of previously posed queries\nresulting in the same ranked list and further allows for time-travel queries\nover evolving collection, as web archives, while maintaining the original\nranking. Thus, retrieval results in evolving document collections are fully\nreproducible even when document collections and thus term statistics change.\n","authors":["Moritz Staudinger","Florina Piroi","Andreas Rauber"],"pdf_url":"https://arxiv.org/pdf/2411.04051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03957v1","updated":"2024-11-06T14:42:39Z","published":"2024-11-06T14:42:39Z","title":"Fine-Grained Guidance for Retrievers: Leveraging LLMs' Feedback in\n Retrieval-Augmented Generation","summary":" Retrieval-Augmented Generation (RAG) has proven to be an effective method for\nmitigating hallucination issues inherent in large language models (LLMs).\nPrevious approaches typically train retrievers based on semantic similarity,\nlacking optimization for RAG. More recent works have proposed aligning\nretrievers with the preference signals of LLMs. However, these preference\nsignals are often difficult for dense retrievers, which typically have weaker\nlanguage capabilities, to understand and learn effectively. Drawing inspiration\nfrom pedagogical theories like Guided Discovery Learning, we propose a novel\nframework, FiGRet (Fine-grained Guidance for Retrievers), which leverages the\nlanguage capabilities of LLMs to construct examples from a more granular,\ninformation-centric perspective to guide the learning of retrievers.\nSpecifically, our method utilizes LLMs to construct easy-to-understand examples\nfrom samples where the retriever performs poorly, focusing on three learning\nobjectives highly relevant to the RAG scenario: relevance, comprehensiveness,\nand purity. These examples serve as scaffolding to ultimately align the\nretriever with the LLM's preferences. Furthermore, we employ a dual curriculum\nlearning strategy and leverage the reciprocal feedback between LLM and\nretriever to further enhance the performance of the RAG system. A series of\nexperiments demonstrate that our proposed framework enhances the performance of\nRAG systems equipped with different retrievers and is applicable to various\nLLMs.\n","authors":["Yuhang Liu","Xueyu Hu","Shengyu Zhang","Jingyuan Chen","Fan Wu","Fei Wu"],"pdf_url":"https://arxiv.org/pdf/2411.03957v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2411.03906v1","updated":"2024-11-06T13:37:28Z","published":"2024-11-06T13:37:28Z","title":"Lexicalization Is All You Need: Examining the Impact of Lexical\n Knowledge in a Compositional QALD System","summary":" In this paper, we examine the impact of lexicalization on Question Answering\nover Linked Data (QALD). It is well known that one of the key challenges in\ninterpreting natural language questions with respect to SPARQL lies in bridging\nthe lexical gap, that is mapping the words in the query to the correct\nvocabulary elements. We argue in this paper that lexicalization, that is\nexplicit knowledge about the potential interpretations of a word with respect\nto the given vocabulary, significantly eases the task and increases the\nperformance of QA systems. Towards this goal, we present a compositional QA\nsystem that can leverage explicit lexical knowledge in a compositional manner\nto infer the meaning of a question in terms of a SPARQL query. We show that\nsuch a system, given lexical knowledge, has a performance well beyond current\nQA systems, achieving up to a $35.8\\%$ increase in the micro $F_1$ score\ncompared to the best QA system on QALD-9. This shows the importance and\npotential of including explicit lexical knowledge. In contrast, we show that\nLLMs have limited abilities to exploit lexical knowledge, with only marginal\nimprovements compared to a version without lexical knowledge. This shows that\nLLMs have no ability to compositionally interpret a question on the basis of\nthe meaning of its parts, a key feature of compositional approaches. Taken\ntogether, our work shows new avenues for QALD research, emphasizing the\nimportance of lexicalization and compositionality.\n","authors":["David Maria Schmidt","Mohammad Fazleh Elahi","Philipp Cimiano"],"pdf_url":"https://arxiv.org/pdf/2411.03906v1.pdf","comment":"24th International Conference on Knowledge Engineering and Knowledge\n Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands"},{"id":"http://arxiv.org/abs/2411.03881v1","updated":"2024-11-06T12:54:27Z","published":"2024-11-06T12:54:27Z","title":"Data Fusion of Synthetic Query Variants With Generative Large Language\n Models","summary":" Considering query variance in information retrieval (IR) experiments is\nbeneficial for retrieval effectiveness. Especially ranking ensembles based on\ndifferent topically related queries retrieve better results than rankings based\non a single query alone. Recently, generative instruction-tuned Large Language\nModels (LLMs) improved on a variety of different tasks in capturing human\nlanguage. To this end, this work explores the feasibility of using synthetic\nquery variants generated by instruction-tuned LLMs in data fusion experiments.\nMore specifically, we introduce a lightweight, unsupervised, and cost-efficient\napproach that exploits principled prompting and data fusion techniques. In our\nexperiments, LLMs produce more effective queries when provided with additional\ncontext information on the topic. Furthermore, our analysis based on four TREC\nnewswire benchmarks shows that data fusion based on synthetic query variants is\nsignificantly better than baselines with single queries and also outperforms\npseudo-relevance feedback methods. We publicly share the code and query\ndatasets with the community as resources for follow-up studies.\n","authors":["Timo Breuer"],"pdf_url":"https://arxiv.org/pdf/2411.03881v1.pdf","comment":"The definitive version of record was published in SIGIR-AP '24"},{"id":"http://arxiv.org/abs/2411.02832v2","updated":"2024-11-06T11:19:42Z","published":"2024-11-05T06:11:17Z","title":"PersianRAG: A Retrieval-Augmented Generation System for Persian Language","summary":" Retrieval augmented generation (RAG) models, which integrate large-scale\npre-trained generative models with external retrieval mechanisms, have shown\nsignificant success in various natural language processing (NLP) tasks.\nHowever, applying RAG models in Persian language as a low-resource language,\nposes distinct challenges. These challenges primarily involve the\npreprocessing, embedding, retrieval, prompt construction, language modeling,\nand response evaluation of the system. In this paper, we address the challenges\ntowards implementing a real-world RAG system for Persian language called\nPersianRAG. We propose novel solutions to overcome these obstacles and evaluate\nour approach using several Persian benchmark datasets. Our experimental results\ndemonstrate the capability of the PersianRAG framework to enhance question\nanswering task in Persian.\n","authors":["Hossein Hosseini","Mohammad Sobhan Zare","Amir Hossein Mohammadi","Arefeh Kazemi","Zahra Zojaji","Mohammad Ali Nematbakhsh"],"pdf_url":"https://arxiv.org/pdf/2411.02832v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03039v2","updated":"2024-11-06T09:28:25Z","published":"2024-11-05T12:22:51Z","title":"Self-Compositional Data Augmentation for Scientific Keyphrase Generation","summary":" State-of-the-art models for keyphrase generation require large amounts of\ntraining data to achieve good performance. However, obtaining keyphrase-labeled\ndocuments can be challenging and costly. To address this issue, we present a\nself-compositional data augmentation method. More specifically, we measure the\nrelatedness of training documents based on their shared keyphrases, and combine\nsimilar documents to generate synthetic samples. The advantage of our method\nlies in its ability to create additional training samples that keep domain\ncoherence, without relying on external data or resources. Our results on\nmultiple datasets spanning three different domains, demonstrate that our method\nconsistently improves keyphrase generation. A qualitative analysis of the\ngenerated keyphrases for the Computer Science domain confirms this improvement\ntowards their representativity property.\n","authors":["Mael Houbre","Florian Boudin","Beatrice Daille","Akiko Aizawa"],"pdf_url":"https://arxiv.org/pdf/2411.03039v2.pdf","comment":"Accepted to JCDL 2024. This is the author's version of the work. It\n is posted here for your personal use. Not for redistribution. The definitive\n version was published in the proceedings of the 2024 ACM/IEEE Joint\n Conference on Digital Libraries (JCDL 24)\n https://doi.org/10.1145/3677389.3702504"},{"id":"http://arxiv.org/abs/2411.03701v1","updated":"2024-11-06T06:56:22Z","published":"2024-11-06T06:56:22Z","title":"The Essence of the Essence from the Web:The Metasearch Engine","summary":" The exponential growth of information source on the web and in turn\ncontinuing technological progress of searching the information by using tools\nlike Search Engines gives rise to many problems for the user to know which tool\nis best for their query and which tool is not. At this time Metasearch Engine\ncomes into play by reducing the user burden by dispatching queries to multiple\nsearch engines in parallel and refining the results of these search engines to\ngive the best out of best by doing superior job on their side. These engines do\nnot own a database of Web pages rather they send search terms to the databases\nmaintained by the search engine companies, get back results from all the search\nengines queried and then compile the results to be presented to the user. In\nthis paper, we describe the working of a typical metasearch engine and then\npresent a comparative study of traditional search engines and metasearch\nengines on the basis of different parameters and show how metasearch engines\nare better than the other search engines.\n","authors":["Rajender Nath","Satinder Bal"],"pdf_url":"https://arxiv.org/pdf/2411.03701v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2401.11505v2","updated":"2024-11-06T04:11:14Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n Report Labeling","summary":" Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/Soombit-ai/CheXGPT.\n","authors":["Jawook Gu","Kihyun You","Han-Cheol Cho","Jiho Kim","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v2.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2411.03624v1","updated":"2024-11-06T02:45:16Z","published":"2024-11-06T02:45:16Z","title":"SEGMN: A Structure-Enhanced Graph Matching Network for Graph Similarity\n Learning","summary":" Graph similarity computation (GSC) aims to quantify the similarity score\nbetween two graphs. Although recent GSC methods based on graph neural networks\n(GNNs) take advantage of intra-graph structures in message passing, few of them\nfully utilize the structures presented by edges to boost the representation of\ntheir connected nodes. Moreover, previous cross-graph node embedding matching\nlacks the perception of the overall structure of the graph pair, due to the\nfact that the node representations from GNNs are confined to the intra-graph\nstructure, causing the unreasonable similarity score. Intuitively, the\ncross-graph structure represented in the assignment graph is helpful to rectify\nthe inappropriate matching. Therefore, we propose a structure-enhanced graph\nmatching network (SEGMN). Equipped with a dual embedding learning module and a\nstructure perception matching module, SEGMN achieves structure enhancement in\nboth embedding learning and cross-graph matching. The dual embedding learning\nmodule incorporates adjacent edge representation into each node to achieve a\nstructure-enhanced representation. The structure perception matching module\nachieves cross-graph structure enhancement through assignment graph\nconvolution. The similarity score of each cross-graph node pair can be\nrectified by aggregating messages from structurally relevant node pairs.\nExperimental results on benchmark datasets demonstrate that SEGMN outperforms\nthe state-of-the-art GSC methods in the GED regression task, and the structure\nperception matching module is plug-and-play, which can further improve the\nperformance of the baselines by up to 25%.\n","authors":["Wenjun Wang","Jiacheng Lu","Kejia Chen","Zheng Liu","Shilong Sang"],"pdf_url":"https://arxiv.org/pdf/2411.03624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.02937v2","updated":"2024-11-06T02:36:02Z","published":"2024-08-06T03:44:06Z","title":"A Real-Time Adaptive Multi-Stream GPU System for Online Approximate\n Nearest Neighborhood Search","summary":" In recent years, Approximate Nearest Neighbor Search (ANNS) has played a\npivotal role in modern search and recommendation systems, especially in\nemerging LLM applications like Retrieval-Augmented Generation. There is a\ngrowing exploration into harnessing the parallel computing capabilities of GPUs\nto meet the substantial demands of ANNS. However, existing systems primarily\nfocus on offline scenarios, overlooking the distinct requirements of online\napplications that necessitate real-time insertion of new vectors. This\nlimitation renders such systems inefficient for real-world scenarios. Moreover,\nprevious architectures struggled to effectively support real-time insertion due\nto their reliance on serial execution streams. In this paper, we introduce a\nnovel Real-Time Adaptive Multi-Stream GPU ANNS System (RTAMS-GANNS). Our\narchitecture achieves its objectives through three key advancements: 1) We\ninitially examined the real-time insertion mechanisms in existing GPU ANNS\nsystems and discovered their reliance on repetitive copying and memory\nallocation, which significantly hinders real-time effectiveness on GPUs. As a\nsolution, we introduce a dynamic vector insertion algorithm based on memory\nblocks, which includes in-place rearrangement. 2) To enable real-time vector\ninsertion in parallel, we introduce a multi-stream parallel execution mode,\nwhich differs from existing systems that operate serially within a single\nstream. Our system utilizes a dynamic resource pool, allowing multiple streams\nto execute concurrently without additional execution blocking. 3) Through\nextensive experiments and comparisons, our approach effectively handles varying\nQPS levels across different datasets, reducing latency by up to 40%-80%. The\nproposed system has also been deployed in real-world industrial search and\nrecommendation systems, serving hundreds of millions of users daily, and has\nachieved good results.\n","authors":["Yiping Sun","Yang Shi","Jiaolong Du"],"pdf_url":"https://arxiv.org/pdf/2408.02937v2.pdf","comment":"Accepted by CIKM'24, V2 fixes some typos"},{"id":"http://arxiv.org/abs/2408.09380v3","updated":"2024-11-06T02:26:07Z","published":"2024-08-18T06:41:46Z","title":"ELASTIC: Efficient Linear Attention for Sequential Interest Compression","summary":" State-of-the-art sequential recommendation models heavily rely on\ntransformer's attention mechanism. However, the quadratic computational and\nmemory complexities of self attention have limited its scalability for modeling\nusers' long range behaviour sequences. To address this problem, we propose\nELASTIC, an Efficient Linear Attention for SequenTial Interest Compression,\nrequiring only linear time complexity and decoupling model capacity from\ncomputational cost. Specifically, ELASTIC introduces a fixed length interest\nexperts with linear dispatcher attention mechanism which compresses the\nlong-term behaviour sequences to a significantly more compact representation\nwhich reduces up to 90% GPU memory usage with x2.7 inference speed up. The\nproposed linear dispatcher attention mechanism significantly reduces the\nquadratic complexity and makes the model feasible for adequately modeling\nextremely long sequences. Moreover, in order to retain the capacity for\nmodeling various user interests, ELASTIC initializes a vast learnable interest\nmemory bank and sparsely retrieves compressed user's interests from the memory\nwith a negligible computational overhead. The proposed interest memory\nretrieval technique significantly expands the cardinality of available interest\nspace while keeping the same computational cost, thereby striking a trade-off\nbetween recommendation accuracy and efficiency. To validate the effectiveness\nof our proposed ELASTIC, we conduct extensive experiments on various public\ndatasets and compare it with several strong sequential recommenders.\nExperimental results demonstrate that ELASTIC consistently outperforms\nbaselines by a significant margin and also highlight the computational\nefficiency of ELASTIC when modeling long sequences. We will make our\nimplementation code publicly available.\n","authors":["Jiaxin Deng","Shiyao Wang","Song Lu","Yinfeng Li","Xinchen Luo","Yuanjun Liu","Peixing Xu","Guorui Zhou"],"pdf_url":"https://arxiv.org/pdf/2408.09380v3.pdf","comment":"We hereby withdraw this paper from arXiv due to incomplete\n experiments. Upon further review, we have determined that additional\n experimental work is necessary to fully validate our findings and conclusions"},{"id":"http://arxiv.org/abs/2411.03572v1","updated":"2024-11-06T00:23:55Z","published":"2024-11-06T00:23:55Z","title":"Advanced RAG Models with Graph Structures: Optimizing Complex Knowledge\n Reasoning and Text Generation","summary":" This study aims to optimize the existing retrieval-augmented generation model\n(RAG) by introducing a graph structure to improve the performance of the model\nin dealing with complex knowledge reasoning tasks. The traditional RAG model\nhas the problem of insufficient processing efficiency when facing complex graph\nstructure information (such as knowledge graphs, hierarchical relationships,\netc.), which affects the quality and consistency of the generated results. This\nstudy proposes a scheme to process graph structure data by combining graph\nneural network (GNN), so that the model can capture the complex relationship\nbetween entities, thereby improving the knowledge consistency and reasoning\nability of the generated text. The experiment used the Natural Questions (NQ)\ndataset and compared it with multiple existing generation models. The results\nshow that the graph-based RAG model proposed in this paper is superior to the\ntraditional generation model in terms of quality, knowledge consistency, and\nreasoning ability, especially when dealing with tasks that require\nmulti-dimensional reasoning. Through the combination of the enhancement of the\nretrieval module and the graph neural network, the model in this study can\nbetter handle complex knowledge background information and has broad potential\nvalue in multiple practical application scenarios.\n","authors":["Yuxin Dong","Shuo Wang","Hongye Zheng","Jiajing Chen","Zhenhong Zhang","Chihang Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03572v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2411.05854v1","updated":"2024-11-06T23:48:30Z","published":"2024-11-06T23:48:30Z","title":"Harmful YouTube Video Detection: A Taxonomy of Online Harm and MLLMs as\n Alternative Annotators","summary":" Short video platforms, such as YouTube, Instagram, or TikTok, are used by\nbillions of users globally. These platforms expose users to harmful content,\nranging from clickbait or physical harms to misinformation or online hate. Yet,\ndetecting harmful videos remains challenging due to an inconsistent\nunderstanding of what constitutes harm and limited resources and mental tolls\ninvolved in human annotation. As such, this study advances measures and methods\nto detect harm in video content. First, we develop a comprehensive taxonomy for\nonline harm on video platforms, categorizing it into six categories:\nInformation, Hate and harassment, Addictive, Clickbait, Sexual, and Physical\nharms. Next, we establish multimodal large language models as reliable\nannotators of harmful videos. We analyze 19,422 YouTube videos using 14 image\nframes, 1 thumbnail, and text metadata, comparing the accuracy of crowdworkers\n(Mturk) and GPT-4-Turbo with domain expert annotations serving as the gold\nstandard. Our results demonstrate that GPT-4-Turbo outperforms crowdworkers in\nboth binary classification (harmful vs. harmless) and multi-label harm\ncategorization tasks. Methodologically, this study extends the application of\nLLMs to multi-label and multi-modal contexts beyond text annotation and binary\nclassification. Practically, our study contributes to online harm mitigation by\nguiding the definitions and identification of harmful content on video\nplatforms.\n","authors":["Claire Wonjeong Jo","Miki Wesołowska","Magdalena Wojcieszak"],"pdf_url":"https://arxiv.org/pdf/2411.05854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04901v2","updated":"2024-11-06T23:32:27Z","published":"2023-04-11T00:17:28Z","title":"Efficiently Collecting Training Dataset for 2D Object Detection by\n Online Visual Feedback","summary":" Training deep-learning-based vision systems require the manual annotation of\na significant number of images. Such manual annotation is highly time-consuming\nand labor-intensive. Although previous studies have attempted to eliminate the\neffort required for annotation, the effort required for image collection was\nretained. To address this, we propose a human-in-the-loop dataset collection\nmethod that uses a web application. To counterbalance the workload and\nperformance by encouraging the collection of multi-view object image datasets\nin an enjoyable manner, thereby amplifying motivation, we propose three types\nof online visual feedback features to track the progress of the collection\nstatus. Our experiments thoroughly investigated the impact of each feature on\ncollection performance and quality of operation. The results suggested the\nfeasibility of annotation and object detection.\n","authors":["Takuya Kiyokawa","Naoki Shirakura","Hiroki Katayama","Keita Tomochika","Jun Takamatsu"],"pdf_url":"https://arxiv.org/pdf/2304.04901v2.pdf","comment":"13 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.03115v2","updated":"2024-11-06T22:09:09Z","published":"2024-01-06T03:03:28Z","title":"Transferable Learned Image Compression-Resistant Adversarial\n Perturbations","summary":" Adversarial attacks can readily disrupt the image classification system,\nrevealing the vulnerability of DNN-based recognition tasks. While existing\nadversarial perturbations are primarily applied to uncompressed images or\ncompressed images by the traditional image compression method, i.e., JPEG,\nlimited studies have investigated the robustness of models for image\nclassification in the context of DNN-based image compression. With the rapid\nevolution of advanced image compression, DNN-based learned image compression\nhas emerged as the promising approach for transmitting images in many\nsecurity-critical applications, such as cloud-based face recognition and\nautonomous driving, due to its superior performance over traditional\ncompression. Therefore, there is a pressing need to fully investigate the\nrobustness of a classification system post-processed by learned image\ncompression. To bridge this research gap, we explore the adversarial attack on\na new pipeline that targets image classification models that utilize learned\nimage compressors as pre-processing modules. Furthermore, to enhance the\ntransferability of perturbations across various quality levels and\narchitectures of learned image compression models, we introduce a saliency\nscore-based sampling method to enable the fast generation of transferable\nperturbation. Extensive experiments with popular attack methods demonstrate the\nenhanced transferability of our proposed method when attacking images that have\nbeen post-processed with different learned image compression models.\n","authors":["Yang Sui","Zhuohang Li","Ding Ding","Xiang Pan","Xiaozhong Xu","Shan Liu","Zhenzhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03115v2.pdf","comment":"Accepted by BMVC 2024"},{"id":"http://arxiv.org/abs/2407.14093v2","updated":"2024-11-06T16:45:17Z","published":"2024-07-19T07:57:48Z","title":"Routing Experts: Learning to Route Dynamic Experts in Multi-modal Large\n Language Models","summary":" Recently, mixture of experts (MoE) has become a popular paradigm for\nachieving the trade-off between modal capacity and efficiency of multi-modal\nlarge language models (MLLMs). Different from previous efforts, we are\ndedicated to exploring the dynamic expert path in an already exist MLLM and\nshow that a standard MLLM can be also a mixture of experts. To approach this\ntarget, we propose a novel dynamic expert scheme for MLLMs, termed Routing\nExperts (RoE), which can achieve example-dependent optimal path routing without\nobvious structure tweaks. Meanwhile, a new regularization of structure sparsity\nis also introduced to enforce MLLMs to learn more short-cut inference, ensuring\nthe efficiency. In addition, we also realize the first attempt of aligning the\ntraining and inference schemes of MLLMs in terms of network routing. To\nvalidate RoE, we apply it to a set of latest MLLMs, including LLaVA-1.5,\nLLaVA-HR and VILA, and conduct extensive experiments on a bunch of VL\nbenchmarks. The experiment results not only show the great advantages of our\nRoE in improving MLLMs' efficiency, but also yield obvious advantages than\nMoE-LLaVA in both performance and speed, e.g., an average performance gain of\n3.3% on 5 benchmarks while being faster.\n","authors":["Qiong Wu","Zhaoxi Ke","Yiyi Zhou","Gen Luo","Xiaoshuai Sun","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2407.14093v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03948v1","updated":"2024-11-06T14:29:49Z","published":"2024-11-06T14:29:49Z","title":"Long-Form Text-to-Music Generation with Adaptive Prompts: A Case of\n Study in Tabletop Role-Playing Games Soundtracks","summary":" This paper investigates the capabilities of text-to-audio music generation\nmodels in producing long-form music with prompts that change over time,\nfocusing on soundtrack generation for Tabletop Role-Playing Games (TRPGs). We\nintroduce Babel Bardo, a system that uses Large Language Models (LLMs) to\ntransform speech transcriptions into music descriptions for controlling a\ntext-to-music model. Four versions of Babel Bardo were compared in two TRPG\ncampaigns: a baseline using direct speech transcriptions, and three LLM-based\nversions with varying approaches to music description generation. Evaluations\nconsidered audio quality, story alignment, and transition smoothness. Results\nindicate that detailed music descriptions improve audio quality while\nmaintaining consistency across consecutive descriptions enhances story\nalignment and transition smoothness.\n","authors":["Felipe Marra","Lucas N. Ferreira"],"pdf_url":"https://arxiv.org/pdf/2411.03948v1.pdf","comment":"Paper accepted at the LAMIR 2024 workshop"},{"id":"http://arxiv.org/abs/2411.03921v1","updated":"2024-11-06T13:52:49Z","published":"2024-11-06T13:52:49Z","title":"Inter-Frame Coding for Dynamic Meshes via Coarse-to-Fine Anchor Mesh\n Generation","summary":" In the current Video-based Dynamic Mesh Coding (V-DMC) standard, inter-frame\ncoding is restricted to mesh frames with constant topology. Consequently,\ntemporal redundancy is not fully leveraged, resulting in suboptimal compression\nefficacy. To address this limitation, this paper introduces a novel\ncoarse-to-fine scheme to generate anchor meshes for frames with time-varying\ntopology. Initially, we generate a coarse anchor mesh using an octree-based\nnearest neighbor search. Motion estimation compensates for regions with\nsignificant motion changes during this process. However, the quality of the\ncoarse mesh is low due to its suboptimal vertices. To enhance details, the fine\nanchor mesh is further optimized using the Quadric Error Metrics (QEM)\nalgorithm to calculate more precise anchor points. The inter-frame anchor mesh\ngenerated herein retains the connectivity of the reference base mesh, while\nconcurrently preserving superior quality. Experimental results show that our\nmethod achieves 7.2% ~ 10.3% BD-rate gain compared to the existing V-DMC test\nmodel version 7.\n","authors":["He Huang","Lizhi Hou","Qi Yang","Yiling Xu"],"pdf_url":"https://arxiv.org/pdf/2411.03921v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2411.03823v1","updated":"2024-11-06T10:44:15Z","published":"2024-11-06T10:44:15Z","title":"Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM\n Data Contamination","summary":" The rapid progression of multimodal large language models (MLLMs) has\ndemonstrated superior performance on various multimodal benchmarks. However,\nthe issue of data contamination during training creates challenges in\nperformance evaluation and comparison. While numerous methods exist for\ndetecting dataset contamination in large language models (LLMs), they are less\neffective for MLLMs due to their various modalities and multiple training\nphases. In this study, we introduce a multimodal data contamination detection\nframework, MM-Detect, designed for MLLMs. Our experimental results indicate\nthat MM-Detect is sensitive to varying degrees of contamination and can\nhighlight significant performance improvements due to leakage of the training\nset of multimodal benchmarks. Furthermore, We also explore the possibility of\ncontamination originating from the pre-training phase of LLMs used by MLLMs and\nthe fine-tuning phase of MLLMs, offering new insights into the stages at which\ncontamination may be introduced.\n","authors":["Dingjie Song","Sicheng Lai","Shunian Chen","Lichao Sun","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2411.03823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.18680v3","updated":"2024-11-06T10:27:05Z","published":"2024-09-27T12:06:53Z","title":"Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large\n Language Models","summary":" Various audio-LLMs (ALLMs) have been explored recently for tackling different\naudio tasks simultaneously using a single, unified model. While existing\nevaluations of ALLMs primarily focus on single-audio tasks, real-world\napplications often involve processing multiple audio streams simultaneously. To\nbridge this gap, we propose the first multi-audio evaluation (MAE) benchmark\nthat consists of 20 datasets from 11 multi-audio tasks encompassing both speech\nand sound scenarios. Comprehensive experiments on MAE demonstrate that the\nexisting ALLMs, while being powerful in comprehending primary audio elements in\nindividual audio inputs, struggling to handle multi-audio scenarios. To this\nend, we propose a novel multi-audio-LLM (MALLM) to capture audio context among\nmultiple similar audios using discriminative learning on our proposed synthetic\ndata. The results demonstrate that the proposed MALLM outperforms all baselines\nand achieves high data efficiency using synthetic data without requiring human\nannotations. The proposed MALLM opens the door for ALLMs towards multi-audio\nprocessing era and brings us closer to replicating human auditory capabilities\nin machines.\n","authors":["Yiming Chen","Xianghu Yue","Xiaoxue Gao","Chen Zhang","Luis Fernando D'Haro","Robby T. Tan","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2409.18680v3.pdf","comment":"EMNLP24 Findings. Data available at\n https://github.com/MatthewCYM/MALLM"},{"id":"http://arxiv.org/abs/2411.05832v1","updated":"2024-11-06T04:30:04Z","published":"2024-11-06T04:30:04Z","title":"Diversify, Contextualize, and Adapt: Efficient Entropy Modeling for\n Neural Image Codec","summary":" Designing a fast and effective entropy model is challenging but essential for\npractical application of neural codecs. Beyond spatial autoregressive entropy\nmodels, more efficient backward adaptation-based entropy models have been\nrecently developed. They not only reduce decoding time by using smaller number\nof modeling steps but also maintain or even improve rate--distortion\nperformance by leveraging more diverse contexts for backward adaptation.\nDespite their significant progress, we argue that their performance has been\nlimited by the simple adoption of the design convention for forward adaptation:\nusing only a single type of hyper latent representation, which does not provide\nsufficient contextual information, especially in the first modeling step. In\nthis paper, we propose a simple yet effective entropy modeling framework that\nleverages sufficient contexts for forward adaptation without compromising on\nbit-rate. Specifically, we introduce a strategy of diversifying hyper latent\nrepresentations for forward adaptation, i.e., using two additional types of\ncontexts along with the existing single type of context. In addition, we\npresent a method to effectively use the diverse contexts for contextualizing\nthe current elements to be encoded/decoded. By addressing the limitation of the\nprevious approach, our proposed framework leads to significant performance\nimprovements. Experimental results on popular datasets show that our proposed\nframework consistently improves rate--distortion performance across various\nbit-rate regions, e.g., 3.73% BD-rate gain over the state-of-the-art baseline\non the Kodak dataset.\n","authors":["Jun-Hyuk Kim","Seungeon Kim","Won-Hee Lee","Dokwan Oh"],"pdf_url":"https://arxiv.org/pdf/2411.05832v1.pdf","comment":"NeurIPS 2024"},{"id":"http://arxiv.org/abs/2411.03595v1","updated":"2024-11-06T01:14:42Z","published":"2024-11-06T01:14:42Z","title":"Investigating Conceptual Blending of a Diffusion Model for Improving\n Nonword-to-Image Generation","summary":" Text-to-image diffusion models sometimes depict blended concepts in the\ngenerated images. One promising use case of this effect would be the\nnonword-to-image generation task which attempts to generate images intuitively\nimaginable from a non-existing word (nonword). To realize nonword-to-image\ngeneration, an existing study focused on associating nonwords with\nsimilar-sounding words. Since each nonword can have multiple similar-sounding\nwords, generating images containing their blended concepts would increase\nintuitiveness, facilitating creative activities and promoting computational\npsycholinguistics. Nevertheless, no existing study has quantitatively evaluated\nthis effect in either diffusion models or the nonword-to-image generation\nparadigm. Therefore, this paper first analyzes the conceptual blending in a\npretrained diffusion model, Stable Diffusion. The analysis reveals that a high\npercentage of generated images depict blended concepts when inputting an\nembedding interpolating between the text embeddings of two text prompts\nreferring to different concepts. Next, this paper explores the best text\nembedding space conversion method of an existing nonword-to-image generation\nframework to ensure both the occurrence of conceptual blending and image\ngeneration quality. We compare the conventional direct prediction approach with\nthe proposed method that combines $k$-nearest neighbor search and linear\nregression. Evaluation reveals that the enhanced accuracy of the embedding\nspace conversion by the proposed method improves the image generation quality,\nwhile the emergence of conceptual blending could be attributed mainly to the\nspecific dimensions of the high-dimensional text embedding space.\n","authors":["Chihaya Matsuhira","Marc A. Kastner","Takahiro Komamizu","Takatsugu Hirayama","Ichiro Ide"],"pdf_url":"https://arxiv.org/pdf/2411.03595v1.pdf","comment":"Paper accepted at ACM MM 2024 (doi: 10.1145/3664647.3681202) with\n supplementary materials concatenated"},{"id":"http://arxiv.org/abs/2410.21169v3","updated":"2024-11-06T00:11:08Z","published":"2024-10-28T16:11:35Z","title":"Document Parsing Unveiled: Techniques, Challenges, and Prospects for\n Structured Information Extraction","summary":" Document parsing is essential for converting unstructured and semi-structured\ndocuments-such as contracts, academic papers, and invoices-into structured,\nmachine-readable data. Document parsing extract reliable structured data from\nunstructured inputs, providing huge convenience for numerous applications.\nEspecially with recent achievements in Large Language Models, document parsing\nplays an indispensable role in both knowledge base construction and training\ndata generation. This survey presents a comprehensive review of the current\nstate of document parsing, covering key methodologies, from modular pipeline\nsystems to end-to-end models driven by large vision-language models. Core\ncomponents such as layout detection, content extraction (including text,\ntables, and mathematical expressions), and multi-modal data integration are\nexamined in detail. Additionally, this paper discusses the challenges faced by\nmodular document parsing systems and vision-language models in handling complex\nlayouts, integrating multiple modules, and recognizing high-density text. It\nemphasizes the importance of developing larger and more diverse datasets and\noutlines future research directions.\n","authors":["Qintong Zhang","Victor Shea-Jay Huang","Bin Wang","Junyuan Zhang","Zhengren Wang","Hao Liang","Shawn Wang","Matthieu Lin","Conghui He","Wentao Zhang"],"pdf_url":"https://arxiv.org/pdf/2410.21169v3.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..b0cf53cd --- /dev/null +++ b/index.html @@ -0,0 +1,22917 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 53 + +
+
+
+ + ☆ The Limited Impact of Medical Adaptation of Large Language and + Vision-Language Models EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare ten +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting and supervised fine-tuning regimes for medical question-answering +(QA). For instance, across all tasks and model pairs we consider in the 3-shot +setting, medical LLMs only outperform their base models in 22.7% of cases, +reach a (statistical) tie in 36.8% of cases, and are significantly worse than +their base models in the remaining 40.5% of cases. Our conclusions are based on +(i) comparing each medical model head-to-head, directly against the +corresponding base model; (ii) optimizing the prompts for each model separately +in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty +in comparisons. While these basic practices are not consistently adopted in the +literature, our ablations show that they substantially impact conclusions. +Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs +can show performance improvements, but the benefits do not carry over to tasks +based on clinical notes. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes + additional results on clinical note QA tasks and supervised fine-tuning + evaluations +
+
+
+
+
+ + ☆ CamemBERT 2.0: A Smarter French Language Model Aged to Perfection + + +
+ French language models, such as CamemBERT, have been widely adopted across +industries for natural language processing (NLP) tasks, with models like +CamemBERT seeing over 4 million downloads per month. However, these models face +challenges due to temporal concept drift, where outdated training data leads to +a decline in performance, especially when encountering new topics and +terminology. This issue emphasizes the need for updated models that reflect +current linguistic trends. In this paper, we introduce two new versions of the +CamemBERT base model-CamemBERTav2 and CamemBERTv2-designed to address these +challenges. CamemBERTav2 is based on the DeBERTaV3 architecture and makes use +of the Replaced Token Detection (RTD) objective for better contextual +understanding, while CamemBERTv2 is built on RoBERTa, which uses the Masked +Language Modeling (MLM) objective. Both models are trained on a significantly +larger and more recent dataset with longer context length and an updated +tokenizer that enhances tokenization performance for French. We evaluate the +performance of these models on both general-domain NLP tasks and +domain-specific applications, such as medical field tasks, demonstrating their +versatility and effectiveness across a range of use cases. Our results show +that these updated models vastly outperform their predecessors, making them +valuable tools for modern NLP systems. All our new models, as well as +intermediate checkpoints, are made openly available on Huggingface. + +
+
+
+
+
+ + ☆ Can sparse autoencoders be used to decompose and interpret steering + vectors? + + +
+ Steering vectors are a promising approach to control the behaviour of large +language models. However, their underlying mechanisms remain poorly understood. +While sparse autoencoders (SAEs) may offer a potential method to interpret +steering vectors, recent findings show that SAE-reconstructed vectors often +lack the steering properties of the original vectors. This paper investigates +why directly applying SAEs to steering vectors yields misleading +decompositions, identifying two reasons: (1) steering vectors fall outside the +input distribution for which SAEs are designed, and (2) steering vectors can +have meaningful negative projections in feature directions, which SAEs are not +designed to accommodate. These limitations hinder the direct use of SAEs for +interpreting steering vectors. + +
+
+
+
+
+ + ☆ Zero-shot Cross-lingual Transfer Learning with Multiple Source and + Target Languages for Information Extraction: Language Selection and + Adversarial Training + + +
+ The majority of previous researches addressing multi-lingual IE are limited +to zero-shot cross-lingual single-transfer (one-to-one) setting, with +high-resource languages predominantly as source training data. As a result, +these works provide little understanding and benefit for the realistic goal of +developing a multi-lingual IE system that can generalize to as many languages +as possible. Our study aims to fill this gap by providing a detailed analysis +on Cross-Lingual Multi-Transferability (many-to-many transfer learning), for +the recent IE corpora that cover a diverse set of languages. Specifically, we +first determine the correlation between single-transfer performance and a wide +range of linguistic-based distances. From the obtained insights, a combined +language distance metric can be developed that is not only highly correlated +but also robust across different tasks and model scales. Next, we investigate +the more general zero-shot multi-lingual transfer settings where multiple +languages are involved in the training and evaluation processes. Language +clustering based on the newly defined distance can provide directions for +achieving the optimal cost-performance trade-off in data (languages) selection +problem. Finally, a relational-transfer setting is proposed to further +incorporate multi-lingual unlabeled data based on adversarial training using +the relation induced from the above linguistic distance. + +
+
+
+
+
+ + ☆ Multi-Perspective Stance Detection + + +
+ Subjective NLP tasks usually rely on human annotations provided by multiple +annotators, whose judgments may vary due to their diverse backgrounds and life +experiences. Traditional methods often aggregate multiple annotations into a +single ground truth, disregarding the diversity in perspectives that arises +from annotator disagreement. In this preliminary study, we examine the effect +of including multiple annotations on model accuracy in classification. Our +methodology investigates the performance of perspective-aware classification +models in stance detection task and further inspects if annotator disagreement +affects the model confidence. The results show that multi-perspective approach +yields better classification performance outperforming the baseline which uses +the single label. This entails that designing more inclusive perspective-aware +AI models is not only an essential first step in implementing responsible and +ethical AI, but it can also achieve superior results than using the traditional +approaches. + +
+
+
+
+
+ + ☆ Separating Tongue from Thought: Activation Patching Reveals + Language-Agnostic Concept Representations in Transformers ICML 2024 + + +
+ A central question in multilingual language modeling is whether large +language models (LLMs) develop a universal concept representation, disentangled +from specific languages. In this paper, we address this question by analyzing +latent representations (latents) during a word translation task in +transformer-based LLMs. We strategically extract latents from a source +translation prompt and insert them into the forward pass on a target +translation prompt. By doing so, we find that the output language is encoded in +the latent at an earlier layer than the concept to be translated. Building on +this insight, we conduct two key experiments. First, we demonstrate that we can +change the concept without changing the language and vice versa through +activation patching alone. Second, we show that patching with the mean over +latents across different languages does not impair and instead improves the +models' performance in translating the concept. Our results provide evidence +for the existence of language-agnostic concept representations within the +investigated models. + +
+
+ comment: 12 pages, 10 figures, previously published under the title "How Do + Llamas Process Multilingual Text? A Latent Exploration through Activation + Patching" at the ICML 2024 mechanistic interpretability workshop + https://openreview.net/forum?id=0ku2hIm4BS +
+
+
+
+
+ + ☆ A Comparative Study of Discrete Speech Tokens for Semantic-Related Tasks + with Large Language Models + + +
+ With the rise of Speech Large Language Models (Speech LLMs), there has been +growing interest in discrete speech tokens for their ability to integrate with +text-based tokens seamlessly. Compared to most studies that focus on continuous +speech features, although discrete-token based LLMs have shown promising +results on certain tasks, the performance gap between these two paradigms is +rarely explored. In this paper, we present a fair and thorough comparison +between discrete and continuous features across a variety of semantic-related +tasks using a light-weight LLM (Qwen1.5-0.5B). Our findings reveal that +continuous features generally outperform discrete tokens, particularly in tasks +requiring fine-grained semantic understanding. Moreover, this study goes beyond +surface-level comparison by identifying key factors behind the +under-performance of discrete tokens, such as limited token granularity and +inefficient information retention. To enhance the performance of discrete +tokens, we explore potential aspects based on our analysis. We hope our results +can offer new insights into the opportunities for advancing discrete speech +tokens in Speech LLMs. + +
+
+ comment: 5 tables, 4 figures +
+
+
+
+
+ + ☆ Dynamic Rewarding with Prompt Optimization Enables Tuning-free + Self-Alignment of Language Models EMNLP 2024 + + +
+ Aligning Large Language Models (LLMs) traditionally relies on costly training +and human preference annotations. Self-alignment seeks to reduce these expenses +by enabling models to align themselves. To further lower costs and achieve +alignment without any expensive tuning or annotations, we introduce a new +tuning-free approach for self-alignment, Dynamic Rewarding with Prompt +Optimization (\ours). Our approach leverages a search-based optimization +framework that allows LLMs to iteratively self-improve and craft the optimal +alignment instructions, all without additional training or human intervention. +The core of \ours is a dynamic rewarding mechanism, which identifies and +rectifies model-specific alignment weaknesses, allowing LLMs to adapt +efficiently to diverse alignment challenges. Empirical evaluations on eight +recent LLMs, both open- and closed-sourced, demonstrate that \ours +significantly enhances alignment performance, with base models outperforming +their SFT/RLHF-tuned counterparts. Moreover, the prompts automatically +optimized by \ours surpass those curated by human experts, further validating +the effectiveness of our approach. Our findings highlight the great potential +of current LLMs to achieve adaptive self-alignment through inference-time +optimization, complementing tuning-based alignment methods. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ☆ Analyst Reports and Stock Performance: Evidence from the Chinese Market + + +
+ This article applies natural language processing (NLP) to extract and +quantify textual information to predict stock performance. Using an extensive +dataset of Chinese analyst reports and employing a customized BERT deep +learning model for Chinese text, this study categorizes the sentiment of the +reports as positive, neutral, or negative. The findings underscore the +predictive capacity of this sentiment indicator for stock volatility, excess +returns, and trading volume. Specifically, analyst reports with strong positive +sentiment will increase excess return and intraday volatility, and vice versa, +reports with strong negative sentiment also increase volatility and trading +volume, but decrease future excess return. The magnitude of this effect is +greater for positive sentiment reports than for negative sentiment reports. +This article contributes to the empirical literature on sentiment analysis and +the response of the stock market to news in the Chinese stock market. + +
+
+
+
+
+ + ☆ Are Triggers Needed for Document-Level Event Extraction? + + +
+ Most existing work on event extraction has focused on sentence-level texts +and presumes the identification of a trigger-span -- a word or phrase in the +input that evokes the occurrence of an event of interest. Event arguments are +then extracted with respect to the trigger. Indeed, triggers are treated as +integral to, and trigger detection as an essential component of, event +extraction. In this paper, we provide the first investigation of the role of +triggers for the more difficult and much less studied task of document-level +event extraction. We analyze their usefulness in multiple end-to-end and +pipelined neural event extraction models for three document-level event +extraction datasets, measuring performance using triggers of varying quality +(human-annotated, LLM-generated, keyword-based, and random). Our research shows +that trigger effectiveness varies based on the extraction task's +characteristics and data quality, with basic, automatically-generated triggers +serving as a viable alternative to human-annotated ones. Furthermore, providing +detailed event descriptions to the extraction model helps maintain robust +performance even when trigger quality degrades. Perhaps surprisingly, we also +find that the mere existence of trigger input, even random ones, is important +for prompt-based LLM approaches to the task. + +
+
+
+
+
+ + ☆ Theoretical Analysis of Byte-Pair Encoding + + +
+ Byte-Pair Encoding (BPE) is a widely used method for subword tokenization, +with origins in grammar-based text compression. It is employed in a variety of +language processing tasks such as machine translation or large language model +(LLM) pretraining, to create a token dictionary of a prescribed size. Most +evaluations of BPE to date are empirical, and the reasons for its good +practical performance are not well understood. + In this paper we focus on the optimization problem underlying BPE: finding a +pair encoding that achieves optimal compression utility. We show that this +problem is APX-complete, indicating that it is unlikely to admit a +polynomial-time approximation scheme. This answers, in a stronger form, a +question recently raised by Zouhar et al. + On the positive side, we show that BPE approximates the compression utility +of the optimal pair encoding to a worst-case factor between $0.333$ and +$0.625$. Our results aim to explain the ongoing success of BPE and are, to our +knowledge, the first rigorous guarantees on its compression utility that hold +for all inputs. + +
+
+
+
+
+ + ☆ Dynamic Subset Tuning: Expanding the Operational Range of + Parameter-Efficient Training for Large Language Models NeurIPS 2024 + + +
+ We propose a novel parameter-efficient training (PET) method for large +language models that adapts models to downstream tasks by optimizing a small +subset of the existing model parameters. Unlike prior methods, this subset is +not fixed in location but rather which parameters are modified evolves over the +course of training. This dynamic parameter selection can yield good performance +with many fewer parameters than extant methods. Our method enables a seamless +scaling of the subset size across an arbitrary proportion of the total model +size, while popular PET approaches like prompt tuning and LoRA cover only a +small part of this spectrum. We match or outperform prompt tuning and LoRA in +most cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given +parameter budget across different model families and sizes. + +
+
+ comment: NeurIPS 2024 Workshop on Adaptive Foundation Models +
+
+
+
+
+ + ☆ XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL + + +
+ To tackle the challenges of large language model performance in natural +language to SQL tasks, we introduce XiYan-SQL, an innovative framework that +employs a multi-generator ensemble strategy to improve candidate generation. We +introduce M-Schema, a semi-structured schema representation method designed to +enhance the understanding of database structures. To enhance the quality and +diversity of generated candidate SQL queries, XiYan-SQL integrates the +significant potential of in-context learning (ICL) with the precise control of +supervised fine-tuning. On one hand, we propose a series of training strategies +to fine-tune models to generate high-quality candidates with diverse +preferences. On the other hand, we implement the ICL approach with an example +selection method based on named entity recognition to prevent overemphasis on +entities. The refiner optimizes each candidate by correcting logical or +syntactical errors. To address the challenge of identifying the best candidate, +we fine-tune a selection model to distinguish nuances of candidate SQL queries. +The experimental results on multiple dialect datasets demonstrate the +robustness of XiYan-SQL in addressing challenges across different scenarios. +Overall, our proposed XiYan-SQL achieves the state-of-the-art execution +accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on +NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. +The proposed framework not only enhances the quality and diversity of SQL +queries but also outperforms previous methods. + +
+
+
+
+
+ + ☆ CorrSynth -- A Correlated Sampling Method for Diverse Dataset Generation + from LLMs EMNLP 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable performance in +diverse tasks using zero-shot and few-shot prompting. Even though their +capabilities of data synthesis have been studied well in recent years, the +generated data suffers from a lack of diversity, less adherence to the prompt, +and potential biases that creep into the data from the generator model. In this +work, we tackle the challenge of generating datasets with high diversity, upon +which a student model is trained for downstream tasks. Taking the route of +decoding-time guidance-based approaches, we propose CorrSynth, which generates +data that is more diverse and faithful to the input prompt using a correlated +sampling strategy. Further, our method overcomes the complexity drawbacks of +some other guidance-based techniques like classifier-based guidance. With +extensive experiments, we show the effectiveness of our approach and +substantiate our claims. In particular, we perform intrinsic evaluation to show +the improvements in diversity. Our experiments show that CorrSynth improves +both student metrics and intrinsic metrics upon competitive baselines across +four datasets, showing the innate advantage of our method. + +
+
+ comment: Published as a main conference paper at EMNLP 2024; First two authors + contributed equally +
+
+
+
+
+ + ☆ Neural Topic Modeling with Large Language Models in the Loop + + +
+ Topic modeling is a fundamental task in natural language processing, allowing +the discovery of latent thematic structures in text corpora. While Large +Language Models (LLMs) have demonstrated promising capabilities in topic +discovery, their direct application to topic modeling suffers from issues such +as incomplete topic coverage, misalignment of topics, and inefficiency. To +address these limitations, we propose LLM-ITL, a novel LLM-in-the-loop +framework that integrates LLMs with many existing Neural Topic Models (NTMs). +In LLM-ITL, global topics and document representations are learned through the +NTM, while an LLM refines the topics via a confidence-weighted Optimal +Transport (OT)-based alignment objective. This process enhances the +interpretability and coherence of the learned topics, while maintaining the +efficiency of NTMs. Extensive experiments demonstrate that LLM-ITL can help +NTMs significantly improve their topic interpretability while maintaining the +quality of document representation. + +
+
+
+
+
+ + ☆ Tree-of-Table: Unleashing the Power of LLMs for Enhanced Large-Scale + Table Understanding + + +
+ The ubiquity and value of tables as semi-structured data across various +domains necessitate advanced methods for understanding their complexity and +vast amounts of information. Despite the impressive capabilities of large +language models (LLMs) in advancing the natural language understanding +frontier, their application to large-scale tabular data presents significant +challenges, specifically regarding table size and complex intricate +relationships. Existing works have shown promise with small-scale tables but +often flounder when tasked with the complex reasoning required by larger, +interconnected tables found in real-world scenarios. To address this gap, we +introduce "Tree-of-Table", a novel approach designed to enhance LLMs' reasoning +capabilities over large and complex tables. Our method employs Table +Condensation and Decomposition to distill and reorganize relevant data into a +manageable format, followed by the construction of a hierarchical Table-Tree +that facilitates tree-structured reasoning. Through a meticulous Table-Tree +Execution process, we systematically unravel the tree-structured reasoning +chain to derive the solutions. Experiments across diverse datasets, including +WikiTQ, TableFact, FeTaQA, and BIRD, demonstrate that Tree-of-Table sets a new +benchmark with superior performance, showcasing remarkable efficiency and +generalization capabilities in large-scale table reasoning. + +
+
+
+
+
+ + ☆ An Information Theoretic Approach to Operationalize Right to Data + Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ☆ Towards Objective and Unbiased Decision Assessments with LLM-Enhanced + Hierarchical Attention Networks + + +
+ How objective and unbiased are we while making decisions? This work +investigates cognitive bias identification in high-stake decision making +process by human experts, questioning its effectiveness in real-world settings, +such as candidates assessments for university admission. We begin with a +statistical analysis assessing correlations among different decision points +among in the current process, which discovers discrepancies that imply +cognitive bias and inconsistency in decisions. This motivates our exploration +of bias-aware AI-augmented workflow that surpass human judgment. We propose +BGM-HAN, a hierarchical attention network enhanced by byte-pair encoding, +multi-head attention and gated residual connection. Using it as backbone model, +we further propose a Shortlist-Analyse-Recommend (SAR) agentic workflow, which +simulate real-world decision-making. In our experiments, both the proposed +model and the agentic workflow significantly improves on both human judgment +and alternative models, validated with real-world data. + +
+
+
+
+
+ + ☆ Towards Evaluating Large Language Models for Graph Query Generation SC + + +
+ Large Language Models (LLMs) are revolutionizing the landscape of Generative +Artificial Intelligence (GenAI), with innovative LLM-backed solutions emerging +rapidly. However, when applied to database technologies, specifically query +generation for graph databases and Knowledge Graphs (KGs), LLMs still face +significant challenges. While research on LLM-driven query generation for +Structured Query Language (SQL) exists, similar systems for graph databases +remain underdeveloped. This paper presents a comparative study addressing the +challenge of generating Cypher queries a powerful language for interacting with +graph databases using open-access LLMs. We rigorously evaluate several LLM +agents (OpenAI ChatGPT 4o, Claude Sonnet 3.5, Google Gemini Pro 1.5, and a +locally deployed Llama 3.1 8B) using a designed few-shot learning prompt and +Retrieval Augmented Generation (RAG) backed by Chain-of-Thoughts (CoT) +reasoning. Our empirical analysis of query generation accuracy reveals that +Claude Sonnet 3.5 outperforms its counterparts in this specific domain. +Further, we highlight promising future research directions to address the +identified limitations and advance LLM-driven query generation for graph +databases. + +
+
+ comment: Paper accepted and will be presented at CSCI2024 in December 2024, + Later will be published at Springer LNCS +
+
+
+
+
+ + ☆ One STEP at a time: Language Agents are Stepwise Planners + + +
+ Language agents have shown promising adaptability in dynamic environments to +perform complex tasks. However, despite the versatile knowledge embedded in +large language models, these agents still fall short when it comes to tasks +that require planning. We introduce STEP, a novel framework designed to +efficiently learn from previous experiences to enhance the planning +capabilities of language agents in future steps. Concretely, STEP functions +through four interconnected components. First, the Planner takes on the task, +breaks it down into subtasks and provides relevant insights. Then the Executor +generates action candidates, while the Evaluator ensures the actions align with +learned rules from previous experiences. Lastly, Memory stores experiences to +inform future decisions. In the ScienceWorld benchmark, our results show that +STEP consistently outperforms state-of-the-art models, achieving an overall +score of 67.4 and successfully completing 12 out of 18 tasks. These findings +highlight STEP's potential as a framework for enhancing planning capabilities +in language agents, paving the way for more sophisticated task-solving in +dynamic environments. + +
+
+
+
+
+ + ☆ CLaSP: Learning Concepts for Time-Series Signals from Natural Language + Supervision + + +
+ This paper proposes a foundation model called "CLaSP" that can search time +series signals using natural language that describes the characteristics of the +signals as queries. Previous efforts to represent time series signal data in +natural language have had challenges in designing a conventional class of time +series signal characteristics, formulating their quantification, and creating a +dictionary of synonyms. To overcome these limitations, the proposed method +introduces a neural network based on contrastive learning. This network is +first trained using the datasets TRUCE and SUSHI, which consist of time series +signals and their corresponding natural language descriptions. Previous studies +have proposed vocabularies that data analysts use to describe signal +characteristics, and SUSHI was designed to cover these terms. We believe that a +neural network trained on these datasets will enable data analysts to search +using natural language vocabulary. Furthermore, our method does not require a +dictionary of predefined synonyms, and it leverages common sense knowledge +embedded in a large-scale language model (LLM). Experimental results +demonstrate that CLaSP enables natural language search of time series signal +data and can accurately learn the points at which signal data changes. + +
+
+
+
+
+ + ☆ Interpretable Syntactic Representations Enable Hierarchical Word Vectors + + +
+ The distributed representations currently used are dense and uninterpretable, +leading to interpretations that themselves are relative, overcomplete, and hard +to interpret. We propose a method that transforms these word vectors into +reduced syntactic representations. The resulting representations are compact +and interpretable allowing better visualization and comparison of the word +vectors and we successively demonstrate that the drawn interpretations are in +line with human judgment. The syntactic representations are then used to create +hierarchical word vectors using an incremental learning approach similar to the +hierarchical aspect of human learning. As these representations are drawn from +pre-trained vectors, the generation process and learning approach are +computationally efficient. Most importantly, we find out that syntactic +representations provide a plausible interpretation of the vectors and +subsequent hierarchical vectors outperform the original vectors in benchmark +tests. + +
+
+
+
+
+ + ☆ Refining Translations with LLMs: A Constraint-Aware Iterative Prompting + Approach + + +
+ Large language models (LLMs) have demonstrated remarkable proficiency in +machine translation (MT), even without specific training on the languages in +question. However, translating rare words in low-resource or domain-specific +contexts remains challenging for LLMs. To address this issue, we propose a +multi-step prompt chain that enhances translation faithfulness by prioritizing +key terms crucial for semantic accuracy. Our method first identifies these +keywords and retrieves their translations from a bilingual dictionary, +integrating them into the LLM's context using Retrieval-Augmented Generation +(RAG). We further mitigate potential output hallucinations caused by long +prompts through an iterative self-checking mechanism, where the LLM refines its +translations based on lexical and semantic constraints. Experiments using Llama +and Qwen as base models on the FLORES-200 and WMT datasets demonstrate +significant improvements over baselines, highlighting the effectiveness of our +approach in enhancing translation faithfulness and robustness, particularly in +low-resource scenarios. + +
+
+
+
+
+ + ☆ A Chinese Multi-label Affective Computing Dataset Based on Social Media + Network Users + + +
+ Emotion and personality are central elements in understanding human +psychological states. Emotions reflect an individual subjective experiences, +while personality reveals relatively stable behavioral and cognitive patterns. +Existing affective computing datasets often annotate emotion and personality +traits separately, lacking fine-grained labeling of micro-emotions and emotion +intensity in both single-label and multi-label classifications. Chinese emotion +datasets are extremely scarce, and datasets capturing Chinese user personality +traits are even more limited. To address these gaps, this study collected data +from the major social media platform Weibo, screening 11,338 valid users from +over 50,000 individuals with diverse MBTI personality labels and acquiring +566,900 posts along with the user MBTI personality tags. Using the EQN method, +we compiled a multi-label Chinese affective computing dataset that integrates +the same user's personality traits with six emotions and micro-emotions, each +annotated with intensity levels. Validation results across multiple NLP +classification models demonstrate the dataset strong utility. This dataset is +designed to advance machine recognition of complex human emotions and provide +data support for research in psychology, education, marketing, finance, and +politics. + +
+
+
+
+
+ + ☆ Bangla Grammatical Error Detection Leveraging Transformer-based Token + Classification + + +
+ Bangla is the seventh most spoken language by a total number of speakers in +the world, and yet the development of an automated grammar checker in this +language is an understudied problem. Bangla grammatical error detection is a +task of detecting sub-strings of a Bangla text that contain grammatical, +punctuation, or spelling errors, which is crucial for developing an automated +Bangla typing assistant. Our approach involves breaking down the task as a +token classification problem and utilizing state-of-the-art transformer-based +models. Finally, we combine the output of these models and apply rule-based +post-processing to generate a more reliable and comprehensive result. Our +system is evaluated on a dataset consisting of over 25,000 texts from various +sources. Our best model achieves a Levenshtein distance score of 1.04. Finally, +we provide a detailed analysis of different components of our system. + +
+
+
+
+
+ + ☆ Are LLMs Prescient? A Continuous Evaluation using Daily News as the + Oracle + + +
+ Many existing evaluation benchmarks for Large Language Models (LLMs) quickly +become outdated due to the emergence of new models and training data. These +benchmarks also fall short in assessing how LLM performance changes over time, +as they consist of static questions without a temporal dimension. To address +these limitations, we propose using future event prediction as a continuous +evaluation method to assess LLMs' temporal generalization and forecasting +abilities. Our benchmark, Daily Oracle, automatically generates question-answer +(QA) pairs from daily news, challenging LLMs to predict "future" event +outcomes. Our findings reveal that as pre-training data becomes outdated, LLM +performance degrades over time. While Retrieval Augmented Generation (RAG) has +the potential to enhance prediction accuracy, the performance degradation +pattern persists, highlighting the need for continuous model updates. + +
+
+
+
+
+ + ☆ R3HF: Reward Redistribution for Enhancing Reinforcement Learning from + Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) provides a paradigm for +aligning large language models (LLMs) with human preferences. This involves the +initial training of a reward model based on pairwise human feedback. The reward +model is subsequently utilized in reinforcement learning to assess the scores +of each generated sentence as a whole, further guiding the optimization of +LLMs. However, current approaches have a significant shortcoming: \emph{They +allocate a single, sparse, and delayed reward to an entire sequence of output}. +This may overlook some significant individual contributions of each token +towards the desired outcome. To overcome this limitation, our paper proposes a +novel reward redistribution method called R3HF, which facilitates a more +fine-grained, token-level reward allocation. Specifically, our method treats +the reward prediction task of the reward model as a regression problem. As a +result, the redistributed rewards are computed by evaluating the specific +contribution of each token to the reward model's output. This detailed approach +improves the model's understanding of language nuances, leading to more precise +enhancements in its performance. Our method is crafted to integrate seamlessly +with most current techniques while incurring minimal computational costs. +Through comprehensive experiments across diverse datasets and tasks, we have +verified the effectiveness and superiority of our approach. + +
+
+
+
+
+ + ☆ Knowledge Bases in Support of Large Language Models for Processing Web + News + + +
+ Large Language Models (LLMs) have received considerable interest in wide +applications lately. During pre-training via massive datasets, such a model +implicitly memorizes the factual knowledge of trained datasets in its hidden +parameters. However, knowledge held implicitly in parameters often makes its +use by downstream applications ineffective due to the lack of common-sense +reasoning. In this article, we introduce a general framework that permits to +build knowledge bases with an aid of LLMs, tailored for processing Web news. +The framework applies a rule-based News Information Extractor (NewsIE) to news +items for extracting their relational tuples, referred to as knowledge bases, +which are then graph-convoluted with the implicit knowledge facts of news items +obtained by LLMs, for their classification. It involves two lightweight +components: 1) NewsIE: for extracting the structural information of every news +item, in the form of relational tuples; 2) BERTGraph: for graph convoluting the +implicit knowledge facts with relational tuples extracted by NewsIE. We have +evaluated our framework under different news-related datasets for news category +classification, with promising experimental results. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ A Large-Scale Study of Relevance Assessments with Large Language Models: + An Initial Look + + +
+ The application of large language models to provide relevance assessments +presents exciting opportunities to advance information retrieval, natural +language processing, and beyond, but to date many unknowns remain. This paper +reports on the results of a large-scale evaluation (the TREC 2024 RAG Track) +where four different relevance assessment approaches were deployed in situ: the +"standard" fully manual process that NIST has implemented for decades and three +different alternatives that take advantage of LLMs to different extents using +the open-source UMBRELA tool. This setup allows us to correlate system rankings +induced by the different approaches to characterize tradeoffs between cost and +quality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system +rankings induced by automatically generated relevance assessments from UMBRELA +correlate highly with those induced by fully manual assessments across a +diverse set of 77 runs from 19 teams. Our results suggest that automatically +generated UMBRELA judgments can replace fully manual judgments to accurately +capture run-level effectiveness. Surprisingly, we find that LLM assistance does +not appear to increase correlation with fully manual assessments, suggesting +that costs associated with human-in-the-loop processes do not bring obvious +tangible benefits. Overall, human assessors appear to be stricter than UMBRELA +in applying relevance criteria. Our work validates the use of LLMs in academic +TREC-style evaluations and provides the foundation for future studies. + +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ MILU: A Multi-task Indic Language Understanding Benchmark + + +
+ Evaluating Large Language Models (LLMs) in low-resource and linguistically +diverse languages remains a significant challenge in NLP, particularly for +languages using non-Latin scripts like those spoken in India. Existing +benchmarks predominantly focus on English, leaving substantial gaps in +assessing LLM capabilities in these languages. We introduce MILU, a Multi task +Indic Language Understanding Benchmark, a comprehensive evaluation benchmark +designed to address this gap. MILU spans 8 domains and 42 subjects across 11 +Indic languages, reflecting both general and culturally specific knowledge. +With an India-centric design, incorporates material from regional and +state-level examinations, covering topics such as local history, arts, +festivals, and laws, alongside standard subjects like science and mathematics. +We evaluate over 45 LLMs, and find that current LLMs struggle with MILU, with +GPT-4o achieving the highest average accuracy at 72 percent. Open multilingual +models outperform language-specific fine-tuned models, which perform only +slightly better than random baselines. Models also perform better in high +resource languages as compared to low resource ones. Domain-wise analysis +indicates that models perform poorly in culturally relevant areas like Arts and +Humanities, Law and Governance compared to general fields like STEM. To the +best of our knowledge, MILU is the first of its kind benchmark focused on Indic +languages, serving as a crucial step towards comprehensive cultural evaluation. +All code, benchmarks, and artifacts are publicly available to foster open +research. + +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from over-reliance on unimodal biases (e.g., language bias +and vision bias), leading to incorrect answers or hallucinations in complex +multimodal tasks. To investigate this issue, we propose a causal framework to +interpret the biases in Visual Question Answering (VQA) problems. Within this +framework, we conduct an in-depth causal analysis to assess the causal effect +of these biases on MLLM predictions. Based on the analysis, we introduce 1) a +novel MORE dataset with 12,000 challenging VQA instances requiring multi-hop +reasoning and overcoming unimodal biases. 2) a causality-enhanced agent +framework CAVE that guides models to comprehensively integrate information from +different modalities and mitigate biases. Our experiments show that MLLMs +perform poorly on MORE, indicating strong unimodal biases and limited semantic +understanding. However, when integrated with our CAVE, promising improvements +in reasoning and bias mitigation can be seen. These findings provide important +insights for the development of more robust MLLMs and contribute to the broader +goal of advancing multimodal AI systems capable of deeper understanding and +reasoning. Our project page is at https://github.com/OpenCausaLab/MORE. + +
+
+
+
+
+ + ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models NeurIPS 2024 + + +
+ In the face of uncertainty, the ability to *seek information* is of +fundamental importance. In many practical applications, such as medical +diagnosis and troubleshooting, the information needed to solve the task is not +initially given and has to be actively sought by asking follow-up questions +(for example, a doctor asking a patient for more details about their symptoms). +In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to +augment large language models with the ability to actively seek information by +asking effective questions. UoT combines 1) an *uncertainty-aware simulation +approach* which enables the model to simulate possible future scenarios and how +likely they are to occur, 2) *uncertainty-based rewards* motivated by +information gain which incentivizes the model to seek information, and 3) a +*reward propagation scheme* to select the optimal question to ask in a way that +maximizes the expected reward. In experiments on medical diagnosis, +troubleshooting, and the `20 Questions` game, UoT achieves an average +performance improvement of 38.1% in the rate of successful task completion +across multiple LLMs compared with direct prompting and also improves +efficiency (i.e., the number of questions needed to complete the task). Our +code has been released [here](https://github.com/zhiyuanhubj/UoT) + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Chinese SimpleQA: A Chinese Factuality Evaluation for Large Language + Models + + +
+ New LLM evaluation benchmarks are important to align with the rapid +development of Large Language Models (LLMs). In this work, we present Chinese +SimpleQA, the first comprehensive Chinese benchmark to evaluate the factuality +ability of language models to answer short questions, and Chinese SimpleQA +mainly has five properties (i.e., Chinese, Diverse, High-quality, Static, +Easy-to-evaluate). Specifically, first, we focus on the Chinese language over 6 +major topics with 99 diverse subtopics. Second, we conduct a comprehensive +quality control process to achieve high-quality questions and answers, where +the reference answers are static and cannot be changed over time. Third, +following SimpleQA, the questions and answers are very short, and the grading +process is easy-to-evaluate based on OpenAI API. Based on Chinese SimpleQA, we +perform a comprehensive evaluation on the factuality abilities of existing +LLMs. Finally, we hope that Chinese SimpleQA could guide the developers to +better understand the Chinese factuality abilities of their models and +facilitate the growth of foundation models. + +
+
+
+
+
+ + ♻ ☆ Toxicity Detection is NOT all you Need: Measuring the Gaps to Supporting + Volunteer Content Moderators + + +
+ Extensive efforts in automated approaches for content moderation have been +focused on developing models to identify toxic, offensive, and hateful content +with the aim of lightening the load for moderators. Yet, it remains uncertain +whether improvements on those tasks have truly addressed moderators' needs in +accomplishing their work. In this paper, we surface gaps between past research +efforts that have aimed to provide automation for aspects of content moderation +and the needs of volunteer content moderators, regarding identifying violations +of various moderation rules. To do so, we conduct a model review on Hugging +Face to reveal the availability of models to cover various moderation rules and +guidelines from three exemplar forums. We further put state-of-the-art LLMs to +the test, evaluating how well these models perform in flagging violations of +platform rules from one particular forum. Finally, we conduct a user survey +study with volunteer moderators to gain insight into their perspectives on +useful moderation models. Overall, we observe a non-trivial gap, as missing +developed models and LLMs exhibit moderate to low performance on a significant +portion of the rules. Moderators' reports provide guides for future work on +developing moderation assistant models. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Economists + + +
+ Deep learning provides powerful methods to impute structured information from +large-scale, unstructured text and image datasets. For example, economists +might wish to detect the presence of economic activity in satellite images, or +to measure the topics or entities mentioned in social media, the congressional +record, or firm filings. This review introduces deep neural networks, covering +methods such as classifiers, regression models, generative AI, and embedding +models. Applications include classification, document digitization, record +linkage, and methods for data exploration in massive scale text and image +corpora. When suitable methods are used, deep learning models can be cheap to +tune and can scale affordably to problems involving millions or billions of +data points.. The review is accompanied by a companion website, EconDL, with +user-friendly demo notebooks, software resources, and a knowledge base that +provides technical details and additional applications. + +
+
+
+
+
+ + ♻ ☆ No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design + Choices + + +
+ Advances in generative models have made it possible for AI-generated text, +code, and images to mirror human-generated content in many applications. +Watermarking, a technique that aims to embed information in the output of a +model to verify its source, is useful for mitigating the misuse of such +AI-generated content. However, we show that common design choices in LLM +watermarking schemes make the resulting systems surprisingly susceptible to +attack -- leading to fundamental trade-offs in robustness, utility, and +usability. To navigate these trade-offs, we rigorously study a set of simple +yet effective attacks on common watermarking systems, and propose guidelines +and defenses for LLM watermarking in practice. + +
+
+
+
+
+ + ♻ ☆ General LLMs as Instructors for Domain-Specific LLMs: A Sequential + Fusion Method to Integrate Extraction and Editing + + +
+ The substantial interest in updating Large Language Models (LLMs) without +retraining from scratch is accompanied by several challenges. This is +particularly true when updating LLMs with datasets that necessitate +domain-expert reasoning across extensive texts, despite limited samples. We +termed the scenario as the Few-Shot Domain-Expert Reasoning for Updating LLMs +(FDoR-UL). Traditional methods such as Low-Rank Adaptation (LoRA) and Retrieval +Augmented Generation (RAG) are inadequate for addressing this critical issue, +particularly evident in our exploration of a specific medical dataset that +epitomizes the distinct needs of FDoR-UL. To tackle this challenge, we +introduce a Sequential Fusion method to integrate knowledge from complex +contexts into LLMs. This method employs a two-stage framework: initially +leveraging general LLMs to perform relation extraction for knowledge +acquisition from complex texts, followed by updating domain-specific LLMs +through Knowledge Editing (KE). Employing our method, domain-specific LLMs +achieved a 71.7% accuracy (an average gain of 39.1%) in question-answering +tasks. Furthermore, we expanded our evaluation to a novel economics-management +dataset we developed, where our method achieved a 75.0% accuracy (an average +gain of 45.0%). These findings underscore the effectiveness and flexibility of +our approach in FDoR-UL across various domains. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ♻ ☆ Is Moral Self-correction An Innate Capability of Large Language Models? + A Mechanistic Analysis to Self-correction + + +
+ Though intensive attentions to the self-correction capability of Large +Language Models (LLMs), the underlying mechanism of this capability is still +under-explored. In this paper, we aim to answer two fundamental questions for +moral self-correction: (1) how different components in self-correction, such as +Chain-of-Thought (CoT) reasoning, external feedback, and instructional prompts, +interact to enable moral self-correction; and (2) is the self-correction one of +LLMs' innate capabilities? To answer the first question, we examine how +different self-correction components interact to intervene the embedded +morality within hidden states, therefore contributing to different performance. +For the second question, we (i) evaluate the robustness of moral +self-correction by introducing natural language interventions of weak evidence +into prompts; (ii) propose a validation framework, self-distinguish, that +requires effective self-correction to enable LLMs to distinguish between +desirable and undesirable outputs. Our experimental results indicate that there +is no universally optimal self-correction method for the tasks considered, +although external feedback and CoT can contribute to additional performance +gains. However, our mechanistic analysis reveals negative interactions among +instructional prompts, CoT, and external feedback, suggesting a conflict +between internal knowledge and external feedback. The self-distinguish +experiments demonstrate that while LLMs can self-correct their responses, they +are unable to reliably distinguish between desired and undesired outputs. With +our empirical evidence, we can conclude that moral self-correction is not an +innate capability of LLMs acquired during pretraining. + +
+
+
+
+
+ + ♻ ☆ Enhancing Post-Hoc Attributions in Long Document Comprehension via + Coarse Grained Answer Decomposition + + +
+ Accurately attributing answer text to its source document is crucial for +developing a reliable question-answering system. However, attribution for long +documents remains largely unexplored. Post-hoc attribution systems are designed +to map answer text back to the source document, yet the granularity of this +mapping has not been addressed. Furthermore, a critical question arises: What +exactly should be attributed? This involves identifying the specific +information units within an answer that require grounding. In this paper, we +propose and investigate a novel approach to the factual decomposition of +generated answers for attribution, employing template-based in-context +learning. To accomplish this, we utilize the question and integrate negative +sampling during few-shot in-context learning for decomposition. This approach +enhances the semantic understanding of both abstractive and extractive answers. +We examine the impact of answer decomposition by providing a thorough +examination of various attribution approaches, ranging from retrieval-based +techniques to LLM-based attributors. + +
+
+
+
+
+ + ♻ ☆ Are Large Language Models Table-based Fact-Checkers? SC + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ Target-driven Attack for Large Language Models + + +
+ Current large language models (LLM) provide a strong foundation for +large-scale user-oriented natural language tasks. Many users can easily inject +adversarial text or instructions through the user interface, thus causing LLM +model security challenges like the language model not giving the correct +answer. Although there is currently a large amount of research on black-box +attacks, most of these black-box attacks use random and heuristic strategies. +It is unclear how these strategies relate to the success rate of attacks and +thus effectively improve model robustness. To solve this problem, we propose +our target-driven black-box attack method to maximize the KL divergence between +the conditional probabilities of the clean text and the attack text to redefine +the attack's goal. We transform the distance maximization problem into two +convex optimization problems based on the attack goal to solve the attack text +and estimate the covariance. Furthermore, the projected gradient descent +algorithm solves the vector corresponding to the attack text. Our target-driven +black-box attack approach includes two attack strategies: token manipulation +and misinformation attack. Experimental results on multiple Large Language +Models and datasets demonstrate the effectiveness of our attack method. + +
+
+ comment: 12 pages, 7 figures. This work is an extension of the + arXiv:2404.07234 work. We propose new methods. 27th European Conference on + Artificial Intelligence 2024 +
+
+
+
+
+ + ♻ ☆ SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation EMNLP 2024 + + +
+ It is often desirable to distill the capabilities of large language models +(LLMs) into smaller student models due to compute and memory constraints. One +way to do this for classification tasks is via dataset synthesis, which can be +accomplished by generating examples of each label from the LLM. Prior +approaches to synthesis use few-shot prompting, which relies on the LLM's +parametric knowledge to generate usable examples. However, this leads to issues +of repetition, bias towards popular entities, and stylistic differences from +human text. In this work, we propose Synthesize by Retrieval and Refinement +(SynthesizRR), which uses retrieval augmentation to introduce variety into the +dataset synthesis process: as retrieved passages vary, the LLM is seeded with +different content to generate its examples. We empirically study the synthesis +of six datasets, covering topic classification, sentiment analysis, tone +detection, and humor, requiring complex synthesis strategies. We find that +SynthesizRR greatly improves lexical and semantic diversity, similarity to +human-written text, and distillation performance, when compared to 32-shot +prompting and four prior approaches. We release our code to perform all steps +at https://github.com/amazon-science/synthesizrr + +
+
+ comment: Published as a main conference paper at EMNLP 2024. Code available at + https://github.com/amazon-science/synthesizrr +
+
+
+
+
+ + ♻ ☆ Vikhr: Constructing a State-of-the-art Bilingual Open-Source + Instruction-Following Large Language Model for Russian EMNLP-2024 + + +
+ There has been a surge in developing various Large Language Models (LLMs). +However, text generation for languages other than English often faces +significant challenges, including poor generation quality and reduced +computational performance due to the disproportionate representation of tokens +in the model's vocabulary. In this work, we address these issues by developing +a pipeline for adapting English-oriented pre-trained models to other languages +and constructing efficient bilingual LLMs. Using this pipeline, we construct +Vikhr, a state-of-the-art bilingual open-source instruction-following LLM +designed specifically for the Russian language. "Vikhr" refers to the name of +the Mistral LLM series and means a "strong gust of wind." Unlike previous +Russian-language models that typically rely on LoRA adapters on top of +English-oriented models, sacrificing performance for lower training costs, +Vikhr features an adapted tokenizer vocabulary and undergoes continued +pre-training and instruction tuning of all weights. This not only enhances the +model's performance but also significantly improves its computational and +contextual efficiency. The remarkable performance of Vikhr across various +Russian-language benchmarks can also be attributed to our efforts in expanding +instruction datasets and corpora for continued pre-training. Vikhr not only +sets a new state of the art among open-source LLMs for Russian but even +outperforms some proprietary closed-source models on certain benchmarks. The +model weights, instruction sets, and code are publicly available. + +
+
+ comment: Accepted at WMRL @ EMNLP-2024 +
+
+
+
+
+ + ♻ ☆ Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models + with Simple and Effective Scaffold Token Removal + + +
+ Byte Pair Encoding (BPE) serves as a foundation method for text tokenization +in the Natural Language Processing (NLP) field. Despite its wide adoption, the +original BPE algorithm harbors an inherent flaw: it inadvertently introduces a +frequency imbalance for tokens in the text corpus. Since BPE iteratively merges +the most frequent token pair in the text corpus to generate a new token and +keeps all generated tokens in the vocabulary, it unavoidably holds tokens that +primarily act as components of a longer token and appear infrequently on their +own. We term such tokens as Scaffold Tokens. Due to their infrequent +occurrences in the text corpus, Scaffold Tokens pose a learning imbalance +issue. To address that issue, we propose Scaffold-BPE, which incorporates a +dynamic scaffold token removal mechanism by parameter-free, computation-light, +and easy-to-implement modifications to the original BPE method. This novel +approach ensures the exclusion of low-frequency Scaffold Tokens from the token +representations for given texts, thereby mitigating the issue of frequency +imbalance and facilitating model training. On extensive experiments across +language modeling and even machine translation, Scaffold-BPE consistently +outperforms the original BPE, well demonstrating its effectiveness. + +
+
+
+
+
+ + ♻ ☆ Spin glass model of in-context learning + + +
+ Large language models show a surprising in-context learning ability -- being +able to use a prompt to form a prediction for a query, yet without additional +training, in stark contrast to old-fashioned supervised learning. Providing a +mechanistic interpretation and linking the empirical phenomenon to physics are +thus challenging and remain unsolved. We study a simple yet expressive +transformer with linear attention and map this structure to a spin glass model +with real-valued spins, where the couplings and fields explain the intrinsic +disorder in data. The spin glass model explains how the weight parameters +interact with each other during pre-training, and further clarifies why an +unseen function can be predicted by providing only a prompt yet without further +training. Our theory reveals that for single-instance learning, increasing the +task diversity leads to the emergence of in-context learning, by allowing the +Boltzmann distribution to converge to a unique correct solution of weight +parameters. Therefore the pre-trained transformer displays a prediction power +in a novel prompt setting. The proposed analytically tractable model thus +offers a promising avenue for thinking about how to interpret many intriguing +but puzzling properties of large language models. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Query Optimization for Parametric Knowledge Refinement in + Retrieval-Augmented Large Language Models + + +
+ We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel +approach designed to bridge the pre-retrieval information gap in +Retrieval-Augmented Generation (RAG) systems through query optimization +tailored to meet the specific knowledge requirements of Large Language Models +(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR +framework begins by extracting parametric knowledge from LLMs, followed by +using a specialized query optimizer for refining these queries. This process +ensures the retrieval of only the most pertinent information essential for +generating accurate responses. Moreover, to enhance flexibility and reduce +computational costs, we propose a trainable scheme for our pipeline that +utilizes a smaller, tunable model as the query optimizer, which is refined +through knowledge distillation from a larger teacher model. Our evaluations on +various question-answering (QA) datasets and with different retrieval systems +show that ERRR consistently outperforms existing baselines, proving to be a +versatile and cost-effective module for improving the utility and accuracy of +RAG systems. + +
+
+
+
+
+ + ♻ ☆ Evaluating AI-Generated Essays with GRE Analytical Writing Assessment + + +
+ The recent revolutionary advance in generative AI enables the generation of +realistic and coherent texts by large language models (LLMs). Despite many +existing evaluation metrics on the quality of the generated texts, there is +still a lack of rigorous assessment of how well LLMs perform in complex and +demanding writing assessments. This study examines essays generated by ten +leading LLMs for the analytical writing assessment of the Graduate Record Exam +(GRE). We assessed these essays using both human raters and the e-rater +automated scoring engine as used in the GRE scoring pipeline. Notably, the +top-performing Gemini and GPT-4o received an average score of 4.78 and 4.67, +respectively, falling between "generally thoughtful, well-developed analysis of +the issue and conveys meaning clearly" and "presents a competent analysis of +the issue and conveys meaning with acceptable clarity" according to the GRE +scoring guideline. We also evaluated the detection accuracy of these essays, +with detectors trained on essays generated by the same and different LLMs. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multi-IF: Benchmarking LLMs on Multi-Turn and Multilingual Instructions + Following + + +
+ Large Language Models (LLMs) have demonstrated impressive capabilities in +various tasks, including instruction following, which is crucial for aligning +model outputs with user expectations. However, evaluating LLMs' ability to +follow instructions remains challenging due to the complexity and subjectivity +of human language. Current benchmarks primarily focus on single-turn, +monolingual instructions, which do not adequately reflect the complexities of +real-world applications that require handling multi-turn and multilingual +interactions. To address this gap, we introduce Multi-IF, a new benchmark +designed to assess LLMs' proficiency in following multi-turn and multilingual +instructions. Multi-IF, which utilizes a hybrid framework combining LLM and +human annotators, expands upon the IFEval by incorporating multi-turn sequences +and translating the English prompts into another 7 languages, resulting in a +dataset of 4,501 multilingual conversations, where each has three turns. Our +evaluation of 14 state-of-the-art LLMs on Multi-IF reveals that it presents a +significantly more challenging task than existing benchmarks. All the models +tested showed a higher rate of failure in executing instructions correctly with +each additional turn. For example, o1-preview drops from 0.877 at the first +turn to 0.707 at the third turn in terms of average accuracy over all +languages. Moreover, languages with non-Latin scripts (Hindi, Russian, and +Chinese) generally exhibit higher error rates, suggesting potential limitations +in the models' multilingual capabilities. We release Multi-IF prompts and the +evaluation code base to encourage further research in this critical area. + +
+
+
+
+
+ + ♻ ☆ Experiences from Creating a Benchmark for Sentiment Classification for + Varieties of English + + +
+ Existing benchmarks often fail to account for linguistic diversity, like +language variants of English. In this paper, we share our experiences from our +ongoing project of building a sentiment classification benchmark for three +variants of English: Australian (en-AU), Indian (en-IN), and British (en-UK) +English. Using Google Places reviews, we explore the effects of various +sampling techniques based on label semantics, review length, and sentiment +proportion and report performances on three fine-tuned BERT-based models. Our +initial evaluation reveals significant performance variations influenced by +sample characteristics, label semantics, and language variety, highlighting the +need for nuanced benchmark design. We offer actionable insights for researchers +to create robust benchmarks, emphasising the importance of diverse sampling, +careful label definition, and comprehensive evaluation across linguistic +varieties. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Fair Summarization: Bridging Quality and Diversity in Extractive + Summaries NeurIPS 2024 + + +
+ Fairness in multi-document summarization of user-generated content remains a +critical challenge in natural language processing (NLP). Existing summarization +methods often fail to ensure equitable representation across different social +groups, leading to biased outputs. In this paper, we introduce two novel +methods for fair extractive summarization: FairExtract, a clustering-based +approach, and FairGPT, which leverages GPT-3.5-turbo with fairness constraints. +We evaluate these methods using Divsumm summarization dataset of White-aligned, +Hispanic, and African-American dialect tweets and compare them against relevant +baselines. The results obtained using a comprehensive set of summarization +quality metrics such as SUPERT, BLANC, SummaQA, BARTScore, and UniEval, as well +as a fairness metric F, demonstrate that FairExtract and FairGPT achieve +superior fairness while maintaining competitive summarization quality. +Additionally, we introduce composite metrics (e.g., SUPERT+F, BLANC+F) that +integrate quality and fairness into a single evaluation framework, offering a +more nuanced understanding of the trade-offs between these objectives. This +work highlights the importance of fairness in summarization and sets a +benchmark for future research in fairness-aware NLP models. + +
+
+ comment: Accepted at Algorithmic Fairness through the Lens of Metrics and + Evaluation Workshop @ NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 98 + +
+
+
+ + ☆ 4D Gaussian Splatting in the Wild with Uncertainty-Aware Regularization NeurIPS 2024 + + +
+ Novel view synthesis of dynamic scenes is becoming important in various +applications, including augmented and virtual reality. We propose a novel 4D +Gaussian Splatting (4DGS) algorithm for dynamic scenes from casually recorded +monocular videos. To overcome the overfitting problem of existing work for +these real-world videos, we introduce an uncertainty-aware regularization that +identifies uncertain regions with few observations and selectively imposes +additional priors based on diffusion models and depth smoothness on such +regions. This approach improves both the performance of novel view synthesis +and the quality of training image reconstruction. We also identify the +initialization problem of 4DGS in fast-moving dynamic regions, where the +Structure from Motion (SfM) algorithm fails to provide reliable 3D landmarks. +To initialize Gaussian primitives in such regions, we present a dynamic region +densification method using the estimated depth maps and scene flow. Our +experiments show that the proposed method improves the performance of 4DGS +reconstruction from a video captured by a handheld monocular camera and also +exhibits promising results in few-shot static scene reconstruction. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ A Short Note on Evaluating RepNet for Temporal Repetition Counting in + Videos + + +
+ We discuss some consistent issues on how RepNet has been evaluated in various +papers. As a way to mitigate these issues, we report RepNet performance results +on different datasets, and release evaluation code and the RepNet checkpoint to +obtain these results. Code URL: +https://github.com/google-research/google-research/blob/master/repnet/ + +
+
+
+
+
+ + ☆ Multimodal Instruction Tuning with Hybrid State Space Models + + +
+ Handling lengthy context is crucial for enhancing the recognition and +understanding capabilities of multimodal large language models (MLLMs) in +applications such as processing high-resolution images or high frame rate +videos. The rise in image resolution and frame rate substantially increases +computational demands due to the increased number of input tokens. This +challenge is further exacerbated by the quadratic complexity with respect to +sequence length of the self-attention mechanism. Most prior works either +pre-train models with long contexts, overlooking the efficiency problem, or +attempt to reduce the context length via downsampling (e.g., identify the key +image patches or frames) to decrease the context length, which may result in +information loss. To circumvent this issue while keeping the remarkable +effectiveness of MLLMs, we propose a novel approach using a hybrid +transformer-MAMBA model to efficiently handle long contexts in multimodal +applications. Our multimodal model can effectively process long context input +exceeding 100k tokens, outperforming existing models across various benchmarks. +Remarkably, our model enhances inference efficiency for high-resolution images +and high-frame-rate videos by about 4 times compared to current models, with +efficiency gains increasing as image resolution or video frames rise. +Furthermore, our model is the first to be trained on low-resolution images or +low-frame-rate videos while being capable of inference on high-resolution +images and high-frame-rate videos, offering flexibility for inference in +diverse scenarios. + +
+
+
+
+
+ + ☆ LUDO: Low-Latency Understanding of Highly Deformable Objects using Point + Cloud Occupancy Functions + + +
+ Accurately determining the shape and location of internal structures within +deformable objects is crucial for medical tasks that require precise targeting, +such as robotic biopsies. We introduce LUDO, a method for accurate low-latency +understanding of deformable objects. LUDO reconstructs objects in their +deformed state, including their internal structures, from a single-view point +cloud observation in under 30 ms using occupancy networks. We demonstrate +LUDO's abilities for autonomous targeting of internal regions of interest +(ROIs) in highly deformable objects. Additionally, LUDO provides uncertainty +estimates and explainability for its predictions, both of which are important +in safety-critical applications such as surgical interventions. We evaluate +LUDO in real-world robotic experiments, achieving a success rate of 98.9% for +puncturing various ROIs inside highly deformable objects. LUDO demonstrates the +potential to interact with deformable objects without the need for deformable +registration methods. + +
+
+
+
+
+ + ☆ Sharingan: Extract User Action Sequence from Desktop Recordings + + +
+ Video recordings of user activities, particularly desktop recordings, offer a +rich source of data for understanding user behaviors and automating processes. +However, despite advancements in Vision-Language Models (VLMs) and their +increasing use in video analysis, extracting user actions from desktop +recordings remains an underexplored area. This paper addresses this gap by +proposing two novel VLM-based methods for user action extraction: the Direct +Frame-Based Approach (DF), which inputs sampled frames directly into VLMs, and +the Differential Frame-Based Approach (DiffF), which incorporates explicit +frame differences detected via computer vision techniques. We evaluate these +methods using a basic self-curated dataset and an advanced benchmark adapted +from prior work. Our results show that the DF approach achieves an accuracy of +70% to 80% in identifying user actions, with the extracted action sequences +being re-playable though Robotic Process Automation. We find that while VLMs +show potential, incorporating explicit UI changes can degrade performance, +making the DF approach more reliable. This work represents the first +application of VLMs for extracting user action sequences from desktop +recordings, contributing new methods, benchmarks, and insights for future +research. + +
+
+
+
+
+ + ☆ Masked Image Modeling Boosting Semi-Supervised Semantic Segmentation + + +
+ In view of the fact that semi- and self-supervised learning share a +fundamental principle, effectively modeling knowledge from unlabeled data, +various semi-supervised semantic segmentation methods have integrated +representative self-supervised learning paradigms for further regularization. +However, the potential of the state-of-the-art generative self-supervised +paradigm, masked image modeling, has been scarcely studied. This paradigm +learns the knowledge through establishing connections between the masked and +visible parts of masked image, during the pixel reconstruction process. By +inheriting and extending this insight, we successfully leverage masked image +modeling to boost semi-supervised semantic segmentation. Specifically, we +introduce a novel class-wise masked image modeling that independently +reconstructs different image regions according to their respective classes. In +this way, the mask-induced connections are established within each class, +mitigating the semantic confusion that arises from plainly reconstructing +images in basic masked image modeling. To strengthen these intra-class +connections, we further develop a feature aggregation strategy that minimizes +the distances between features corresponding to the masked and visible parts +within the same class. Additionally, in semantic space, we explore the +application of masked image modeling to enhance regularization. Extensive +experiments conducted on well-known benchmarks demonstrate that our approach +achieves state-of-the-art performance. The code will be available at +https://github.com/haoxt/S4MIM. + +
+
+ comment: 13 pages. This work has been submitted to the IEEE for possible + publication +
+
+
+
+
+ + ☆ Weakly-Supervised Anomaly Detection in Surveillance Videos Based on + Two-Stream I3D Convolution Network + + +
+ The widespread implementation of urban surveillance systems has necessitated +more sophisticated techniques for anomaly detection to ensure enhanced public +safety. This paper presents a significant advancement in the field of anomaly +detection through the application of Two-Stream Inflated 3D (I3D) Convolutional +Networks. These networks substantially outperform traditional 3D Convolutional +Networks (C3D) by more effectively extracting spatial and temporal features +from surveillance videos, thus improving the precision of anomaly detection. +Our research advances the field by implementing a weakly supervised learning +framework based on Multiple Instance Learning (MIL), which uniquely +conceptualizes surveillance videos as collections of 'bags' that contain +instances (video clips). Each instance is innovatively processed through a +ranking mechanism that prioritizes clips based on their potential to display +anomalies. This novel strategy not only enhances the accuracy and precision of +anomaly detection but also significantly diminishes the dependency on extensive +manual annotations. Moreover, through meticulous optimization of model +settings, including the choice of optimizer, our approach not only establishes +new benchmarks in the performance of anomaly detection systems but also offers +a scalable and efficient solution for real-world surveillance applications. +This paper contributes significantly to the field of computer vision by +delivering a more adaptable, efficient, and context-aware anomaly detection +system, which is poised to redefine practices in urban surveillance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Which Viewpoint Shows it Best? Language for Weakly Supervising View + Selection in Multi-view Videos + + +
+ Given a multi-view video, which viewpoint is most informative for a human +observer? Existing methods rely on heuristics or expensive ``best-view" +supervision to answer this question, limiting their applicability. We propose a +weakly supervised approach that leverages language accompanying an +instructional multi-view video as a means to recover its most informative +viewpoint(s). Our key hypothesis is that the more accurately an individual view +can predict a view-agnostic text summary, the more informative it is. To put +this into action, we propose a framework that uses the relative accuracy of +view-dependent caption predictions as a proxy for best view pseudo-labels. +Then, those pseudo-labels are used to train a view selector, together with an +auxiliary camera pose predictor that enhances view-sensitivity. During +inference, our model takes as input only a multi-view video -- no language or +camera poses -- and returns the best viewpoint to watch at each timestep. On +two challenging datasets comprised of diverse multi-camera setups and how-to +activities, our model consistently outperforms state-of-the-art baselines, both +with quantitative metrics and human evaluation. + +
+
+
+
+
+ + ☆ Retrieval Augmented Recipe Generation WACV + + +
+ Given the potential applications of generating recipes from food images, this +area has garnered significant attention from researchers in recent years. +Existing works for recipe generation primarily utilize a two-stage training +method, first generating ingredients and then obtaining instructions from both +the image and ingredients. Large Multi-modal Models (LMMs), which have achieved +notable success across a variety of vision and language tasks, shed light to +generating both ingredients and instructions directly from images. +Nevertheless, LMMs still face the common issue of hallucinations during recipe +generation, leading to suboptimal performance. To tackle this, we propose a +retrieval augmented large multimodal model for recipe generation. We first +introduce Stochastic Diversified Retrieval Augmentation (SDRA) to retrieve +recipes semantically related to the image from an existing datastore as a +supplement, integrating them into the prompt to add diverse and rich context to +the input image. Additionally, Self-Consistency Ensemble Voting mechanism is +proposed to determine the most confident prediction recipes as the final +output. It calculates the consistency among generated recipe candidates, which +use different retrieval recipes as context for generation. Extensive +experiments validate the effectiveness of our proposed method, which +demonstrates state-of-the-art (SOTA) performance in recipe generation tasks on +the Recipe1M dataset. + +
+
+ comment: ACCEPT on IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2025 +
+
+
+
+
+ + ☆ High-resolution optical and acoustic remote sensing datasets of the Puck + Lagoon, Southern Baltic + + +
+ The very shallow marine basin of Puck Lagoon in the southern Baltic Sea, on +the Northern coast of Poland, hosts valuable benthic habitats and cultural +heritage sites. These include, among others, protected Zostera marina meadows, +one of the Baltic's major medieval harbours, a ship graveyard, and likely other +submerged features that are yet to be discovered. Prior to this project, no +comprehensive high-resolution remote sensing data were available for this area. +This article describes the first Digital Elevation Models (DEMs) derived from a +combination of airborne bathymetric LiDAR, multibeam echosounder, airborne +photogrammetry and satellite imagery. These datasets also include multibeam +echosounder backscatter and LiDAR intensity, allowing determination of the +character and properties of the seafloor. Combined, these datasets are a vital +resource for assessing and understanding seafloor morphology, benthic habitats, +cultural heritage, and submerged landscapes. Given the significance of Puck +Lagoon's hydrographical, ecological, geological, and archaeological environs, +the high-resolution bathymetry, acquired by our project, can provide the +foundation for sustainable management and informed decision-making for this +area of interest. + +
+
+
+
+
+ + ☆ TRACE: Transformer-based Risk Assessment for Clinical Evaluation + + +
+ We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation), +a novel method for clinical risk assessment based on clinical data, leveraging +the self-attention mechanism for enhanced feature interaction and result +interpretation. Our approach is able to handle different data modalities, +including continuous, categorical and multiple-choice (checkbox) attributes. +The proposed architecture features a shared representation of the clinical data +obtained by integrating specialized embeddings of each data modality, enabling +the detection of high-risk individuals using Transformer encoder layers. To +assess the effectiveness of the proposed method, a strong baseline based on +non-negative multi-layer perceptrons (MLPs) is introduced. The proposed method +outperforms various baselines widely used in the domain of clinical risk +assessment, while effectively handling missing values. In terms of +explainability, our Transformer-based method offers easily interpretable +results via attention weights, further enhancing the clinicians' +decision-making process. + +
+
+
+
+
+ + ☆ A Survey on Vision Autoregressive Model + + +
+ Autoregressive models have demonstrated great performance in natural language +processing (NLP) with impressive scalability, adaptability and +generalizability. Inspired by their notable success in NLP field, +autoregressive models have been intensively investigated recently for computer +vision, which perform next-token predictions by representing visual data as +visual tokens and enables autoregressive modelling for a wide range of vision +tasks, ranging from visual generation and visual understanding to the very +recent multimodal generation that unifies visual generation and understanding +with a single autoregressive model. This paper provides a systematic review of +vision autoregressive models, including the development of a taxonomy of +existing methods and highlighting their major contributions, strengths, and +limitations, covering various vision tasks such as image generation, video +generation, image editing, motion generation, medical image analysis, 3D +generation, robotic manipulation, unified multimodal generation, etc. Besides, +we investigate and analyze the latest advancements in autoregressive models, +including thorough benchmarking and discussion of existing methods across +various evaluation datasets. Finally, we outline key challenges and promising +directions for future research, offering a roadmap to guide further +advancements in vision autoregressive models. + +
+
+
+
+
+ + ☆ OSMLoc: Single Image-Based Visual Localization in OpenStreetMap with + Geometric and Semantic Guidances + + +
+ OpenStreetMap (OSM), an online and versatile source of volunteered geographic +information (VGI), is widely used for human self-localization by matching +nearby visual observations with vectorized map data. However, due to the +divergence in modalities and views, image-to-OSM (I2O) matching and +localization remain challenging for robots, preventing the full utilization of +VGI data in the unmanned ground vehicles and logistic industry. Inspired by the +fact that the human brain relies on geometric and semantic understanding of +sensory information for spatial localization tasks, we propose the OSMLoc in +this paper. OSMLoc is a brain-inspired single-image visual localization method +with semantic and geometric guidance to improve accuracy, robustness, and +generalization ability. First, we equip the OSMLoc with the visual foundational +model to extract powerful image features. Second, a geometry-guided depth +distribution adapter is proposed to bridge the monocular depth estimation and +camera-to-BEV transform. Thirdly, the semantic embeddings from the OSM data are +utilized as auxiliary guidance for image-to-OSM feature matching. To validate +the proposed OSMLoc, we collect a worldwide cross-area and cross-condition (CC) +benchmark for extensive evaluation. Experiments on the MGL dataset, CC +validation benchmark, and KITTI dataset have demonstrated the superiority of +our method. Code, pre-trained models, CC validation benchmark, and additional +results are available on: https://github.com/WHU-USI3DV/OSMLoc + +
+
+ comment: 15 pages, technical report +
+
+
+
+
+ + ☆ Toward Human Understanding with Controllable Synthesis + + +
+ Training methods to perform robust 3D human pose and shape (HPS) estimation +requires diverse training images with accurate ground truth. While BEDLAM +demonstrates the potential of traditional procedural graphics to generate such +data, the training images are clearly synthetic. In contrast, generative image +models produce highly realistic images but without ground truth. Putting these +methods together seems straightforward: use a generative model with the body +ground truth as controlling signal. However, we find that, the more realistic +the generated images, the more they deviate from the ground truth, making them +inappropriate for training and evaluation. Enhancements of realistic details, +such as clothing and facial expressions, can lead to subtle yet significant +deviations from the ground truth, potentially misleading training models. We +empirically verify that this misalignment causes the accuracy of HPS networks +to decline when trained with generated images. To address this, we design a +controllable synthesis method that effectively balances image realism with +precise ground truth. We use this to create the Generative BEDLAM (Gen-B) +dataset, which improves the realism of the existing synthetic BEDLAM dataset +while preserving ground truth accuracy. We perform extensive experiments, with +various noise-conditioning strategies, to evaluate the tradeoff between visual +realism and HPS accuracy. We show, for the first time, that generative image +models can be controlled by traditional graphics methods to produce training +data that increases the accuracy of HPS methods. + +
+
+
+
+
+ + ☆ MikuDance: Animating Character Art with Mixed Motion Dynamics + + +
+ We propose MikuDance, a diffusion-based pipeline incorporating mixed motion +dynamics to animate stylized character art. MikuDance consists of two key +techniques: Mixed Motion Modeling and Mixed-Control Diffusion, to address the +challenges of high-dynamic motion and reference-guidance misalignment in +character art animation. Specifically, a Scene Motion Tracking strategy is +presented to explicitly model the dynamic camera in pixel-wise space, enabling +unified character-scene motion modeling. Building on this, the Mixed-Control +Diffusion implicitly aligns the scale and body shape of diverse characters with +motion guidance, allowing flexible control of local character motion. +Subsequently, a Motion-Adaptive Normalization module is incorporated to +effectively inject global scene motion, paving the way for comprehensive +character art animation. Through extensive experiments, we demonstrate the +effectiveness and generalizability of MikuDance across various character art +and motion guidance, consistently producing high-quality animations with +remarkable motion dynamics. + +
+
+
+
+
+ + ☆ Towards More Accurate Fake Detection on Images Generated from Advanced + Generative and Neural Rendering Models + + +
+ The remarkable progress in neural-network-driven visual data generation, +especially with neural rendering techniques like Neural Radiance Fields and 3D +Gaussian splatting, offers a powerful alternative to GANs and diffusion models. +These methods can produce high-fidelity images and lifelike avatars, +highlighting the need for robust detection methods. In response, an +unsupervised training technique is proposed that enables the model to extract +comprehensive features from the Fourier spectrum magnitude, thereby overcoming +the challenges of reconstructing the spectrum due to its centrosymmetric +properties. By leveraging the spectral domain and dynamically combining it with +spatial domain information, we create a robust multimodal detector that +demonstrates superior generalization capabilities in identifying challenging +synthetic images generated by the latest image synthesis techniques. To address +the absence of a 3D neural rendering-based fake image database, we develop a +comprehensive database that includes images generated by diverse neural +rendering techniques, providing a robust foundation for evaluating and +advancing detection methods. + +
+
+ comment: 13 pages, 8 Figures +
+
+
+
+
+ + ☆ Zero-shot capability of SAM-family models for bone segmentation in CT + scans + + +
+ The Segment Anything Model (SAM) and similar models build a family of +promptable foundation models (FMs) for image and video segmentation. The object +of interest is identified using prompts, such as bounding boxes or points. With +these FMs becoming part of medical image segmentation, extensive evaluation +studies are required to assess their strengths and weaknesses in clinical +setting. Since the performance is highly dependent on the chosen prompting +strategy, it is important to investigate different prompting techniques to +define optimal guidelines that ensure effective use in medical image +segmentation. Currently, no dedicated evaluation studies exist specifically for +bone segmentation in CT scans, leaving a gap in understanding the performance +for this task. Thus, we use non-iterative, ``optimal'' prompting strategies +composed of bounding box, points and combinations to test the zero-shot +capability of SAM-family models for bone CT segmentation on three different +skeletal regions. Our results show that the best settings depend on the model +type and size, dataset characteristics and objective to optimize. Overall, SAM +and SAM2 prompted with a bounding box in combination with the center point for +all the components of an object yield the best results across all tested +settings. As the results depend on multiple factors, we provide a guideline for +informed decision-making in 2D prompting with non-interactive, ''optimal'' +prompts. + +
+
+
+
+
+ + ☆ LG-Gaze: Learning Geometry-aware Continuous Prompts for Language-Guided + Gaze Estimation ECCV 2024 + + +
+ The ability of gaze estimation models to generalize is often significantly +hindered by various factors unrelated to gaze, especially when the training +dataset is limited. Current strategies aim to address this challenge through +different domain generalization techniques, yet they have had limited success +due to the risk of overfitting when solely relying on value labels for +regression. Recent progress in pre-trained vision-language models has motivated +us to capitalize on the abundant semantic information available. We propose a +novel approach in this paper, reframing the gaze estimation task as a +vision-language alignment issue. Our proposed framework, named Language-Guided +Gaze Estimation (LG-Gaze), learns continuous and geometry-sensitive features +for gaze estimation benefit from the rich prior knowledges of vision-language +models. Specifically, LG-Gaze aligns gaze features with continuous linguistic +features through our proposed multimodal contrastive regression loss, which +customizes adaptive weights for different negative samples. Furthermore, to +better adapt to the labels for gaze estimation task, we propose a +geometry-aware interpolation method to obtain more precise gaze embeddings. +Through extensive experiments, we validate the efficacy of our framework in +four different cross-domain evaluation tasks. + +
+
+ comment: Accepted to ECCV 2024 +
+
+
+
+
+ + ☆ Generalized Pose Space Embeddings for Training In-the-Wild using + Anaylis-by-Synthesis + + +
+ Modern pose estimation models are trained on large, manually-labelled +datasets which are costly and may not cover the full extent of human poses and +appearances in the real world. With advances in neural rendering, +analysis-by-synthesis and the ability to not only predict, but also render the +pose, is becoming an appealing framework, which could alleviate the need for +large scale manual labelling efforts. While recent work have shown the +feasibility of this approach, the predictions admit many flips due to a +simplistic intermediate skeleton representation, resulting in low precision and +inhibiting the acquisition of any downstream knowledge such as +three-dimensional positioning. We solve this problem with a more expressive +intermediate skeleton representation capable of capturing the semantics of the +pose (left and right), which significantly reduces flips. To successfully train +this new representation, we extend the analysis-by-synthesis framework with a +training protocol based on synthetic data. We show that our representation +results in less flips and more accurate predictions. Our approach outperforms +previous models trained with analysis-by-synthesis on standard benchmarks. + +
+
+
+
+
+ + ☆ Slender Object Scene Segmentation in Remote Sensing Image Based on + Learnable Morphological Skeleton with Segment Anything Model + + +
+ Morphological methods play a crucial role in remote sensing image processing, +due to their ability to capture and preserve small structural details. However, +most of the existing deep learning models for semantic segmentation are based +on the encoder-decoder architecture including U-net and Segment Anything Model +(SAM), where the downsampling process tends to discard fine details. In this +paper, we propose a new approach that integrates learnable morphological +skeleton prior into deep neural networks using the variational method. To +address the difficulty in backpropagation in neural networks caused by the +non-differentiability presented in classical morphological operations, we +provide a smooth representation of the morphological skeleton and design a +variational segmentation model integrating morphological skeleton prior by +employing operator splitting and dual methods. Then, we integrate this model +into the network architecture of SAM, which is achieved by adding a token to +mask decoder and modifying the final sigmoid layer, ensuring the final +segmentation results preserve the skeleton structure as much as possible. +Experimental results on remote sensing datasets, including buildings and roads, +demonstrate that our method outperforms the original SAM on slender object +segmentation and exhibits better generalization capability. + +
+
+
+
+
+ + ☆ NavAgent: Multi-scale Urban Street View Fusion For UAV Embodied + Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN), as a widely discussed research +direction in embodied intelligence, aims to enable embodied agents to navigate +in complicated visual environments through natural language commands. Most +existing VLN methods focus on indoor ground robot scenarios. However, when +applied to UAV VLN in outdoor urban scenes, it faces two significant +challenges. First, urban scenes contain numerous objects, which makes it +challenging to match fine-grained landmarks in images with complex textual +descriptions of these landmarks. Second, overall environmental information +encompasses multiple modal dimensions, and the diversity of representations +significantly increases the complexity of the encoding process. To address +these challenges, we propose NavAgent, the first urban UAV embodied navigation +model driven by a large Vision-Language Model. NavAgent undertakes navigation +tasks by synthesizing multi-scale environmental information, including +topological maps (global), panoramas (medium), and fine-grained landmarks +(local). Specifically, we utilize GLIP to build a visual recognizer for +landmark capable of identifying and linguisticizing fine-grained landmarks. +Subsequently, we develop dynamically growing scene topology map that integrate +environmental information and employ Graph Convolutional Networks to encode +global environmental data. In addition, to train the visual recognizer for +landmark, we develop NavAgent-Landmark2K, the first fine-grained landmark +dataset for real urban street scenes. In experiments conducted on the Touchdown +and Map2seq datasets, NavAgent outperforms strong baseline models. The code and +dataset will be released to the community to facilitate the exploration and +development of outdoor VLN. + +
+
+
+
+
+ + ☆ UIFormer: A Unified Transformer-based Framework for Incremental Few-Shot + Object Detection and Instance Segmentation + + +
+ This paper introduces a novel framework for unified incremental few-shot +object detection (iFSOD) and instance segmentation (iFSIS) using the +Transformer architecture. Our goal is to create an optimal solution for +situations where only a few examples of novel object classes are available, +with no access to training data for base or old classes, while maintaining high +performance across both base and novel classes. To achieve this, We extend +Mask-DINO into a two-stage incremental learning framework. Stage 1 focuses on +optimizing the model using the base dataset, while Stage 2 involves fine-tuning +the model on novel classes. Besides, we incorporate a classifier selection +strategy that assigns appropriate classifiers to the encoder and decoder +according to their distinct functions. Empirical evidence indicates that this +approach effectively mitigates the over-fitting on novel classes learning. +Furthermore, we implement knowledge distillation to prevent catastrophic +forgetting of base classes. Comprehensive evaluations on the COCO and LVIS +datasets for both iFSIS and iFSOD tasks demonstrate that our method +significantly outperforms state-of-the-art approaches. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Saliency Map-based Image Retrieval using Invariant Krawtchouk Moments + + +
+ With the widespread adoption of digital devices equipped with cameras and the +rapid development of Internet technology, numerous content-based image +retrieval systems and novel image feature extraction techniques have emerged in +recent years. This paper introduces a saliency map-based image retrieval +approach using invariant Krawtchouk moments (SM-IKM) to enhance retrieval speed +and accuracy. The proposed method applies a global contrast-based salient +region detection algorithm to create a saliency map that effectively isolates +the foreground from the background. It then combines multiple orders of +invariant Krawtchouk moments (IKM) with local binary patterns (LBPs) and color +histograms to comprehensively represent the foreground and background. +Additionally, it incorporates LBPs derived from the saliency map to improve +discriminative power, facilitating more precise image differentiation. A +bag-of-visual-words (BoVW) model is employed to generate a codebook for +classification and discrimination. By using compact IKMs in the BoVW framework +and integrating a range of region-based feature-including color histograms, +LBPs, and saliency map-enhanced LBPs, our proposed SM-IKM achieves efficient +and accurate image retrieval. xtensive experiments on publicly available +datasets, such as Caltech 101 and Wang, demonstrate that SM-IKM outperforms +recent state-of-the-art retrieval methods. The source code for SM-IKM is +available at github.com/arnejad/SMIKM. + +
+
+
+
+
+ + ☆ APDDv2: Aesthetics of Paintings and Drawings Dataset with Artist Labeled + Scores and Comments + + +
+ Datasets play a pivotal role in training visual models, facilitating the +development of abstract understandings of visual features through diverse image +samples and multidimensional attributes. However, in the realm of aesthetic +evaluation of artistic images, datasets remain relatively scarce. Existing +painting datasets are often characterized by limited scoring dimensions and +insufficient annotations, thereby constraining the advancement and application +of automatic aesthetic evaluation methods in the domain of painting. To bridge +this gap, we introduce the Aesthetics Paintings and Drawings Dataset (APDD), +the first comprehensive collection of paintings encompassing 24 distinct +artistic categories and 10 aesthetic attributes. Building upon the initial +release of APDDv1, our ongoing research has identified opportunities for +enhancement in data scale and annotation precision. Consequently, APDDv2 boasts +an expanded image corpus and improved annotation quality, featuring detailed +language comments to better cater to the needs of both researchers and +practitioners seeking high-quality painting datasets. Furthermore, we present +an updated version of the Art Assessment Network for Specific Painting Styles, +denoted as ArtCLIP. Experimental validation demonstrates the superior +performance of this revised model in the realm of aesthetic evaluation, +surpassing its predecessor in accuracy and efficacy. The dataset and model are +available at https://github.com/BestiVictory/APDDv2.git. + +
+
+
+
+
+ + ☆ MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal + Lymphatic Vessel Segmentation ML4H 2024 + + +
+ Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste +products from the human brain. An impairment in their functionality has been +associated with aging as well as brain disorders like multiple sclerosis and +Alzheimer's disease. However, MLVs have only recently been described for the +first time in magnetic resonance imaging (MRI), and their ramified structure +renders manual segmentation particularly difficult. Further, as there is no +consistent notion of their appearance, human-annotated MLV structures contain a +high inter-rater variability that most automatic segmentation methods cannot +take into account. In this work, we propose a new rater-aware training scheme +for the popular nnU-Net model, and we explore rater-based ensembling strategies +for accurate and consistent segmentation of MLVs. This enables us to boost +nnU-Net's performance while obtaining explicit predictions in different +annotation styles and a rater-based uncertainty estimation. Our final model, +MLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to +the human reference standard. The model further matches the human inter-rater +reliability and replicates age-related associations with MLV volume. + +
+
+ comment: ML4H 2024 +
+
+
+
+
+ + ☆ Classification and Morphological Analysis of DLBCL Subtypes in + H\&E-Stained Slides + + +
+ We address the challenge of automated classification of diffuse large B-cell +lymphoma (DLBCL) into its two primary subtypes: activated B-cell-like (ABC) and +germinal center B-cell-like (GCB). Accurate classification between these +subtypes is essential for determining the appropriate therapeutic strategy, +given their distinct molecular profiles and treatment responses. Our proposed +deep learning model demonstrates robust performance, achieving an average area +under the curve (AUC) of (87.4 pm 5.7)\% during cross-validation. It shows a +high positive predictive value (PPV), highlighting its potential for clinical +application, such as triaging for molecular testing. To gain biological +insights, we performed an analysis of morphological features of ABC and GCB +subtypes. We segmented cell nuclei using a pre-trained deep neural network and +compared the statistics of geometric and color features for ABC and GCB. We +found that the distributions of these features were not very different for the +two subtypes, which suggests that the visual differences between them are more +subtle. These results underscore the potential of our method to assist in more +precise subtype classification and can contribute to improved treatment +management and outcomes for patients of DLBCL. + +
+
+
+
+
+ + ☆ Efficient Whole Slide Image Classification through Fisher Vector + Representation + + +
+ The advancement of digital pathology, particularly through computational +analysis of whole slide images (WSI), is poised to significantly enhance +diagnostic precision and efficiency. However, the large size and complexity of +WSIs make it difficult to analyze and classify them using computers. This study +introduces a novel method for WSI classification by automating the +identification and examination of the most informative patches, thus +eliminating the need to process the entire slide. Our method involves +two-stages: firstly, it extracts only a few patches from the WSIs based on +their pathological significance; and secondly, it employs Fisher vectors (FVs) +for representing features extracted from these patches, which is known for its +robustness in capturing fine-grained details. This approach not only +accentuates key pathological features within the WSI representation but also +significantly reduces computational overhead, thus making the process more +efficient and scalable. We have rigorously evaluated the proposed method across +multiple datasets to benchmark its performance against comprehensive WSI +analysis and contemporary weakly-supervised learning methodologies. The +empirical results indicate that our focused analysis of select patches, +combined with Fisher vector representation, not only aligns with, but at times +surpasses, the classification accuracy of standard practices. Moreover, this +strategy notably diminishes computational load and resource expenditure, +thereby establishing an efficient and precise framework for WSI analysis in the +realm of digital pathology. + +
+
+
+
+
+ + ☆ BillBoard Splatting (BBSplat): Learnable Textured Primitives for Novel + View Synthesis + + +
+ We present billboard Splatting (BBSplat) - a novel approach for 3D scene +representation based on textured geometric primitives. BBSplat represents the +scene as a set of optimizable textured planar primitives with learnable RGB +textures and alpha-maps to control their shape. BBSplat primitives can be used +in any Gaussian Splatting pipeline as drop-in replacements for Gaussians. Our +method's qualitative and quantitative improvements over 3D and 2D Gaussians are +most noticeable when fewer primitives are used, when BBSplat achieves over 1200 +FPS. Our novel regularization term encourages textures to have a sparser +structure, unlocking an efficient compression that leads to a reduction in +storage space of the model. Our experiments show the efficiency of BBSplat on +standard datasets of real indoor and outdoor scenes such as Tanks&Temples, DTU, +and Mip-NeRF-360. We demonstrate improvements on PSNR, SSIM, and LPIPS metrics +compared to the state-of-the-art, especially for the case when fewer primitives +are used, which, on the other hand, leads to up to 2 times inference speed +improvement for the same rendering quality. + +
+
+
+
+
+ + ☆ Impact of Iris Pigmentation on Performance Bias in Visible Iris + Verification Systems: A Comparative Study + + +
+ Iris recognition technology plays a critical role in biometric identification +systems, but their performance can be affected by variations in iris +pigmentation. In this work, we investigate the impact of iris pigmentation on +the efficacy of biometric recognition systems, focusing on a comparative +analysis of blue and dark irises. Data sets were collected using multiple +devices, including P1, P2, and P3 smartphones [4], to assess the robustness of +the systems in different capture environments [19]. Both traditional machine +learning techniques and deep learning models were used, namely Open-Iris, +ViT-b, and ResNet50, to evaluate performance metrics such as Equal Error Rate +(EER) and True Match Rate (TMR). Our results indicate that iris recognition +systems generally exhibit higher accuracy for blue irises compared to dark +irises. Furthermore, we examined the generalization capabilities of these +systems across different iris colors and devices, finding that while training +on diverse datasets enhances recognition performance, the degree of improvement +is contingent on the specific model and device used. Our analysis also +identifies inherent biases in recognition performance related to iris color and +cross-device variability. These findings underscore the need for more inclusive +dataset collection and model refinement to reduce bias and promote equitable +biometric recognition across varying iris pigmentation and device +configurations. + +
+
+ comment: 14 pages, 5 figures, 5 Tables +
+
+
+
+
+ + ☆ UNSCT-HRNet: Modeling Anatomical Uncertainty for Landmark Detection in + Total Hip Arthroplasty + + +
+ Total hip arthroplasty (THA) relies on accurate landmark detection from +radiographic images, but unstructured data caused by irregular patient postures +or occluded anatomical markers pose significant challenges for existing +methods. To address this, we propose UNSCT-HRNet (Unstructured CT - +High-Resolution Net), a deep learning-based framework that integrates a Spatial +Relationship Fusion (SRF) module and an Uncertainty Estimation (UE) module. The +SRF module, utilizing coordinate convolution and polarized attention, enhances +the model's ability to capture complex spatial relationships. Meanwhile, the UE +module which based on entropy ensures predictions are anatomically relevant. +For unstructured data, the proposed method can predict landmarks without +relying on the fixed number of points, which shows higher accuracy and better +robustness comparing with the existing methods. Our UNSCT-HRNet demonstrates +over a 60% improvement across multiple metrics in unstructured data. The +experimental results also reveal that our approach maintains good performance +on the structured dataset. Overall, the proposed UNSCT-HRNet has the potential +to be used as a new reliable, automated solution for THA surgical planning and +postoperative monitoring. + +
+
+
+
+
+ + ☆ Methodology for a Statistical Analysis of Influencing Factors on 3D + Object Detection Performance + + +
+ In autonomous driving, object detection is an essential task to perceive the +environment by localizing and classifying objects. Most object detection +algorithms rely on deep learning for their superior performance. However, their +black box nature makes it challenging to ensure safety. In this paper, we +propose a first-of-its-kind methodology for statistical analysis of the +influence of various factors related to the objects to detect or the +environment on the detection performance of both LiDAR- and camera-based 3D +object detectors. We perform a univariate analysis between each of the factors +and the detection error in order to compare the strength of influence. To +better identify potential sources of detection errors, we also analyze the +performance in dependency of the influencing factors and examine the +interdependencies between the different influencing factors. Recognizing the +factors that influence detection performance helps identify robustness issues +in the trained object detector and supports the safety approval of object +detection systems. + +
+
+
+
+
+ + ☆ A survey on Graph Deep Representation Learning for Facial Expression + Recognition + + +
+ This comprehensive review delves deeply into the various methodologies +applied to facial expression recognition (FER) through the lens of graph +representation learning (GRL). Initially, we introduce the task of FER and the +concepts of graph representation and GRL. Afterward, we discuss some of the +most prevalent and valuable databases for this task. We explore promising +approaches for graph representation in FER, including graph diffusion, +spatio-temporal graphs, and multi-stream architectures. Finally, we identify +future research opportunities and provide concluding remarks. + +
+
+
+
+
+ + ☆ HyperFace: Generating Synthetic Face Recognition Datasets by Exploring + Face Embedding Hypersphere NeurIPS 2024 + + +
+ Face recognition datasets are often collected by crawling Internet and +without individuals' consents, raising ethical and privacy concerns. Generating +synthetic datasets for training face recognition models has emerged as a +promising alternative. However, the generation of synthetic datasets remains +challenging as it entails adequate inter-class and intra-class variations. +While advances in generative models have made it easier to increase intra-class +variations in face datasets (such as pose, illumination, etc.), generating +sufficient inter-class variation is still a difficult task. In this paper, we +formulate the dataset generation as a packing problem on the embedding space +(represented on a hypersphere) of a face recognition model and propose a new +synthetic dataset generation approach, called HyperFace. We formalize our +packing problem as an optimization problem and solve it with a gradient +descent-based approach. Then, we use a conditional face generator model to +synthesize face images from the optimized embeddings. We use our generated +datasets to train face recognition models and evaluate the trained models on +several benchmarking real datasets. Our experimental results show that models +trained with HyperFace achieve state-of-the-art performance in training face +recognition using synthetic datasets. + +
+
+ comment: Accepted in NeurIPS 2024 Safe Generative AI Workshop +
+
+
+
+
+ + ☆ Can MLLMs Guide Weakly-Supervised Temporal Action Localization Tasks? + + +
+ Recent breakthroughs in Multimodal Large Language Models (MLLMs) have gained +significant recognition within the deep learning community, where the fusion of +the Video Foundation Models (VFMs) and Large Language Models(LLMs) has proven +instrumental in constructing robust video understanding systems, effectively +surmounting constraints associated with predefined visual tasks. These +sophisticated MLLMs exhibit remarkable proficiency in comprehending videos, +swiftly attaining unprecedented performance levels across diverse benchmarks. +However, their operation demands substantial memory and computational +resources, underscoring the continued importance of traditional models in video +comprehension tasks. In this paper, we introduce a novel learning paradigm +termed MLLM4WTAL. This paradigm harnesses the potential of MLLM to offer +temporal action key semantics and complete semantic priors for conventional +Weakly-supervised Temporal Action Localization (WTAL) methods. MLLM4WTAL +facilitates the enhancement of WTAL by leveraging MLLM guidance. It achieves +this by integrating two distinct modules: Key Semantic Matching (KSM) and +Complete Semantic Reconstruction (CSR). These modules work in tandem to +effectively address prevalent issues like incomplete and over-complete outcomes +common in WTAL methods. Rigorous experiments are conducted to validate the +efficacy of our proposed approach in augmenting the performance of various +heterogeneous WTAL models. + +
+
+
+
+
+ + ☆ Trap-MID: Trapdoor-based Defense against Model Inversion Attacks NeurIPS + + +
+ Model Inversion (MI) attacks pose a significant threat to the privacy of Deep +Neural Networks by recovering training data distribution from well-trained +models. While existing defenses often rely on regularization techniques to +reduce information leakage, they remain vulnerable to recent attacks. In this +paper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to +mislead MI attacks. A trapdoor is integrated into the model to predict a +specific label when the input is injected with the corresponding trigger. +Consequently, this trapdoor information serves as the "shortcut" for MI +attacks, leading them to extract trapdoor triggers rather than private data. We +provide theoretical insights into the impacts of trapdoor's effectiveness and +naturalness on deceiving MI attacks. In addition, empirical experiments +demonstrate the state-of-the-art defense performance of Trap-MID against +various MI attacks without the requirements for extra data or large +computational overhead. Our source code is publicly available at +https://github.com/ntuaislab/Trap-MID. + +
+
+ comment: Accepted by Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Biomass phenotyping of oilseed rape through UAV multi-view oblique + imaging with 3DGS and SAM model + + +
+ Biomass estimation of oilseed rape is crucial for optimizing crop +productivity and breeding strategies. While UAV-based imaging has advanced +high-throughput phenotyping, current methods often rely on orthophoto images, +which struggle with overlapping leaves and incomplete structural information in +complex field environments. This study integrates 3D Gaussian Splatting (3DGS) +with the Segment Anything Model (SAM) for precise 3D reconstruction and biomass +estimation of oilseed rape. UAV multi-view oblique images from 36 angles were +used to perform 3D reconstruction, with the SAM module enhancing point cloud +segmentation. The segmented point clouds were then converted into point cloud +volumes, which were fitted to ground-measured biomass using linear regression. +The results showed that 3DGS (7k and 30k iterations) provided high accuracy, +with peak signal-to-noise ratios (PSNR) of 27.43 and 29.53 and training times +of 7 and 49 minutes, respectively. This performance exceeded that of structure +from motion (SfM) and mipmap Neural Radiance Fields (Mip-NeRF), demonstrating +superior efficiency. The SAM module achieved high segmentation accuracy, with a +mean intersection over union (mIoU) of 0.961 and an F1-score of 0.980. +Additionally, a comparison of biomass extraction models found the point cloud +volume model to be the most accurate, with an determination coefficient (R2) of +0.976, root mean square error (RMSE) of 2.92 g/plant, and mean absolute +percentage error (MAPE) of 6.81%, outperforming both the plot crop volume and +individual crop volume models. This study highlights the potential of combining +3DGS with multi-view UAV imaging for improved biomass phenotyping. + +
+
+
+
+
+ + ☆ AD-DINO: Attention-Dynamic DINO for Distance-Aware Embodied Reference + Understanding + + +
+ Embodied reference understanding is crucial for intelligent agents to predict +referents based on human intention through gesture signals and language +descriptions. This paper introduces the Attention-Dynamic DINO, a novel +framework designed to mitigate misinterpretations of pointing gestures across +various interaction contexts. Our approach integrates visual and textual +features to simultaneously predict the target object's bounding box and the +attention source in pointing gestures. Leveraging the distance-aware nature of +nonverbal communication in visual perspective taking, we extend the virtual +touch line mechanism and propose an attention-dynamic touch line to represent +referring gesture based on interactive distances. The combination of this +distance-aware approach and independent prediction of the attention source, +enhances the alignment between objects and the gesture represented line. +Extensive experiments on the YouRefIt dataset demonstrate the efficacy of our +gesture information understanding method in significantly improving task +performance. Our model achieves 76.4% accuracy at the 0.25 IoU threshold and, +notably, surpasses human performance at the 0.75 IoU threshold, marking a first +in this domain. Comparative experiments with distance-unaware understanding +methods from previous research further validate the superiority of the +Attention-Dynamic Touch Line across diverse contexts. + +
+
+
+
+
+ + ☆ Machine Unlearning on Pre-trained Models by Residual Feature Alignment + Using LoRA + + +
+ Machine unlearning is new emerged technology that removes a subset of the +training data from a trained model without affecting the model performance on +the remaining data. This topic is becoming increasingly important in protecting +user privacy and eliminating harmful or outdated data. The key challenge lies +in effectively and efficiently unlearning specific information without +compromising the model's utility on the retained data. For the pre-trained +models, fine-tuning is an important way to achieve the unlearning target. +Previous work typically fine-tuned the entire model's parameters, which incurs +significant computation costs. In addition, the fine-tuning process may cause +shifts in the intermediate layer features, affecting the model's overall +utility. In this work, we propose a novel and efficient machine unlearning +method on pre-trained models. We term the method as Residual Feature Alignment +Unlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose +the model's intermediate features into pre-trained features and residual +features. By adjusting the residual features, we align the unlearned model with +the pre-trained model at the intermediate feature level to achieve both +unlearning and remaining targets. The method aims to learn the zero residuals +on the retained set and shifted residuals on the unlearning set. Extensive +experiments on numerous datasets validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ A Heterogeneous Graph Neural Network Fusing Functional and Structural + Connectivity for MCI Diagnosis + + +
+ Brain connectivity alternations associated with brain disorders have been +widely reported in resting-state functional imaging (rs-fMRI) and diffusion +tensor imaging (DTI). While many dual-modal fusion methods based on graph +neural networks (GNNs) have been proposed, they generally follow homogenous +fusion ways ignoring rich heterogeneity of dual-modal information. To address +this issue, we propose a novel method that integrates functional and structural +connectivity based on heterogeneous graph neural networks (HGNNs) to better +leverage the rich heterogeneity in dual-modal images. We firstly use blood +oxygen level dependency and whiter matter structure information provided by +rs-fMRI and DTI to establish homo-meta-path, capturing node relationships +within the same modality. At the same time, we propose to establish +hetero-meta-path based on structure-function coupling and brain community +searching to capture relations among cross-modal nodes. Secondly, we further +introduce a heterogeneous graph pooling strategy that automatically balances +homo- and hetero-meta-path, effectively leveraging heterogeneous information +and preventing feature confusion after pooling. Thirdly, based on the +flexibility of heterogeneous graphs, we propose a heterogeneous graph data +augmentation approach that can conveniently address the sample imbalance issue +commonly seen in clinical diagnosis. We evaluate our method on ADNI-3 dataset +for mild cognitive impairment (MCI) diagnosis. Experimental results indicate +the proposed method is effective and superior to other algorithms, with a mean +classification accuracy of 93.3%. + +
+
+
+
+
+ + ☆ The VLLM Safety Paradox: Dual Ease in Jailbreak Attack and Defense + + +
+ The vulnerability of Vision Large Language Models (VLLMs) to jailbreak +attacks appears as no surprise. However, recent defense mechanisms against +these attacks have reached near-saturation performance on benchmarks, often +with minimal effort. This simultaneous high performance in both attack and +defense presents a perplexing paradox. Resolving it is critical for advancing +the development of trustworthy models. To address this research gap, we first +investigate why VLLMs are prone to these attacks. We then make a key +observation: existing defense mechanisms suffer from an \textbf{over-prudence} +problem, resulting in unexpected abstention even in the presence of benign +inputs. Additionally, we find that the two representative evaluation methods +for jailbreak often exhibit chance agreement. This limitation makes it +potentially misleading when evaluating attack strategies or defense mechanisms. +Beyond these empirical observations, our another contribution in this work is +to repurpose the guardrails of LLMs on the shelf, as an effective alternative +detector prior to VLLM response. We believe these findings offer useful +insights to rethink the foundational development of VLLM safety with respect to +benchmark datasets, evaluation methods, and defense strategies. + +
+
+
+
+
+ + ☆ V2X-R: Cooperative LiDAR-4D Radar Fusion for 3D Object Detection with + Denoising Diffusion + + +
+ Current Vehicle-to-Everything (V2X) systems have significantly enhanced 3D +object detection using LiDAR and camera data. However, these methods suffer +from performance degradation in adverse weather conditions. The weatherrobust +4D radar provides Doppler and additional geometric information, raising the +possibility of addressing this challenge. To this end, we present V2X-R, the +first simulated V2X dataset incorporating LiDAR, camera, and 4D radar. V2X-R +contains 12,079 scenarios with 37,727 frames of LiDAR and 4D radar point +clouds, 150,908 images, and 170,859 annotated 3D vehicle bounding boxes. +Subsequently, we propose a novel cooperative LiDAR-4D radar fusion pipeline for +3D object detection and implement it with various fusion strategies. To achieve +weather-robust detection, we additionally propose a Multi-modal Denoising +Diffusion (MDD) module in our fusion pipeline. MDD utilizes weather-robust 4D +radar feature as a condition to prompt the diffusion model to denoise noisy +LiDAR features. Experiments show that our LiDAR-4D radar fusion pipeline +demonstrates superior performance in the V2X-R dataset. Over and above this, +our MDD module further improved the performance of basic fusion model by up to +5.73%/6.70% in foggy/snowy conditions with barely disrupting normal +performance. The dataset and code will be publicly available at: +https://github.com/ylwhxht/V2X-R. + +
+
+
+
+
+ + ☆ MambaXCTrack: Mamba-based Tracker with SSM Cross-correlation and Motion + Prompt for Ultrasound Needle Tracking + + +
+ Ultrasound (US)-guided needle insertion is widely employed in percutaneous +interventions. However, providing feedback on the needle tip position via US +image presents challenges due to noise, artifacts, and the thin imaging plane +of US, which degrades needle features and leads to intermittent tip visibility. +In this paper, a Mamba-based US needle tracker MambaXCTrack utilizing +structured state space models cross-correlation (SSMX-Corr) and implicit motion +prompt is proposed, which is the first application of Mamba in US needle +tracking. The SSMX-Corr enhances cross-correlation by long-range modeling and +global searching of distant semantic features between template and search maps, +benefiting the tracking under noise and artifacts by implicitly learning +potential distant semantic cues. By combining with cross-map interleaved scan +(CIS), local pixel-wise interaction with positional inductive bias can also be +introduced to SSMX-Corr. The implicit low-level motion descriptor is proposed +as a non-visual prompt to enhance tracking robustness, addressing the +intermittent tip visibility problem. Extensive experiments on a dataset with +motorized needle insertion in both phantom and tissue samples demonstrate that +the proposed tracker outperforms other state-of-the-art trackers while ablation +studies further highlight the effectiveness of each proposed tracking module. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ EgoVid-5M: A Large-Scale Video-Action Dataset for Egocentric Video + Generation + + +
+ Video generation has emerged as a promising tool for world simulation, +leveraging visual data to replicate real-world environments. Within this +context, egocentric video generation, which centers on the human perspective, +holds significant potential for enhancing applications in virtual reality, +augmented reality, and gaming. However, the generation of egocentric videos +presents substantial challenges due to the dynamic nature of egocentric +viewpoints, the intricate diversity of actions, and the complex variety of +scenes encountered. Existing datasets are inadequate for addressing these +challenges effectively. To bridge this gap, we present EgoVid-5M, the first +high-quality dataset specifically curated for egocentric video generation. +EgoVid-5M encompasses 5 million egocentric video clips and is enriched with +detailed action annotations, including fine-grained kinematic control and +high-level textual descriptions. To ensure the integrity and usability of the +dataset, we implement a sophisticated data cleaning pipeline designed to +maintain frame consistency, action coherence, and motion smoothness under +egocentric conditions. Furthermore, we introduce EgoDreamer, which is capable +of generating egocentric videos driven simultaneously by action descriptions +and kinematic control signals. The EgoVid-5M dataset, associated action +annotations, and all data cleansing metadata will be released for the +advancement of research in egocentric video generation. + +
+
+ comment: Project Page: https://egovid.github.io/ +
+
+
+
+
+ + ☆ Multiscale Graph Construction Using Non-local Cluster Features + + +
+ This paper presents a multiscale graph construction method using both graph +and signal features. Multiscale graph is a hierarchical representation of the +graph, where a node at each level indicates a cluster in a finer resolution. To +obtain the hierarchical clusters, existing methods often use graph clustering; +however, they may ignore signal variations. As a result, these methods could +fail to detect the clusters having similar features on nodes. In this paper, we +consider graph and node-wise features simultaneously for multiscale clustering +of a graph. With given clusters of the graph, the clusters are merged +hierarchically in three steps: 1) Feature vectors in the clusters are +extracted. 2) Similarities among cluster features are calculated using optimal +transport. 3) A variable $k$-nearest neighbor graph (V$k$NNG) is constructed +and graph spectral clustering is applied to the V$k$NNG to obtain clusters at a +coarser scale. Additionally, the multiscale graph in this paper has +\textit{non-local} characteristics: Nodes with similar features are merged even +if they are spatially separated. In experiments on multiscale image and point +cloud segmentation, we demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ A Chinese Multi-label Affective Computing Dataset Based on Social Media + Network Users + + +
+ Emotion and personality are central elements in understanding human +psychological states. Emotions reflect an individual subjective experiences, +while personality reveals relatively stable behavioral and cognitive patterns. +Existing affective computing datasets often annotate emotion and personality +traits separately, lacking fine-grained labeling of micro-emotions and emotion +intensity in both single-label and multi-label classifications. Chinese emotion +datasets are extremely scarce, and datasets capturing Chinese user personality +traits are even more limited. To address these gaps, this study collected data +from the major social media platform Weibo, screening 11,338 valid users from +over 50,000 individuals with diverse MBTI personality labels and acquiring +566,900 posts along with the user MBTI personality tags. Using the EQN method, +we compiled a multi-label Chinese affective computing dataset that integrates +the same user's personality traits with six emotions and micro-emotions, each +annotated with intensity levels. Validation results across multiple NLP +classification models demonstrate the dataset strong utility. This dataset is +designed to advance machine recognition of complex human emotions and provide +data support for research in psychology, education, marketing, finance, and +politics. + +
+
+
+
+
+ + ☆ DyConfidMatch: Dynamic Thresholding and Re-sampling for 3D + Semi-supervised Learning + + +
+ Semi-supervised learning (SSL) leverages limited labeled and abundant +unlabeled data but often faces challenges with data imbalance, especially in 3D +contexts. This study investigates class-level confidence as an indicator of +learning status in 3D SSL, proposing a novel method that utilizes dynamic +thresholding to better use unlabeled data, particularly from underrepresented +classes. A re-sampling strategy is also introduced to mitigate bias towards +well-represented classes, ensuring equitable class representation. Through +extensive experiments in 3D SSL, our method surpasses state-of-the-art +counterparts in classification and detection tasks, highlighting its +effectiveness in tackling data imbalance. This approach presents a significant +advancement in SSL for 3D datasets, providing a robust solution for data +imbalance issues. + +
+
+ comment: Accepted by Pattern Recognition Journal +
+
+
+
+
+ + ☆ DEEGITS: Deep Learning based Framework for Measuring Heterogenous + Traffic State in Challenging Traffic Scenarios + + +
+ This paper presents DEEGITS (Deep Learning Based Heterogeneous Traffic State +Measurement), a comprehensive framework that leverages state-of-the-art +convolutional neural network (CNN) techniques to accurately and rapidly detect +vehicles and pedestrians, as well as to measure traffic states in challenging +scenarios (i.e., congestion, occlusion). In this study, we enhance the training +dataset through data fusion, enabling simultaneous detection of vehicles and +pedestrians. Image preprocessing and augmentation are subsequently performed to +improve the quality and quantity of the dataset. Transfer learning is applied +on the YOLOv8 pretrained model to increase the model's capability to identify a +diverse array of vehicles. Optimal hyperparameters are obtained using the Grid +Search algorithm, with the Stochastic Gradient Descent (SGD) optimizer +outperforming other optimizers under these settings. Extensive experimentation +and evaluation demonstrate substantial accuracy within the detection framework, +with the model achieving 0.794 mAP@0.5 on the validation set and 0.786 mAP@0.5 +on the test set, surpassing previous benchmarks on similar datasets. The +DeepSORT multi-object tracking algorithm is incorporated to track detected +vehicles and pedestrians in this study. Finally, the framework is tested to +measure heterogeneous traffic states in mixed traffic conditions. Two locations +with differing traffic compositions and congestion levels are selected: one +motorized-dominant location with moderate density and one +non-motorized-dominant location with higher density. Errors are statistically +insignificant for both cases, showing correlations from 0.99 to 0.88 and 0.91 +to 0.97 for heterogeneous traffic flow and speed measurements, respectively. + +
+
+ comment: Submitted for presentation at the 103 rd Annual Meeting of + Transportation Research Board and publication in Transportation Research + Record: Journal of Transportation Research Board +
+
+
+
+
+ + ☆ Enhancing Multimodal Query Representation via Visual Dialogues for + End-to-End Knowledge Retrieval + + +
+ Existing multimodal retrieval systems often rely on disjointed models for +image comprehension, such as object detectors and caption generators, leading +to cumbersome implementations and training processes. To overcome this +limitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a +text retriever with the ability to understand multimodal queries via dynamic +modality interaction. Ret-XKnow leverages a partial convolution mechanism to +focus on visual information relevant to the given textual query, thereby +enhancing multimodal query representations. To effectively learn multimodal +interaction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset +automatically constructed from visual dialogue datasets. Our dataset +construction process ensures that the dialogues are transformed into suitable +information retrieval tasks using a text retriever. We demonstrate that our +approach not only significantly improves retrieval performance in zero-shot +settings but also achieves substantial improvements in fine-tuning scenarios. +Our code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow. + +
+
+
+
+
+ + ☆ SASE: A Searching Architecture for Squeeze and Excitation Operations + + +
+ In the past few years, channel-wise and spatial-wise attention blocks have +been widely adopted as supplementary modules in deep neural networks, enhancing +network representational abilities while introducing low complexity. Most +attention modules follow a squeeze-and-excitation paradigm. However, to design +such attention modules, requires a substantial amount of experiments and +computational resources. Neural Architecture Search (NAS), meanwhile, is able +to automate the design of neural networks and spares the numerous experiments +required for an optimal architecture. This motivates us to design a search +architecture that can automatically find near-optimal attention modules through +NAS. We propose SASE, a Searching Architecture for Squeeze and Excitation +operations, to form a plug-and-play attention block by searching within certain +search space. The search space is separated into 4 different sets, each +corresponds to the squeeze or excitation operation along the channel or spatial +dimension. Additionally, the search sets include not only existing attention +blocks but also other operations that have not been utilized in attention +mechanisms before. To the best of our knowledge, SASE is the first attempt to +subdivide the attention search space and search for architectures beyond +currently known attention modules. The searched attention module is tested with +extensive experiments across a range of visual tasks. Experimental results +indicate that visual backbone networks (ResNet-50/101) using the SASE attention +module achieved the best performance compared to those using the current +state-of-the-art attention modules. Codes are included in the supplementary +material, and they will be made public later. + +
+
+
+
+
+ + ☆ Motion Control for Enhanced Complex Action Video Generation + + +
+ Existing text-to-video (T2V) models often struggle with generating videos +with sufficiently pronounced or complex actions. A key limitation lies in the +text prompt's inability to precisely convey intricate motion details. To +address this, we propose a novel framework, MVideo, designed to produce +long-duration videos with precise, fluid actions. MVideo overcomes the +limitations of text prompts by incorporating mask sequences as an additional +motion condition input, providing a clearer, more accurate representation of +intended actions. Leveraging foundational vision models such as GroundingDINO +and SAM2, MVideo automatically generates mask sequences, enhancing both +efficiency and robustness. Our results demonstrate that, after training, MVideo +effectively aligns text prompts with motion conditions to produce videos that +simultaneously meet both criteria. This dual control mechanism allows for more +dynamic video generation by enabling alterations to either the text prompt or +motion condition independently, or both in tandem. Furthermore, MVideo supports +motion condition editing and composition, facilitating the generation of videos +with more complex actions. MVideo thus advances T2V motion generation, setting +a strong benchmark for improved action depiction in current video diffusion +models. Our project page is available at https://mvideo-v1.github.io/. + +
+
+ comment: Project page: https://mvideo-v1.github.io/ +
+
+
+
+
+ + ☆ Robust Divergence Learning for Missing-Modality Segmentation + + +
+ Multimodal Magnetic Resonance Imaging (MRI) provides essential complementary +information for analyzing brain tumor subregions. While methods using four +common MRI modalities for automatic segmentation have shown success, they often +face challenges with missing modalities due to image quality issues, +inconsistent protocols, allergic reactions, or cost factors. Thus, developing a +segmentation paradigm that handles missing modalities is clinically valuable. A +novel single-modality parallel processing network framework based on H\"older +divergence and mutual information is introduced. Each modality is independently +input into a shared network backbone for parallel processing, preserving unique +information. Additionally, a dynamic sharing framework is introduced that +adjusts network parameters based on modality availability. A H\"older +divergence and mutual information-based loss functions are used for evaluating +discrepancies between predictions and labels. Extensive testing on the BraTS +2018 and BraTS 2020 datasets demonstrates that our method outperforms existing +techniques in handling missing modalities and validates each component's +effectiveness. + +
+
+
+
+
+ + ☆ Choix d'un espace de représentation image adapté à la détection + de réseaux routiers + + +
+ These last years, algorithms allowing to decompose an image into its +structures and textures components have emerged. In this paper, we present an +application of this type of decomposition to the problem road network detection +in aerial or satelite imagery. The algorithmic procedure involves the image +decomposition (using a unique property), an alignment detection step based on +the Gestalt theory, and a refinement step using statistical active contours. + +
+
+ comment: in French language +
+
+
+
+
+ + ☆ Noisy image decomposition: a new structure, texture and noise model + based on local adaptivity + + +
+ These last few years, image decomposition algorithms have been proposed to +split an image into two parts: the structures and the textures. These +algorithms are not adapted to the case of noisy images because the textures are +corrupted by noise. In this paper, we propose a new model which decomposes an +image into three parts (structures, textures and noise) based on a local +regularization scheme. We compare our results with the recent work of Aujol and +Chambolle. We finish by giving another model which combines the advantages of +the two previous ones. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2411.05265 +
+
+
+
+
+ + ☆ Restoration algorithms and system performance evaluation for active + imagers + + +
+ This paper deals with two fields related to active imaging system. First, we +begin to explore image processing algorithms to restore the artefacts like +speckle, scintillation and image dancing caused by atmospheric turbulence. +Next, we examine how to evaluate the performance of this kind of systems. To do +this task, we propose a modified version of the german TRM3 metric which +permits to get MTF-like measures. We use the database acquired during NATO-TG40 +field trials to make our tests. + +
+
+
+
+
+ + ☆ MBA-SLAM: Motion Blur Aware Dense Visual SLAM with Radiance Fields + Representation + + +
+ Emerging 3D scene representations, such as Neural Radiance Fields (NeRF) and +3D Gaussian Splatting (3DGS), have demonstrated their effectiveness in +Simultaneous Localization and Mapping (SLAM) for photo-realistic rendering, +particularly when using high-quality video sequences as input. However, +existing methods struggle with motion-blurred frames, which are common in +real-world scenarios like low-light or long-exposure conditions. This often +results in a significant reduction in both camera localization accuracy and map +reconstruction quality. To address this challenge, we propose a dense visual +SLAM pipeline (i.e. MBA-SLAM) to handle severe motion-blurred inputs. Our +approach integrates an efficient motion blur-aware tracker with either neural +radiance fields or Gaussian Splatting based mapper. By accurately modeling the +physical image formation process of motion-blurred images, our method +simultaneously learns 3D scene representation and estimates the cameras' local +trajectory during exposure time, enabling proactive compensation for motion +blur caused by camera movement. In our experiments, we demonstrate that +MBA-SLAM surpasses previous state-of-the-art methods in both camera +localization and map reconstruction, showcasing superior performance across a +range of datasets, including synthetic and real datasets featuring sharp images +as well as those affected by motion blur, highlighting the versatility and +robustness of our approach. Code is available at +https://github.com/WU-CVGL/MBA-SLAM. + +
+
+
+
+
+ + ☆ LBONet: Supervised Spectral Descriptors for Shape Analysis + + +
+ The Laplace-Beltrami operator has established itself in the field of +non-rigid shape analysis due to its many useful properties such as being +invariant under isometric transformation, having a countable eigensystem +forming an orthonormal basis, and fully characterizing geodesic distances of +the manifold. However, this invariancy only applies under isometric +deformations, which leads to a performance breakdown in many real-world +applications. In recent years emphasis has been placed upon extracting optimal +features using deep learning methods, however spectral signatures play a +crucial role and still add value. In this paper we take a step back, revisiting +the LBO and proposing a supervised way to learn several operators on a +manifold. Depending on the task, by applying these functions, we can train the +LBO eigenbasis to be more task-specific. The optimization of the LBO leads to +enormous improvements to established descriptors such as the heat kernel +signature in various tasks such as retrieval, classification, segmentation, and +correspondence, proving the adaption of the LBO eigenbasis to both global and +highly local learning settings. + +
+
+ comment: 14 pages, 13 figure +
+
+
+
+
+ + ♻ ☆ Scaling Properties of Diffusion Models for Perceptual Tasks + + +
+ In this paper, we argue that iterative computation with diffusion models +offers a powerful paradigm for not only generation but also visual perception +tasks. We unify tasks such as depth estimation, optical flow, and amodal +segmentation under the framework of image-to-image translation, and show how +diffusion models benefit from scaling training and test-time compute for these +perceptual tasks. Through a careful analysis of these scaling properties, we +formulate compute-optimal training and inference recipes to scale diffusion +models for visual perception tasks. Our models achieve competitive performance +to state-of-the-art methods using significantly less data and compute. To +access our code and models, see https://scaling-diffusion-perception.github.io . + +
+
+
+
+
+ + ♻ ☆ Forensic Iris Image-Based Post-Mortem Interval Estimation + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup. One factor that may be useful in +conditioning iris recognition methods is the tissue decomposition level, which +is correlated with the post-mortem interval (PMI), \ie the number of hours that +have elapsed since death. PMI, however, is not always available, and its +precise estimation remains one of the core challenges in forensic examination. +This paper presents the first known to us method of the PMI estimation directly +from iris images captured after death. To assess the feasibility of the +iris-based PMI estimation, we designed models predicting the PMI from (a) +near-infrared (NIR), (b) visible (RGB), and (c) multispectral (RGB+NIR) +forensic iris images. Models were evaluated following a 10-fold +cross-validation, in (S1) sample-disjoint, (S2) subject-disjoint, and (S3) +cross-dataset scenarios. We explore two data balancing techniques for S3: +resampling-based balancing (S3-real), and synthetic data-supplemented balancing +(S3-synthetic). We found that using the multispectral data offers a +spectacularly low mean absolute error (MAE) of $\approx 3.5$ hours in the +scenario (S1), a bit worse MAE $\approx 17.5$ hours in the scenario (S2), and +MAE $\approx 45.77$ hours in the scenario (S3). Additionally, supplementing the +training set with synthetically-generated forensic iris images (S3-synthetic) +significantly enhances the models' ability to generalize to new NIR, RGB and +multispectral data collected in a different lab. This suggests that if the +environmental conditions are favorable (\eg, bodies are kept in low +temperatures), forensic iris images provide features that are indicative of the +PMI and can be automatically estimated. + +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+ comment: Accepted by 2024 5th International Conference on Computer Vision, + Image and Deep Learning +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ GaussianObject: High-Quality 3D Object Reconstruction from Four Views + with Gaussian Splatting SIGGRAPH + + +
+ Reconstructing and rendering 3D objects from highly sparse views is of +critical importance for promoting applications of 3D vision techniques and +improving user experience. However, images from sparse views only contain very +limited 3D information, leading to two significant challenges: 1) Difficulty in +building multi-view consistency as images for matching are too few; 2) +Partially omitted or highly compressed object information as view coverage is +insufficient. To tackle these challenges, we propose GaussianObject, a +framework to represent and render the 3D object with Gaussian splatting that +achieves high rendering quality with only 4 input images. We first introduce +techniques of visual hull and floater elimination, which explicitly inject +structure priors into the initial optimization process to help build multi-view +consistency, yielding a coarse 3D Gaussian representation. Then we construct a +Gaussian repair model based on diffusion models to supplement the omitted +object information, where Gaussians are further refined. We design a +self-generating strategy to obtain image pairs for training the repair model. +We further design a COLMAP-free variant, where pre-given accurate camera poses +are not required, which achieves competitive quality and facilitates wider +applications. GaussianObject is evaluated on several challenging datasets, +including MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed +images, achieving superior performance from only four views and significantly +outperforming previous SOTA methods. Our demo is available at +https://gaussianobject.github.io/, and the code has been released at +https://github.com/GaussianObject/GaussianObject. + +
+
+ comment: ACM Transactions on Graphics (SIGGRAPH Asia 2024). Project page: + https://gaussianobject.github.io/ Code: + https://github.com/chensjtu/GaussianObject +
+
+
+
+
+ + ♻ ☆ Quantifying and Mitigating Unimodal Biases in Multimodal Large Language + Models: A Causal Perspective + + +
+ Recent advancements in Large Language Models (LLMs) have facilitated the +development of Multimodal LLMs (MLLMs). Despite their impressive capabilities, +MLLMs often suffer from over-reliance on unimodal biases (e.g., language bias +and vision bias), leading to incorrect answers or hallucinations in complex +multimodal tasks. To investigate this issue, we propose a causal framework to +interpret the biases in Visual Question Answering (VQA) problems. Within this +framework, we conduct an in-depth causal analysis to assess the causal effect +of these biases on MLLM predictions. Based on the analysis, we introduce 1) a +novel MORE dataset with 12,000 challenging VQA instances requiring multi-hop +reasoning and overcoming unimodal biases. 2) a causality-enhanced agent +framework CAVE that guides models to comprehensively integrate information from +different modalities and mitigate biases. Our experiments show that MLLMs +perform poorly on MORE, indicating strong unimodal biases and limited semantic +understanding. However, when integrated with our CAVE, promising improvements +in reasoning and bias mitigation can be seen. These findings provide important +insights for the development of more robust MLLMs and contribute to the broader +goal of advancing multimodal AI systems capable of deeper understanding and +reasoning. Our project page is at https://github.com/OpenCausaLab/MORE. + +
+
+
+
+
+ + ♻ ☆ Textured-GS: Gaussian Splatting with Spatially Defined Color and Opacity + + +
+ In this paper, we introduce Textured-GS, an innovative method for rendering +Gaussian splatting that incorporates spatially defined color and opacity +variations using Spherical Harmonics (SH). This approach enables each Gaussian +to exhibit a richer representation by accommodating varying colors and +opacities across its surface, significantly enhancing rendering quality +compared to traditional methods. To demonstrate the merits of our approach, we +have adapted the Mini-Splatting architecture to integrate textured Gaussians +without increasing the number of Gaussians. Our experiments across multiple +real-world datasets show that Textured-GS consistently outperforms both the +baseline Mini-Splatting and standard 3DGS in terms of visual fidelity. The +results highlight the potential of Textured-GS to advance Gaussian-based +rendering technologies, promising more efficient and high-quality scene +reconstructions. Our implementation is available at +https://github.com/ZhentaoHuang/Textured-GS. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity + Synthesis of MR Images with Structure Preservation + + +
+ Synthesizing medical images while preserving their structural information is +crucial in medical research. In such scenarios, the preservation of anatomical +content becomes especially important. Although recent advances have been made +by incorporating instance-level information to guide translation, these methods +overlook the spatial coherence of structural-level representation and the +anatomical invariance of content during translation. To address these issues, +we introduce hierarchical granularity discrimination, which exploits various +levels of semantic information present in medical images. Our strategy utilizes +three levels of discrimination granularity: pixel-level discrimination using a +Brain Memory Bank, structure-level discrimination on each brain structure with +a re-weighting strategy to focus on hard samples, and global-level +discrimination to ensure anatomical consistency during translation. The image +translation performance of our strategy has been evaluated on three independent +datasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed +state-of-the-art algorithms. Particularly, our model excels not only in +synthesizing normal structures but also in handling abnormal (pathological) +structures, such as brain tumors, despite the variations in contrast observed +across different imaging modalities due to their pathological characteristics. +The diagnostic value of synthesized MR images containing brain tumors has been +evaluated by radiologists. This indicates that our model may offer an +alternative solution in scenarios where specific MR modalities of patients are +unavailable. Extensive experiments further demonstrate the versatility of our +method, providing unique insights into medical image translation. + +
+
+
+
+
+ + ♻ ☆ BoQ: A Place is Worth a Bag of Learnable Queries CVPR 2024 + + +
+ In visual place recognition, accurately identifying and matching images of +locations under varying environmental conditions and viewpoints remains a +significant challenge. In this paper, we introduce a new technique, called +Bag-of-Queries (BoQ), which learns a set of global queries designed to capture +universal place-specific attributes. Unlike existing methods that employ +self-attention and generate the queries directly from the input features, BoQ +employs distinct learnable global queries, which probe the input features via +cross-attention, ensuring consistent information aggregation. In addition, our +technique provides an interpretable attention mechanism and integrates with +both CNN and Vision Transformer backbones. The performance of BoQ is +demonstrated through extensive experiments on 14 large-scale benchmarks. It +consistently outperforms current state-of-the-art techniques including NetVLAD, +MixVPR and EigenPlaces. Moreover, as a global retrieval technique (one-stage), +BoQ surpasses two-stage retrieval methods, such as Patch-NetVLAD, TransVPR and +R2Former, all while being orders of magnitude faster and more efficient. The +code and model weights are publicly available at +https://github.com/amaralibey/Bag-of-Queries. + +
+
+ comment: Accepted at CVPR 2024 +
+
+
+
+
+ + ♻ ☆ Deep Learning for Economists + + +
+ Deep learning provides powerful methods to impute structured information from +large-scale, unstructured text and image datasets. For example, economists +might wish to detect the presence of economic activity in satellite images, or +to measure the topics or entities mentioned in social media, the congressional +record, or firm filings. This review introduces deep neural networks, covering +methods such as classifiers, regression models, generative AI, and embedding +models. Applications include classification, document digitization, record +linkage, and methods for data exploration in massive scale text and image +corpora. When suitable methods are used, deep learning models can be cheap to +tune and can scale affordably to problems involving millions or billions of +data points.. The review is accompanied by a companion website, EconDL, with +user-friendly demo notebooks, software resources, and a knowledge base that +provides technical details and additional applications. + +
+
+
+
+
+ + ♻ ☆ Into the Fog: Evaluating Robustness of Multiple Object Tracking + + +
+ State-of-the-art Multiple Object Tracking (MOT) approaches have shown +remarkable performance when trained and evaluated on current benchmarks. +However, these benchmarks primarily consist of clear weather scenarios, +overlooking adverse atmospheric conditions such as fog, haze, smoke and dust. +As a result, the robustness of trackers against these challenging conditions +remains underexplored. To address this gap, we introduce physics-based +volumetric fog simulation method for arbitrary MOT datasets, utilizing +frame-by-frame monocular depth estimation and a fog formation optical model. We +enhance our simulation by rendering both homogeneous and heterogeneous fog and +propose to use the dark channel prior method to estimate atmospheric light, +showing promising results even in night and indoor scenes. We present the +leading benchmark MOTChallenge (third release) augmented with fog (smoke for +indoor scenes) of various intensities and conduct a comprehensive evaluation of +MOT methods, revealing their limitations under fog and fog-like challenges. + +
+
+
+
+
+ + ♻ ☆ Optimal Transport on the Lie Group of Roto-translations + + +
+ The roto-translation group SE2 has been of active interest in image analysis +due to methods that lift the image data to multi-orientation representations +defined on this Lie group. This has led to impactful applications of +crossing-preserving flows for image de-noising, geodesic tracking, and +roto-translation equivariant deep learning. In this paper, we develop a +computational framework for optimal transportation over Lie groups, with a +special focus on SE2. We make several theoretical contributions (generalizable +to matrix Lie groups) such as the non-optimality of group actions as transport +maps, invariance and equivariance of optimal transport, and the quality of the +entropic-regularized optimal transport plan using geodesic distance +approximations. We develop a Sinkhorn like algorithm that can be efficiently +implemented using fast and accurate distance approximations of the Lie group +and GPU-friendly group convolutions. We report valuable advancements in the +experiments on 1) image barycentric interpolation, 2) interpolation of planar +orientation fields, and 3) Wasserstein gradient flows on SE2. We observe that +our framework of lifting images to SE2 and optimal transport with +left-invariant anisotropic metrics leads to equivariant transport along +dominant contours and salient line structures in the image. This yields sharper +and more meaningful interpolations compared to their counterparts on R^2 + +
+
+
+
+
+ + ♻ ☆ Extracting polygonal footprints in off-nadir images with Segment + Anything Model + + +
+ Building Footprint Extraction (BFE) from off-nadir aerial images often +involves roof segmentation and offset prediction to adjust roof boundaries to +the building footprint. However, this multi-stage approach typically produces +low-quality results, limiting its applicability in real-world data production. +To address this issue, we present OBMv2, an end-to-end and promptable model for +polygonal footprint prediction. Unlike its predecessor OBM, OBMv2 introduces a +novel Self Offset Attention (SOFA) mechanism that improves performance across +diverse building types, from bungalows to skyscrapers, enabling end-to-end +footprint prediction without post-processing. Additionally, we propose a +Multi-level Information System (MISS) to effectively leverage roof masks, +building masks, and offsets for accurate footprint prediction. We evaluate +OBMv2 on the BONAI and OmniCity-view3 datasets and demonstrate its +generalization on the Huizhou test set. The code will be available at +https://github.com/likaiucas/OBMv2. + +
+
+
+
+
+ + ♻ ☆ V-LoL: A Diagnostic Dataset for Visual Logical Learning + + +
+ Despite the successes of recent developments in visual AI, different +shortcomings still exist; from missing exact logical reasoning, to abstract +generalization abilities, to understanding complex and noisy scenes. +Unfortunately, existing benchmarks, were not designed to capture more than a +few of these aspects. Whereas deep learning datasets focus on visually complex +data but simple visual reasoning tasks, inductive logic datasets involve +complex logical learning tasks, however, lack the visual component. To address +this, we propose the diagnostic visual logical learning dataset, V-LoL, that +seamlessly combines visual and logical challenges. Notably, we introduce the +first instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic +benchmark in symbolic AI, the Michalski train problem. By incorporating +intricate visual scenes and flexible logical reasoning tasks within a versatile +framework, V-LoL-Train provides a platform for investigating a wide range of +visual logical learning challenges. We evaluate a variety of AI systems +including traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our +evaluations demonstrate that even SOTA AI faces difficulties in dealing with +visual logical learning challenges, highlighting unique advantages and +limitations of each methodology. Overall, V-LoL opens up new avenues for +understanding and enhancing current abilities in visual logical learning for AI +systems. + +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ VoxelKeypointFusion: Generalizable Multi-View Multi-Person Pose + Estimation + + +
+ In the rapidly evolving field of computer vision, the task of accurately +estimating the poses of multiple individuals from various viewpoints presents a +formidable challenge, especially if the estimations should be reliable as well. +This work presents an extensive evaluation of the generalization capabilities +of multi-view multi-person pose estimators to unseen datasets and presents a +new algorithm with strong performance in this task. It also studies the +improvements by additionally using depth information. Since the new approach +can not only generalize well to unseen datasets, but also to different +keypoints, the first multi-view multi-person whole-body estimator is presented. +To support further research on those topics, all of the work is publicly +accessible. + +
+
+
+
+
+ + ♻ ☆ SLYKLatent: A Learning Framework for Gaze Estimation Using Deep Facial + Feature Learning + + +
+ In this research, we present SLYKLatent, a novel approach for enhancing gaze +estimation by addressing appearance instability challenges in datasets due to +aleatoric uncertainties, covariant shifts, and test domain generalization. +SLYKLatent utilizes Self-Supervised Learning for initial training with facial +expression datasets, followed by refinement with a patch-based tri-branch +network and an inverse explained variance-weighted training loss function. Our +evaluation on benchmark datasets achieves a 10.9% improvement on Gaze360, +supersedes top MPIIFaceGaze results with 3.8%, and leads on a subset of +ETH-XGaze by 11.6%, surpassing existing methods by significant margins. +Adaptability tests on RAF-DB and Affectnet show 86.4% and 60.9% accuracies, +respectively. Ablation studies confirm the effectiveness of SLYKLatent's novel +components. + +
+
+
+
+
+ + ♻ ☆ Snakes and Ladders: Two Steps Up for VideoMamba + + +
+ Video understanding requires the extraction of rich spatio-temporal +representations, which transformer models achieve through self-attention. +Unfortunately, self-attention poses a computational burden. In NLP, Mamba has +surfaced as an efficient alternative for transformers. However, Mamba's +successes do not trivially extend to vision tasks, including those in video +analysis. In this paper, we theoretically analyze the differences between +self-attention and Mamba. We identify two limitations in Mamba's token +processing: historical decay and element contradiction. We propose +VideoMambaPro (VMP) that solves the identified limitations by adding masked +backward computation and elemental residual connections to a VideoMamba +backbone. Differently sized VideoMambaPro models surpass VideoMamba by 1.6-2.8% +and 1.1-1.9% top-1 on Kinetics-400 and Something-Something V2, respectively. +Even without extensive pre-training, our models present an increasingly +attractive and efficient alternative to current transformer models. Moreover, +our two solutions are orthogonal to recent advances in Vision Mamba models, and +are likely to provide further improvements in future models. + +
+
+ comment: New updated experiment results +
+
+
+
+
+ + ♻ ☆ Automatic dataset shift identification to support root cause analysis of + AI performance drift + + +
+ Shifts in data distribution can substantially harm the performance of +clinical AI models. Hence, various methods have been developed to detect the +presence of such shifts at deployment time. However, root causes of dataset +shifts are varied, and the choice of shift mitigation strategies is highly +dependent on the precise type of shift encountered at test time. As such, +detecting test-time dataset shift is not sufficient: precisely identifying +which type of shift has occurred is critical. In this work, we propose the +first unsupervised dataset shift identification framework, effectively +distinguishing between prevalence shift (caused by a change in the label +distribution), covariate shift (caused by a change in input characteristics) +and mixed shifts (simultaneous prevalence and covariate shifts). We discuss the +importance of self-supervised encoders for detecting subtle covariate shifts +and propose a novel shift detector leveraging both self-supervised encoders and +task model outputs for improved shift detection. We report promising results +for the proposed shift identification framework across three different imaging +modalities (chest radiography, digital mammography, and retinal fundus images) +on five types of real-world dataset shifts, using four large publicly available +datasets. + +
+
+ comment: Code available at + https://github.com/biomedia-mira/shift_identification +
+
+
+
+
+ + ♻ ☆ CLASS-M: Adaptive stain separation-based contrastive learning with + pseudo-labeling for histopathological image classification + + +
+ Histopathological image classification is an important task in medical image +analysis. Recent approaches generally rely on weakly supervised learning due to +the ease of acquiring case-level labels from pathology reports. However, +patch-level classification is preferable in applications where only a limited +number of cases are available or when local prediction accuracy is critical. On +the other hand, acquiring extensive datasets with localized labels for training +is not feasible. In this paper, we propose a semi-supervised patch-level +histopathological image classification model, named CLASS-M, that does not +require extensively labeled datasets. CLASS-M is formed by two main parts: a +contrastive learning module that uses separated Hematoxylin and Eosin images +generated through an adaptive stain separation process, and a module with +pseudo-labels using MixUp. We compare our model with other state-of-the-art +models on two clear cell renal cell carcinoma datasets. We demonstrate that our +CLASS-M model has the best performance on both datasets. Our code is available +at github.com/BzhangURU/Paper_CLASS-M/tree/main + +
+
+
+
+
+ + ♻ ☆ A Review of Electromagnetic Elimination Methods for low-field portable + MRI scanner + + +
+ This paper analyzes conventional and deep learning methods for eliminating +electromagnetic interference (EMI) in MRI systems. We compare traditional +analytical and adaptive techniques with advanced deep learning approaches. Key +strengths and limitations of each method are highlighted. Recent advancements +in active EMI elimination, such as external EMI receiver coils, are discussed +alongside deep learning methods, which show superior EMI suppression by +leveraging neural networks trained on MRI data. While deep learning improves +EMI elimination and diagnostic capabilities, it introduces security and safety +concerns, particularly in commercial applications. A balanced approach, +integrating conventional reliability with deep learning's advanced +capabilities, is proposed for more effective EMI suppression in MRI systems. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ Exploring Test-Time Adaptation for Object Detection in Continually + Changing Environments + + +
+ Real-world application models are commonly deployed in dynamic environments, +where the target domain distribution undergoes temporal changes. Continual +Test-Time Adaptation (CTTA) has recently emerged as a promising technique to +gradually adapt a source-trained model to continually changing target domains. +Despite recent advancements in addressing CTTA, two critical issues remain: 1) +Fixed thresholds for pseudo-labeling in existing methodologies lead to +low-quality pseudo-labels, as model confidence varies across categories and +domains; 2) Stochastic parameter restoration methods for mitigating +catastrophic forgetting fail to preserve critical information effectively, due +to their intrinsic randomness. To tackle these challenges for detection models +in CTTA scenarios, we present AMROD, featuring three core components. Firstly, +the object-level contrastive learning module extracts object-level features for +contrastive learning to refine the feature representation in the target domain. +Secondly, the adaptive monitoring module dynamically skips unnecessary +adaptation and updates the category-specific threshold based on predicted +confidence scores to enable efficiency and improve the quality of +pseudo-labels. Lastly, the adaptive randomized restoration mechanism +selectively reset inactive parameters with higher possibilities, ensuring the +retention of essential knowledge. We demonstrate the effectiveness of AMROD on +four CTTA object detection tasks, where AMROD outperforms existing methods, +especially achieving a 3.2 mAP improvement and a 20% increase in efficiency on +the Cityscapes-to-Cityscapes-C CTTA task. The code will be released. + +
+
+
+
+
+ + ♻ ☆ LT-DARTS: An Architectural Approach to Enhance Deep Long-Tailed Learning + + +
+ Deep long-tailed recognition has been widely studied to address the issue of +imbalanced data distributions in real-world scenarios. However, there has been +insufficient focus on the design of neural architectures, despite empirical +evidence suggesting that architecture can significantly impact performance. In +this paper, we attempt to mitigate long-tailed issues through architectural +improvements. To simplify the design process, we utilize Differential +Architecture Search (DARTS) to achieve this goal. Unfortunately, existing DARTS +methods struggle to perform well in long-tailed scenarios. To tackle this +challenge, we introduce Long-Tailed Differential Architecture Search +(LT-DARTS). Specifically, we conduct extensive experiments to explore +architectural components that demonstrate better performance on long-tailed +data and propose a new search space based on our observations. This ensures +that the architecture obtained through our search process incorporates superior +components. Additionally, we propose replacing the learnable linear classifier +with an Equiangular Tight Frame (ETF) classifier to further enhance our method. +This classifier effectively alleviates the biased search process and prevents +performance collapse. Extensive experimental evaluations demonstrate that our +approach consistently improves upon existing methods from an orthogonal +perspective and achieves state-of-the-art results with simple enhancements. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ Continual Learning in the Frequency Domain NeurIPS 2024 + + +
+ Continual learning (CL) is designed to learn new tasks while preserving +existing knowledge. Replaying samples from earlier tasks has proven to be an +effective method to mitigate the forgetting of previously acquired knowledge. +However, the current research on the training efficiency of rehearsal-based +methods is insufficient, which limits the practical application of CL systems +in resource-limited scenarios. The human visual system (HVS) exhibits varying +sensitivities to different frequency components, enabling the efficient +elimination of visually redundant information. Inspired by HVS, we propose a +novel framework called Continual Learning in the Frequency Domain (CLFD). To +our knowledge, this is the first study to utilize frequency domain features to +enhance the performance and efficiency of CL training on edge devices. For the +input features of the feature extractor, CLFD employs wavelet transform to map +the original input image into the frequency domain, thereby effectively +reducing the size of input feature maps. Regarding the output features of the +feature extractor, CLFD selectively utilizes output features for distinct +classes for classification, thereby balancing the reusability and interference +of output features based on the frequency domain similarity of the classes +across various tasks. Optimizing only the input and output features of the +feature extractor allows for seamless integration of CLFD with various +rehearsal-based methods. Extensive experiments conducted in both cloud and edge +environments demonstrate that CLFD consistently improves the performance of +state-of-the-art (SOTA) methods in both precision and training efficiency. +Specifically, CLFD can increase the accuracy of the SOTA CL method by up to +6.83% and reduce the training time by 2.6$\times$. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Show Me What and Where has Changed? Question Answering and Grounding for + Remote Sensing Change Detection + + +
+ Remote sensing change detection aims to perceive changes occurring on the +Earth's surface from remote sensing data in different periods, and feed these +changes back to humans. However, most existing methods only focus on detecting +change regions, lacking the capability to interact with users to identify +changes that the users expect. In this paper, we introduce a new task named +Change Detection Question Answering and Grounding (CDQAG), which extends the +traditional change detection task by providing interpretable textual answers +and intuitive visual evidence. To this end, we construct the first CDQAG +benchmark dataset, termed QAG-360K, comprising over 360K triplets of questions, +textual answers, and corresponding high-quality visual masks. It encompasses 10 +essential land-cover categories and 8 comprehensive question types, which +provides a valuable and diverse dataset for remote sensing applications. +Furthermore, we present VisTA, a simple yet effective baseline method that +unifies the tasks of question answering and grounding by delivering both visual +and textual answers. Our method achieves state-of-the-art results on both the +classic change detection-based visual question answering (CDVQA) and the +proposed CDQAG datasets. Extensive qualitative and quantitative experimental +results provide useful insights for developing better CDQAG models, and we hope +that our work can inspire further research in this important yet underexplored +research field. The proposed benchmark dataset and method are available at +https://github.com/like413/VisTA. + +
+
+
+
+
+ + ♻ ☆ From Explicit Rules to Implicit Reasoning in an Interpretable Violence + Monitoring System + + +
+ Recently, research based on pre-trained models has demonstrated outstanding +performance in violence surveillance tasks. However, most of them were +black-box systems which faced challenges regarding explainability during +training and inference processes. An important question is how to incorporate +explicit knowledge into these implicit models, thereby designing expert-driven +and interpretable violence surveillance systems. This paper proposes a new +paradigm for weakly supervised violence monitoring (WSVM) called Rule base +Violence Monitoring (RuleVM). The proposed RuleVM uses a dual-branch structure +with different designs for images and text. One of the branches is called the +implicit branch, which uses only visual features for coarse-grained binary +classification. In this branch, image feature extraction is divided into two +channels: one responsible for extracting scene frames and the other focusing on +extracting actions. The other branch is called the explicit branch, which +utilizes language-image alignment to perform fine-grained classification. For +the language channel design in the explicit branch, the proposed RuleCLIP uses +the state-of-the-art YOLO-World model to detect objects in video frames, and +association rules are identified through data mining methods as descriptions of +the video. Leveraging the dual-branch architecture, RuleVM achieves +interpretable coarse-grained and fine-grained violence surveillance. Extensive +experiments were conducted on two commonly used benchmarks, and the results +show that RuleCLIP achieved the best performance in both coarse-grained and +fine-grained monitoring, significantly outperforming existing state-of-the-art +methods. Moreover, interpretability experiments uncovered some interesting +rules, such as the observation that as the number of people increases, the risk +level of violent behavior also rises. + +
+
+ comment: 12 pages,7 figures IEEE TSMCA (Under review) +
+
+
+
+
+ + ♻ ☆ *: Improving the 3D detector by introducing Voxel2Pillar feature + encoding and extracting multi-scale features + + +
+ The multi-line LiDAR is widely used in autonomous vehicles, so point +cloud-based 3D detectors are essential for autonomous driving. Extracting rich +multi-scale features is crucial for point cloud-based 3D detectors in +autonomous driving due to significant differences in the size of different +types of objects. However, because of the real-time requirements, large-size +convolution kernels are rarely used to extract large-scale features in the +backbone. Current 3D detectors commonly use feature pyramid networks to obtain +large-scale features; however, some objects containing fewer point clouds are +further lost during down-sampling, resulting in degraded performance. Since +pillar-based schemes require much less computation than voxel-based schemes, +they are more suitable for constructing real-time 3D detectors. Hence, we +propose the *, a pillar-based scheme. We redesigned the feature encoding, the +backbone, and the neck of the 3D detector. We propose the Voxel2Pillar feature +encoding, which uses a sparse convolution constructor to construct pillars with +richer point cloud features, especially height features. The Voxel2Pillar adds +more learnable parameters to the feature encoding, enabling the initial pillars +to have higher performance ability. We extract multi-scale and large-scale +features in the proposed fully sparse backbone, which does not utilize +large-size convolutional kernels; the backbone consists of the proposed +multi-scale feature extraction module. The neck consists of the proposed sparse +ConvNeXt, whose simple structure significantly improves the performance. We +validate the effectiveness of the proposed * on the Waymo Open Dataset, and the +object detection accuracy for vehicles, pedestrians, and cyclists is improved. +We also verify the effectiveness of each proposed module in detail through +ablation studies. + +
+
+ comment: Due to experimental data errors, it needs to be withdrawn +
+
+
+
+
+ + ♻ ☆ Stem-OB: Generalizable Visual Imitation Learning with Stem-Like + Convergent Observation through Diffusion Inversion + + +
+ Visual imitation learning methods demonstrate strong performance, yet they +lack generalization when faced with visual input perturbations, including +variations in lighting and textures, impeding their real-world application. We +propose Stem-OB that utilizes pretrained image diffusion models to suppress +low-level visual differences while maintaining high-level scene structures. +This image inversion process is akin to transforming the observation into a +shared representation, from which other observations stem, with extraneous +details removed. Stem-OB contrasts with data-augmentation approaches as it is +robust to various unspecified appearance changes without the need for +additional training. Our method is a simple yet highly effective plug-and-play +solution. Empirical results confirm the effectiveness of our approach in +simulated tasks and show an exceptionally significant improvement in real-world +applications, with an average increase of 22.2% in success rates compared to +the best baseline. See https://hukz18.github.io/Stem-Ob/ for more info. + +
+
+ comment: Arxiv preprint version, website: https://hukz18.github.io/Stem-Ob/ +
+
+
+
+
+ + ♻ ☆ LAuReL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Projecting Gaussian Ellipsoids While Avoiding Affine Projection + Approximation + + +
+ Recently, 3D Gaussian Splatting has dominated novel-view synthesis with its +real-time rendering speed and state-of-the-art rendering quality. However, +during the rendering process, the use of the Jacobian of the affine +approximation of the projection transformation leads to inevitable errors, +resulting in blurriness, artifacts and a lack of scene consistency in the final +rendered images. To address this issue, we introduce an ellipsoid-based +projection method to calculate the projection of Gaussian ellipsoid on the +image plane, witch is the primitive of 3D Gaussian Splatting. As our proposed +ellipsoid-based projection method cannot handle Gaussian ellipsoids with camera +origins inside them or parts lying below $z=0$ plane in the camera space, we +designed a pre-filtering strategy. Experiments over multiple widely adopted +benchmark datasets show that using our ellipsoid-based projection method can +enhance the rendering quality of 3D Gaussian Splatting and its extensions. + +
+
+
+
+
+ + ♻ ☆ ViTOC: Vision Transformer and Object-aware Captioner + + +
+ This paper presents ViTOC (Vision Transformer and Object-aware Captioner), a +novel vision-language model for image captioning that addresses the challenges +of accuracy and diversity in generated descriptions. Unlike conventional +approaches, ViTOC employs a dual-path architecture based on Vision Transformer +and object detector, effectively fusing global visual features and local object +information through learnable vectors. The model introduces an innovative +object-aware prompting strategy that significantly enhances its capability in +handling long-tail data. Experiments on the standard COCO dataset demonstrate +that ViTOC outperforms baseline models across all evaluation metrics. +Additionally, we propose a reference-free evaluation method based on CLIP to +further validate the model's effectiveness. By utilizing pretrained visual +model parameters, ViTOC achieves efficient end-to-end training. + +
+
+
+
+
+ + ♻ ☆ Leveraging Pre-trained Models for FF-to-FFPE Histopathological Image + Translation + + +
+ The two primary types of Hematoxylin and Eosin (H&E) slides in histopathology +are Formalin-Fixed Paraffin-Embedded (FFPE) and Fresh Frozen (FF). FFPE slides +offer high quality histopathological images but require a labor-intensive +acquisition process. In contrast, FF slides can be prepared quickly, but the +image quality is relatively poor. Our task is to translate FF images into FFPE +style, thereby improving the image quality for diagnostic purposes. In this +paper, we propose Diffusion-FFPE, a method for FF-to-FFPE histopathological +image translation using a pre-trained diffusion model. Specifically, we utilize +a one-step diffusion model as the generator, which we fine-tune using LoRA +adapters within an adversarial learning framework. To enable the model to +effectively capture both global structural patterns and local details, we +introduce a multi-scale feature fusion module that leverages two VAE encoders +to extract features at different image resolutions, performing feature fusion +before inputting them into the UNet. Additionally, a pre-trained +vision-language model for histopathology serves as the backbone for the +discriminator, enhancing model performance. Our FF-to-FFPE translation +experiments on the TCGA-NSCLC dataset demonstrate that the proposed approach +outperforms existing methods. The code and models are released at +https://github.com/QilaiZhang/Diffusion-FFPE. + +
+
+ comment: Accepted at IEEE BIBM 2024 +
+
+
+
+
+ + ♻ ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ♻ ☆ Six-Point Method for Multi-Camera Systems with Reduced Solution Space ECCV + + +
+ Relative pose estimation using point correspondences (PC) is a widely used +technique. A minimal configuration of six PCs is required for two views of +generalized cameras. In this paper, we present several minimal solvers that use +six PCs to compute the 6DOF relative pose of multi-camera systems, including a +minimal solver for the generalized camera and two minimal solvers for the +practical configuration of two-camera rigs. The equation construction is based +on the decoupling of rotation and translation. Rotation is represented by +Cayley or quaternion parametrization, and translation can be eliminated by +using the hidden variable technique. Ray bundle constraints are found and +proven when a subset of PCs relate the same cameras across two views. This is +the key to reducing the number of solutions and generating numerically stable +solvers. Moreover, all configurations of six-point problems for multi-camera +systems are enumerated. Extensive experiments demonstrate the superior accuracy +and efficiency of our solvers compared to state-of-the-art six-point methods. +The code is available at https://github.com/jizhaox/relpose-6pt + +
+
+ comment: Accepted to the European Conference on Computer Vision (ECCV), 2024, + for an oral presentation +
+
+
+
+
+ + ♻ ☆ DINO-LG: A Task-Specific DINO Model for Coronary Calcium Scoring + + +
+ Coronary artery disease (CAD), one of the most common cause of mortality in +the world. Coronary artery calcium (CAC) scoring using computed tomography (CT) +is key for risk assessment to prevent coronary disease. Previous studies on +risk assessment and calcification detection in CT scans primarily use +approaches based on UNET architecture, frequently implemented on pre-built +models. However, these models are limited by the availability of annotated CT +scans containing CAC and suffering from imbalanced dataset, decreasing +performance of CAC segmentation and scoring. In this study, we extend this +approach by incorporating the self-supervised learning (SSL) technique of DINO +(self-distillation with no labels) to eliminate limitations of scarce annotated +data in CT scans. The DINO model's ability to train without requiring CAC area +annotations enhances its robustness in generating distinct features. The DINO +model is trained on to focus specifically on calcified areas by using labels, +aiming to generate features that effectively capture and highlight key +characteristics. The label-guided DINO (DINO-LG) enhances classification by +distinguishing CT slices that contain calcification from those that do not, +performing 57% better than the standard DINO model in this task. CAC scoring +and segmentation tasks are performed by a basic U-NET architecture, fed +specifically with CT slices containing calcified areas as identified by the +DINO-LG model. This targeted identification performed by DINO-LG model improves +CAC segmentation performance by approximately 10% and significant increase in +CAC scoring accuracy. + +
+
+ comment: Developed by Center for Applied Artificial Intelligence (CAAI), + University of Kentucky +
+
+
+
+
+ + ♻ ☆ Personalize to generalize: Towards a universal medical multi-modality + generalization through personalization + + +
+ The differences among medical imaging modalities, driven by distinct +underlying principles, pose significant challenges for generalization in +multi-modal medical tasks. Beyond modality gaps, individual variations, such as +differences in organ size and metabolic rate, further impede a model's ability +to generalize effectively across both modalities and diverse populations. +Despite the importance of personalization, existing approaches to multi-modal +generalization often neglect individual differences, focusing solely on common +anatomical features. This limitation may result in weakened generalization in +various medical tasks. In this paper, we unveil that personalization is +critical for multi-modal generalization. Specifically, we propose an approach +to achieve personalized generalization through approximating the underlying +personalized invariant representation ${X}_h$ across various modalities by +leveraging individual-level constraints and a learnable biological prior. We +validate the feasibility and benefits of learning a personalized ${X}_h$, +showing that this representation is highly generalizable and transferable +across various multi-modal medical tasks. Extensive experimental results +consistently show that the additionally incorporated personalization +significantly improves performance and generalization across diverse scenarios, +confirming its effectiveness. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at: +\url{https://github.com/chikap421/mseg_vcuq} + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ♻ ☆ RealisHuman: A Two-Stage Approach for Refining Malformed Human Parts in + Generated Images + + +
+ In recent years, diffusion models have revolutionized visual generation, +outperforming traditional frameworks like Generative Adversarial Networks +(GANs). However, generating images of humans with realistic semantic parts, +such as hands and faces, remains a significant challenge due to their intricate +structural complexity. To address this issue, we propose a novel +post-processing solution named RealisHuman. The RealisHuman framework operates +in two stages. First, it generates realistic human parts, such as hands or +faces, using the original malformed parts as references, ensuring consistent +details with the original image. Second, it seamlessly integrates the rectified +human parts back into their corresponding positions by repainting the +surrounding areas to ensure smooth and realistic blending. The RealisHuman +framework significantly enhances the realism of human generation, as +demonstrated by notable improvements in both qualitative and quantitative +metrics. Code is available at https://github.com/Wangbenzhi/RealisHuman. + +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting ECCV 2024 + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ♻ ☆ ConMe: Rethinking Evaluation of Compositional Reasoning for Modern VLMs NeurIPS 2024 + + +
+ Compositional Reasoning (CR) entails grasping the significance of attributes, +relations, and word order. Recent Vision-Language Models (VLMs), comprising a +visual encoder and a Large Language Model (LLM) decoder, have demonstrated +remarkable proficiency in such reasoning tasks. This prompts a crucial +question: have VLMs effectively tackled the CR challenge? We conjecture that +existing CR benchmarks may not adequately push the boundaries of modern VLMs +due to the reliance on an LLM-only negative text generation pipeline. +Consequently, the negatives produced either appear as outliers from the natural +language distribution learned by VLMs' LLM decoders or as improbable within the +corresponding image context. To address these limitations, we introduce ConMe +-- a compositional reasoning benchmark and a novel data generation pipeline +leveraging VLMs to produce `hard CR Q&A'. Through a new concept of VLMs +conversing with each other to collaboratively expose their weaknesses, our +pipeline autonomously generates, evaluates, and selects challenging +compositional reasoning questions, establishing a robust CR benchmark, also +subsequently validated manually. Our benchmark provokes a noteworthy, up to +33%, decrease in CR performance compared to preceding benchmarks, reinstating +the CR challenge even for state-of-the-art VLMs. + +
+
+ comment: NeurIPS 2024 Camera Ready +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Rethinking negative sampling in content-based news recommendation + + +
+ News recommender systems are hindered by the brief lifespan of articles, as +they undergo rapid relevance decay. Recent studies have demonstrated the +potential of content-based neural techniques in tackling this problem. However, +these models often involve complex neural architectures and often lack +consideration for negative examples. In this study, we posit that the careful +sampling of negative examples has a big impact on the model's outcome. We +devise a negative sampling technique that not only improves the accuracy of the +model but also facilitates the decentralization of the recommendation system. +The experimental results obtained using the MIND dataset demonstrate that the +accuracy of the method under consideration can compete with that of +State-of-the-Art models. The utilization of the sampling technique is essential +in reducing model complexity and accelerating the training process, while +maintaining a high level of accuracy. Finally, we discuss how decentralized +models can help improve privacy and scalability. + +
+
+
+
+
+ + ☆ Scholarly Wikidata: Population and Exploration of Conference Data in + Wikidata using LLMs + + +
+ Several initiatives have been undertaken to conceptually model the domain of +scholarly data using ontologies and to create respective Knowledge Graphs. Yet, +the full potential seems unleashed, as automated means for automatic population +of said ontologies are lacking, and respective initiatives from the Semantic +Web community are not necessarily connected: we propose to make scholarly data +more sustainably accessible by leveraging Wikidata's infrastructure and +automating its population in a sustainable manner through LLMs by tapping into +unstructured sources like conference Web sites and proceedings texts as well as +already existing structured conference datasets. While an initial analysis +shows that Semantic Web conferences are only minimally represented in Wikidata, +we argue that our methodology can help to populate, evolve and maintain +scholarly data as a community within Wikidata. Our main contributions include +(a) an analysis of ontologies for representing scholarly data to identify gaps +and relevant entities/properties in Wikidata, (b) semi-automated extraction -- +requiring (minimal) manual validation -- of conference metadata (e.g., +acceptance rates, organizer roles, programme committee members, best paper +awards, keynotes, and sponsors) from websites and proceedings texts using LLMs. +Finally, we discuss (c) extensions to visualization tools in the Wikidata +context for data exploration of the generated scholarly data. Our study focuses +on data from 105 Semantic Web-related conferences and extends/adds more than +6000 entities in Wikidata. It is important to note that the method can be more +generally applicable beyond Semantic Web-related conferences for enhancing +Wikidata's utility as a comprehensive scholarly resource. + Source Repository: https://github.com/scholarly-wikidata/ + DOI: https://doi.org/10.5281/zenodo.10989709 + License: Creative Commons CC0 (Data), MIT (Code) + +
+
+ comment: 17 pages, accepted at EKAW-24 +
+
+
+
+
+ + ☆ Neural Corrective Machine Unranking + + +
+ Machine unlearning in neural information retrieval (IR) systems requires +removing specific data whilst maintaining model performance. Applying existing +machine unlearning methods to IR may compromise retrieval effectiveness or +inadvertently expose unlearning actions due to the removal of particular items +from the retrieved results presented to users. We formalise corrective +unranking, which extends machine unlearning in (neural) IR context by +integrating substitute documents to preserve ranking integrity, and propose a +novel teacher-student framework, Corrective unRanking Distillation (CuRD), for +this task. CuRD (1) facilitates forgetting by adjusting the (trained) neural IR +model such that its output relevance scores of to-be-forgotten samples mimic +those of low-ranking, non-retrievable samples; (2) enables correction by +fine-tuning the relevance scores for the substitute samples to match those of +corresponding to-be-forgotten samples closely; (3) seeks to preserve +performance on samples that are not targeted for forgetting. We evaluate CuRD +on four neural IR models (BERTcat, BERTdot, ColBERT, PARADE) using MS MARCO and +TREC CAR datasets. Experiments with forget set sizes from 1 % and 20 % of the +training dataset demonstrate that CuRD outperforms seven state-of-the-art +baselines in terms of forgetting and correction while maintaining model +retention and generalisation capabilities. + +
+
+ comment: submitted to Information Sciences +
+
+
+
+
+ + ☆ Enhancing Multimodal Query Representation via Visual Dialogues for + End-to-End Knowledge Retrieval + + +
+ Existing multimodal retrieval systems often rely on disjointed models for +image comprehension, such as object detectors and caption generators, leading +to cumbersome implementations and training processes. To overcome this +limitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a +text retriever with the ability to understand multimodal queries via dynamic +modality interaction. Ret-XKnow leverages a partial convolution mechanism to +focus on visual information relevant to the given textual query, thereby +enhancing multimodal query representations. To effectively learn multimodal +interaction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset +automatically constructed from visual dialogue datasets. Our dataset +construction process ensures that the dialogues are transformed into suitable +information retrieval tasks using a text retriever. We demonstrate that our +approach not only significantly improves retrieval performance in zero-shot +settings but also achieves substantial improvements in fine-tuning scenarios. +Our code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow. + +
+
+
+
+
+ + ☆ A Large-Scale Study of Relevance Assessments with Large Language Models: + An Initial Look + + +
+ The application of large language models to provide relevance assessments +presents exciting opportunities to advance information retrieval, natural +language processing, and beyond, but to date many unknowns remain. This paper +reports on the results of a large-scale evaluation (the TREC 2024 RAG Track) +where four different relevance assessment approaches were deployed in situ: the +"standard" fully manual process that NIST has implemented for decades and three +different alternatives that take advantage of LLMs to different extents using +the open-source UMBRELA tool. This setup allows us to correlate system rankings +induced by the different approaches to characterize tradeoffs between cost and +quality. We find that in terms of nDCG@20, nDCG@100, and Recall@100, system +rankings induced by automatically generated relevance assessments from UMBRELA +correlate highly with those induced by fully manual assessments across a +diverse set of 77 runs from 19 teams. Our results suggest that automatically +generated UMBRELA judgments can replace fully manual judgments to accurately +capture run-level effectiveness. Surprisingly, we find that LLM assistance does +not appear to increase correlation with fully manual assessments, suggesting +that costs associated with human-in-the-loop processes do not bring obvious +tangible benefits. Overall, human assessors appear to be stricter than UMBRELA +in applying relevance criteria. Our work validates the use of LLMs in academic +TREC-style evaluations and provides the foundation for future studies. + +
+
+
+
+
+ + ♻ ☆ Explainable Enrichment-Driven GrAph Reasoner (EDGAR) for Large Knowledge + Graphs with Applications in Drug Repurposing + + +
+ Knowledge graphs (KGs) represent connections and relationships between +real-world entities. We propose a link prediction framework for KGs named +Enrichment-Driven GrAph Reasoner (EDGAR), which infers new edges by mining +entity-local rules. This approach leverages enrichment analysis, a +well-established statistical method used to identify mechanisms common to sets +of differentially expressed genes. EDGAR's inference results are inherently +explainable and rankable, with p-values indicating the statistical significance +of each enrichment-based rule. + We demonstrate the framework's effectiveness on a large-scale biomedical KG, +ROBOKOP, focusing on drug repurposing for Alzheimer disease (AD) as a case +study. Initially, we extracted 14 known drugs from the KG and identified 20 +contextual biomarkers through enrichment analysis, revealing functional +pathways relevant to shared drug efficacy for AD. Subsequently, using the top +1000 enrichment results, our system identified 1246 additional drug candidates +for AD treatment. The top 10 candidates were validated using evidence from +medical literature. + EDGAR is deployed within ROBOKOP, complete with a web user interface. This is +the first study to apply enrichment analysis to large graph completion and drug +repurposing. + +
+
+ comment: 10 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural + Networks + + +
+ Graph has become increasingly integral to the advancement of recommendation +systems, particularly with the fast development of graph neural network(GNN). +By exploring the virtue of rich node features and link information, GNN is +designed to provide personalized and accurate suggestions. Meanwhile, the +privacy leakage of GNN in such contexts has also captured special attention. +Prior work has revealed that a malicious user can utilize auxiliary knowledge +to extract sensitive link data of the target graph, integral to recommendation +systems, via the decision made by the target GNN model. This poses a +significant risk to the integrity and confidentiality of data used in +recommendation system. Though important, previous works on GNN's privacy +leakage are still challenged in three aspects, i.e., limited stealing attack +scenarios, sub-optimal attack performance, and adaptation against defense. To +address these issues, we propose a diffusion model based link stealing attack, +named DM4Steal. It differs previous work from three critical aspects. (i) +Generality: aiming at six attack scenarios with limited auxiliary knowledge, we +propose a novel training strategy for diffusion models so that DM4Steal is +transferable to diverse attack scenarios. (ii) Effectiveness: benefiting from +the retention of semantic structure in the diffusion model during the training +process, DM4Steal is capable to learn the precise topology of the target graph +through the GNN decision process. (iii) Adaptation: when GNN is defensive +(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling +the score model multiple times to keep performance degradation to a minimum, +thus DM4Steal implements successful adaptive attack on defensive GNN. + +
+
+ comment: We found that there were critical problems in our paper, and we + needed to redo the experiment, which was incomplete +
+
+
+
+
+ + ♻ ☆ Query Optimization for Parametric Knowledge Refinement in + Retrieval-Augmented Large Language Models + + +
+ We introduce the Extract-Refine-Retrieve-Read (ERRR) framework, a novel +approach designed to bridge the pre-retrieval information gap in +Retrieval-Augmented Generation (RAG) systems through query optimization +tailored to meet the specific knowledge requirements of Large Language Models +(LLMs). Unlike conventional query optimization techniques used in RAG, the ERRR +framework begins by extracting parametric knowledge from LLMs, followed by +using a specialized query optimizer for refining these queries. This process +ensures the retrieval of only the most pertinent information essential for +generating accurate responses. Moreover, to enhance flexibility and reduce +computational costs, we propose a trainable scheme for our pipeline that +utilizes a smaller, tunable model as the query optimizer, which is refined +through knowledge distillation from a larger teacher model. Our evaluations on +various question-answering (QA) datasets and with different retrieval systems +show that ERRR consistently outperforms existing baselines, proving to be a +versatile and cost-effective module for improving the utility and accuracy of +RAG systems. + +
+
+
+
+
+ + ♻ ☆ Feature Interaction Fusion Self-Distillation Network For CTR Prediction + + +
+ Click-Through Rate (CTR) prediction plays a vital role in recommender +systems, online advertising, and search engines. Most of the current approaches +model feature interactions through stacked or parallel structures, with some +employing knowledge distillation for model compression. However, we observe +some limitations with these approaches: (1) In parallel structure models, the +explicit and implicit components are executed independently and simultaneously, +which leads to insufficient information sharing within the feature set. (2) The +introduction of knowledge distillation technology brings about the problems of +complex teacher-student framework design and low knowledge transfer efficiency. +(3) The dataset and the process of constructing high-order feature interactions +contain significant noise, which limits the model's effectiveness. To address +these limitations, we propose FSDNet, a CTR prediction framework incorporating +a plug-and-play fusion self-distillation module. Specifically, FSDNet forms +connections between explicit and implicit feature interactions at each layer, +enhancing the sharing of information between different features. The deepest +fusion layer is then used as the teacher model, utilizing self-distillation to +guide the training of shallow layers. Empirical evaluation across four +benchmark datasets validates the framework's efficacy and generalization +capabilities. The code is available on +https://anonymous.4open.science/r/FSDNet. + +
+
+
+
+
+
+
+
+ + Machine Learning 143 + +
+
+
+ + ☆ A Short Note on Evaluating RepNet for Temporal Repetition Counting in + Videos + + +
+ We discuss some consistent issues on how RepNet has been evaluated in various +papers. As a way to mitigate these issues, we report RepNet performance results +on different datasets, and release evaluation code and the RepNet checkpoint to +obtain these results. Code URL: +https://github.com/google-research/google-research/blob/master/repnet/ + +
+
+
+
+
+ + ☆ The Limited Impact of Medical Adaptation of Large Language and + Vision-Language Models EMNLP 2024 + + +
+ Several recent works seek to develop foundation models specifically for +medical applications, adapting general-purpose large language models (LLMs) and +vision-language models (VLMs) via continued pretraining on publicly available +biomedical corpora. These works typically claim that such domain-adaptive +pretraining (DAPT) improves performance on downstream medical tasks, such as +answering medical licensing exam questions. In this paper, we compare ten +public "medical" LLMs and two VLMs against their corresponding base models, +arriving at a different conclusion: all medical VLMs and nearly all medical +LLMs fail to consistently improve over their base models in the zero-/few-shot +prompting and supervised fine-tuning regimes for medical question-answering +(QA). For instance, across all tasks and model pairs we consider in the 3-shot +setting, medical LLMs only outperform their base models in 22.7% of cases, +reach a (statistical) tie in 36.8% of cases, and are significantly worse than +their base models in the remaining 40.5% of cases. Our conclusions are based on +(i) comparing each medical model head-to-head, directly against the +corresponding base model; (ii) optimizing the prompts for each model separately +in zero-/few-shot prompting; and (iii) accounting for statistical uncertainty +in comparisons. While these basic practices are not consistently adopted in the +literature, our ablations show that they substantially impact conclusions. +Meanwhile, we find that after fine-tuning on specific QA tasks, medical LLMs +can show performance improvements, but the benefits do not carry over to tasks +based on clinical notes. Our findings suggest that state-of-the-art +general-domain models may already exhibit strong medical knowledge and +reasoning capabilities, and offer recommendations to strengthen the conclusions +of future studies. + +
+
+ comment: Extended version of EMNLP 2024 paper arXiv:2411.04118. Includes + additional results on clinical note QA tasks and supervised fine-tuning + evaluations +
+
+
+
+
+ + ☆ Unsupervised Parameter-free Outlier Detection using HDBSCAN* Outlier + Profiles + + +
+ In machine learning and data mining, outliers are data points that +significantly differ from the dataset and often introduce irrelevant +information that can induce bias in its statistics and models. Therefore, +unsupervised methods are crucial to detect outliers if there is limited or no +information about them. Global-Local Outlier Scores based on Hierarchies +(GLOSH) is an unsupervised outlier detection method within HDBSCAN*, a +state-of-the-art hierarchical clustering method. GLOSH estimates outlier scores +for each data point by comparing its density to the highest density of the +region they reside in the HDBSCAN* hierarchy. GLOSH may be sensitive to +HDBSCAN*'s minpts parameter that influences density estimation. With limited +knowledge about the data, choosing an appropriate minpts value beforehand is +challenging as one or some minpts values may better represent the underlying +cluster structure than others. Additionally, in the process of searching for +``potential outliers'', one has to define the number of outliers n a dataset +has, which may be impractical and is often unknown. In this paper, we propose +an unsupervised strategy to find the ``best'' minpts value, leveraging the +range of GLOSH scores across minpts values to identify the value for which +GLOSH scores can best identify outliers from the rest of the dataset. Moreover, +we propose an unsupervised strategy to estimate a threshold for classifying +points into inliers and (potential) outliers without the need to pre-define any +value. Our experiments show that our strategies can automatically find the +minpts value and threshold that yield the best or near best outlier detection +results using GLOSH. + +
+
+ comment: Accepted at IEEE International Conference on Big Data, IEEE BigData + 2024 +
+
+
+
+
+ + ☆ LLMStinger: Jailbreaking LLMs using RL fine-tuned LLMs AAAI 2025 + + +
+ We introduce LLMStinger, a novel approach that leverages Large Language +Models (LLMs) to automatically generate adversarial suffixes for jailbreak +attacks. Unlike traditional methods, which require complex prompt engineering +or white-box access, LLMStinger uses a reinforcement learning (RL) loop to +fine-tune an attacker LLM, generating new suffixes based on existing attacks +for harmful questions from the HarmBench benchmark. Our method significantly +outperforms existing red-teaming approaches (we compared against 15 of the +latest methods), achieving a +57.2% improvement in Attack Success Rate (ASR) on +LLaMA2-7B-chat and a +50.3% ASR increase on Claude 2, both models known for +their extensive safety measures. Additionally, we achieved a 94.97% ASR on +GPT-3.5 and 99.4% on Gemma-2B-it, demonstrating the robustness and adaptability +of LLMStinger across open and closed-source models. + +
+
+ comment: Accepted at AAAI 2025 +
+
+
+
+
+ + ☆ Interaction Testing in Variation Analysis + + +
+ Relationships of cause and effect are of prime importance for explaining +scientific phenomena. Often, rather than just understanding the effects of +causes, researchers also wish to understand how a cause $X$ affects an outcome +$Y$ mechanistically -- i.e., what are the causal pathways that are activated +between $X$ and $Y$. For analyzing such questions, a range of methods has been +developed over decades under the rubric of causal mediation analysis. +Traditional mediation analysis focuses on decomposing the average treatment +effect (ATE) into direct and indirect effects, and therefore focuses on the ATE +as the central quantity. This corresponds to providing explanations for +associations in the interventional regime, such as when the treatment $X$ is +randomized. Commonly, however, it is of interest to explain associations in the +observational regime, and not just in the interventional regime. In this paper, +we introduce \text{variation analysis}, an extension of mediation analysis that +focuses on the total variation (TV) measure between $X$ and $Y$, written as +$\mathrm{E}[Y \mid X=x_1] - \mathrm{E}[Y \mid X=x_0]$. The TV measure +encompasses both causal and confounded effects, as opposed to the ATE which +only encompasses causal (direct and mediated) variations. In this way, the TV +measure is suitable for providing explanations in the natural regime and +answering questions such as ``why is $X$ associated with $Y$?''. Our focus is +on decomposing the TV measure, in a way that explicitly includes direct, +indirect, and confounded variations. Furthermore, we also decompose the TV +measure to include interaction terms between these different pathways. +Subsequently, interaction testing is introduced, involving hypothesis tests to +determine if interaction terms are significantly different from zero. If +interactions are not significant, more parsimonious decompositions of the TV +measure can be used. + +
+
+
+
+
+ + ☆ Oblique Bayesian additive regression trees + + +
+ Current implementations of Bayesian Additive Regression Trees (BART) are +based on axis-aligned decision rules that recursively partition the feature +space using a single feature at a time. Several authors have demonstrated that +oblique trees, whose decision rules are based on linear combinations of +features, can sometimes yield better predictions than axis-aligned trees and +exhibit excellent theoretical properties. We develop an oblique version of BART +that leverages a data-adaptive decision rule prior that recursively partitions +the feature space along random hyperplanes. Using several synthetic and +real-world benchmark datasets, we systematically compared our oblique BART +implementation to axis-aligned BART and other tree ensemble methods, finding +that oblique BART was competitive with -- and sometimes much better than -- +those methods. + +
+
+
+
+
+ + ☆ Offline Adaptation of Quadruped Locomotion using Diffusion Models + + +
+ We present a diffusion-based approach to quadrupedal locomotion that +simultaneously addresses the limitations of learning and interpolating between +multiple skills and of (modes) offline adapting to new locomotion behaviours +after training. This is the first framework to apply classifier-free guided +diffusion to quadruped locomotion and demonstrate its efficacy by extracting +goal-conditioned behaviour from an originally unlabelled dataset. We show that +these capabilities are compatible with a multi-skill policy and can be applied +with little modification and minimal compute overhead, i.e., running entirely +on the robots onboard CPU. We verify the validity of our approach with hardware +experiments on the ANYmal quadruped platform. + +
+
+
+
+
+ + ☆ Model agnostic local variable importance for locally dependent + relationships + + +
+ Global variable importance measures are commonly used to interpret machine +learning model results. Local variable importance techniques assess how +variables contribute to individual observations rather than the entire dataset. +Current methods typically fail to accurately reflect locally dependent +relationships between variables and instead focus on marginal importance +values. Additionally, they are not natively adapted for multi-class +classification problems. We propose a new model-agnostic method for calculating +local variable importance, CLIQUE, that captures locally dependent +relationships, contains improvements over permutation-based methods, and can be +directly applied to multi-class classification problems. Simulated and +real-world examples show that CLIQUE emphasizes locally dependent information +and properly reduces bias in regions where variables do not affect the +response. + +
+
+
+
+
+ + ☆ Process-aware Human Activity Recognition + + +
+ Humans naturally follow distinct patterns when conducting their daily +activities, which are driven by established practices and processes, such as +production workflows, social norms and daily routines. Human activity +recognition (HAR) algorithms usually use neural networks or machine learning +techniques to analyse inherent relationships within the data. However, these +approaches often overlook the contextual information in which the data are +generated, potentially limiting their effectiveness. We propose a novel +approach that incorporates process information from context to enhance the HAR +performance. Specifically, we align probabilistic events generated by machine +learning models with process models derived from contextual information. This +alignment adaptively weighs these two sources of information to optimise HAR +accuracy. Our experiments demonstrate that our approach achieves better +accuracy and Macro F1-score compared to baseline models. + +
+
+
+
+
+ + ☆ FinRobot: AI Agent for Equity Research and Valuation with Large Language + Models + + +
+ As financial markets grow increasingly complex, there is a rising need for +automated tools that can effectively assist human analysts in equity research, +particularly within sell-side research. While Generative AI (GenAI) has +attracted significant attention in this field, existing AI solutions often fall +short due to their narrow focus on technical factors and limited capacity for +discretionary judgment. These limitations hinder their ability to adapt to new +data in real-time and accurately assess risks, which diminishes their practical +value for investors. + This paper presents FinRobot, the first AI agent framework specifically +designed for equity research. FinRobot employs a multi-agent Chain of Thought +(CoT) system, integrating both quantitative and qualitative analyses to emulate +the comprehensive reasoning of a human analyst. The system is structured around +three specialized agents: the Data-CoT Agent, which aggregates diverse data +sources for robust financial integration; the Concept-CoT Agent, which mimics +an analysts reasoning to generate actionable insights; and the Thesis-CoT +Agent, which synthesizes these insights into a coherent investment thesis and +report. FinRobot provides thorough company analysis supported by precise +numerical data, industry-appropriate valuation metrics, and realistic risk +assessments. Its dynamically updatable data pipeline ensures that research +remains timely and relevant, adapting seamlessly to new financial information. +Unlike existing automated research tools, such as CapitalCube and Wright +Reports, FinRobot delivers insights comparable to those produced by major +brokerage firms and fundamental research vendors. We open-source FinRobot at +\url{https://github. com/AI4Finance-Foundation/FinRobot}. + +
+
+ comment: The 1st Workshop on LLMs and Generative AI for Finance, ICAIF 2024 +
+
+
+
+
+ + ☆ Deep Learning Accelerated Quantum Transport Simulations in + Nanoelectronics: From Break Junctions to Field-Effect Transistors + + +
+ Quantum transport calculations are essential for understanding and designing +nanoelectronic devices, yet the trade-off between accuracy and computational +efficiency has long limited their practical applications. We present a general +framework that combines the deep learning tight-binding Hamiltonian (DeePTB) +approach with the non-equilibrium Green's Function (NEGF) method, enabling +efficient quantum transport calculations while maintaining first-principles +accuracy. We demonstrate the capabilities of the DeePTB-NEGF framework through +two representative applications: comprehensive simulation of break junction +systems, where conductance histograms show good agreement with experimental +measurements in both metallic contact and single-molecule junction cases; and +simulation of carbon nanotube field effect transistors through self-consistent +NEGF-Poisson calculations, capturing essential physics including the +electrostatic potential and transfer characteristic curves under finite bias +conditions. This framework bridges the gap between first-principles accuracy +and computational efficiency, providing a powerful tool for high-throughput +quantum transport simulations across different scales in nanoelectronics. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Learning Gaussian Multi-Index Models with Gradient Flow: Time Complexity + and Directional Convergence AISTATS 2025 + + +
+ This work focuses on the gradient flow dynamics of a neural network model +that uses correlation loss to approximate a multi-index function on +high-dimensional standard Gaussian data. Specifically, the multi-index function +we consider is a sum of neurons $f^*(x) \!=\! \sum_{j=1}^k \! \sigma^*(v_j^T +x)$ where $v_1, \dots, v_k$ are unit vectors, and $\sigma^*$ lacks the first +and second Hermite polynomials in its Hermite expansion. It is known that, for +the single-index case ($k\!=\!1$), overcoming the search phase requires +polynomial time complexity. We first generalize this result to multi-index +functions characterized by vectors in arbitrary directions. After the search +phase, it is not clear whether the network neurons converge to the index +vectors, or get stuck at a sub-optimal solution. When the index vectors are +orthogonal, we give a complete characterization of the fixed points and prove +that neurons converge to the nearest index vectors. Therefore, using $n \! +\asymp \! k \log k$ neurons ensures finding the full set of index vectors with +gradient flow with high probability over random initialization. When $ v_i^T +v_j \!=\! \beta \! \geq \! 0$ for all $i \neq j$, we prove the existence of a +sharp threshold $\beta_c \!=\! c/(c+k)$ at which the fixed point that computes +the average of the index vectors transitions from a saddle point to a minimum. +Numerical simulations show that using a correlation loss and a mild +overparameterization suffices to learn all of the index vectors when they are +nearly orthogonal, however, the correlation loss fails when the dot product +between the index vectors exceeds a certain threshold. + +
+
+ comment: 21 pages, 6 figures, under review by AISTATS 2025 +
+
+
+
+
+ + ☆ Locally Private Sampling with Public Data + + +
+ Local differential privacy (LDP) is increasingly employed in +privacy-preserving machine learning to protect user data before sharing it with +an untrusted aggregator. Most LDP methods assume that users possess only a +single data record, which is a significant limitation since users often gather +extensive datasets (e.g., images, text, time-series data) and frequently have +access to public datasets. To address this limitation, we propose a locally +private sampling framework that leverages both the private and public datasets +of each user. Specifically, we assume each user has two distributions: $p$ and +$q$ that represent their private dataset and the public dataset, respectively. +The objective is to design a mechanism that generates a private sample +approximating $p$ while simultaneously preserving $q$. We frame this objective +as a minimax optimization problem using $f$-divergence as the utility measure. +We fully characterize the minimax optimal mechanisms for general +$f$-divergences provided that $p$ and $q$ are discrete distributions. +Remarkably, we demonstrate that this optimal mechanism is universal across all +$f$-divergences. Experiments validate the effectiveness of our minimax optimal +sampler compared to the state-of-the-art locally private sampler. + +
+
+
+
+
+ + ☆ Can sparse autoencoders be used to decompose and interpret steering + vectors? + + +
+ Steering vectors are a promising approach to control the behaviour of large +language models. However, their underlying mechanisms remain poorly understood. +While sparse autoencoders (SAEs) may offer a potential method to interpret +steering vectors, recent findings show that SAE-reconstructed vectors often +lack the steering properties of the original vectors. This paper investigates +why directly applying SAEs to steering vectors yields misleading +decompositions, identifying two reasons: (1) steering vectors fall outside the +input distribution for which SAEs are designed, and (2) steering vectors can +have meaningful negative projections in feature directions, which SAEs are not +designed to accommodate. These limitations hinder the direct use of SAEs for +interpreting steering vectors. + +
+
+
+
+
+ + ☆ Optimal Oblivious Subspace Embeddings with Near-optimal Sparsity + + +
+ An oblivious subspace embedding is a random $m\times n$ matrix $\Pi$ such +that, for any $d$-dimensional subspace, with high probability $\Pi$ preserves +the norms of all vectors in that subspace within a $1\pm\epsilon$ factor. In +this work, we give an oblivious subspace embedding with the optimal dimension +$m=\Theta(d/\epsilon^2)$ that has a near-optimal sparsity of $\tilde +O(1/\epsilon)$ non-zero entries per column of $\Pi$. This is the first result +to nearly match the conjecture of Nelson and Nguyen [FOCS 2013] in terms of the +best sparsity attainable by an optimal oblivious subspace embedding, improving +on a prior bound of $\tilde O(1/\epsilon^6)$ non-zeros per column [Chenakkod et +al., STOC 2024]. We further extend our approach to the non-oblivious setting, +proposing a new family of Leverage Score Sparsified embeddings with Independent +Columns, which yield faster runtimes for matrix approximation and regression +tasks. + In our analysis, we develop a new method which uses a decoupling argument +together with the cumulant method for bounding the edge universality error of +isotropic random matrices. To achieve near-optimal sparsity, we combine this +general-purpose approach with new traces inequalities that leverage the +specific structure of our subspace embedding construction. + +
+
+
+
+
+ + ☆ Mapping Methane -- The Impact of Dairy Farm Practices on Emissions + Through Satellite Data and Machine Learning + + +
+ This study investigates the correlation between dairy farm characteristics +and methane concentrations as derived from satellite observations in Eastern +Canada. Utilizing data from 11 dairy farms collected between January 2020 and +December 2022, we integrated Sentinel-5P satellite methane data with critical +farm-level attributes, including herd genetics, feeding practices, and +management strategies. Initial analyses revealed significant correlations with +methane concentrations, leading to the application of Variance Inflation Factor +(VIF) and Principal Component Analysis (PCA) to address multicollinearity and +enhance model stability. Subsequently, machine learning models - specifically +Random Forest and Neural Networks - were employed to evaluate feature +importance and predict methane emissions. Our findings indicate a strong +negative correlation between the Estimated Breeding Value (EBV) for protein +percentage and methane concentrations, suggesting that genetic selection for +higher milk protein content could be an effective strategy for emissions +reduction. The integration of atmospheric transport models with satellite data +further refined our emission estimates, significantly enhancing accuracy and +spatial resolution. This research underscores the potential of advanced +satellite monitoring, machine learning techniques, and atmospheric modeling in +improving methane emission assessments within the dairy sector. It emphasizes +the critical role of farm-specific characteristics in developing effective +mitigation strategies. Future investigations should focus on expanding the +dataset and incorporating inversion modeling for more precise emission +quantification. Balancing ecological impacts with economic viability will be +essential for fostering sustainable dairy farming practices. + +
+
+ comment: 16 pages, 5 figures +
+
+
+
+
+ + ☆ Flow reconstruction in time-varying geometries using graph neural + networks + + +
+ The paper presents a Graph Attention Convolutional Network (GACN) for flow +reconstruction from very sparse data in time-varying geometries. The model +incorporates a feature propagation algorithm as a preprocessing step to handle +extremely sparse inputs, leveraging information from neighboring nodes to +initialize missing features. In addition, a binary indicator is introduced as a +validity mask to distinguish between the original and propagated data points, +enabling more effective learning from sparse inputs. Trained on a unique data +set of Direct Numerical Simulations (DNS) of a motored engine at a technically +relevant operating condition, the GACN shows robust performance across +different resolutions and domain sizes and can effectively handle unstructured +data and variable input sizes. The model is tested on previously unseen DNS +data as well as on an experimental data set from Particle Image Velocimetry +(PIV) measurements that were not considered during training. A comparative +analysis shows that the GACN consistently outperforms both a conventional +Convolutional Neural Network (CNN) and cubic interpolation methods on the DNS +and PIV test sets by achieving lower reconstruction errors and better capturing +fine-scale turbulent structures. In particular, the GACN effectively +reconstructs flow fields from domains up to 14 times larger than those observed +during training, with the performance advantage increasing for larger domains. + +
+
+
+
+
+ + ☆ Energy Dissipation Preserving Physics Informed Neural Network for + Allen-Cahn Equations + + +
+ This paper investigates a numerical solution of Allen-Cahn equation with +constant and degenerate mobility, with polynomial and logarithmic energy +functionals, with deterministic and random initial functions, and with +advective term in one, two, and three spatial dimensions, based on the +physics-informed neural network (PINN). To improve the learning capacity of the +PINN, we incorporate the energy dissipation property of the Allen-Cahn equation +as a penalty term into the loss function of the network. To facilitate the +learning process of random initials, we employ a continuous analogue of the +initial random condition by utilizing the Fourier series expansion. Adaptive +methods from traditional numerical analysis are also integrated to enhance the +effectiveness of the proposed PINN. Numerical results indicate a consistent +decrease in the discrete energy, while also revealing phenomena such as phase +separation and metastability. + +
+
+
+
+
+ + ☆ ScaleNet: Scale Invariance Learning in Directed Graphs + + +
+ Graph Neural Networks (GNNs) have advanced relational data analysis but lack +invariance learning techniques common in image classification. In node +classification with GNNs, it is actually the ego-graph of the center node that +is classified. This research extends the scale invariance concept to node +classification by drawing an analogy to image processing: just as scale +invariance being used in image classification to capture multi-scale features, +we propose the concept of ``scaled ego-graphs''. Scaled ego-graphs generalize +traditional ego-graphs by replacing undirected single-edges with +``scaled-edges'', which are ordered sequences of multiple directed edges. We +empirically assess the performance of the proposed scale invariance in graphs +on seven benchmark datasets, across both homophilic and heterophilic +structures. Our scale-invariance-based graph learning outperforms inception +models derived from random walks by being simpler, faster, and more accurate. +The scale invariance explains inception models' success on homophilic graphs +and limitations on heterophilic graphs. To ensure applicability of inception +model to heterophilic graphs as well, we further present ScaleNet, an +architecture that leverages multi-scaled features. ScaleNet achieves +state-of-the-art results on five out of seven datasets (four homophilic and one +heterophilic) and matches top performance on the remaining two, demonstrating +its excellent applicability. This represents a significant advance in graph +learning, offering a unified framework that enhances node classification across +various graph types. Our code is available at +https://github.com/Qin87/ScaleNet/tree/July25. + +
+
+ comment: Scale invariance in node classification is demonstrated and applied + in graph transformation to develop ScaleNet, which achieves state-of-the-art + performance on both homophilic and heterophilic directed graphs +
+
+
+
+
+ + ☆ Weakly-Supervised Anomaly Detection in Surveillance Videos Based on + Two-Stream I3D Convolution Network + + +
+ The widespread implementation of urban surveillance systems has necessitated +more sophisticated techniques for anomaly detection to ensure enhanced public +safety. This paper presents a significant advancement in the field of anomaly +detection through the application of Two-Stream Inflated 3D (I3D) Convolutional +Networks. These networks substantially outperform traditional 3D Convolutional +Networks (C3D) by more effectively extracting spatial and temporal features +from surveillance videos, thus improving the precision of anomaly detection. +Our research advances the field by implementing a weakly supervised learning +framework based on Multiple Instance Learning (MIL), which uniquely +conceptualizes surveillance videos as collections of 'bags' that contain +instances (video clips). Each instance is innovatively processed through a +ranking mechanism that prioritizes clips based on their potential to display +anomalies. This novel strategy not only enhances the accuracy and precision of +anomaly detection but also significantly diminishes the dependency on extensive +manual annotations. Moreover, through meticulous optimization of model +settings, including the choice of optimizer, our approach not only establishes +new benchmarks in the performance of anomaly detection systems but also offers +a scalable and efficient solution for real-world surveillance applications. +This paper contributes significantly to the field of computer vision by +delivering a more adaptable, efficient, and context-aware anomaly detection +system, which is poised to redefine practices in urban surveillance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ Optimal Transport-Based Displacement Interpolation with Data + Augmentation for Reduced Order Modeling of Nonlinear Dynamical Systems + + +
+ We present a novel reduced-order Model (ROM) that leverages optimal transport +(OT) theory and displacement interpolation to enhance the representation of +nonlinear dynamics in complex systems. While traditional ROM techniques face +challenges in this scenario, especially when data (i.e., observational +snapshots) is limited, our method addresses these issues by introducing a data +augmentation strategy based on OT principles. The proposed framework generates +interpolated solutions tracing geodesic paths in the space of probability +distributions, enriching the training dataset for the ROM. A key feature of our +approach is its ability to provide a continuous representation of the +solution's dynamics by exploiting a virtual-to-real time mapping. This enables +the reconstruction of solutions at finer temporal scales than those provided by +the original data. To further improve prediction accuracy, we employ Gaussian +Process Regression to learn the residual and correct the representation between +the interpolated snapshots and the physical solution. We demonstrate the +effectiveness of our methodology with atmospheric mesoscale benchmarks +characterized by highly nonlinear, advection-dominated dynamics. Our results +show improved accuracy and efficiency in predicting complex system behaviors, +indicating the potential of this approach for a wide range of applications in +computational physics and engineering. + +
+
+
+
+
+ + ☆ Bayesian Comparisons Between Representations + + +
+ Which neural networks are similar is a fundamental question for both machine +learning and neuroscience. Our novel method compares representations based on +Bayesian statistics about linear readouts from the representations. Concretely, +we suggest to use the total variation distance or Jensen-Shannon distance +between prior predictive distributions to compare representations. The prior +predictive distribution is a full description of the inductive bias and +generalization of a model in Bayesian statistics, making it a great basis for +comparisons. As Jensen-Shannon distance and total variation distance are +metrics our dissimilarity measures are pseudo-metrics for representations. For +a linear readout, our metrics just depend on the linear kernel matrix of the +representations. Thus, our metrics connects linear read-out based comparisons +to kernel based metrics like centered kernel alignment and representational +similarity analysis. We apply our new metrics to deep neural networks trained +on ImageNet-1k. Our new metrics can be computed efficiently including a +stochastic gradient without dimensionality reductions of the representations. +It broadly agrees with existing metrics, but is more stringent. It varies less +across different random image samples, and it measures how well two +representations could be distinguished based on a linear read out. Thus our +metric nicely extends our toolkit for comparing representations. + +
+
+
+
+
+ + ☆ Recommender systems and reinforcement learning for building control and + occupant interaction: A text-mining driven review of scientific literature + + +
+ The indoor environment greatly affects health and well-being; enhancing +health and reducing energy use in these settings is a key research focus. With +advancing Information and Communication Technology (ICT), recommendation +systems and reinforcement learning have emerged as promising methods to induce +behavioral changes that improve indoor environments and building energy +efficiency. This study employs text-mining and Natural Language Processing +(NLP) to examine these approaches in building control and occupant interaction. +Analyzing approximately 27,000 articles from the ScienceDirect database, we +found extensive use of recommendation systems and reinforcement learning for +space optimization, location recommendations, and personalized control +suggestions. Despite broad applications, their use in optimizing indoor +environments and energy efficiency is limited. Traditional recommendation +algorithms are commonly used, but optimizing indoor conditions and energy +efficiency often requires advanced machine learning techniques like +reinforcement and deep learning. This review highlights the potential for +expanding recommender systems and reinforcement learning applications in +buildings and indoor environments. Areas for innovation include predictive +maintenance, building-related product recommendations, and optimizing +environments for specific needs like sleep and productivity enhancements based +on user feedback. + +
+
+
+
+
+ + ☆ Searching Latent Program Spaces + + +
+ Program synthesis methods aim to automatically generate programs restricted +to a language that can explain a given specification of input-output pairs. +While purely symbolic approaches suffer from a combinatorial search space, +recent methods leverage neural networks to learn distributions over program +structures to narrow this search space significantly, enabling more efficient +search. However, for challenging problems, it remains difficult to train models +to perform program synthesis in one shot, making test-time search essential. +Most neural methods lack structured search mechanisms during inference, relying +instead on stochastic sampling or gradient updates, which can be inefficient. +In this work, we propose the Latent Program Network (LPN), a general algorithm +for program induction that learns a distribution over latent programs in a +continuous space, enabling efficient search and test-time adaptation. We +explore how to train these networks to optimize for test-time computation and +demonstrate the use of gradient-based search both during training and at test +time. We evaluate LPN on ARC-AGI, a program synthesis benchmark that evaluates +performance by generalizing programs to new inputs rather than explaining the +underlying specification. We show that LPN can generalize beyond its training +distribution and adapt to unseen tasks by utilizing test-time computation, +outperforming algorithms without test-time adaptation mechanisms. + +
+
+ comment: Code available at https://github.com/clement-bonnet/lpn +
+
+
+
+
+ + ☆ MVKTrans: Multi-View Knowledge Transfer for Robust Multiomics + Classification + + +
+ The distinct characteristics of multiomics data, including complex +interactions within and across biological layers and disease heterogeneity +(e.g., heterogeneity in etiology and clinical symptoms), drive us to develop +novel designs to address unique challenges in multiomics prediction. In this +paper, we propose the multi-view knowledge transfer learning (MVKTrans) +framework, which transfers intra- and inter-omics knowledge in an adaptive +manner by reviewing data heterogeneity and suppressing bias transfer, thereby +enhancing classification performance. Specifically, we design a graph +contrastive module that is trained on unlabeled data to effectively learn and +transfer the underlying intra-omics patterns to the supervised task. This +unsupervised pretraining promotes learning general and unbiased representations +for each modality, regardless of the downstream tasks. In light of the varying +discriminative capacities of modalities across different diseases and/or +samples, we introduce an adaptive and bi-directional cross-omics distillation +module. This module automatically identifies richer modalities and facilitates +dynamic knowledge transfer from more informative to less informative omics, +thereby enabling a more robust and generalized integration. Extensive +experiments on four real biomedical datasets demonstrate the superior +performance and robustness of MVKTrans compared to the state-of-the-art. Code +and data are available at https://github.com/Yaolab-fantastic/MVKTrans. + +
+
+
+
+
+ + ☆ TRACE: Transformer-based Risk Assessment for Clinical Evaluation + + +
+ We present TRACE (Transformer-based Risk Assessment for Clinical Evaluation), +a novel method for clinical risk assessment based on clinical data, leveraging +the self-attention mechanism for enhanced feature interaction and result +interpretation. Our approach is able to handle different data modalities, +including continuous, categorical and multiple-choice (checkbox) attributes. +The proposed architecture features a shared representation of the clinical data +obtained by integrating specialized embeddings of each data modality, enabling +the detection of high-risk individuals using Transformer encoder layers. To +assess the effectiveness of the proposed method, a strong baseline based on +non-negative multi-layer perceptrons (MLPs) is introduced. The proposed method +outperforms various baselines widely used in the domain of clinical risk +assessment, while effectively handling missing values. In terms of +explainability, our Transformer-based method offers easily interpretable +results via attention weights, further enhancing the clinicians' +decision-making process. + +
+
+
+
+
+ + ☆ Rethinking negative sampling in content-based news recommendation + + +
+ News recommender systems are hindered by the brief lifespan of articles, as +they undergo rapid relevance decay. Recent studies have demonstrated the +potential of content-based neural techniques in tackling this problem. However, +these models often involve complex neural architectures and often lack +consideration for negative examples. In this study, we posit that the careful +sampling of negative examples has a big impact on the model's outcome. We +devise a negative sampling technique that not only improves the accuracy of the +model but also facilitates the decentralization of the recommendation system. +The experimental results obtained using the MIND dataset demonstrate that the +accuracy of the method under consideration can compete with that of +State-of-the-Art models. The utilization of the sampling technique is essential +in reducing model complexity and accelerating the training process, while +maintaining a high level of accuracy. Finally, we discuss how decentralized +models can help improve privacy and scalability. + +
+
+
+
+
+ + ☆ FedSub: Introducing class-aware Subnetworks Fusion to Enhance + Personalized Federated Learning in Ubiquitous Systems + + +
+ Personalized Federated Learning is essential in AI-driven ubiquitous systems, +supporting the distributed development of models able to adapt to diverse and +evolving user behaviors while safeguarding privacy. Despite addressing +heterogeneous user data distributions in collaborative model training, existing +methods often face limitations balancing personalization and generalization, +oversimplifying user similarities, or relying heavily on global models. In this +paper, we propose FedSub, a novel federated approach designed to enhance +personalization through the use of class-aware prototypes and model +subnetworks. Prototypes serve as compact representations of user data, +clustered on the server to identify similarities based on specific label +patterns. Concurrently, subnetworks -- model components necessary to process +each class -- are extracted locally and fused by the server according to these +clusters, producing highly tailored model updates for each user. This +fine-grained, class-specific aggregation of clients' models allows FedSub to +capture the unique characteristics of individual user data patterns. The +effectiveness of FedSub is validated in three real-world scenarios +characterized by high data heterogeneity, derived from human activity +recognition and mobile health applications. Experimental evaluations +demonstrate FedSub's performance improvements with respect to the +state-of-the-art and significant advancements in personalization for ubiquitous +systems based on personal mobile and wearable devices. + +
+
+ comment: Submitted to Proceedings of the ACM on Interactive, Mobile, Wearable + and Ubiquitous Technologies (IMWUT) +
+
+
+
+
+ + ☆ Measuring similarity between embedding spaces using induced neighborhood + graphs + + +
+ Deep Learning techniques have excelled at generating embedding spaces that +capture semantic similarities between items. Often these representations are +paired, enabling experiments with analogies (pairs within the same domain) and +cross-modality (pairs across domains). These experiments are based on specific +assumptions about the geometry of embedding spaces, which allow finding paired +items by extrapolating the positional relationships between embedding pairs in +the training dataset, allowing for tasks such as finding new analogies, and +multimodal zero-shot classification. In this work, we propose a metric to +evaluate the similarity between paired item representations. Our proposal is +built from the structural similarity between the nearest-neighbors induced +graphs of each representation, and can be configured to compare spaces based on +different distance metrics and on different neighborhood sizes. We demonstrate +that our proposal can be used to identify similar structures at different +scales, which is hard to achieve with kernel methods such as Centered Kernel +Alignment (CKA). We further illustrate our method with two case studies: an +analogy task using GloVe embeddings, and zero-shot classification in the +CIFAR-100 dataset using CLIP embeddings. Our results show that accuracy in both +analogy and zero-shot classification tasks correlates with the embedding +similarity. These findings can help explain performance differences in these +tasks, and may lead to improved design of paired-embedding models in the +future. + +
+
+
+
+
+ + ☆ UniMat: Unifying Materials Embeddings through Multi-modal Learning + + +
+ Materials science datasets are inherently heterogeneous and are available in +different modalities such as characterization spectra, atomic structures, +microscopic images, and text-based synthesis conditions. The advancements in +multi-modal learning, particularly in vision and language models, have opened +new avenues for integrating data in different forms. In this work, we evaluate +common techniques in multi-modal learning (alignment and fusion) in unifying +some of the most important modalities in materials science: atomic structure, +X-ray diffraction patterns (XRD), and composition. We show that structure graph +modality can be enhanced by aligning with XRD patterns. Additionally, we show +that aligning and fusing more experimentally accessible data formats, such as +XRD patterns and compositions, can create more robust joint embeddings than +individual modalities across various tasks. This lays the groundwork for future +studies aiming to exploit the full potential of multi-modal data in materials +science, facilitating more informed decision-making in materials design and +discovery. + +
+
+
+
+
+ + ☆ Accelerating Quasi-Static Time Series Simulations with Foundation Models + + +
+ Quasi-static time series (QSTS) simulations have great potential for +evaluating the grid's ability to accommodate the large-scale integration of +distributed energy resources. However, as grids expand and operate closer to +their limits, iterative power flow solvers, central to QSTS simulations, become +computationally prohibitive and face increasing convergence issues. Neural +power flow solvers provide a promising alternative, speeding up power flow +computations by 3 to 4 orders of magnitude, though they are costly to train. In +this paper, we envision how recently introduced grid foundation models could +improve the economic viability of neural power flow solvers. Conceptually, +these models amortize training costs by serving as a foundation for a range of +grid operation and planning tasks beyond power flow solving, with only minimal +fine-tuning required. We call for collaboration between the AI and power grid +communities to develop and open-source these models, enabling all operators, +even those with limited resources, to benefit from AI without building +solutions from scratch. + +
+
+ comment: Equal contributors: A.P. and F.M.; Lead contact: A.P +
+
+
+
+
+ + ☆ Estimating unknown parameters in differential equations with a + reinforcement learning based PSO method + + +
+ Differential equations offer a foundational yet powerful framework for +modeling interactions within complex dynamic systems and are widely applied +across numerous scientific fields. One common challenge in this area is +estimating the unknown parameters of these dynamic relationships. However, +traditional numerical optimization methods rely on the selection of initial +parameter values, making them prone to local optima. Meanwhile, deep learning +and Bayesian methods require training models on specific differential +equations, resulting in poor versatility. This paper reformulates the parameter +estimation problem of differential equations as an optimization problem by +introducing the concept of particles from the particle swarm optimization +algorithm. Building on reinforcement learning-based particle swarm optimization +(RLLPSO), this paper proposes a novel method, DERLPSO, for estimating unknown +parameters of differential equations. We compared its performance on three +typical ordinary differential equations with the state-of-the-art methods, +including the RLLPSO algorithm, traditional numerical methods, deep learning +approaches, and Bayesian methods. The experimental results demonstrate that our +DERLPSO consistently outperforms other methods in terms of performance, +achieving an average Mean Square Error of 1.13e-05, which reduces the error by +approximately 4 orders of magnitude compared to other methods. Apart from +ordinary differential equations, our DERLPSO also show great promise for +estimating unknown parameters of partial differential equations. The DERLPSO +method proposed in this paper has high accuracy, is independent of initial +parameter values, and possesses strong versatility and stability. This work +provides new insights into unknown parameter estimation for differential +equations. + +
+
+
+
+
+ + ☆ Towards Secure Intelligent O-RAN Architecture: Vulnerabilities, Threats + and Promising Technical Solutions using LLMs + + +
+ The evolution of wireless communication systems will be fundamentally +impacted by an open radio access network (O-RAN), a new concept defining an +intelligent architecture with enhanced flexibility, openness, and the ability +to slice services more efficiently. For all its promises, and like any +technological advancement, O-RAN is not without risks that need to be carefully +assessed and properly addressed to accelerate its wide adoption in future +mobile networks. In this paper, we present an in-depth security analysis of the +O-RAN architecture, discussing the potential threats that may arise in the +different O-RAN architecture layers and their impact on the Confidentiality, +Integrity, and Availability (CIA) triad. We also promote the potential of zero +trust, Moving Target Defense (MTD), blockchain, and large language models(LLM) +technologies in fortifying O-RAN's security posture. Furthermore, we +numerically demonstrate the effectiveness of MTD in empowering robust deep +reinforcement learning methods for dynamic network slice admission control in +the O-RAN architecture. Moreover, we examine the effect of explainable AI (XAI) +based on LLMs in securing the system. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Gaussian Mixture Models Based Augmentation Enhances GNN Generalization + + +
+ Graph Neural Networks (GNNs) have shown great promise in tasks like node and +graph classification, but they often struggle to generalize, particularly to +unseen or out-of-distribution (OOD) data. These challenges are exacerbated when +training data is limited in size or diversity. To address these issues, we +introduce a theoretical framework using Rademacher complexity to compute a +regret bound on the generalization error and then characterize the effect of +data augmentation. This framework informs the design of GMM-GDA, an efficient +graph data augmentation (GDA) algorithm leveraging the capability of Gaussian +Mixture Models (GMMs) to approximate any distribution. Our approach not only +outperforms existing augmentation techniques in terms of generalization but +also offers improved time complexity, making it highly suitable for real-world +applications. + +
+
+
+
+
+ + ☆ Robot See, Robot Do: Imitation Reward for Noisy Financial Environments + + +
+ The sequential nature of decision-making in financial asset trading aligns +naturally with the reinforcement learning (RL) framework, making RL a common +approach in this domain. However, the low signal-to-noise ratio in financial +markets results in noisy estimates of environment components, including the +reward function, which hinders effective policy learning by RL agents. Given +the critical importance of reward function design in RL problems, this paper +introduces a novel and more robust reward function by leveraging imitation +learning, where a trend labeling algorithm acts as an expert. We integrate +imitation (expert's) feedback with reinforcement (agent's) feedback in a +model-free RL algorithm, effectively embedding the imitation learning problem +within the RL paradigm to handle the stochasticity of reward signals. Empirical +results demonstrate that this novel approach improves financial performance +metrics compared to traditional benchmarks and RL agents trained solely using +reinforcement feedback. + +
+
+
+
+
+ + ☆ Deep Generative Demand Learning for Newsvendor and Pricing + + +
+ We consider data-driven inventory and pricing decisions in the feature-based +newsvendor problem, where demand is influenced by both price and contextual +features and is modeled without any structural assumptions. The unknown demand +distribution results in a challenging conditional stochastic optimization +problem, further complicated by decision-dependent uncertainty and the +integration of features. Inspired by recent advances in deep generative +learning, we propose a novel approach leveraging conditional deep generative +models (cDGMs) to address these challenges. cDGMs learn the demand distribution +and generate probabilistic demand forecasts conditioned on price and features. +This generative approach enables accurate profit estimation and supports the +design of algorithms for two key objectives: (1) optimizing inventory for +arbitrary prices, and (2) jointly determining optimal pricing and inventory +levels. We provide theoretical guarantees for our approach, including the +consistency of profit estimation and convergence of our decisions to the +optimal solution. Extensive simulations-ranging from simple to complex +scenarios, including one involving textual features-and a real-world case study +demonstrate the effectiveness of our approach. Our method opens a new paradigm +in management science and operations research, is adaptable to extensions of +the newsvendor and pricing problems, and holds potential for solving other +conditional stochastic optimization problems. + +
+
+ comment: 30 pages, 6 figures +
+
+
+
+
+ + ☆ Dynamic Subset Tuning: Expanding the Operational Range of + Parameter-Efficient Training for Large Language Models NeurIPS 2024 + + +
+ We propose a novel parameter-efficient training (PET) method for large +language models that adapts models to downstream tasks by optimizing a small +subset of the existing model parameters. Unlike prior methods, this subset is +not fixed in location but rather which parameters are modified evolves over the +course of training. This dynamic parameter selection can yield good performance +with many fewer parameters than extant methods. Our method enables a seamless +scaling of the subset size across an arbitrary proportion of the total model +size, while popular PET approaches like prompt tuning and LoRA cover only a +small part of this spectrum. We match or outperform prompt tuning and LoRA in +most cases on a variety of NLP tasks (MT, QA, GSM8K, SuperGLUE) for a given +parameter budget across different model families and sizes. + +
+
+ comment: NeurIPS 2024 Workshop on Adaptive Foundation Models +
+
+
+
+
+ + ☆ XiYan-SQL: A Multi-Generator Ensemble Framework for Text-to-SQL + + +
+ To tackle the challenges of large language model performance in natural +language to SQL tasks, we introduce XiYan-SQL, an innovative framework that +employs a multi-generator ensemble strategy to improve candidate generation. We +introduce M-Schema, a semi-structured schema representation method designed to +enhance the understanding of database structures. To enhance the quality and +diversity of generated candidate SQL queries, XiYan-SQL integrates the +significant potential of in-context learning (ICL) with the precise control of +supervised fine-tuning. On one hand, we propose a series of training strategies +to fine-tune models to generate high-quality candidates with diverse +preferences. On the other hand, we implement the ICL approach with an example +selection method based on named entity recognition to prevent overemphasis on +entities. The refiner optimizes each candidate by correcting logical or +syntactical errors. To address the challenge of identifying the best candidate, +we fine-tune a selection model to distinguish nuances of candidate SQL queries. +The experimental results on multiple dialect datasets demonstrate the +robustness of XiYan-SQL in addressing challenges across different scenarios. +Overall, our proposed XiYan-SQL achieves the state-of-the-art execution +accuracy of 89.65% on the Spider test set, 69.86% on SQL-Eval, 41.20% on +NL2GQL, and a competitive score of 72.23% on the Bird development benchmark. +The proposed framework not only enhances the quality and diversity of SQL +queries but also outperforms previous methods. + +
+
+
+
+
+ + ☆ Hopfield-Fenchel-Young Networks: A Unified Framework for Associative + Memory Retrieval + + +
+ Associative memory models, such as Hopfield networks and their modern +variants, have garnered renewed interest due to advancements in memory capacity +and connections with self-attention in transformers. In this work, we introduce +a unified framework-Hopfield-Fenchel-Young networks-which generalizes these +models to a broader family of energy functions. Our energies are formulated as +the difference between two Fenchel-Young losses: one, parameterized by a +generalized entropy, defines the Hopfield scoring mechanism, while the other +applies a post-transformation to the Hopfield output. By utilizing Tsallis and +norm entropies, we derive end-to-end differentiable update rules that enable +sparse transformations, uncovering new connections between loss margins, +sparsity, and exact retrieval of single memory patterns. We further extend this +framework to structured Hopfield networks using the SparseMAP transformation, +allowing the retrieval of pattern associations rather than a single pattern. +Our framework unifies and extends traditional and modern Hopfield networks and +provides an energy minimization perspective for widely used +post-transformations like $\ell_2$-normalization and layer normalization-all +through suitable choices of Fenchel-Young losses and by using convex analysis +as a building block. Finally, we validate our Hopfield-Fenchel-Young networks +on diverse memory recall tasks, including free and sequential recall. +Experiments on simulated data, image retrieval, multiple instance learning, and +text rationalization demonstrate the effectiveness of our approach. + +
+
+ comment: 49 pages, 14 figures. arXiv admin note: text overlap with + arXiv:2402.13725 +
+
+
+
+
+ + ☆ DeepUQ: Assessing the Aleatoric Uncertainties from two Deep Learning + Methods NeurIPS 2024 + + +
+ Assessing the quality of aleatoric uncertainty estimates from uncertainty +quantification (UQ) deep learning methods is important in scientific contexts, +where uncertainty is physically meaningful and important to characterize and +interpret exactly. We systematically compare aleatoric uncertainty measured by +two UQ techniques, Deep Ensembles (DE) and Deep Evidential Regression (DER). +Our method focuses on both zero-dimensional (0D) and two-dimensional (2D) data, +to explore how the UQ methods function for different data dimensionalities. We +investigate uncertainty injected on the input and output variables and include +a method to propagate uncertainty in the case of input uncertainty so that we +can compare the predicted aleatoric uncertainty to the known values. We +experiment with three levels of noise. The aleatoric uncertainty predicted +across all models and experiments scales with the injected noise level. +However, the predicted uncertainty is miscalibrated to $\rm{std}(\sigma_{\rm +al})$ with the true uncertainty for half of the DE experiments and almost all +of the DER experiments. The predicted uncertainty is the least accurate for +both UQ methods for the 2D input uncertainty experiment and the high-noise +level. While these results do not apply to more complex data, they highlight +that further research on post-facto calibration for these methods would be +beneficial, particularly for high-noise and high-dimensional settings. + +
+
+ comment: Accepted to the Machine Learning for Physical Sciences workshop at + NeurIPS 2024; 11 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Intelligent Algorithms For Signature Diagnostics Of Three-Phase Motors + + +
+ The application of machine learning (ML) algorithms in the intelligent +diagnosis of three-phase engines has the potential to significantly enhance +diagnostic performance and accuracy. Traditional methods largely rely on +signature analysis, which, despite being a standard practice, can benefit from +the integration of advanced ML techniques. In our study, we innovate by +combining state of the art algorithms with a novel unsupervised anomaly +generation methodology that takes into account physics model of the engine. +This hybrid approach leverages the strengths of both supervised ML and +unsupervised signature analysis, achieving superior diagnostic accuracy and +reliability along with a wide industrial application. Our experimental results +demonstrate that this method significantly outperforms existing ML and non-ML +state-of-the-art approaches while retaining the practical advantages of an +unsupervised methodology. The findings highlight the potential of our approach +to significantly contribute to the field of engine diagnostics, offering a +robust and efficient solution for real-world applications. + +
+
+
+
+
+ + ☆ Grammarization-Based Grasping with Deep Multi-Autoencoder Latent Space + Exploration by Reinforcement Learning Agent ICRA 2025 + + +
+ Grasping by a robot in unstructured environments is deemed a critical +challenge because of the requirement for effective adaptation to a wide +variation in object geometries, material properties, and other environmental +factors. In this paper, we propose a novel framework for robotic grasping based +on the idea of compressing high-dimensional target and gripper features in a +common latent space using a set of autoencoders. Our approach simplifies +grasping by using three autoencoders dedicated to the target, the gripper, and +a third one that fuses their latent representations. This allows the RL agent +to achieve higher learning rates at the initial stages of exploration of a new +environment, as well as at non-zero shot grasp attempts. The agent explores the +latent space of the third autoencoder for better quality grasp without explicit +reconstruction of objects. By implementing the PoWER algorithm into the RL +training process, updates on the agent's policy will be made through the +perturbation in the reward-weighted latent space. The successful exploration +efficiently constrains both position and pose integrity for feasible executions +of grasps. We evaluate our system on a diverse set of objects, demonstrating +the high success rate in grasping with minimum computational overhead. We found +that approach enhances the adaptation of the RL agent by more than 35 \% in +simulation experiments. + +
+
+ comment: Submitted for review at IEEE ICRA 2025 +
+
+
+
+
+ + ☆ Learning Locally Adaptive Metrics that Enhance Structural Representation + with $\texttt{LAMINAR}$ NeurIPS 2024 + + +
+ We present $\texttt{LAMINAR}$, a novel unsupervised machine learning pipeline +designed to enhance the representation of structure within data via producing a +more-informative distance metric. Analysis methods in the physical sciences +often rely on standard metrics to define geometric relationships in data, which +may fail to capture the underlying structure of complex data sets. +$\texttt{LAMINAR}$ addresses this by using a continuous-normalising-flow and +inverse-transform-sampling to define a Riemannian manifold in the data space +without the need for the user to specify a metric over the data a-priori. The +result is a locally-adaptive-metric that produces structurally-informative +density-based distances. We demonstrate the utility of $\texttt{LAMINAR}$ by +comparing its output to the Euclidean metric for structured data sets. + +
+
+ comment: Accepted to the NeurIPS 2024 Machine Learning and the Physical + Sciences workshop. 6 pages, 6 figures +
+
+
+
+
+ + ☆ Leveraging Pre-Trained Neural Networks to Enhance Machine Learning with + Variational Quantum Circuits + + +
+ Quantum Machine Learning (QML) offers tremendous potential but is currently +limited by the availability of qubits. We introduce an innovative approach that +utilizes pre-trained neural networks to enhance Variational Quantum Circuits +(VQC). This technique effectively separates approximation error from qubit +count and removes the need for restrictive conditions, making QML more viable +for real-world applications. Our method significantly improves parameter +optimization for VQC while delivering notable gains in representation and +generalization capabilities, as evidenced by rigorous theoretical analysis and +extensive empirical testing on quantum dot classification tasks. Moreover, our +results extend to applications such as human genome analysis, demonstrating the +broad applicability of our approach. By addressing the constraints of current +quantum hardware, our work paves the way for a new era of advanced QML +applications, unlocking the full potential of quantum computing in fields such +as machine learning, materials science, medicine, mimetics, and various +interdisciplinary areas. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Graph Neural Networks in Supply Chain Analytics and Optimization: + Concepts, Perspectives, Dataset and Benchmarks + + +
+ Graph Neural Networks (GNNs) have recently gained traction in transportation, +bioinformatics, language and image processing, but research on their +application to supply chain management remains limited. Supply chains are +inherently graph-like, making them ideal for GNN methodologies, which can +optimize and solve complex problems. The barriers include a lack of proper +conceptual foundations, familiarity with graph applications in SCM, and +real-world benchmark datasets for GNN-based supply chain research. To address +this, we discuss and connect supply chains with graph structures for effective +GNN application, providing detailed formulations, examples, mathematical +definitions, and task guidelines. Additionally, we present a multi-perspective +real-world benchmark dataset from a leading FMCG company in Bangladesh, +focusing on supply chain planning. We discuss various supply chain tasks using +GNNs and benchmark several state-of-the-art models on homogeneous and +heterogeneous graphs across six supply chain analytics tasks. Our analysis +shows that GNN-based models consistently outperform statistical Machine +Learning and other Deep Learning models by around 10-30% in regression, 10-30% +in classification and detection tasks, and 15-40% in anomaly detection tasks on +designated metrics. With this work, we lay the groundwork for solving supply +chain problems using GNNs, supported by conceptual discussions, methodological +insights, and a comprehensive dataset. + +
+
+ comment: 27 Pages. Extended journal version of SupplyGraph (arXiv:2401.15299). + In Review +
+
+
+
+
+ + ☆ MLV$^2$-Net: Rater-Based Majority-Label Voting for Consistent Meningeal + Lymphatic Vessel Segmentation ML4H 2024 + + +
+ Meningeal lymphatic vessels (MLVs) are responsible for the drainage of waste +products from the human brain. An impairment in their functionality has been +associated with aging as well as brain disorders like multiple sclerosis and +Alzheimer's disease. However, MLVs have only recently been described for the +first time in magnetic resonance imaging (MRI), and their ramified structure +renders manual segmentation particularly difficult. Further, as there is no +consistent notion of their appearance, human-annotated MLV structures contain a +high inter-rater variability that most automatic segmentation methods cannot +take into account. In this work, we propose a new rater-aware training scheme +for the popular nnU-Net model, and we explore rater-based ensembling strategies +for accurate and consistent segmentation of MLVs. This enables us to boost +nnU-Net's performance while obtaining explicit predictions in different +annotation styles and a rater-based uncertainty estimation. Our final model, +MLV$^2$-Net, achieves a Dice similarity coefficient of 0.806 with respect to +the human reference standard. The model further matches the human inter-rater +reliability and replicates age-related associations with MLV volume. + +
+
+ comment: ML4H 2024 +
+
+
+
+
+ + ☆ Efficient Whole Slide Image Classification through Fisher Vector + Representation + + +
+ The advancement of digital pathology, particularly through computational +analysis of whole slide images (WSI), is poised to significantly enhance +diagnostic precision and efficiency. However, the large size and complexity of +WSIs make it difficult to analyze and classify them using computers. This study +introduces a novel method for WSI classification by automating the +identification and examination of the most informative patches, thus +eliminating the need to process the entire slide. Our method involves +two-stages: firstly, it extracts only a few patches from the WSIs based on +their pathological significance; and secondly, it employs Fisher vectors (FVs) +for representing features extracted from these patches, which is known for its +robustness in capturing fine-grained details. This approach not only +accentuates key pathological features within the WSI representation but also +significantly reduces computational overhead, thus making the process more +efficient and scalable. We have rigorously evaluated the proposed method across +multiple datasets to benchmark its performance against comprehensive WSI +analysis and contemporary weakly-supervised learning methodologies. The +empirical results indicate that our focused analysis of select patches, +combined with Fisher vector representation, not only aligns with, but at times +surpasses, the classification accuracy of standard practices. Moreover, this +strategy notably diminishes computational load and resource expenditure, +thereby establishing an efficient and precise framework for WSI analysis in the +realm of digital pathology. + +
+
+
+
+
+ + ☆ SAD-TIME: a Spatiotemporal-fused network for depression detection with + Automated multi-scale Depth-wise and TIME-interval-related common feature + extractor + + +
+ Background and Objective: Depression is a severe mental disorder, and +accurate diagnosis is pivotal to the cure and rehabilitation of people with +depression. However, the current questionnaire-based diagnostic methods could +bring subjective biases and may be denied by subjects. In search of a more +objective means of diagnosis, researchers have begun to experiment with deep +learning-based methods for identifying depressive disorders in recent years. +Methods: In this study, a novel Spatiotemporal-fused network with Automated +multi-scale Depth-wise and TIME-interval-related common feature extractor +(SAD-TIME) is proposed. SAD-TIME incorporates an automated nodes' common +features extractor (CFE), a spatial sector (SpS), a modified temporal sector +(TeS), and a domain adversarial learner (DAL). The CFE includes a multi-scale +depth-wise 1D-convolutional neural network and a time-interval embedding +generator, where the unique information of each channel is preserved. The SpS +fuses the functional connectivity with the distance-based connectivity +containing spatial position of EEG electrodes. A multi-head-attention graph +convolutional network is also applied in the SpS to fuse the features from +different EEG channels. The TeS is based on long short-term memory and graph +transformer networks, where the temporal information of different time-windows +is fused. Moreover, the DAL is used after the SpS to obtain the +domain-invariant feature. Results: Experimental results under tenfold +cross-validation show that the proposed SAD-TIME method achieves 92.00% and +94.00% depression classification accuracies on two datasets, respectively, in +cross-subject mode. Conclusion: SAD-TIME is a robust depression detection +model, where the automatedly-generated features, the SpS and the TeS assist the +classification performance with the fusion of the innate spatiotemporal +information in the EEG signals. + +
+
+ comment: 21pages, 7 figures +
+
+
+
+
+ + ☆ An Information Theoretic Approach to Operationalize Right to Data + Protection + + +
+ The widespread practice of indiscriminate data scraping to fine-tune language +models (LMs) raises significant legal and ethical concerns, particularly +regarding compliance with data protection laws such as the General Data +Protection Regulation (GDPR). This practice often results in the unauthorized +use of personal information, prompting growing debate within the academic and +regulatory communities. Recent works have introduced the concept of generating +unlearnable datasets (by adding imperceptible noise to the clean data), such +that the underlying model achieves lower loss during training but fails to +generalize to the unseen test setting. Though somewhat effective, these +approaches are predominantly designed for images and are limited by several +practical constraints like requiring knowledge of the target model. To this +end, we introduce RegText, a framework that injects imperceptible spurious +correlations into natural language datasets, effectively rendering them +unlearnable without affecting semantic content. We demonstrate RegText's +utility through rigorous empirical analysis of small and large LMs. Notably, +RegText can restrict newer models like GPT-4o and Llama from learning on our +generated data, resulting in a drop in their test accuracy compared to their +zero-shot performance and paving the way for generating unlearnable text to +protect public data. + +
+
+ comment: First two authors contributed equally to this work +
+
+
+
+
+ + ☆ Methodology for a Statistical Analysis of Influencing Factors on 3D + Object Detection Performance + + +
+ In autonomous driving, object detection is an essential task to perceive the +environment by localizing and classifying objects. Most object detection +algorithms rely on deep learning for their superior performance. However, their +black box nature makes it challenging to ensure safety. In this paper, we +propose a first-of-its-kind methodology for statistical analysis of the +influence of various factors related to the objects to detect or the +environment on the detection performance of both LiDAR- and camera-based 3D +object detectors. We perform a univariate analysis between each of the factors +and the detection error in order to compare the strength of influence. To +better identify potential sources of detection errors, we also analyze the +performance in dependency of the influencing factors and examine the +interdependencies between the different influencing factors. Recognizing the +factors that influence detection performance helps identify robustness issues +in the trained object detector and supports the safety approval of object +detection systems. + +
+
+
+
+
+ + ☆ Learning Model Agnostic Explanations via Constraint Programming + + +
+ Interpretable Machine Learning faces a recurring challenge of explaining the +predictions made by opaque classifiers such as ensemble models, kernel methods, +or neural networks in terms that are understandable to humans. When the model +is viewed as a black box, the objective is to identify a small set of features +that jointly determine the black box response with minimal error. However, +finding such model-agnostic explanations is computationally demanding, as the +problem is intractable even for binary classifiers. In this paper, the task is +framed as a Constraint Optimization Problem, where the constraint solver seeks +an explanation of minimum error and bounded size for an input data instance and +a set of samples generated by the black box. From a theoretical perspective, +this constraint programming approach offers PAC-style guarantees for the output +explanation. We evaluate the approach empirically on various datasets and show +that it statistically outperforms the state-of-the-art heuristic Anchors +method. + +
+
+
+
+
+ + ☆ Trap-MID: Trapdoor-based Defense against Model Inversion Attacks NeurIPS + + +
+ Model Inversion (MI) attacks pose a significant threat to the privacy of Deep +Neural Networks by recovering training data distribution from well-trained +models. While existing defenses often rely on regularization techniques to +reduce information leakage, they remain vulnerable to recent attacks. In this +paper, we propose the Trapdoor-based Model Inversion Defense (Trap-MID) to +mislead MI attacks. A trapdoor is integrated into the model to predict a +specific label when the input is injected with the corresponding trigger. +Consequently, this trapdoor information serves as the "shortcut" for MI +attacks, leading them to extract trapdoor triggers rather than private data. We +provide theoretical insights into the impacts of trapdoor's effectiveness and +naturalness on deceiving MI attacks. In addition, empirical experiments +demonstrate the state-of-the-art defense performance of Trap-MID against +various MI attacks without the requirements for extra data or large +computational overhead. Our source code is publicly available at +https://github.com/ntuaislab/Trap-MID. + +
+
+ comment: Accepted by Neural Information Processing Systems (NeurIPS) 2024 +
+
+
+
+
+ + ☆ Machine Unlearning on Pre-trained Models by Residual Feature Alignment + Using LoRA + + +
+ Machine unlearning is new emerged technology that removes a subset of the +training data from a trained model without affecting the model performance on +the remaining data. This topic is becoming increasingly important in protecting +user privacy and eliminating harmful or outdated data. The key challenge lies +in effectively and efficiently unlearning specific information without +compromising the model's utility on the retained data. For the pre-trained +models, fine-tuning is an important way to achieve the unlearning target. +Previous work typically fine-tuned the entire model's parameters, which incurs +significant computation costs. In addition, the fine-tuning process may cause +shifts in the intermediate layer features, affecting the model's overall +utility. In this work, we propose a novel and efficient machine unlearning +method on pre-trained models. We term the method as Residual Feature Alignment +Unlearning. Specifically, we leverage LoRA (Low-Rank Adaptation) to decompose +the model's intermediate features into pre-trained features and residual +features. By adjusting the residual features, we align the unlearned model with +the pre-trained model at the intermediate feature level to achieve both +unlearning and remaining targets. The method aims to learn the zero residuals +on the retained set and shifted residuals on the unlearning set. Extensive +experiments on numerous datasets validate the effectiveness of our approach. + +
+
+
+
+
+ + ☆ One STEP at a time: Language Agents are Stepwise Planners + + +
+ Language agents have shown promising adaptability in dynamic environments to +perform complex tasks. However, despite the versatile knowledge embedded in +large language models, these agents still fall short when it comes to tasks +that require planning. We introduce STEP, a novel framework designed to +efficiently learn from previous experiences to enhance the planning +capabilities of language agents in future steps. Concretely, STEP functions +through four interconnected components. First, the Planner takes on the task, +breaks it down into subtasks and provides relevant insights. Then the Executor +generates action candidates, while the Evaluator ensures the actions align with +learned rules from previous experiences. Lastly, Memory stores experiences to +inform future decisions. In the ScienceWorld benchmark, our results show that +STEP consistently outperforms state-of-the-art models, achieving an overall +score of 67.4 and successfully completing 12 out of 18 tasks. These findings +highlight STEP's potential as a framework for enhancing planning capabilities +in language agents, paving the way for more sophisticated task-solving in +dynamic environments. + +
+
+
+
+
+ + ☆ Properties of fairness measures in the context of varying class + imbalance and protected group ratios + + +
+ Society is increasingly relying on predictive models in fields like criminal +justice, credit risk management, or hiring. To prevent such automated systems +from discriminating against people belonging to certain groups, fairness +measures have become a crucial component in socially relevant applications of +machine learning. However, existing fairness measures have been designed to +assess the bias between predictions for protected groups without considering +the imbalance in the classes of the target variable. Current research on the +potential effect of class imbalance on fairness focuses on practical +applications rather than dataset-independent measure properties. In this paper, +we study the general properties of fairness measures for changing class and +protected group proportions. For this purpose, we analyze the probability mass +functions of six of the most popular group fairness measures. We also measure +how the probability of achieving perfect fairness changes for varying class +imbalance ratios. Moreover, we relate the dataset-independent properties of +fairness measures described in this paper to classifier fairness in real-life +tasks. Our results show that measures such as Equal Opportunity and Positive +Predictive Parity are more sensitive to changes in class imbalance than +Accuracy Equality. These findings can help guide researchers and practitioners +in choosing the most appropriate fairness measures for their classification +problems. + +
+
+
+
+
+ + ☆ Material Property Prediction with Element Attribute Knowledge Graphs and + Multimodal Representation Learning + + +
+ Machine learning has become a crucial tool for predicting the properties of +crystalline materials. However, existing methods primarily represent material +information by constructing multi-edge graphs of crystal structures, often +overlooking the chemical and physical properties of elements (such as atomic +radius, electronegativity, melting point, and ionization energy), which have a +significant impact on material performance. To address this limitation, we +first constructed an element property knowledge graph and utilized an embedding +model to encode the element attributes within the knowledge graph. Furthermore, +we propose a multimodal fusion framework, ESNet, which integrates element +property features with crystal structure features to generate joint multimodal +representations. This provides a more comprehensive perspective for predicting +the performance of crystalline materials, enabling the model to consider both +microstructural composition and chemical characteristics of the materials. We +conducted experiments on the Materials Project benchmark dataset, which showed +leading performance in the bandgap prediction task and achieved results on a +par with existing benchmarks in the formation energy prediction task. + +
+
+
+
+
+ + ☆ Quantifying Qualitative Insights: Leveraging LLMs to Market Predict + + +
+ Recent advancements in Large Language Models (LLMs) have the potential to +transform financial analytics by integrating numerical and textual data. +However, challenges such as insufficient context when fusing multimodal +information and the difficulty in measuring the utility of qualitative outputs, +which LLMs generate as text, have limited their effectiveness in tasks such as +financial forecasting. This study addresses these challenges by leveraging +daily reports from securities firms to create high-quality contextual +information. The reports are segmented into text-based key factors and combined +with numerical data, such as price information, to form context sets. By +dynamically updating few-shot examples based on the query time, the sets +incorporate the latest information, forming a highly relevant set closely +aligned with the query point. Additionally, a crafted prompt is designed to +assign scores to the key factors, converting qualitative insights into +quantitative results. The derived scores undergo a scaling process, +transforming them into real-world values that are used for prediction. Our +experiments demonstrate that LLMs outperform time-series models in market +forecasting, though challenges such as imperfect reproducibility and limited +explainability remain. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ CLaSP: Learning Concepts for Time-Series Signals from Natural Language + Supervision + + +
+ This paper proposes a foundation model called "CLaSP" that can search time +series signals using natural language that describes the characteristics of the +signals as queries. Previous efforts to represent time series signal data in +natural language have had challenges in designing a conventional class of time +series signal characteristics, formulating their quantification, and creating a +dictionary of synonyms. To overcome these limitations, the proposed method +introduces a neural network based on contrastive learning. This network is +first trained using the datasets TRUCE and SUSHI, which consist of time series +signals and their corresponding natural language descriptions. Previous studies +have proposed vocabularies that data analysts use to describe signal +characteristics, and SUSHI was designed to cover these terms. We believe that a +neural network trained on these datasets will enable data analysts to search +using natural language vocabulary. Furthermore, our method does not require a +dictionary of predefined synonyms, and it leverages common sense knowledge +embedded in a large-scale language model (LLM). Experimental results +demonstrate that CLaSP enables natural language search of time series signal +data and can accurately learn the points at which signal data changes. + +
+
+
+
+
+ + ☆ Interpretable Syntactic Representations Enable Hierarchical Word Vectors + + +
+ The distributed representations currently used are dense and uninterpretable, +leading to interpretations that themselves are relative, overcomplete, and hard +to interpret. We propose a method that transforms these word vectors into +reduced syntactic representations. The resulting representations are compact +and interpretable allowing better visualization and comparison of the word +vectors and we successively demonstrate that the drawn interpretations are in +line with human judgment. The syntactic representations are then used to create +hierarchical word vectors using an incremental learning approach similar to the +hierarchical aspect of human learning. As these representations are drawn from +pre-trained vectors, the generation process and learning approach are +computationally efficient. Most importantly, we find out that syntactic +representations provide a plausible interpretation of the vectors and +subsequent hierarchical vectors outperform the original vectors in benchmark +tests. + +
+
+
+
+
+ + ☆ Physics Informed Distillation for Diffusion Models + + +
+ Diffusion models have recently emerged as a potent tool in generative +modeling. However, their inherent iterative nature often results in sluggish +image generation due to the requirement for multiple model evaluations. Recent +progress has unveiled the intrinsic link between diffusion models and +Probability Flow Ordinary Differential Equations (ODEs), thus enabling us to +conceptualize diffusion models as ODE systems. Simultaneously, Physics Informed +Neural Networks (PINNs) have substantiated their effectiveness in solving +intricate differential equations through implicit modeling of their solutions. +Building upon these foundational insights, we introduce Physics Informed +Distillation (PID), which employs a student model to represent the solution of +the ODE system corresponding to the teacher diffusion model, akin to the +principles employed in PINNs. Through experiments on CIFAR 10 and ImageNet +64x64, we observe that PID achieves performance comparable to recent +distillation methods. Notably, it demonstrates predictable trends concerning +method-specific hyperparameters and eliminates the need for synthetic dataset +generation during the distillation process. Both of which contribute to its +easy-to-use nature as a distillation approach for Diffusion Models. Our code +and pre-trained checkpoint are publicly available at: +https://github.com/pantheon5100/pid_diffusion.git. + +
+
+
+
+
+ + ☆ Federated Graph Learning with Graphless Clients + + +
+ Federated Graph Learning (FGL) is tasked with training machine learning +models, such as Graph Neural Networks (GNNs), for multiple clients, each with +its own graph data. Existing methods usually assume that each client has both +node features and graph structure of its graph data. In real-world scenarios, +however, there exist federated systems where only a part of the clients have +such data while other clients (i.e. graphless clients) may only have node +features. This naturally leads to a novel problem in FGL: how to jointly train +a model over distributed graph data with graphless clients? In this paper, we +propose a novel framework FedGLS to tackle the problem in FGL with graphless +clients. In FedGLS, we devise a local graph learner on each graphless client +which learns the local graph structure with the structure knowledge transferred +from other clients. To enable structure knowledge transfer, we design a GNN +model and a feature encoder on each client. During local training, the feature +encoder retains the local graph structure knowledge together with the GNN model +via knowledge distillation, and the structure knowledge is transferred among +clients in global update. Our extensive experiments demonstrate the superiority +of the proposed FedGLS over five baselines. + +
+
+ comment: Accepted by Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ☆ Surprisingly Popular Voting for Concentric Rank-Order Models + + +
+ An important problem on social information sites is the recovery of ground +truth from individual reports when the experts are in the minority. The wisdom +of the crowd, i.e. the collective opinion of a group of individuals fails in +such a scenario. However, the surprisingly popular (SP) +algorithm~\cite{prelec2017solution} can recover the ground truth even when the +experts are in the minority, by asking the individuals to report additional +prediction reports--their beliefs about the reports of others. Several recent +works have extended the surprisingly popular algorithm to an equivalent voting +rule (SP-voting) to recover the ground truth ranking over a set of $m$ +alternatives. However, we are yet to fully understand when SP-voting can +recover the ground truth ranking, and if so, how many samples (votes and +predictions) it needs. We answer this question by proposing two rank-order +models and analyzing the sample complexity of SP-voting under these models. In +particular, we propose concentric mixtures of Mallows and Plackett-Luce models +with $G (\ge 2)$ groups. Our models generalize previously proposed concentric +mixtures of Mallows models with $2$ groups, and we highlight the importance of +$G > 2$ groups by identifying three distinct groups (expert, intermediate, and +non-expert) from existing datasets. Next, we provide conditions on the +parameters of the underlying models so that SP-voting can recover ground-truth +rankings with high probability, and also derive sample complexities under the +same. We complement the theoretical results by evaluating SP-voting on +simulated and real datasets. + +
+
+
+
+
+ + ☆ Coverage Analysis for Digital Cousin Selection -- Improving + Multi-Environment Q-Learning + + +
+ Q-learning is widely employed for optimizing various large-dimensional +networks with unknown system dynamics. Recent advancements include +multi-environment mixed Q-learning (MEMQ) algorithms, which utilize multiple +independent Q-learning algorithms across multiple, structurally related but +distinct environments and outperform several state-of-the-art Q-learning +algorithms in terms of accuracy, complexity, and robustness. We herein conduct +a comprehensive probabilistic coverage analysis to ensure optimal data coverage +conditions for MEMQ algorithms. First, we derive upper and lower bounds on the +expectation and variance of different coverage coefficients (CC) for MEMQ +algorithms. Leveraging these bounds, we develop a simple way of comparing the +utilities of multiple environments in MEMQ algorithms. This approach appears to +be near optimal versus our previously proposed partial ordering approach. We +also present a novel CC-based MEMQ algorithm to improve the accuracy and +complexity of existing MEMQ algorithms. Numerical experiments are conducted +using random network graphs with four different graph properties. Our algorithm +can reduce the average policy error (APE) by 65% compared to partial ordering +and is 95% faster than the exhaustive search. It also achieves 60% less APE +than several state-of-the-art reinforcement learning and prior MEMQ algorithms. +Additionally, we numerically verify the theoretical results and show their +scalability with the action-space size. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Communication Efficient Decentralization for Smoothed Online Convex + Optimization + + +
+ We study the multi-agent Smoothed Online Convex Optimization (SOCO) problem, +where $N$ agents interact through a communication graph. In each round, each +agent $i$ receives a strongly convex hitting cost function $f^i_t$ in an online +fashion and selects an action $x^i_t \in \mathbb{R}^d$. The objective is to +minimize the global cumulative cost, which includes the sum of individual +hitting costs $f^i_t(x^i_t)$, a temporal "switching cost" for changing +decisions, and a spatial "dissimilarity cost" that penalizes deviations in +decisions among neighboring agents. We propose the first decentralized +algorithm for multi-agent SOCO and prove its asymptotic optimality. Our +approach allows each agent to operate using only local information from its +immediate neighbors in the graph. For finite-time performance, we establish +that the optimality gap in competitive ratio decreases with the time horizon +$T$ and can be conveniently tuned based on the per-round computation available +to each agent. Moreover, our results hold even when the communication graph +changes arbitrarily and adaptively over time. Finally, we establish that the +computational complexity per round depends only logarithmically on the number +of agents and almost linearly on their degree within the graph, ensuring +scalability for large-system implementations. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ Bangla Grammatical Error Detection Leveraging Transformer-based Token + Classification + + +
+ Bangla is the seventh most spoken language by a total number of speakers in +the world, and yet the development of an automated grammar checker in this +language is an understudied problem. Bangla grammatical error detection is a +task of detecting sub-strings of a Bangla text that contain grammatical, +punctuation, or spelling errors, which is crucial for developing an automated +Bangla typing assistant. Our approach involves breaking down the task as a +token classification problem and utilizing state-of-the-art transformer-based +models. Finally, we combine the output of these models and apply rule-based +post-processing to generate a more reliable and comprehensive result. Our +system is evaluated on a dataset consisting of over 25,000 texts from various +sources. Our best model achieves a Levenshtein distance score of 1.04. Finally, +we provide a detailed analysis of different components of our system. + +
+
+
+
+
+ + ☆ Learning-Augmented Algorithms for Online Concave Packing and Convex + Covering Problems + + +
+ Learning-augmented algorithms have been extensively studied across the +computer science community in the recent years, driven by advances in machine +learning predictors, which can provide additional information to augment +classical algorithms. Such predictions are especially powerful in the context +of online problems, where decisions have to be made without knowledge of the +future, and which traditionally exhibits impossibility results bounding the +performance of any online algorithm. The study of learning-augmented algorithms +thus aims to use external advice prudently, to overcome classical impossibility +results when the advice is accurate, and still perform comparably to the +state-of-the-art online algorithms even when the advice is inaccurate. + In this paper, we present learning-augmented algorithmic frameworks for two +fundamental optimizations settings, extending and generalizing prior works. For +online packing with concave objectives, we present a simple but overarching +strategy that switches between the advice and the state-of-the-art online +algorithm. For online covering with convex objectives, we greatly extend +primal-dual methods for online convex covering programs by Azar et al. (FOCS +2016) and previous learning-augmented framework for online covering linear +programs from the literature, to many new applications. We show that our +algorithms break impossibility results when the advice is accurate, while +maintaining comparable performance with state-of-the-art classical online +algorithms even when the advice is erroneous. + +
+
+ comment: 38 pages. In submission +
+
+
+
+
+ + ☆ Neural Conjugate Flows: Physics-informed architectures with flow + structure + + +
+ We introduce Neural Conjugate Flows (NCF), a class of neural network +architectures equipped with exact flow structure. By leveraging topological +conjugation, we prove that these networks are not only naturally isomorphic to +a continuous group, but are also universal approximators for flows of ordinary +differential equation (ODEs). Furthermore, topological properties of these +flows can be enforced by the architecture in an interpretable manner. We +demonstrate in numerical experiments how this topological group structure leads +to concrete computational gains over other physics informed neural networks in +estimating and extrapolating latent dynamics of ODEs, while training up to five +times faster than other flow-based architectures. + +
+
+
+
+
+ + ☆ Are LLMs Prescient? A Continuous Evaluation using Daily News as the + Oracle + + +
+ Many existing evaluation benchmarks for Large Language Models (LLMs) quickly +become outdated due to the emergence of new models and training data. These +benchmarks also fall short in assessing how LLM performance changes over time, +as they consist of static questions without a temporal dimension. To address +these limitations, we propose using future event prediction as a continuous +evaluation method to assess LLMs' temporal generalization and forecasting +abilities. Our benchmark, Daily Oracle, automatically generates question-answer +(QA) pairs from daily news, challenging LLMs to predict "future" event +outcomes. Our findings reveal that as pre-training data becomes outdated, LLM +performance degrades over time. While Retrieval Augmented Generation (RAG) has +the potential to enhance prediction accuracy, the performance degradation +pattern persists, highlighting the need for continuous model updates. + +
+
+
+
+
+ + ☆ Conditional Variable Flow Matching: Transforming Conditional Densities + with Amortized Conditional Optimal Transport + + +
+ Forecasting stochastic nonlinear dynamical systems under the influence of +conditioning variables is a fundamental challenge repeatedly encountered across +the biological and physical sciences. While flow-based models can impressively +predict the temporal evolution of probability distributions representing +possible outcomes of a specific process, existing frameworks cannot +satisfactorily account for the impact of conditioning variables on these +dynamics. Amongst several limitations, existing methods require training data +with paired conditions and are developed for discrete conditioning variables. +We propose Conditional Variable Flow Matching (CVFM), a framework for learning +flows transforming conditional distributions with amortization across +continuous conditioning variables - permitting predictions across the +conditional density manifold. This is accomplished through several novel +advances, in particular, simultaneous sample conditioned flows over the main +and conditioning variables, alongside a conditional Wasserstein distance and +kernel facilitating conditional optimal transport. Collectively, these advances +allow for learning system dynamics provided measurement data whose states and +conditioning variables are not in correspondence. We demonstrate CVFM on a +suite of increasingly challenging problems, including discrete and continuous +conditional mapping benchmarks, image-to-image domain transfer, and modeling +the temporal evolution of materials internal structure during manufacturing +processes. We observe that CVFM results in improved performance and convergence +characteristics over alternative conditional variants. + +
+
+
+
+
+ + ☆ SDDBench: A Benchmark for Synthesizable Drug Design + + +
+ A significant challenge in wet lab experiments with current drug design +generative models is the trade-off between pharmacological properties and +synthesizability. Molecules predicted to have highly desirable properties are +often difficult to synthesize, while those that are easily synthesizable tend +to exhibit less favorable properties. As a result, evaluating the +synthesizability of molecules in general drug design scenarios remains a +significant challenge in the field of drug discovery. The commonly used +synthetic accessibility (SA) score aims to evaluate the ease of synthesizing +generated molecules, but it falls short of guaranteeing that synthetic routes +can actually be found. Inspired by recent advances in top-down synthetic route +generation, we propose a new, data-driven metric to evaluate molecule +synthesizability. Our approach directly assesses the feasibility of synthetic +routes for a given molecule through our proposed round-trip score. This novel +metric leverages the synergistic duality between retrosynthetic planners and +reaction predictors, both of which are trained on extensive reaction datasets. +To demonstrate the efficacy of our method, we conduct a comprehensive +evaluation of round-trip scores alongside search success rate across a range of +representative molecule generative models. Code is available at +https://github.com/SongtaoLiu0823/SDDBench. + +
+
+
+
+
+ + ☆ TowerDebias: A Novel Debiasing Method based on the Tower Property + + +
+ Decision-making processes have increasingly come to rely on sophisticated +machine learning tools, raising concerns about the fairness of their +predictions with respect to any sensitive groups. The widespread use of +commercial black-box machine learning models necessitates careful consideration +of their legal and ethical implications on consumers. In situations where users +have access to these "black-box" models, a key question emerges: how can we +mitigate or eliminate the influence of sensitive attributes, such as race or +gender? We propose towerDebias (tDB), a novel approach designed to reduce the +influence of sensitive variables in predictions made by black-box models. Using +the Tower Property from probability theory, tDB aims to improve prediction +fairness during the post-processing stage in a manner amenable to the +Fairness-Utility Tradeoff. This method is highly flexible, requiring no prior +knowledge of the original model's internal structure, and can be extended to a +range of different applications. We provide a formal improvement theorem for +tDB and demonstrate its effectiveness in both regression and classification +tasks, underscoring its impact on the fairness-utility tradeoff. + +
+
+ comment: To be submitted to a journal soon +
+
+
+
+
+ + ☆ RESOLVE: Relational Reasoning with Symbolic and Object-Level Features + Using Vector Symbolic Processing + + +
+ Modern transformer-based encoder-decoder architectures struggle with +reasoning tasks due to their inability to effectively extract relational +information between input objects (data/tokens). Recent work introduced the +Abstractor module, embedded between transformer layers, to address this gap. +However, the Abstractor layer while excelling at capturing relational +information (pure relational reasoning), faces challenges in tasks that require +both object and relational-level reasoning (partial relational reasoning). To +address this, we propose RESOLVE, a neuro-vector symbolic architecture that +combines object-level features with relational representations in +high-dimensional spaces, using fast and efficient operations such as bundling +(summation) and binding (Hadamard product) allowing both object-level features +and relational representations to coexist within the same structure without +interfering with one another. RESOLVE is driven by a novel attention mechanism +that operates in a bipolar high dimensional space, allowing fast attention +score computation compared to the state-of-the-art. By leveraging this design, +the model achieves both low compute latency and memory efficiency. RESOLVE also +offers better generalizability while achieving higher accuracy in purely +relational reasoning tasks such as sorting as well as partial relational +reasoning tasks such as math problem-solving compared to state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Hashing for Protein Structure Similarity Search + + +
+ Protein structure similarity search (PSSS), which tries to search proteins +with similar structures, plays a crucial role across diverse domains from drug +design to protein function prediction and molecular evolution. Traditional +alignment-based PSSS methods, which directly calculate alignment on the protein +structures, are highly time-consuming with high memory cost. Recently, +alignment-free methods, which represent protein structures as fixed-length +real-valued vectors, are proposed for PSSS. Although these methods have lower +time and memory cost than alignment-based methods, their time and memory cost +is still too high for large-scale PSSS, and their accuracy is unsatisfactory. +In this paper, we propose a novel method, called +$\underline{\text{p}}$r$\underline{\text{o}}$tein +$\underline{\text{s}}$tructure $\underline{\text{h}}$ashing (POSH), for PSSS. +POSH learns a binary vector representation for each protein structure, which +can dramatically reduce the time and memory cost for PSSS compared with +real-valued vector representation based methods. Furthermore, in POSH we also +propose expressive hand-crafted features and a structure encoder to well model +both node and edge interactions in proteins. Experimental results on real +datasets show that POSH can outperform other methods to achieve +state-of-the-art accuracy. Furthermore, POSH achieves a memory saving of more +than six times and speed improvement of more than four times, compared with +other methods. + +
+
+
+
+
+ + ☆ Least Squares Training of Quadratic Convolutional Neural Networks with + Applications to System Theory + + +
+ This paper provides a least squares formulation for the training of a 2-layer +convolutional neural network using quadratic activation functions, a 2-norm +loss function, and no regularization term. Using this method, an analytic +expression for the globally optimal weights is obtained alongside a quadratic +input-output equation for the network. These properties make the network a +viable tool in system theory by enabling further analysis, such as the +sensitivity of the output to perturbations in the input, which is crucial for +safety-critical systems such as aircraft or autonomous vehicles.The least +squares method is compared to previously proposed strategies for training +quadratic networks and to a back-propagation-trained ReLU network. The proposed +method is applied to a system identification problem and a GPS position +estimation problem. The least squares network is shown to have a significantly +reduced training time with minimal compromises on prediction accuracy alongside +the advantages of having an analytic input-output equation. Although these +results only apply to 2-layer networks, this paper motivates the exploration of +deeper quadratic networks in the context of system theory. + +
+
+
+
+
+ + ☆ GPTree: Towards Explainable Decision-Making via LLM-powered Decision + Trees + + +
+ Traditional decision tree algorithms are explainable but struggle with +non-linear, high-dimensional data, limiting its applicability in complex +decision-making. Neural networks excel at capturing complex patterns but +sacrifice explainability in the process. In this work, we present GPTree, a +novel framework combining explainability of decision trees with the advanced +reasoning capabilities of LLMs. GPTree eliminates the need for feature +engineering and prompt chaining, requiring only a task-specific prompt and +leveraging a tree-based structure to dynamically split samples. We also +introduce an expert-in-the-loop feedback mechanism to further enhance +performance by enabling human intervention to refine and rebuild decision +paths, emphasizing the harmony between human expertise and machine +intelligence. Our decision tree achieved a 7.8% precision rate for identifying +"unicorn" startups at the inception stage of a startup, surpassing gpt-4o with +few-shot learning as well as the best human decision-makers (3.1% to 5.6%). + +
+
+
+
+
+ + ♻ ☆ Regional Style and Color Transfer + + +
+ This paper presents a novel contribution to the field of regional style +transfer. Existing methods often suffer from the drawback of applying style +homogeneously across the entire image, leading to stylistic inconsistencies or +foreground object twisted when applied to image with foreground elements such +as person figures. To address this limitation, we propose a new approach that +leverages a segmentation network to precisely isolate foreground objects within +the input image. Subsequently, style transfer is applied exclusively to the +background region. The isolated foreground objects are then carefully +reintegrated into the style-transferred background. To enhance the visual +coherence between foreground and background, a color transfer step is employed +on the foreground elements prior to their rein-corporation. Finally, we utilize +feathering techniques to achieve a seamless amalgamation of foreground and +background, resulting in a visually unified and aesthetically pleasing final +composition. Extensive evaluations demonstrate that our proposed approach +yields significantly more natural stylistic transformations compared to +conventional methods. + +
+
+ comment: Accepted by 2024 5th International Conference on Computer Vision, + Image and Deep Learning +
+
+
+
+
+ + ♻ ☆ A Single Transformer for Scalable Vision-Language Modeling + + +
+ We present SOLO, a single transformer for Scalable visiOn-Language mOdeling. +Current large vision-language models (LVLMs) such as LLaVA mostly employ +heterogeneous architectures that connect pre-trained visual encoders with large +language models (LLMs) to facilitate visual recognition and complex reasoning. +Although achieving remarkable performance with relatively lightweight training, +we identify four primary scalability limitations: (1) The visual capacity is +constrained by pre-trained visual encoders, which are typically an order of +magnitude smaller than LLMs. (2) The heterogeneous architecture complicates the +use of established hardware and software infrastructure. (3) Study of scaling +laws on such architecture must consider three separate components - visual +encoder, connector, and LLMs, which complicates the analysis. (4) The use of +existing visual encoders typically requires following a pre-defined +specification of image inputs pre-processing, for example, by reshaping inputs +to fixed-resolution square images, which presents difficulties in processing +and training on high-resolution images or those with unusual aspect ratio. A +unified single Transformer architecture, like SOLO, effectively addresses these +scalability concerns in LVLMs; however, its limited adoption in the modern +context likely stems from the absence of reliable training recipes that balance +both modalities and ensure stable training for billion-scale models. In this +paper, we introduce the first open-source training recipe for developing SOLO, +an open-source 7B LVLM using moderate academic resources. The training recipe +involves initializing from LLMs, sequential pre-training on ImageNet and +web-scale data, and instruction fine-tuning on our curated high-quality +datasets. On extensive evaluation, SOLO demonstrates performance comparable to +LLaVA-v1.5-7B, particularly excelling in visual mathematical reasoning. + +
+
+ comment: Accepted to TMLR +
+
+
+
+
+ + ♻ ☆ Physics-Informed Geometry-Aware Neural Operator + + +
+ Engineering design problems often involve solving parametric Partial +Differential Equations (PDEs) under variable PDE parameters and domain +geometry. Recently, neural operators have shown promise in learning PDE +operators and quickly predicting the PDE solutions. However, training these +neural operators typically requires large datasets, the acquisition of which +can be prohibitively expensive. To overcome this, physics-informed training +offers an alternative way of building neural operators, eliminating the high +computational costs associated with Finite Element generation of training data. +Nevertheless, current physics-informed neural operators struggle with +limitations, either in handling varying domain geometries or varying PDE +parameters. In this research, we introduce a novel method, the Physics-Informed +Geometry-Aware Neural Operator (PI-GANO), designed to simultaneously generalize +across both PDE parameters and domain geometries. We adopt a geometry encoder +to capture the domain geometry features, and design a novel pipeline to +integrate this component within the existing DCON architecture. Numerical +results demonstrate the accuracy and efficiency of the proposed method. All the +codes and data related to this work are available on GitHub: +https://github.com/WeihengZ/Physics-informed-Neural-Foundation-Operator. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2404.13646 +
+
+
+
+
+ + ♻ ☆ Insights and Current Gaps in Open-Source LLM Vulnerability Scanners: A + Comparative Analysis + + +
+ This report presents a comparative analysis of open-source vulnerability +scanners for conversational large language models (LLMs). As LLMs become +integral to various applications, they also present potential attack surfaces, +exposed to security risks such as information leakage and jailbreak attacks. +Our study evaluates prominent scanners - Garak, Giskard, PyRIT, and +CyberSecEval - that adapt red-teaming practices to expose these +vulnerabilities. We detail the distinctive features and practical use of these +scanners, outline unifying principles of their design and perform quantitative +evaluations to compare them. These evaluations uncover significant reliability +issues in detecting successful attacks, highlighting a fundamental gap for +future development. Additionally, we contribute a preliminary labelled dataset, +which serves as an initial step to bridge this gap. Based on the above, we +provide strategic recommendations to assist organizations choose the most +suitable scanner for their red-teaming needs, accounting for customizability, +test suite comprehensiveness, and industry-specific use cases. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ $π_0$: A Vision-Language-Action Flow Model for General Robot Control + + +
+ Robot learning holds tremendous promise to unlock the full potential of +flexible, general, and dexterous robot systems, as well as to address some of +the deepest questions in artificial intelligence. However, bringing robot +learning to the level of generality required for effective real-world systems +faces major obstacles in terms of data, generalization, and robustness. In this +paper, we discuss how generalist robot policies (i.e., robot foundation models) +can address these challenges, and how we can design effective generalist robot +policies for complex and highly dexterous tasks. We propose a novel flow +matching architecture built on top of a pre-trained vision-language model (VLM) +to inherit Internet-scale semantic knowledge. We then discuss how this model +can be trained on a large and diverse dataset from multiple dexterous robot +platforms, including single-arm robots, dual-arm robots, and mobile +manipulators. We evaluate our model in terms of its ability to perform tasks in +zero shot after pre-training, follow language instructions from people and from +a high-level VLM policy, and its ability to acquire new skills via fine-tuning. +Our results cover a wide variety of tasks, such as laundry folding, table +cleaning, and assembling boxes. + +
+
+ comment: See project website for videos: + https://physicalintelligence.company/blog/pi0 +
+
+
+
+
+ + ♻ ☆ Physics-informed Discretization-independent Deep Compositional Operator + Network + + +
+ Solving parametric Partial Differential Equations (PDEs) for a broad range of +parameters is a critical challenge in scientific computing. To this end, neural +operators, which \textcolor{black}{predicts the PDE solution with variable PDE +parameter inputs}, have been successfully used. However, the training of neural +operators typically demands large training datasets, the acquisition of which +can be prohibitively expensive. To address this challenge, physics-informed +training can offer a cost-effective strategy. However, current physics-informed +neural operators face limitations, either in handling irregular domain shapes +or in in generalizing to various discrete representations of PDE parameters. In +this research, we introduce a novel physics-informed model architecture which +can generalize to various discrete representations of PDE parameters and +irregular domain shapes. Particularly, inspired by deep operator neural +networks, our model involves a discretization-independent learning of parameter +embedding repeatedly, and this parameter embedding is integrated with the +response embeddings through multiple compositional layers, for more +expressivity. Numerical results demonstrate the accuracy and efficiency of the +proposed method. All the codes and data related to this work are available on +GitHub: https://github.com/WeihengZ/PI-DCON. + +
+
+
+
+
+ + ♻ ☆ A Universal Deep Learning Framework for Materials X-ray Absorption + Spectra + + +
+ X-ray absorption spectroscopy (XAS) is a powerful characterization technique +for probing the local chemical environment of absorbing atoms. However, +analyzing XAS data presents significant challenges, often requiring extensive, +computationally intensive simulations, as well as significant domain expertise. +These limitations hinder the development of fast, robust XAS analysis pipelines +that are essential in high-throughput studies and for autonomous +experimentation. We address these challenges with OmniXAS, a framework that +contains a suite of transfer learning approaches for XAS prediction, each +contributing to improved accuracy and efficiency, as demonstrated on K-edge +spectra database covering eight 3d transition metals (Ti-Cu). The OmniXAS +framework is built upon three distinct strategies. First, we use M3GNet to +derive latent representations of the local chemical environment of absorption +sites as input for XAS prediction, achieving up to order-of-magnitude +improvements over conventional featurization techniques. Second, we employ a +hierarchical transfer learning strategy, training a universal multi-task model +across elements before fine-tuning for element-specific predictions. Models +based on this cascaded approach after element-wise fine-tuning outperform +element-specific models by up to 69%. Third, we implement cross-fidelity +transfer learning, adapting a universal model to predict spectra generated by +simulation of a different fidelity with a higher computational cost. This +approach improves prediction accuracy by up to 11% over models trained on the +target fidelity alone. Our approach boosts the throughput of XAS modeling by +orders of magnitude versus first-principles simulations and is extendable to +XAS prediction for a broader range of elements. This transfer learning +framework is generalizable to enhance deep-learning models that target other +properties in materials research. + +
+
+ comment: Main manuscript: 22 pages, 11 figures. Supplemental material (12 + pages, 6 figures) available as a separate file in arXiv ancillary files + (additional downloadable files) +
+
+
+
+
+ + ♻ ☆ Uncertainty of Thoughts: Uncertainty-Aware Planning Enhances Information + Seeking in Large Language Models NeurIPS 2024 + + +
+ In the face of uncertainty, the ability to *seek information* is of +fundamental importance. In many practical applications, such as medical +diagnosis and troubleshooting, the information needed to solve the task is not +initially given and has to be actively sought by asking follow-up questions +(for example, a doctor asking a patient for more details about their symptoms). +In this work, we introduce Uncertainty of Thoughts (UoT), an algorithm to +augment large language models with the ability to actively seek information by +asking effective questions. UoT combines 1) an *uncertainty-aware simulation +approach* which enables the model to simulate possible future scenarios and how +likely they are to occur, 2) *uncertainty-based rewards* motivated by +information gain which incentivizes the model to seek information, and 3) a +*reward propagation scheme* to select the optimal question to ask in a way that +maximizes the expected reward. In experiments on medical diagnosis, +troubleshooting, and the `20 Questions` game, UoT achieves an average +performance improvement of 38.1% in the rate of successful task completion +across multiple LLMs compared with direct prompting and also improves +efficiency (i.e., the number of questions needed to complete the task). Our +code has been released [here](https://github.com/zhiyuanhubj/UoT) + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Active Inference Meeting Energy-Efficient Control of Parallel and + Identical Machines + + +
+ We investigate the application of active inference in developing +energy-efficient control agents for manufacturing systems. Active inference, +rooted in neuroscience, provides a unified probabilistic framework integrating +perception, learning, and action, with inherent uncertainty quantification +elements. Our study explores deep active inference, an emerging field that +combines deep learning with the active inference decision-making framework. +Leveraging a deep active inference agent, we focus on controlling parallel and +identical machine workstations to enhance energy efficiency. We address +challenges posed by the problem's stochastic nature and delayed policy response +by introducing tailored enhancements to existing agent architectures. +Specifically, we introduce multi-step transition and hybrid horizon methods to +mitigate the need for complex planning. Our experimental results demonstrate +the effectiveness of these enhancements and highlight the potential of the +active inference-based approach. + +
+
+ comment: Accepted at the 10th International Conference on Machine Learning, + Optimization, and Data Science +
+
+
+
+
+ + ♻ ☆ On Training Survival Models with Scoring Rules + + +
+ Scoring rules are an established way of comparing predictive performances +across model classes. In the context of survival analysis, they require +adaptation in order to accommodate censoring. This work investigates using +scoring rules for model training rather than evaluation. Doing so, we establish +a general framework for training survival models that is model agnostic and can +learn event time distributions parametrically or non-parametrically. In +addition, our framework is not restricted to any specific scoring rule. While +we focus on neural network-based implementations, we also provide +proof-of-concept implementations using gradient boosting, generalized additive +models, and trees. Empirical comparisons on synthetic and real-world data +indicate that scoring rules can be successfully incorporated into model +training and yield competitive predictive performance with established +time-to-event models. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ CGRclust: Chaos Game Representation for Twin Contrastive Clustering of + Unlabelled DNA Sequences + + +
+ This study proposes CGRclust, a novel combination of unsupervised twin +contrastive clustering of Chaos Game Representations (CGR) of DNA sequences, +with convolutional neural networks (CNNs). To the best of our knowledge, +CGRclust is the first method to use unsupervised learning for image +classification (herein applied to two-dimensional CGR images) for clustering +datasets of DNA sequences. CGRclust overcomes the limitations of traditional +sequence classification methods by leveraging unsupervised twin contrastive +learning to detect distinctive sequence patterns, without requiring DNA +sequence alignment or biological/taxonomic labels. CGRclust accurately +clustered twenty-five diverse datasets, with sequence lengths ranging from 664 +bp to 100 kbp, including mitochondrial genomes of fish, fungi, and protists, as +well as viral whole genome assemblies and synthetic DNA sequences. Compared +with three recent clustering methods for DNA sequences (DeLUCS, iDeLUCS, and +MeShClust v3.0.), CGRclust is the only method that surpasses 81.70% accuracy +across all four taxonomic levels tested for mitochondrial DNA genomes of fish. +Moreover, CGRclust also consistently demonstrates superior performance across +all the viral genomic datasets. The high clustering accuracy of CGRclust on +these twenty-five datasets, which vary significantly in terms of sequence +length, number of genomes, number of clusters, and level of taxonomy, +demonstrates its robustness, scalability, and versatility. + +
+
+ comment: 28 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Optimal vintage factor analysis with deflation varimax + + +
+ Vintage factor analysis is one important type of factor analysis that aims to +first find a low-dimensional representation of the original data, and then to +seek a rotation such that the rotated low-dimensional representation is +scientifically meaningful. The most widely used vintage factor analysis is the +Principal Component Analysis (PCA) followed by the varimax rotation. Despite +its popularity, little theoretical guarantee can be provided to date mainly +because varimax rotation requires to solve a non-convex optimization over the +set of orthogonal matrices. + In this paper, we propose a deflation varimax procedure that solves each row +of an orthogonal matrix sequentially. In addition to its net computational gain +and flexibility, we are able to fully establish theoretical guarantees for the +proposed procedure in a broader context. Adopting this new deflation varimax as +the second step after PCA, we further analyze this two step procedure under a +general class of factor models. Our results show that it estimates the factor +loading matrix in the minimax optimal rate when the signal-to-noise-ratio (SNR) +is moderate or large. In the low SNR regime, we offer possible improvement over +using PCA and the deflation varimax when the additive noise under the factor +model is structured. The modified procedure is shown to be minimax optimal in +all SNR regimes. Our theory is valid for finite sample and allows the number of +the latent factors to grow with the sample size as well as the ambient +dimension to grow with, or even exceed, the sample size. Extensive simulation +and real data analysis further corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ On the Effects of Data Scale on UI Control Agents NeurIPS 2024 + + +
+ Autonomous agents that control computer interfaces to accomplish human tasks +are emerging. Leveraging LLMs to power such agents has been of special +interest, but unless fine-tuned on human-collected task demonstrations, +performance is still relatively low. In this work we study whether fine-tuning +alone is a viable approach for building real-world computer control agents. In +particularly, we investigate how performance measured on both high and +low-level tasks in domain and out of domain scales as more training data is +collected. To this end we collect and release a new dataset, AndroidControl, +consisting of 15,283 demonstrations of everyday tasks with Android apps. +Compared to existing datasets, each AndroidControl task instance includes both +high and low-level human-generated instructions, allowing us to explore the +level of task complexity an agent can handle. Moreover, AndroidControl is the +most diverse computer control dataset to date, including 14,548 unique tasks +over 833 Android apps, thus allowing us to conduct in-depth analysis of the +model performance in and out of the domain of the training data. Using the +dataset, we find that when tested in domain fine-tuned models outperform zero +and few-shot baselines and scale in such a way that robust performance might +feasibly be obtained simply by collecting more data. Out of domain, performance +scales significantly more slowly and suggests that in particular for high-level +tasks, fine-tuning on more data alone may be insufficient for achieving robust +out-of-domain performance. + +
+
+ comment: NeurIPS 2024 (Datasets and Benchmarks) +
+
+
+
+
+ + ♻ ☆ AudioProtoPNet: An interpretable deep learning model for bird sound + classification + + +
+ Deep learning models have significantly advanced acoustic bird monitoring by +being able to recognize numerous bird species based on their vocalizations. +However, traditional deep learning models are black boxes that provide no +insight into their underlying computations, limiting their usefulness to +ornithologists and machine learning engineers. Explainable models could +facilitate debugging, knowledge discovery, trust, and interdisciplinary +collaboration. This study introduces AudioProtoPNet, an adaptation of the +Prototypical Part Network (ProtoPNet) for multi-label bird sound +classification. It is an inherently interpretable model that uses a ConvNeXt +backbone to extract embeddings, with the classification layer replaced by a +prototype learning classifier trained on these embeddings. The classifier +learns prototypical patterns of each bird species' vocalizations from +spectrograms of training instances. During inference, audio recordings are +classified by comparing them to the learned prototypes in the embedding space, +providing explanations for the model's decisions and insights into the most +informative embeddings of each bird species. The model was trained on the +BirdSet training dataset, which consists of 9,734 bird species and over 6,800 +hours of recordings. Its performance was evaluated on the seven test datasets +of BirdSet, covering different geographical regions. AudioProtoPNet +outperformed the state-of-the-art model Perch, achieving an average AUROC of +0.90 and a cmAP of 0.42, with relative improvements of 7.1% and 16.7% over +Perch, respectively. These results demonstrate that even for the challenging +task of multi-label bird sound classification, it is possible to develop +powerful yet inherently interpretable deep learning models that provide +valuable insights for ornithologists and machine learning engineers. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Implicit Bias of Mirror Flow on Separable Data + + +
+ We examine the continuous-time counterpart of mirror descent, namely mirror +flow, on classification problems which are linearly separable. Such problems +are minimised `at infinity' and have many possible solutions; we study which +solution is preferred by the algorithm depending on the mirror potential. For +exponential tailed losses and under mild assumptions on the potential, we show +that the iterates converge in direction towards a $\phi_\infty$-maximum margin +classifier. The function $\phi_\infty$ is the \textit{horizon function} of the +mirror potential and characterises its shape `at infinity'. When the potential +is separable, a simple formula allows to compute this function. We analyse +several examples of potentials and provide numerical experiments highlighting +our results. + +
+
+ comment: Neurips camera ready. Minor changes from the previous versions. + Mainly added full iterate trajectories (Figure 4) +
+
+
+
+
+ + ♻ ☆ Rethinking Distribution Shifts: Empirical Analysis and Inductive + Modeling for Tabular Data NeurIPS 2023 + + +
+ Different distribution shifts require different interventions, and algorithms +must be grounded in the specific shifts they address. However, methodological +development for robust algorithms typically relies on structural assumptions +that lack empirical validation. Advocating for an empirically grounded +data-driven approach to research, we build an empirical testbed comprising +natural shifts across 5 tabular datasets and 60,000 method configurations +encompassing imbalanced learning and distributionally robust optimization (DRO) +methods. We find $Y|X$-shifts are most prevalent on our testbed, in stark +contrast to the heavy focus on $X$ (covariate)-shifts in the ML literature. The +performance of robust algorithms varies significantly over shift types, and is +no better than that of vanilla methods. To understand why, we conduct an +in-depth empirical analysis of DRO methods and find that although often +neglected by researchers, implementation details -- such as the choice of +underlying model class (e.g., XGBoost) and hyperparameter selection -- have a +bigger impact on performance than the ambiguity set or its radius. To further +bridge that gap between methodological research and practice, we design case +studies that illustrate how such a data-driven, inductive understanding of +distribution shifts can enhance both data-centric and algorithmic +interventions. + +
+
+ comment: Conference version appeared in NeurIPS 2023, previously titled "On + the Need for a Language Describing Distribution Shifts: Illustrations on + Tabular Datasets" +
+
+
+
+
+ + ♻ ☆ Calibrating Bayesian Generative Machine Learning for Bayesiamplification + + +
+ Recently, combinations of generative and Bayesian machine learning have been +introduced in particle physics for both fast detector simulation and inference +tasks. These neural networks aim to quantify the uncertainty on the generated +distribution originating from limited training statistics. The interpretation +of a distribution-wide uncertainty however remains ill-defined. We show a clear +scheme for quantifying the calibration of Bayesian generative machine learning +models. For a Continuous Normalizing Flow applied to a low-dimensional toy +example, we evaluate the calibration of Bayesian uncertainties from either a +mean-field Gaussian weight posterior, or Monte Carlo sampling network weights, +to gauge their behaviour on unsteady distribution edges. Well calibrated +uncertainties can then be used to roughly estimate the number of uncorrelated +truth samples that are equivalent to the generated sample and clearly indicate +data amplification for smooth features of the distribution. + +
+
+ comment: 15 pages, 6 figures, updated references, fixed typo +
+
+
+
+
+ + ♻ ☆ Neural Persistence Dynamics + + +
+ We consider the problem of learning the dynamics in the topology of +time-evolving point clouds, the prevalent spatiotemporal model for systems +exhibiting collective behavior, such as swarms of insects and birds or +particles in physics. In such systems, patterns emerge from (local) +interactions among self-propelled entities. While several well-understood +governing equations for motion and interaction exist, they are notoriously +difficult to fit to data, as most prior work requires knowledge about +individual motion trajectories, i.e., a requirement that is challenging to +satisfy with an increasing number of entities. To evade such confounding +factors, we investigate collective behavior from a $\textit{topological +perspective}$, but instead of summarizing entire observation sequences (as done +previously), we propose learning a latent dynamical model from topological +features $\textit{per time point}$. The latter is then used to formulate a +downstream regression task to predict the parametrization of some a priori +specified governing equation. We implement this idea based on a latent ODE +learned from vectorized (static) persistence diagrams and show that a +combination of recent stability results for persistent homology justifies this +modeling choice. Various (ablation) experiments not only demonstrate the +relevance of each model component but provide compelling empirical evidence +that our proposed model - $\textit{Neural Persistence Dynamics}$ - +substantially outperforms the state-of-the-art across a diverse set of +parameter regression tasks. + +
+
+
+
+
+ + ♻ ☆ Circuit design in biology and machine learning. I. Random networks and + dimensional reduction + + +
+ A biological circuit is a neural or biochemical cascade, taking inputs and +producing outputs. How have biological circuits learned to solve environmental +challenges over the history of life? The answer certainly follows Dobzhansky's +famous quote that ``nothing in biology makes sense except in the light of +evolution.'' But that quote leaves out the mechanistic basis by which natural +selection's trial-and-error learning happens, which is exactly what we have to +understand. How does the learning process that designs biological circuits +actually work? How much insight can we gain about the form and function of +biological circuits by studying the processes that have made those circuits? +Because life's circuits must often solve the same problems as those faced by +machine learning, such as environmental tracking, homeostatic control, +dimensional reduction, or classification, we can begin by considering how +machine learning designs computational circuits to solve problems. We can then +ask: How much insight do those computational circuits provide about the design +of biological circuits? How much does biology differ from computers in the +particular circuit designs that it uses to solve problems? This article steps +through two classic machine learning models to set the foundation for analyzing +broad questions about the design of biological circuits. One insight is the +surprising power of randomly connected networks. Another is the central role of +internal models of the environment embedded within biological circuits, +illustrated by a model of dimensional reduction and trend prediction. Overall, +many challenges in biology have machine learning analogs, suggesting hypotheses +about how biology's circuits are designed. + +
+
+ comment: Added background info in two text boxes and new figure, edited + throughout +
+
+
+
+
+ + ♻ ☆ No Free Lunch in LLM Watermarking: Trade-offs in Watermarking Design + Choices + + +
+ Advances in generative models have made it possible for AI-generated text, +code, and images to mirror human-generated content in many applications. +Watermarking, a technique that aims to embed information in the output of a +model to verify its source, is useful for mitigating the misuse of such +AI-generated content. However, we show that common design choices in LLM +watermarking schemes make the resulting systems surprisingly susceptible to +attack -- leading to fundamental trade-offs in robustness, utility, and +usability. To navigate these trade-offs, we rigorously study a set of simple +yet effective attacks on common watermarking systems, and propose guidelines +and defenses for LLM watermarking in practice. + +
+
+
+
+
+ + ♻ ☆ Controlling Large Electric Vehicle Charging Stations via User Behavior + Modeling and Stochastic Programming + + +
+ This paper introduces an Electric Vehicle Charging Station (EVCS) model that +incorporates real-world constraints, such as slot power limitations, contract +threshold overruns penalties, or early disconnections of electric vehicles +(EVs). We propose a formulation of the problem of EVCS control under +uncertainty, and implement two Multi-Stage Stochastic Programming approaches +that leverage user-provided information, namely, Model Predictive Control and +Two-Stage Stochastic Programming. The model addresses uncertainties in charging +session start and end times, as well as in energy demand. A user's behavior +model based on a sojourn-time-dependent stochastic process enhances cost +reduction while maintaining customer satisfaction. The benefits of the two +proposed methods are showcased against two baselines over a 22-day simulation +using a real-world dataset. The two-stage approach demonstrates robustness +against early disconnections by considering a wider range of uncertainty +scenarios for optimization. The algorithm prioritizing user satisfaction over +electricity cost achieves a 20% and 36% improvement in two user satisfaction +metrics compared to an industry-standard baseline. Additionally, the algorithm +striking the best balance between cost and user satisfaction exhibits a mere 3% +relative cost increase compared to the theoretically optimal baseline - for +which the nonanticipativity constraint is relaxed - while attaining 94% and 84% +of the user satisfaction performance in the two used satisfaction metrics. + +
+
+
+
+
+ + ♻ ☆ Exponential separations between classical and quantum learners + + +
+ Despite significant effort, the quantum machine learning community has only +demonstrated quantum learning advantages for artificial cryptography-inspired +datasets when dealing with classical data. In this paper we address the +challenge of finding learning problems where quantum learning algorithms can +achieve a provable exponential speedup over classical learning algorithms. We +reflect on computational learning theory concepts related to this question and +discuss how subtle differences in definitions can result in significantly +different requirements and tasks for the learner to meet and solve. We examine +existing learning problems with provable quantum speedups and find that they +largely rely on the classical hardness of evaluating the function that +generates the data, rather than identifying it. To address this, we present two +new learning separations where the classical difficulty primarily lies in +identifying the function generating the data. Furthermore, we explore +computational hardness assumptions that can be leveraged to prove quantum +speedups in scenarios where data is quantum-generated, which implies likely +quantum advantages in a plethora of more natural settings (e.g., in condensed +matter and high energy physics). We also discuss the limitations of the +classical shadow paradigm in the context of learning separations, and how +physically-motivated settings such as characterizing phases of matter and +Hamiltonian learning fit in the computational learning framework. + +
+
+ comment: this article supersedes arXiv:2208.06339 +
+
+
+
+
+ + ♻ ☆ On the Robustness of Neural Collapse and the Neural Collapse of + Robustness + + +
+ Neural Collapse refers to the curious phenomenon in the end of training of a +neural network, where feature vectors and classification weights converge to a +very simple geometrical arrangement (a simplex). While it has been observed +empirically in various cases and has been theoretically motivated, its +connection with crucial properties of neural networks, like their +generalization and robustness, remains unclear. In this work, we study the +stability properties of these simplices. We find that the simplex structure +disappears under small adversarial attacks, and that perturbed examples "leap" +between simplex vertices. We further analyze the geometry of networks that are +optimized to be robust against adversarial perturbations of the input, and find +that Neural Collapse is a pervasive phenomenon in these cases as well, with +clean and perturbed representations forming aligned simplices, and giving rise +to a robust simple nearest-neighbor classifier. By studying the propagation of +the amount of collapse inside the network, we identify novel properties of both +robust and non-robust machine learning models, and show that earlier, unlike +later layers maintain reliable simplices on perturbed data. Our code is +available at https://github.com/JingtongSu/robust_neural_collapse . + +
+
+ comment: Transactions on Machine Learning Research, 2024 +
+
+
+
+
+ + ♻ ☆ Predictive Inference in Multi-environment Scenarios + + +
+ We address the challenge of constructing valid confidence intervals and sets +in problems of prediction across multiple environments. We investigate two +types of coverage suitable for these problems, extending the jackknife and +split-conformal methods to show how to obtain distribution-free coverage in +such non-traditional, potentially hierarchical data-generating scenarios. We +demonstrate a novel resizing method to adapt to problem difficulty, which +applies both to existing approaches for predictive inference and the methods we +develop; this reduces prediction set sizes using limited information from the +test environment, a key to the methods' practical performance, which we +evaluate through neurochemical sensing and species classification datasets. Our +contributions also include extensions for settings with non-real-valued +responses, a theory of consistency for predictive inference in these general +problems, and insights on the limits of conditional coverage. + +
+
+
+
+
+ + ♻ ☆ Investigating the Effectiveness of Explainability Methods in Parkinson's + Detection from Speech + + +
+ Speech impairments in Parkinson's disease (PD) provide significant early +indicators for diagnosis. While models for speech-based PD detection have shown +strong performance, their interpretability remains underexplored. This study +systematically evaluates several explainability methods to identify PD-specific +speech features, aiming to support the development of accurate, interpretable +models for clinical decision-making in PD diagnosis and monitoring. Our +methodology involves (i) obtaining attributions and saliency maps using +mainstream interpretability techniques, (ii) quantitatively evaluating the +faithfulness of these maps and their combinations obtained via union and +intersection through a range of established metrics, and (iii) assessing the +information conveyed by the saliency maps for PD detection from an auxiliary +classifier. Our results reveal that, while explanations are aligned with the +classifier, they often fail to provide valuable information for domain experts. + +
+
+ comment: The first two authors contributed equally to this research: author + order is alphabetical +
+
+
+
+
+ + ♻ ☆ GeSubNet: Gene Interaction Inference for Disease Subtype Network + Generation ICLR 2025 + + +
+ Retrieving gene functional networks from knowledge databases presents a +challenge due to the mismatch between disease networks and subtype-specific +variations. Current solutions, including statistical and deep learning methods, +often fail to effectively integrate gene interaction knowledge from databases +or explicitly learn subtype-specific interactions. To address this mismatch, we +propose GeSubNet, which learns a unified representation capable of predicting +gene interactions while distinguishing between different disease subtypes. +Graphs generated by such representations can be considered subtype-specific +networks. GeSubNet is a multi-step representation learning framework with three +modules: First, a deep generative model learns distinct disease subtypes from +patient gene expression profiles. Second, a graph neural network captures +representations of prior gene networks from knowledge databases, ensuring +accurate physical gene interactions. Finally, we integrate these two +representations using an inference loss that leverages graph generation +capabilities, conditioned on the patient separation loss, to refine +subtype-specific information in the learned representation. GeSubNet +consistently outperforms traditional methods, with average improvements of +30.6%, 21.0%, 20.1%, and 56.6% across four graph evaluation metrics, averaged +over four cancer datasets. Particularly, we conduct a biological simulation +experiment to assess how the behavior of selected genes from over 11,000 +candidates affects subtypes or patient distributions. The results show that the +generated network has the potential to identify subtype-specific genes with an +83% likelihood of impacting patient distribution shifts. The GeSubNet resource +is available: https://anonymous.4open.science/r/GeSubNet/ + +
+
+ comment: Under review as a conference paper at ICLR 2025 +
+
+
+
+
+ + ♻ ☆ Gradient Normalization Provably Benefits Nonconvex SGD under + Heavy-Tailed Noise + + +
+ This paper investigates the roles of gradient normalization and clipping in +ensuring the convergence of Stochastic Gradient Descent (SGD) under +heavy-tailed noise. While existing approaches consider gradient clipping +indispensable for SGD convergence, we theoretically demonstrate that gradient +normalization alone without clipping is sufficient to ensure convergence. +Furthermore, we establish that combining gradient normalization with clipping +offers significantly improved convergence rates compared to using either +technique in isolation, particularly as gradient noise diminishes. With these +results, our work provides the first theoretical evidence demonstrating the +benefits of gradient normalization in SGD under heavy-tailed noise. Finally, we +introduce an accelerated SGD variant that incorporates both gradient +normalization and clipping, further enhancing convergence rates under +heavy-tailed noise. + +
+
+
+
+
+ + ♻ ☆ V-LoL: A Diagnostic Dataset for Visual Logical Learning + + +
+ Despite the successes of recent developments in visual AI, different +shortcomings still exist; from missing exact logical reasoning, to abstract +generalization abilities, to understanding complex and noisy scenes. +Unfortunately, existing benchmarks, were not designed to capture more than a +few of these aspects. Whereas deep learning datasets focus on visually complex +data but simple visual reasoning tasks, inductive logic datasets involve +complex logical learning tasks, however, lack the visual component. To address +this, we propose the diagnostic visual logical learning dataset, V-LoL, that +seamlessly combines visual and logical challenges. Notably, we introduce the +first instantiation of V-LoL, V-LoL-Train, - a visual rendition of a classic +benchmark in symbolic AI, the Michalski train problem. By incorporating +intricate visual scenes and flexible logical reasoning tasks within a versatile +framework, V-LoL-Train provides a platform for investigating a wide range of +visual logical learning challenges. We evaluate a variety of AI systems +including traditional symbolic AI, neural AI, as well as neuro-symbolic AI. Our +evaluations demonstrate that even SOTA AI faces difficulties in dealing with +visual logical learning challenges, highlighting unique advantages and +limitations of each methodology. Overall, V-LoL opens up new avenues for +understanding and enhancing current abilities in visual logical learning for AI +systems. + +
+
+
+
+
+ + ♻ ☆ Are Large Language Models Table-based Fact-Checkers? SC + + +
+ Table-based Fact Verification (TFV) aims to extract the entailment relation +between statements and structured tables. Existing TFV methods based on +small-scaled models suffer from insufficient labeled data and weak zero-shot +ability. Recently, the appearance of Large Language Models (LLMs) has gained +lots of attraction in research fields. They have shown powerful zero-shot and +in-context learning abilities on several NLP tasks, but their potential on TFV +is still unknown. In this work, we implement a preliminary study about whether +LLMs are table-based fact-checkers. In detail, we design diverse prompts to +explore how the in-context learning can help LLMs in TFV, i.e., zero-shot and +few-shot TFV capability. Besides, we carefully design and construct TFV +instructions to study the performance gain brought by the instruction tuning of +LLMs. Experimental results demonstrate that LLMs can achieve acceptable results +on zero-shot and few-shot TFV with prompt engineering, while instruction-tuning +can stimulate the TFV capability significantly. We also make some valuable +findings about the format of zero-shot prompts and the number of in-context +examples. Finally, we analyze some possible directions to promote the accuracy +of TFV via LLMs, which is beneficial to further research of table reasoning. + +
+
+ comment: CSCWD 2024 +
+
+
+
+
+ + ♻ ☆ ASTM :Autonomous Smart Traffic Management System Using Artificial + Intelligence CNN and LSTM + + +
+ In the modern world, the development of Artificial Intelligence (AI) has +contributed to improvements in various areas, including automation, computer +vision, fraud detection, and more. AI can be leveraged to enhance the +efficiency of Autonomous Smart Traffic Management (ASTM) systems and reduce +traffic congestion rates. This paper presents an Autonomous Smart Traffic +Management (STM) system that uses AI to improve traffic flow rates. The system +employs the YOLO V5 Convolutional Neural Network to detect vehicles in traffic +management images. Additionally, it predicts the number of vehicles for the +next 12 hours using a Recurrent Neural Network with Long Short-Term Memory +(RNN-LSTM). The Smart Traffic Management Cycle Length Analysis manages the +traffic cycle length based on these vehicle predictions, aided by AI. From the +results of the RNN-LSTM model for predicting vehicle numbers over the next 12 +hours, we observe that the model predicts traffic with a Mean Squared Error +(MSE) of 4.521 vehicles and a Root Mean Squared Error (RMSE) of 2.232 vehicles. +After simulating the STM system in the CARLA simulation environment, we found +that the Traffic Management Congestion Flow Rate with ASTM (21 vehicles per +minute) is 50\% higher than the rate without STM (around 15 vehicles per +minute). Additionally, the Traffic Management Vehicle Pass Delay with STM (5 +seconds per vehicle) is 70\% lower than without STM (around 12 seconds per +vehicle). These results demonstrate that the STM system using AI can increase +traffic flow by 50\% and reduce vehicle pass delays by 70\%. + +
+
+ comment: In process to IEEE Intelligent Vehicle Symposium 2025 +
+
+
+
+
+ + ♻ ☆ SynthesizRR: Generating Diverse Datasets with Retrieval Augmentation EMNLP 2024 + + +
+ It is often desirable to distill the capabilities of large language models +(LLMs) into smaller student models due to compute and memory constraints. One +way to do this for classification tasks is via dataset synthesis, which can be +accomplished by generating examples of each label from the LLM. Prior +approaches to synthesis use few-shot prompting, which relies on the LLM's +parametric knowledge to generate usable examples. However, this leads to issues +of repetition, bias towards popular entities, and stylistic differences from +human text. In this work, we propose Synthesize by Retrieval and Refinement +(SynthesizRR), which uses retrieval augmentation to introduce variety into the +dataset synthesis process: as retrieved passages vary, the LLM is seeded with +different content to generate its examples. We empirically study the synthesis +of six datasets, covering topic classification, sentiment analysis, tone +detection, and humor, requiring complex synthesis strategies. We find that +SynthesizRR greatly improves lexical and semantic diversity, similarity to +human-written text, and distillation performance, when compared to 32-shot +prompting and four prior approaches. We release our code to perform all steps +at https://github.com/amazon-science/synthesizrr + +
+
+ comment: Published as a main conference paper at EMNLP 2024. Code available at + https://github.com/amazon-science/synthesizrr +
+
+
+
+
+ + ♻ ☆ Harmonic Path Integral Diffusion + + +
+ In this manuscript, we present a novel approach for sampling from a +continuous multivariate probability distribution, which may either be +explicitly known (up to a normalization factor) or represented via empirical +samples. Our method constructs a time-dependent bridge from a delta function +centered at the origin of the state space at $t=0$, optimally transforming it +into the target distribution at $t=1$. We formulate this as a Stochastic +Optimal Control problem of the Path Integral Control type, with a cost function +comprising (in its basic form) a quadratic control term, a quadratic state +term, and a terminal constraint. This framework, which we refer to as Harmonic +Path Integral Diffusion (H-PID), leverages an analytical solution through a +mapping to an auxiliary quantum harmonic oscillator in imaginary time. + The H-PID framework results in a set of efficient sampling algorithms, +without the incorporation of Neural Networks. The algorithms are validated on +two standard use cases: a mixture of Gaussians over a grid and images from +CIFAR-10. The transparency of the method allows us to analyze the algorithms in +detail, particularly revealing that the current weighted state is an order +parameter for the dynamic phase transition, signaling earlier, at $t<1$, that +the sample generation process is almost complete. We contrast these algorithms +with other sampling methods, particularly simulated annealing and path integral +sampling, highlighting their advantages in terms of analytical control, +accuracy, and computational efficiency on benchmark problems. + Additionally, we extend the methodology to more general cases where the +underlying stochastic differential equation includes an external deterministic, +possibly non-conservative force, and where the cost function incorporates a +gauge potential term. + +
+
+
+
+
+ + ♻ ☆ Exact Fractional Inference via Re-Parametrization & Interpolation + between Tree-Re-Weighted- and Belief Propagation- Algorithms + + +
+ Computing the partition function, $Z$, of an Ising model over a graph of $N$ +\enquote{spins} is most likely exponential in $N$. Efficient variational +methods, such as Belief Propagation (BP) and Tree Re-Weighted (TRW) algorithms, +compute $Z$ approximately by minimizing the respective (BP- or TRW-) free +energy. We generalize the variational scheme by building a $\lambda$-fractional +interpolation, $Z^{(\lambda)}$, where $\lambda=0$ and $\lambda=1$ correspond to +TRW- and BP-approximations, respectively. This fractional scheme -- coined +Fractional Belief Propagation (FBP) -- guarantees that in the attractive +(ferromagnetic) case $Z^{(TRW)} \geq Z^{(\lambda)} \geq Z^{(BP)}$, and there +exists a unique (\enquote{exact}) $\lambda_*$ such that $Z=Z^{(\lambda_*)}$. +Generalizing the re-parametrization approach of +\citep{wainwright_tree-based_2002} and the loop series approach of +\citep{chertkov_loop_2006}, we show how to express $Z$ as a product, $\forall +\lambda:\ Z=Z^{(\lambda)}{\tilde Z}^{(\lambda)}$, where the multiplicative +correction, ${\tilde Z}^{(\lambda)}$, is an expectation over a node-independent +probability distribution built from node-wise fractional marginals. Our +theoretical analysis is complemented by extensive experiments with models from +Ising ensembles over planar and random graphs of medium and large sizes. Our +empirical study yields a number of interesting observations, such as the +ability to estimate ${\tilde Z}^{(\lambda)}$ with $O(N^{2::4})$ fractional +samples and suppression of variation in $\lambda_*$ estimates with an increase +in $N$ for instances from a particular random Ising ensemble, where $[2::4]$ +indicates a range from $2$ to $4$. We also discuss the applicability of this +approach to the problem of image de-noising. + +
+
+
+
+
+ + ♻ ☆ A General Recipe for the Analysis of Randomized Multi-Armed Bandit + Algorithms + + +
+ In this paper we propose a general methodology to derive regret bounds for +randomized multi-armed bandit algorithms. It consists in checking a set of +sufficient conditions on the sampling probability of each arm and on the family +of distributions to prove a logarithmic regret. As a direct application we +revisit two famous bandit algorithms, Minimum Empirical Divergence (MED) and +Thompson Sampling (TS), under various models for the distributions including +single parameter exponential families, Gaussian distributions, bounded +distributions, or distributions satisfying some conditions on their moments. In +particular, we prove that MED is asymptotically optimal for all these models, +but also provide a simple regret analysis of some TS algorithms for which the +optimality is already known. We then further illustrate the interest of our +approach, by analyzing a new Non-Parametric TS algorithm (h-NPTS), adapted to +some families of unbounded reward distributions with a bounded h-moment. This +model can for instance capture some non-parametric families of distributions +whose variance is upper bounded by a known constant. + +
+
+
+
+
+ + ♻ ☆ Active learning of digenic functions with boolean matrix logic + programming + + +
+ We apply logic-based machine learning techniques to facilitate cellular +engineering and drive biological discovery, based on comprehensive databases of +metabolic processes called genome-scale metabolic network models (GEMs). +Predicted host behaviours are not always correctly described by GEMs. Learning +the intricate genetic interactions within GEMs presents computational and +empirical challenges. To address these, we describe a novel approach called +Boolean Matrix Logic Programming (BMLP) by leveraging boolean matrices to +evaluate large logic programs. We introduce a new system, $BMLP_{active}$, +which efficiently explores the genomic hypothesis space by guiding informative +experimentation through active learning. In contrast to sub-symbolic methods, +$BMLP_{active}$ encodes a state-of-the-art GEM of a widely accepted bacterial +host in an interpretable and logical representation using datalog logic +programs. Notably, $BMLP_{active}$ can successfully learn the interaction +between a gene pair with fewer training examples than random experimentation, +overcoming the increase in experimental design space. $BMLP_{active}$ enables +rapid optimisation of metabolic models and offers a realistic approach to a +self-driving lab for microbial engineering. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2405.06724 +
+
+
+
+
+ + ♻ ☆ The Probabilistic Tsetlin Machine: A Novel Approach to Uncertainty + Quantification + + +
+ Tsetlin Machines (TMs) have emerged as a compelling alternative to +conventional deep learning methods, offering notable advantages such as smaller +memory footprint, faster inference, fault-tolerant properties, and +interpretability. Although various adaptations of TMs have expanded their +applicability across diverse domains, a fundamental gap remains in +understanding how TMs quantify uncertainty in their predictions. In response, +this paper introduces the Probabilistic Tsetlin Machine (PTM) framework, aimed +at providing a robust, reliable, and interpretable approach for uncertainty +quantification. Unlike the original TM, the PTM learns the probability of +staying on each state of each Tsetlin Automaton (TA) across all clauses. These +probabilities are updated using the feedback tables that are part of the TM +framework: Type I and Type II feedback. During inference, TAs decide their +actions by sampling states based on learned probability distributions, akin to +Bayesian neural networks when generating weight values. In our experimental +analysis, we first illustrate the spread of the probabilities across TA states +for the noisy-XOR dataset. Then we evaluate the PTM alongside benchmark models +using both simulated and real-world datasets. The experiments on the simulated +dataset reveal the PTM's effectiveness in uncertainty quantification, +particularly in delineating decision boundaries and identifying regions of high +uncertainty. Moreover, when applied to multiclass classification tasks using +the Iris dataset, the PTM demonstrates competitive performance in terms of +predictive entropy and expected calibration error, showcasing its potential as +a reliable tool for uncertainty estimation. Our findings underscore the +importance of selecting appropriate models for accurate uncertainty +quantification in predictive tasks, with the PTM offering a particularly +interpretable and effective solution. + +
+
+ comment: 12 pages, 5 figures, 6 tables, accepted and presented at ICAAI 2024, + London +
+
+
+
+
+ + ♻ ☆ Exact, Tractable Gauss-Newton Optimization in Deep Reversible + Architectures Reveal Poor Generalization NeurIPS 2024 + + +
+ Second-order optimization has been shown to accelerate the training of deep +neural networks in many applications, often yielding faster progress per +iteration on the training loss compared to first-order optimizers. However, the +generalization properties of second-order methods are still being debated. +Theoretical investigations have proved difficult to carry out outside the +tractable settings of heavily simplified model classes -- thus, the relevance +of existing theories to practical deep learning applications remains unclear. +Similarly, empirical studies in large-scale models and real datasets are +significantly confounded by the necessity to approximate second-order updates +in practice. It is often unclear whether the observed generalization behaviour +arises specifically from the second-order nature of the parameter updates, or +instead reflects the specific structured (e.g.\ Kronecker) approximations used +or any damping-based interpolation towards first-order updates. Here, we show +for the first time that exact Gauss-Newton (GN) updates take on a tractable +form in a class of deep reversible architectures that are sufficiently +expressive to be meaningfully applied to common benchmark datasets. We exploit +this novel setting to study the training and generalization properties of the +GN optimizer. We find that exact GN generalizes poorly. In the mini-batch +training setting, this manifests as rapidly saturating progress even on the +\emph{training} loss, with parameter updates found to overfit each +mini-batchatch without producing the features that would support generalization +to other mini-batches. We show that our experiments run in the ``lazy'' regime, +in which the neural tangent kernel (NTK) changes very little during the course +of training. This behaviour is associated with having no significant changes in +neural representations, explaining the lack of generalization. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Review of Electromagnetic Elimination Methods for low-field portable + MRI scanner + + +
+ This paper analyzes conventional and deep learning methods for eliminating +electromagnetic interference (EMI) in MRI systems. We compare traditional +analytical and adaptive techniques with advanced deep learning approaches. Key +strengths and limitations of each method are highlighted. Recent advancements +in active EMI elimination, such as external EMI receiver coils, are discussed +alongside deep learning methods, which show superior EMI suppression by +leveraging neural networks trained on MRI data. While deep learning improves +EMI elimination and diagnostic capabilities, it introduces security and safety +concerns, particularly in commercial applications. A balanced approach, +integrating conventional reliability with deep learning's advanced +capabilities, is proposed for more effective EMI suppression in MRI systems. + +
+
+ comment: Accepted by 2024 5th International Conference on Machine Learning and + Computer Application +
+
+
+
+
+ + ♻ ☆ TreeC: a method to generate interpretable energy management systems + using a metaheuristic algorithm + + +
+ Energy management systems (EMS) have traditionally been implemented using +rule-based control (RBC) and model predictive control (MPC) methods. However, +recent research has explored the use of reinforcement learning (RL) as a +promising alternative. This paper introduces TreeC, a machine learning method +that utilizes the covariance matrix adaptation evolution strategy metaheuristic +algorithm to generate an interpretable EMS modeled as a decision tree. Unlike +RBC and MPC approaches, TreeC learns the decision strategy of the EMS based on +historical data, adapting the control model to the controlled energy grid. The +decision strategy is represented as a decision tree, providing interpretability +compared to RL methods that often rely on black-box models like neural +networks. TreeC is evaluated against MPC with perfect forecast and RL EMSs in +two case studies taken from literature: an electric grid case and a household +heating case. In the electric grid case, TreeC achieves an average energy loss +and constraint violation score of 19.2, which is close to MPC and RL EMSs that +achieve scores of 14.4 and 16.2 respectively. All three methods control the +electric grid well especially when compared to the random EMS, which obtains an +average score of 12 875. In the household heating case, TreeC performs +similarly to MPC on the adjusted and averaged electricity cost and total +discomfort (0.033 EUR/m$^2$ and 0.42 Kh for TreeC compared to 0.037 EUR/m$^2$ +and 2.91 kH for MPC), while outperforming RL (0.266 EUR/m$^2$ and 24.41 Kh). + +
+
+ comment: Accepted version Knowledge based system +
+
+
+
+
+ + ♻ ☆ The effect of dataset size and the process of big data mining for + investigating solar-thermal desalination by using machine learning + + +
+ Machine learning's application in solar-thermal desalination is limited by +data shortage and inconsistent analysis. This study develops an optimized +dataset collection and analysis process for the representative solar still. By +ultra-hydrophilic treatment on the condensation cover, the dataset collection +process reduces the collection time by 83.3%. Over 1,000 datasets are +collected, which is nearly one order of magnitude larger than up-to-date works. +Then, a new interdisciplinary process flow is proposed. Some meaningful results +are obtained that were not addressed by previous studies. It is found that +Radom Forest might be a better choice for datasets larger than 1,000 due to +both high accuracy and fast speed. Besides, the dataset range affects the +quantified importance (weighted value) of factors significantly, with up to a +115% increment. Moreover, the results show that machine learning has a high +accuracy on the extrapolation prediction of productivity, where the minimum +mean relative prediction error is just around 4%. The results of this work not +only show the necessity of the dataset characteristics' effect but also provide +a standard process for studying solar-thermal desalination by machine learning, +which would pave the way for interdisciplinary study. + +
+
+
+
+
+ + ♻ ☆ Advantages of Neural Population Coding for Deep Learning + + +
+ Scalar variables, e.g., the orientation of a shape in an image, are commonly +predicted using a single output neuron in a neural network. In contrast, the +mammalian cortex represents variables with a population of neurons. In this +population code, each neuron is most active at its preferred value and shows +partial activity for other values. Here, we investigate the benefit of using a +population code for the output layer of a neural network. We compare population +codes against single-neuron outputs and one-hot vectors. First, we show +theoretically and in experiments with synthetic data that population codes +improve robustness to input noise in networks of stacked linear layers. Second, +we demonstrate the benefit of using population codes to encode ambiguous +outputs, such as the pose of symmetric objects. Using the T-LESS dataset of +feature-less real-world objects, we show that population codes improve the +accuracy of predicting 3D object orientation from image input. + +
+
+
+
+
+ + ♻ ☆ AudioMarkBench: Benchmarking Robustness of Audio Watermarking NeurIPS + + +
+ The increasing realism of synthetic speech, driven by advancements in +text-to-speech models, raises ethical concerns regarding impersonation and +disinformation. Audio watermarking offers a promising solution via embedding +human-imperceptible watermarks into AI-generated audios. However, the +robustness of audio watermarking against common/adversarial perturbations +remains understudied. We present AudioMarkBench, the first systematic benchmark +for evaluating the robustness of audio watermarking against watermark removal +and watermark forgery. AudioMarkBench includes a new dataset created from +Common-Voice across languages, biological sexes, and ages, 3 state-of-the-art +watermarking methods, and 15 types of perturbations. We benchmark the +robustness of these methods against the perturbations in no-box, black-box, and +white-box settings. Our findings highlight the vulnerabilities of current +watermarking techniques and emphasize the need for more robust and fair audio +watermarking solutions. Our dataset and code are publicly available at +https://github.com/moyangkuo/AudioMarkBench. + +
+
+ comment: To appear in NeurIPS Datasets and Benchmarks, 2024 +
+
+
+
+
+ + ♻ ☆ pLDDT-Predictor: High-speed Protein Screening Using Transformer and ESM2 + + +
+ Recent advancements in protein structure prediction, particularly AlphaFold2, +have revolutionized structural biology by achieving near-experimental accuracy +($\text{average RMSD} < 1.5\text{\AA}$). However, the computational demands of +these models (approximately 30 minutes per protein on an RTX 4090) +significantly limit their application in high-throughput protein screening. +While large language models like ESM (Evolutionary Scale Modeling) have shown +promise in extracting structural information directly from protein sequences, +rapid assessment of protein structure quality for large-scale analyses remains +a major challenge. + We introduce pLDDT-Predictor, a high-speed protein screening tool that +achieves a $250,000\times$ speedup compared to AlphaFold2 by leveraging +pre-trained ESM2 protein embeddings and a Transformer architecture. Our model +predicts AlphaFold2's pLDDT (predicted Local Distance Difference Test) scores +with a Pearson correlation of 0.7891 and processes proteins in just 0.007 +seconds on average. Using a comprehensive dataset of 1.5 million diverse +protein sequences (ranging from 50 to 2048 amino acids), we demonstrate that +pLDDT-Predictor accurately classifies high-confidence structures (pLDDT $>$ 70) +with 91.2\% accuracy and achieves an MSE of 84.8142 compared to AlphaFold2's +predictions. + The source code and pre-trained models are freely available at +\url{https://github.com/jw-chae/pLDDT_Predictor}, enabling the research +community to perform rapid, large-scale protein structure quality assessments. + +
+
+ comment: 6 pages main topic, 8 pages including citiation, 4 figures +
+
+
+
+
+ + ♻ ☆ DM4Steal: Diffusion Model For Link Stealing Attack On Graph Neural + Networks + + +
+ Graph has become increasingly integral to the advancement of recommendation +systems, particularly with the fast development of graph neural network(GNN). +By exploring the virtue of rich node features and link information, GNN is +designed to provide personalized and accurate suggestions. Meanwhile, the +privacy leakage of GNN in such contexts has also captured special attention. +Prior work has revealed that a malicious user can utilize auxiliary knowledge +to extract sensitive link data of the target graph, integral to recommendation +systems, via the decision made by the target GNN model. This poses a +significant risk to the integrity and confidentiality of data used in +recommendation system. Though important, previous works on GNN's privacy +leakage are still challenged in three aspects, i.e., limited stealing attack +scenarios, sub-optimal attack performance, and adaptation against defense. To +address these issues, we propose a diffusion model based link stealing attack, +named DM4Steal. It differs previous work from three critical aspects. (i) +Generality: aiming at six attack scenarios with limited auxiliary knowledge, we +propose a novel training strategy for diffusion models so that DM4Steal is +transferable to diverse attack scenarios. (ii) Effectiveness: benefiting from +the retention of semantic structure in the diffusion model during the training +process, DM4Steal is capable to learn the precise topology of the target graph +through the GNN decision process. (iii) Adaptation: when GNN is defensive +(e.g., DP, Dropout), DM4Steal relies on the stability that comes from sampling +the score model multiple times to keep performance degradation to a minimum, +thus DM4Steal implements successful adaptive attack on defensive GNN. + +
+
+ comment: We found that there were critical problems in our paper, and we + needed to redo the experiment, which was incomplete +
+
+
+
+
+ + ♻ ☆ LAuReL: Learned Augmented Residual Layer ICML + + +
+ One of the core pillars of efficient deep learning methods is architectural +improvements such as the residual/skip connection, which has led to +significantly better model convergence and quality. Since then the residual +connection has become ubiquitous in not just convolutional neural networks but +also transformer-based architectures, the backbone of LLMs. + In this paper we introduce \emph{Learned Augmented Residual Layer} (LAuReL) +-- a novel generalization of the canonical residual connection -- with the goal +to be an in-situ replacement of the latter while outperforming on both model +quality and footprint metrics. Our experiments show that using \laurel can help +boost performance for both vision and language models. For example, on the +ResNet-50, ImageNet 1K task, it achieves $60\%$ of the gains from adding an +extra layer, while only adding $0.003\%$ more parameters, and matches it while +adding $2.6\times$ fewer parameters. + +
+
+ comment: Accepted at the 2nd Efficient Systems for Foundation Models Workshop + at the International Conference on Machine Learning (ICML) 2024 +
+
+
+
+
+ + ♻ ☆ Data movement limits to frontier model training + + +
+ We present a theoretical model of distributed training, and use it to analyze +how far dense and sparse training runs can be scaled. Under our baseline +assumptions, given a three month training duration, data movement bottlenecks +begin to significantly lower hardware utilization for training runs exceeding +about $10^{28}$ FLOP, two orders of magnitude above the largest training run to +date, suggesting the arrival of fundamental barriers to scaling in three years +given recent rates of growth. A training run exceeding about $10^{31}$ FLOP is +infeasible even at low utilization. However, more aggressive batch size scaling +and/or shorter and fatter model shapes, if achievable, have the potential to +permit much larger training runs. + +
+
+
+
+
+ + ♻ ☆ DAGER: Exact Gradient Inversion for Large Language Models + + +
+ Federated learning works by aggregating locally computed gradients from +multiple clients, thus enabling collaborative training without sharing private +client data. However, prior work has shown that the data can actually be +recovered by the server using so-called gradient inversion attacks. While these +attacks perform well when applied on images, they are limited in the text +domain and only permit approximate reconstruction of small batches and short +input sequences. In this work, we propose DAGER, the first algorithm to recover +whole batches of input text exactly. DAGER leverages the low-rank structure of +self-attention layer gradients and the discrete nature of token embeddings to +efficiently check if a given token sequence is part of the client data. We use +this check to exactly recover full batches in the honest-but-curious setting +without any prior on the data for both encoder- and decoder-based architectures +using exhaustive heuristic search and a greedy approach, respectively. We +provide an efficient GPU implementation of DAGER and show experimentally that +it recovers full batches of size up to 128 on large language models (LLMs), +beating prior attacks in speed (20x at same batch size), scalability (10x +larger batches), and reconstruction quality (ROUGE-1/2 > 0.99). + +
+
+
+
+
+ + ♻ ☆ Neural Network Verification with Branch-and-Bound for General + Nonlinearities + + +
+ Branch-and-bound (BaB) is among the most effective techniques for neural +network (NN) verification. However, existing works on BaB for NN verification +have mostly focused on NNs with piecewise linear activations, especially ReLU +networks. In this paper, we develop a general framework, named GenBaB, to +conduct BaB on general nonlinearities to verify NNs with general architectures, +based on linear bound propagation for NN verification. To decide which neuron +to branch, we design a new branching heuristic which leverages linear bounds as +shortcuts to efficiently estimate the potential improvement after branching. To +decide nontrivial branching points for general nonlinear functions, we propose +to pre-optimize branching points, which can be efficiently leveraged during +verification with a lookup table. We demonstrate the effectiveness of our +GenBaB on verifying a wide range of NNs, including NNs with activation +functions such as Sigmoid, Tanh, Sine and GeLU, as well as NNs involving +multi-dimensional nonlinear operations such as multiplications in LSTMs and +Vision Transformers. Our framework also allows the verification of general +nonlinear computation graphs and enables verification applications beyond +simple NNs, particularly for AC Optimal Power Flow (ACOPF). GenBaB is part of +the latest $\alpha,\!\beta$-CROWN, the winner of the 4th and the 5th +International Verification of Neural Networks Competition (VNN-COMP 2023 and +2024). + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ ADI: Adversarial Dominating Inputs in Vertical Federated Learning + Systems + + +
+ Vertical federated learning (VFL) system has recently become prominent as a +concept to process data distributed across many individual sources without the +need to centralize it. Multiple participants collaboratively train models based +on their local data in a privacy-aware manner. To date, VFL has become a de +facto solution to securely learn a model among organizations, allowing +knowledge to be shared without compromising privacy of any individuals. Despite +the prosperous development of VFL systems, we find that certain inputs of a +participant, named adversarial dominating inputs (ADIs), can dominate the joint +inference towards the direction of the adversary's will and force other +(victim) participants to make negligible contributions, losing rewards that are +usually offered regarding the importance of their contributions in federated +learning scenarios. We conduct a systematic study on ADIs by first proving +their existence in typical VFL systems. We then propose gradient-based methods +to synthesize ADIs of various formats and exploit common VFL systems. We +further launch greybox fuzz testing, guided by the saliency score of ``victim'' +participants, to perturb adversary-controlled inputs and systematically explore +the VFL attack surface in a privacy-preserving manner. We conduct an in-depth +study on the influence of critical parameters and settings in synthesizing +ADIs. Our study reveals new VFL attack opportunities, promoting the +identification of unknown threats before breaches and building more secure VFL +systems. + +
+
+
+
+
+ + ♻ ☆ Mitigating Gradient Overlap in Deep Residual Networks with Gradient + Normalization for Improved Non-Convex Optimization + + +
+ In deep learning, Residual Networks (ResNets) have proven effective in +addressing the vanishing gradient problem, allowing for the successful training +of very deep networks. However, skip connections in ResNets can lead to +gradient overlap, where gradients from both the learned transformation and the +skip connection combine, potentially resulting in overestimated gradients. This +overestimation can cause inefficiencies in optimization, as some updates may +overshoot optimal regions, affecting weight updates. To address this, we +examine Z-score Normalization (ZNorm) as a technique to manage gradient +overlap. ZNorm adjusts the gradient scale, standardizing gradients across +layers and reducing the negative impact of overlapping gradients. Our +experiments demonstrate that ZNorm improves training process, especially in +non-convex optimization scenarios common in deep learning, where finding +optimal solutions is challenging. These findings suggest that ZNorm can affect +the gradient flow, enhancing performance in large-scale data processing where +accuracy is critical. + +
+
+
+
+
+ + ♻ ☆ Doubly Mild Generalization for Offline Reinforcement Learning NeurIPS 2024 + + +
+ Offline Reinforcement Learning (RL) suffers from the extrapolation error and +value overestimation. From a generalization perspective, this issue can be +attributed to the over-generalization of value functions or policies towards +out-of-distribution (OOD) actions. Significant efforts have been devoted to +mitigating such generalization, and recent in-sample learning approaches have +further succeeded in entirely eschewing it. Nevertheless, we show that mild +generalization beyond the dataset can be trusted and leveraged to improve +performance under certain conditions. To appropriately exploit generalization +in offline RL, we propose Doubly Mild Generalization (DMG), comprising (i) mild +action generalization and (ii) mild generalization propagation. The former +refers to selecting actions in a close neighborhood of the dataset to maximize +the Q values. Even so, the potential erroneous generalization can still be +propagated, accumulated, and exacerbated by bootstrapping. In light of this, +the latter concept is introduced to mitigate the generalization propagation +without impeding the propagation of RL learning signals. Theoretically, DMG +guarantees better performance than the in-sample optimal policy in the oracle +generalization scenario. Even under worst-case generalization, DMG can still +control value overestimation at a certain level and lower bound the +performance. Empirically, DMG achieves state-of-the-art performance across +Gym-MuJoCo locomotion tasks and challenging AntMaze tasks. Moreover, benefiting +from its flexibility in both generalization aspects, DMG enjoys a seamless +transition from offline to online learning and attains strong online +fine-tuning performance. + +
+
+ comment: Accepted to NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking the Power of Timestamps for Robust Time Series Forecasting: A + Global-Local Fusion Perspective NeurIPS 2024 + + +
+ Time series forecasting has played a pivotal role across various industries, +including finance, transportation, energy, healthcare, and climate. Due to the +abundant seasonal information they contain, timestamps possess the potential to +offer robust global guidance for forecasting techniques. However, existing +works primarily focus on local observations, with timestamps being treated +merely as an optional supplement that remains underutilized. When data gathered +from the real world is polluted, the absence of global information will damage +the robust prediction capability of these algorithms. To address these +problems, we propose a novel framework named GLAFF. Within this framework, the +timestamps are modeled individually to capture the global dependencies. Working +as a plugin, GLAFF adaptively adjusts the combined weights for global and local +information, enabling seamless collaboration with any time series forecasting +backbone. Extensive experiments conducted on nine real-world datasets +demonstrate that GLAFF significantly enhances the average performance of widely +used mainstream forecasting models by 12.5%, surpassing the previous +state-of-the-art method by 5.5%. + +
+
+ comment: Accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Synergy-Guided Regional Supervision of Pseudo Labels for Semi-Supervised + Medical Image Segmentation + + +
+ Semi-supervised learning has received considerable attention for its +potential to leverage abundant unlabeled data to enhance model robustness. +Pseudo labeling is a widely used strategy in semi supervised learning. However, +existing methods often suffer from noise contamination, which can undermine +model performance. To tackle this challenge, we introduce a novel +Synergy-Guided Regional Supervision of Pseudo Labels (SGRS-Net) framework. +Built upon the mean teacher network, we employ a Mix Augmentation module to +enhance the unlabeled data. By evaluating the synergy before and after +augmentation, we strategically partition the pseudo labels into distinct +regions. Additionally, we introduce a Region Loss Evaluation module to assess +the loss across each delineated area. Extensive experiments conducted on the LA +dataset have demonstrated superior performance over state-of-the-art +techniques, underscoring the efficiency and practicality of our framework. + +
+
+
+
+
+ + ♻ ☆ Advanced User Credit Risk Prediction Model using LightGBM, XGBoost and + Tabnet with SMOTEENN + + +
+ Bank credit risk is a significant challenge in modern financial transactions, +and the ability to identify qualified credit card holders among a large number +of applicants is crucial for the profitability of a bank'sbank's credit card +business. In the past, screening applicants'applicants' conditions often +required a significant amount of manual labor, which was time-consuming and +labor-intensive. Although the accuracy and reliability of previously used ML +models have been continuously improving, the pursuit of more reliable and +powerful AI intelligent models is undoubtedly the unremitting pursuit by major +banks in the financial industry. In this study, we used a dataset of over +40,000 records provided by a commercial bank as the research object. We +compared various dimensionality reduction techniques such as PCA and T-SNE for +preprocessing high-dimensional datasets and performed in-depth adaptation and +tuning of distributed models such as LightGBM and XGBoost, as well as deep +models like Tabnet. After a series of research and processing, we obtained +excellent research results by combining SMOTEENN with these techniques. The +experiments demonstrated that LightGBM combined with PCA and SMOTEENN +techniques can assist banks in accurately predicting potential high-quality +customers, showing relatively outstanding performance compared to other models. + +
+
+ comment: 8 pagess on IEEE ICPICS +
+
+
+
+
+ + ♻ ☆ General Geospatial Inference with a Population Dynamics Foundation Model + + +
+ Supporting the health and well-being of dynamic populations around the world +requires governmental agencies, organizations and researchers to understand and +reason over complex relationships between human behavior and local contexts in +order to identify high-risk groups and strategically allocate limited +resources. Traditional approaches to these classes of problems often entail +developing manually curated, task-specific features and models to represent +human behavior and the natural and built environment, which can be challenging +to adapt to new, or even, related tasks. To address this, we introduce a +Population Dynamics Foundation Model (PDFM) that aims to capture the +relationships between diverse data modalities and is applicable to a broad +range of geospatial tasks. We first construct a geo-indexed dataset for postal +codes and counties across the United States, capturing rich aggregated +information on human behavior from maps, busyness, and aggregated search +trends, and environmental factors such as weather and air quality. We then +model this data and the complex relationships between locations using a graph +neural network, producing embeddings that can be adapted to a wide range of +downstream tasks using relatively simple models. We evaluate the effectiveness +of our approach by benchmarking it on 27 downstream tasks spanning three +distinct domains: health indicators, socioeconomic factors, and environmental +measurements. The approach achieves state-of-the-art performance on all 27 +geospatial interpolation tasks, and on 25 out of the 27 extrapolation and +super-resolution tasks. We combined the PDFM with a state-of-the-art +forecasting foundation model, TimesFM, to predict unemployment and poverty, +achieving performance that surpasses fully supervised forecasting. The full set +of embeddings and sample code are publicly available for researchers. + +
+
+ comment: 28 pages, 16 figures, preprint; v2: updated github url +
+
+
+
+
+ + ♻ ☆ A Unified Framework for Analyzing Meta-algorithms in Online Convex + Optimization + + +
+ In this paper, we analyze the problem of online convex optimization in +different settings, including different feedback types +(full-information/semi-bandit/bandit/etc) in either stochastic or +non-stochastic setting and different notions of regret (static adversarial +regret/dynamic regret/adaptive regret). This is done through a framework which +allows us to systematically propose and analyze meta-algorithms for the various +settings described above. We show that any algorithm for online linear +optimization with fully adaptive adversaries is an algorithm for online convex +optimization. We also show that any such algorithm that requires +full-information feedback may be transformed to an algorithm with semi-bandit +feedback with comparable regret bound. We further show that algorithms that are +designed for fully adaptive adversaries using deterministic semi-bandit +feedback can obtain similar bounds using only stochastic semi-bandit feedback +when facing oblivious adversaries. We use this to describe general +meta-algorithms to convert first order algorithms to zeroth order algorithms +with comparable regret bounds. Our framework allows us to analyze online +optimization in various settings, recovers several results in the literature +with a simplified proof technique, and provides new results. + +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Verifiable Neural Network Inference Service ACSA + + +
+ Machine learning has revolutionized data analysis and pattern recognition, +but its resource-intensive training has limited accessibility. Machine Learning +as a Service (MLaaS) simplifies this by enabling users to delegate their data +samples to an MLaaS provider and obtain the inference result using a +pre-trained model. Despite its convenience, leveraging MLaaS poses significant +privacy and reliability concerns to the client. Specifically, sensitive +information from the client inquiry data can be leaked to an adversarial MLaaS +provider. Meanwhile, the lack of a verifiability guarantee can potentially +result in biased inference results or even unfair payment issues. While +existing trustworthy machine learning techniques, such as those relying on +verifiable computation or secure computation, offer solutions to privacy and +reliability concerns, they fall short of simultaneously protecting the privacy +of client data and providing provable inference verifiability. + In this paper, we propose vPIN, a privacy-preserving and verifiable CNN +inference scheme that preserves privacy for client data samples while ensuring +verifiability for the inference. vPIN makes use of partial homomorphic +encryption and commit-and-prove succinct non-interactive argument of knowledge +techniques to achieve desirable security properties. In vPIN, we develop +various optimization techniques to minimize the proving circuit for homomorphic +inference evaluation thereby, improving the efficiency and performance of our +technique. We fully implemented and evaluated our vPIN scheme on standard +datasets (e.g., MNIST, CIFAR-10). Our experimental results show that vPIN +achieves high efficiency in terms of proving time, verification time, and proof +size, while providing client data privacy guarantees and provable +verifiability. + +
+
+ comment: Accepted at the Annual Computer Security Applications Conference + (ACSAC) 2024. Source code: github.com/vt-asaplab/vPIN +
+
+
+
+
+ + ♻ ☆ Learning Memory Mechanisms for Decision Making through Demonstrations + + +
+ In Partially Observable Markov Decision Processes, integrating an agent's +history into memory poses a significant challenge for decision-making. +Traditional imitation learning, relying on observation-action pairs for expert +demonstrations, fails to capture the expert's memory mechanisms used in +decision-making. To capture memory processes as demonstrations, we introduce +the concept of memory dependency pairs $(p, q)$ indicating that events at time +$p$ are recalled for decision-making at time $q$. We introduce AttentionTuner +to leverage memory dependency pairs in Transformers and find significant +improvements across several tasks compared to standard Transformers when +evaluated on Memory Gym and the Long-term Memory Benchmark. Code is available +at https://github.com/WilliamYue37/AttentionTuner. + +
+
+
+
+
+ + ♻ ☆ MSEG-VCUQ: Multimodal SEGmentation with Enhanced Vision Foundation + Models, Convolutional Neural Networks, and Uncertainty Quantification for + High-Speed Video Phase Detection Data + + +
+ Purpose: High-speed video (HSV) phase detection (PD) segmentation is vital in +nuclear reactors, chemical processing, and electronics cooling for detecting +vapor, liquid, and microlayer phases. Traditional segmentation models face +pixel-level accuracy and generalization issues in multimodal data. MSEG-VCUQ +introduces VideoSAM, a hybrid framework leveraging convolutional neural +networks (CNNs) and transformer-based vision models to enhance segmentation +accuracy and generalizability across complex multimodal PD tasks. Methods: +VideoSAM combines U-Net CNN and the Segment Anything Model (SAM) for advanced +feature extraction and segmentation across diverse HSV PD modalities, spanning +fluids like water, FC-72, nitrogen, and argon under varied heat flux +conditions. The framework also incorporates uncertainty quantification (UQ) to +assess pixel-based discretization errors, delivering reliable metrics such as +contact line density and dry area fraction under experimental conditions. +Results: VideoSAM outperforms SAM and modality-specific CNN models in +segmentation accuracy, excelling in environments with complex phase boundaries, +overlapping bubbles, and dynamic liquid-vapor interactions. Its hybrid +architecture supports cross-dataset generalization, adapting effectively to +varying modalities. The UQ module provides accurate error estimates, enhancing +the reliability of segmentation outputs for advanced HSV PD research. +Conclusion: MSEG-VCUQ, via VideoSAM, offers a robust solution for HSV PD +segmentation, addressing previous limitations with advanced deep learning and +UQ techniques. The open-source datasets and tools introduced enable scalable, +precise, and adaptable segmentation for multimodal PD datasets, supporting +advancements in HSV analysis and autonomous experimentation. The codes and data +used for this paper are publicly available at: +\url{https://github.com/chikap421/mseg_vcuq} + +
+
+ comment: Under Review in EAAI +
+
+
+
+
+ + ♻ ☆ SPDIM: Source-Free Unsupervised Conditional and Label Shift Adaptation + in EEG + + +
+ The non-stationary nature of electroencephalography (EEG) introduces +distribution shifts across domains (e.g., days and subjects), posing a +significant challenge to EEG-based neurotechnology generalization. Without +labeled calibration data for target domains, the problem is a source-free +unsupervised domain adaptation (SFUDA) problem. For scenarios with constant +label distribution, Riemannian geometry-aware statistical alignment frameworks +on the symmetric positive definite (SPD) manifold are considered +state-of-the-art. However, many practical scenarios, including EEG-based sleep +staging, exhibit label shifts. Here, we propose a geometric deep learning +framework for SFUDA problems under specific distribution shifts, including +label shifts. We introduce a novel, realistic generative model and show that +prior Riemannian statistical alignment methods on the SPD manifold can +compensate for specific marginal and conditional distribution shifts but hurt +generalization under label shifts. As a remedy, we propose a +parameter-efficient manifold optimization strategy termed SPDIM. SPDIM uses the +information maximization principle to learn a single SPD-manifold-constrained +parameter per target domain. In simulations, we demonstrate that SPDIM can +compensate for the shifts under our generative model. Moreover, using public +EEG-based brain-computer interface and sleep staging datasets, we show that +SPDIM outperforms prior approaches. + +
+
+
+
+
+ + ♻ ☆ Sample Complexity of Opinion Formation on Networks with Linear + Regression Models + + +
+ Consider public health officials aiming to spread awareness about a new +vaccine in a community interconnected by a social network. How can they +distribute information with minimal resources, so as to avoid polarization and +ensure community-wide convergence of opinion? To tackle such challenges, we +initiate the study of sample complexity of opinion convergence in networks. Our +framework is built on the recognized opinion formation game, where we regard +the opinion of each agent as a data-derived model, unlike previous works that +treat opinions as data-independent scalars. The opinion model for every agent +is initially learned from its local samples and evolves game-theoretically as +all agents communicate with neighbors and revise their models towards an +equilibrium. Our focus is on the sample complexity needed to ensure that the +opinions converge to an equilibrium such that the final model of every agent +has low generalization error. + Our paper has two main technical results. First, we present a novel +polynomial time optimization framework to quantify the total sample complexity +for arbitrary networks, when the underlying learning problem is (generalized) +linear regression. Second, we leverage this optimization to study the network +gain which measures the improvement of sample complexity when learning over a +network compared to that in isolation. Towards this end, we derive network gain +bounds for various network classes including cliques, star graphs, and random +regular graphs. Additionally, our framework provides a method to study sample +distribution within the network, suggesting that it is sufficient to allocate +samples inversely to the degree. Empirical results on both synthetic and +real-world networks strongly support our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ BIOSCAN-5M: A Multimodal Dataset for Insect Biodiversity + + +
+ As part of an ongoing worldwide effort to comprehend and monitor insect +biodiversity, this paper presents the BIOSCAN-5M Insect dataset to the machine +learning community and establish several benchmark tasks. BIOSCAN-5M is a +comprehensive dataset containing multi-modal information for over 5 million +insect specimens, and it significantly expands existing image-based biological +datasets by including taxonomic labels, raw nucleotide barcode sequences, +assigned barcode index numbers, geographical, and size information. We propose +three benchmark experiments to demonstrate the impact of the multi-modal data +types on the classification and clustering accuracy. First, we pretrain a +masked language model on the DNA barcode sequences of the BIOSCAN-5M dataset, +and demonstrate the impact of using this large reference library on species- +and genus-level classification performance. Second, we propose a zero-shot +transfer learning task applied to images and DNA barcodes to cluster feature +embeddings obtained from self-supervised learning, to investigate whether +meaningful clusters can be derived from these representation embeddings. Third, +we benchmark multi-modality by performing contrastive learning on DNA barcodes, +image data, and taxonomic information. This yields a general shared embedding +space enabling taxonomic classification using multiple types of information and +modalities. The code repository of the BIOSCAN-5M Insect dataset is available +at https://github.com/bioscan-ml/BIOSCAN-5M. + +
+
+
+
+
+ + ♻ ☆ Interpretability Needs a New Paradigm + + +
+ Interpretability is the study of explaining models in understandable terms to +humans. At present, interpretability is divided into two paradigms: the +intrinsic paradigm, which believes that only models designed to be explained +can be explained, and the post-hoc paradigm, which believes that black-box +models can be explained. At the core of this debate is how each paradigm +ensures its explanations are faithful, i.e., true to the model's behavior. This +is important, as false but convincing explanations lead to unsupported +confidence in artificial intelligence (AI), which can be dangerous. This +paper's position is that we should think about new paradigms while staying +vigilant regarding faithfulness. First, by examining the history of paradigms +in science, we see that paradigms are constantly evolving. Then, by examining +the current paradigms, we can understand their underlying beliefs, the value +they bring, and their limitations. Finally, this paper presents 3 emerging +paradigms for interpretability. The first paradigm designs models such that +faithfulness can be easily measured. Another optimizes models such that +explanations become faithful. The last paradigm proposes to develop models that +produce both a prediction and an explanation. + +
+
+
+
+
+ + ♻ ☆ Probabilistic Emulation of a Global Climate Model with Spherical + DYffusion NeurIPS 2024 + + +
+ Data-driven deep learning models are transforming global weather forecasting. +It is an open question if this success can extend to climate modeling, where +the complexity of the data and long inference rollouts pose significant +challenges. Here, we present the first conditional generative model that +produces accurate and physically consistent global climate ensemble simulations +by emulating a coarse version of the United States' primary operational global +forecast model, FV3GFS. Our model integrates the dynamics-informed diffusion +framework (DYffusion) with the Spherical Fourier Neural Operator (SFNO) +architecture, enabling stable 100-year simulations at 6-hourly timesteps while +maintaining low computational overhead compared to single-step deterministic +baselines. The model achieves near gold-standard performance for climate model +emulation, outperforming existing approaches and demonstrating promising +ensemble skill. This work represents a significant advance towards efficient, +data-driven climate simulations that can enhance our understanding of the +climate system and inform adaptation strategies. + +
+
+ comment: NeurIPS 2024; Code is available at + https://github.com/Rose-STL-Lab/spherical-dyffusion +
+
+
+
+
+ + ♻ ☆ Taming Latent Diffusion Model for Neural Radiance Field Inpainting ECCV 2024 + + +
+ Neural Radiance Field (NeRF) is a representation for 3D reconstruction from +multi-view images. Despite some recent work showing preliminary success in +editing a reconstructed NeRF with diffusion prior, they remain struggling to +synthesize reasonable geometry in completely uncovered regions. One major +reason is the high diversity of synthetic contents from the diffusion model, +which hinders the radiance field from converging to a crisp and deterministic +geometry. Moreover, applying latent diffusion models on real data often yields +a textural shift incoherent to the image condition due to auto-encoding errors. +These two problems are further reinforced with the use of pixel-distance +losses. To address these issues, we propose tempering the diffusion model's +stochasticity with per-scene customization and mitigating the textural shift +with masked adversarial training. During the analyses, we also found the +commonly used pixel and perceptual losses are harmful in the NeRF inpainting +task. Through rigorous experiments, our framework yields state-of-the-art NeRF +inpainting results on various real-world scenes. Project page: +https://hubert0527.github.io/MALD-NeRF + +
+
+ comment: Accepted to ECCV 2024. Project page: + https://hubert0527.github.io/MALD-NeRF +
+
+
+
+
+ + ♻ ☆ Super Consistency of Neural Network Landscapes and Learning Rate + Transfer + + +
+ Recently, there has been growing evidence that if the width and depth of a +neural network are scaled toward the so-called rich feature learning limit +(\mup and its depth extension), then some hyperparameters -- such as the +learning rate -- exhibit transfer from small to very large models. From an +optimization perspective, this phenomenon is puzzling, as it implies that the +loss landscape is consistently similar across very different model sizes. In +this work, we study the landscape through the lens of the loss Hessian, with a +focus on its largest eigenvalue (i.e. the sharpness), and find that certain +spectral properties under $\mu$P are largely independent of the size of the +network, and remain consistent as training progresses. We name this property +Super Consistency of the landscape. On the other hand, we show that in the +Neural Tangent Kernel (NTK) and other scaling regimes, the sharpness exhibits +very different dynamics at different scales. But what causes these differences +in the sharpness dynamics? Through a connection between the Hessian's and the +NTK's spectrum, we argue that the cause lies in the presence (for $\mu$P) or +progressive absence (for the NTK scaling) of feature learning. We corroborate +our claims with a substantial suite of experiments, covering a wide range of +datasets and architectures: from ResNets and Vision Transformers trained on +benchmark vision datasets to Transformers-based language models trained on +WikiText. + +
+
+ comment: The paper has been accepted at Neurips 2024. This is a revised + version of the paper previously titled "Why do Learning Rates Transfer? + Reconciling Optimization and Scaling Limits for Deep Learning" +
+
+
+
+
+ + ♻ ☆ "No Matter What You Do": Purifying GNN Models via Backdoor Unlearning + + +
+ Recent studies have exposed that GNNs are vulnerable to several adversarial +attacks, among which backdoor attack is one of the toughest. Similar to Deep +Neural Networks (DNNs), backdoor attacks in GNNs lie in the fact that the +attacker modifies a portion of graph data by embedding triggers and enforces +the model to learn the trigger feature during the model training process. +Despite the massive prior backdoor defense works on DNNs, defending against +backdoor attacks in GNNs is largely unexplored, severely hindering the +widespread application of GNNs in real-world tasks. To bridge this gap, we +present GCleaner, the first backdoor mitigation method on GNNs. GCleaner can +mitigate the presence of the backdoor logic within backdoored GNNs by reversing +the backdoor learning procedure, aiming to restore the model performance to a +level similar to that is directly trained on the original clean dataset. To +achieve this objective, we ask: How to recover universal and hard backdoor +triggers in GNNs? How to unlearn the backdoor trigger feature while maintaining +the model performance? We conduct the graph trigger recovery via the +explanation method to identify optimal trigger locations, facilitating the +search of universal and hard backdoor triggers in the feature space of the +backdoored model through maximal similarity. Subsequently, we introduce the +backdoor unlearning mechanism, which combines knowledge distillation and +gradient-based explainable knowledge for fine-grained backdoor erasure. +Extensive experimental evaluations on four benchmark datasets demonstrate that +GCleaner can reduce the backdoor attack success rate to 10% with only 1% of +clean data, and has almost negligible degradation in model performance, which +far outperforms the state-of-the-art (SOTA) defense methods. + +
+
+ comment: 18 pages, 12 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Data-Prep-Kit: getting your data ready for LLM application development + + +
+ Data preparation is the first and a very important step towards any Large +Language Model (LLM) development. This paper introduces an easy-to-use, +extensible, and scale-flexible open-source data preparation toolkit called Data +Prep Kit (DPK). DPK is architected and designed to enable users to scale their +data preparation to their needs. With DPK they can prepare data on a local +machine or effortlessly scale to run on a cluster with thousands of CPU Cores. +DPK comes with a highly scalable, yet extensible set of modules that transform +natural language and code data. If the user needs additional transforms, they +can be easily developed using extensive DPK support for transform creation. +These modules can be used independently or pipelined to perform a series of +operations. In this paper, we describe DPK architecture and show its +performance from a small scale to a very large number of CPUs. The modules from +DPK have been used for the preparation of Granite Models [1] [2]. We believe +DPK is a valuable contribution to the AI community to easily prepare data to +enhance the performance of their LLM models or to fine-tune models with +Retrieval-Augmented Generation (RAG). + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ 3D Modelling to Address Pandemic Challenges: A Project-Based Learning + Methodology + + +
+ The use of 3D modelling in medical education is a revolutionary tool during +the learning process. In fact, this type of technology enables a more +interactive teaching approach, making information retention more effective and +enhancing students' understanding. 3D modelling allows for the creation of +precise representations of the human body, as well as interaction with +three-dimensional models, giving students a better spatial understanding of the +different organs and systems and enabling simulations of surgical and technical +procedures. This way, medical education is enriched with a more realistic and +safe educational experience. The goal is to understand whether, when students +and schools are challenged, they play an important role in addressing health +issues in their community. School-led projects are directed towards educational +scenarios that emphasize STEM education, tackling relevant public health +problems through open-school initiatives. By implementing an educational +scenario focused on 3D modelling and leveraging technology, we aim to raise +community awareness on public health issues. + +
+
+
+
+
+ + ☆ DiVR: incorporating context from diverse VR scenes for human trajectory + prediction + + +
+ Virtual environments provide a rich and controlled setting for collecting +detailed data on human behavior, offering unique opportunities for predicting +human trajectories in dynamic scenes. However, most existing approaches have +overlooked the potential of these environments, focusing instead on static +contexts without considering userspecific factors. Employing the CREATTIVE3D +dataset, our work models trajectories recorded in virtual reality (VR) scenes +for diverse situations including road-crossing tasks with user interactions and +simulated visual impairments. We propose Diverse Context VR Human Motion +Prediction (DiVR), a cross-modal transformer based on the Perceiver +architecture that integrates both static and dynamic scene context using a +heterogeneous graph convolution network. We conduct extensive experiments +comparing DiVR against existing architectures including MLP, LSTM, and +transformers with gaze and point cloud context. Additionally, we also stress +test our model's generalizability across different users, tasks, and scenes. +Results show that DiVR achieves higher accuracy and adaptability compared to +other models and to static graphs. This work highlights the advantages of using +VR datasets for context-aware human trajectory modeling, with potential +applications in enhancing user experiences in the metaverse. Our source code is +publicly available at https://gitlab.inria.fr/ffrancog/creattive3d-divr-model. + +
+
+
+
+
+ + ☆ Enhancing Multimodal Query Representation via Visual Dialogues for + End-to-End Knowledge Retrieval + + +
+ Existing multimodal retrieval systems often rely on disjointed models for +image comprehension, such as object detectors and caption generators, leading +to cumbersome implementations and training processes. To overcome this +limitation, we propose an end-to-end retrieval system, Ret-XKnow, to endow a +text retriever with the ability to understand multimodal queries via dynamic +modality interaction. Ret-XKnow leverages a partial convolution mechanism to +focus on visual information relevant to the given textual query, thereby +enhancing multimodal query representations. To effectively learn multimodal +interaction, we also introduce the Visual Dialogue-to-Retrieval (ViD2R) dataset +automatically constructed from visual dialogue datasets. Our dataset +construction process ensures that the dialogues are transformed into suitable +information retrieval tasks using a text retriever. We demonstrate that our +approach not only significantly improves retrieval performance in zero-shot +settings but also achieves substantial improvements in fine-tuning scenarios. +Our code is publicly available: https://github.com/yeongjoonJu/Ret_XKnow. + +
+
+
+
+
+ + ☆ PerceiverS: A Multi-Scale Perceiver with Effective Segmentation for + Long-Term Expressive Symbolic Music Generation + + +
+ Music generation has progressed significantly, especially in the domain of +audio generation. However, generating symbolic music that is both +long-structured and expressive remains a significant challenge. In this paper, +we propose PerceiverS (Segmentation and Scale), a novel architecture designed +to address this issue by leveraging both Effective Segmentation and Multi-Scale +attention mechanisms. Our approach enhances symbolic music generation by +simultaneously learning long-term structural dependencies and short-term +expressive details. By combining cross-attention and self-attention in a +Multi-Scale setting, PerceiverS captures long-range musical structure while +preserving performance nuances. The proposed model, evaluated on datasets like +Maestro, demonstrates improvements in generating coherent and diverse music +with both structural consistency and expressive variation. The project demos +and the generated music samples can be accessed through the link: +https://perceivers.github.io. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 84 + +
+
+
+ + ☆ Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial + Approach + + +
+ Deep learning underpins most of the currently advanced natural language +processing (NLP) tasks such as textual classification, neural machine +translation (NMT), abstractive summarization and question-answering (QA). +However, the robustness of the models, particularly QA models, against +adversarial attacks is a critical concern that remains insufficiently explored. +This paper introduces QA-Attack (Question Answering Attack), a novel word-level +adversarial strategy that fools QA models. Our attention-based attack exploits +the customized attention mechanism and deletion ranking strategy to identify +and target specific words within contextual passages. It creates deceptive +inputs by carefully choosing and substituting synonyms, preserving grammatical +integrity while misleading the model to produce incorrect responses. Our +approach demonstrates versatility across various question types, particularly +when dealing with extensive long textual inputs. Extensive experiments on +multiple benchmark datasets demonstrate that QA-Attack successfully deceives +baseline QA models and surpasses existing adversarial techniques regarding +success rate, semantics changes, BLEU score, fluency and grammar error rate. + +
+
+
+
+
+ + ☆ Beyond the Safety Bundle: Auditing the Helpful and Harmless Dataset + + +
+ In an effort to mitigate the harms of large language models (LLMs), learning +from human feedback (LHF) has been used to steer LLMs towards outputs that are +intended to be both less harmful and more helpful. Despite the widespread +adoption of LHF in practice, the quality of this feedback and its effectiveness +as a safety mitigation technique remain unclear. This study addresses these +issues by auditing the widely-used Helpful and Harmless (HH) dataset by +Anthropic. Our work includes: (1) a thorough investigation of the dataset's +content through both manual and automated evaluation; (2) experiments +demonstrating the dataset's impact on models' safety; and (3) an analysis of +the 100 most influential papers citing this dataset. Through our audit, we +showcase how conceptualization failures and quality issues identified in the HH +dataset can create additional harms by leading to disparate safety behaviors +across demographic groups. Our findings highlight the need for more nuanced, +context-sensitive approaches to safety mitigation in LLMs. + +
+
+ comment: Prepared for conference submission +
+
+
+
+
+ + ☆ Retrieval, Reasoning, Re-ranking: A Context-Enriched Framework for + Knowledge Graph Completion + + +
+ The Knowledge Graph Completion~(KGC) task aims to infer the missing entity +from an incomplete triple. Existing embedding-based methods rely solely on +triples in the KG, which is vulnerable to specious relation patterns and +long-tail entities. On the other hand, text-based methods struggle with the +semantic gap between KG triples and natural language. Apart from triples, +entity contexts (e.g., labels, descriptions, aliases) also play a significant +role in augmenting KGs. To address these limitations, we propose KGR3, a +context-enriched framework for KGC. KGR3 is composed of three modules. Firstly, +the Retrieval module gathers supporting triples from the KG, collects plausible +candidate answers from a base embedding model, and retrieves context for each +related entity. Then, the Reasoning module employs a large language model to +generate potential answers for each query triple. Finally, the Re-ranking +module combines candidate answers from the two modules mentioned above, and +fine-tunes an LLM to provide the best answer. Extensive experiments on widely +used datasets demonstrate that KGR3 consistently improves various KGC methods. +Specifically, the best variant of KGR3 achieves absolute Hits@1 improvements of +12.3% and 5.6% on the FB15k237 and WN18RR datasets. + +
+
+
+
+
+ + ☆ Large Language Models Can Self-Improve in Long-context Reasoning + + +
+ Large language models (LLMs) have achieved substantial progress in processing +long contexts but still struggle with long-context reasoning. Existing +approaches typically involve fine-tuning LLMs with synthetic data, which +depends on annotations from human experts or advanced models like GPT-4, thus +restricting further advancements. To address this issue, we investigate the +potential for LLMs to self-improve in long-context reasoning and propose \ours, +an approach specifically designed for this purpose. This approach is +straightforward: we sample multiple outputs for each question, score them with +Minimum Bayes Risk, and then apply supervised fine-tuning or preference +optimization based on these outputs. Extensive experiments on several leading +LLMs demonstrate the effectiveness of \ours, with an absolute improvement of +$4.2$ points for Llama-3.1-8B-Instruct. Furthermore, \ours achieves superior +performance compared to prior approaches that depend on data produced by human +experts or advanced models. We anticipate that this work will open new avenues +for self-improvement techniques in long-context scenarios, which are essential +for the continual advancement of LLMs. + +
+
+ comment: Project Page: https://github.com/SihengLi99/SEALONG +
+
+
+
+
+ + ☆ On the Role of Speech Data in Reducing Toxicity Detection Bias + + +
+ Text toxicity detection systems exhibit significant biases, producing +disproportionate rates of false positives on samples mentioning demographic +groups. But what about toxicity detection in speech? To investigate the extent +to which text-based biases are mitigated by speech-based systems, we produce a +set of high-quality group annotations for the multilingual MuTox dataset, and +then leverage these annotations to systematically compare speech- and +text-based toxicity classifiers. Our findings indicate that access to speech +data during inference supports reduced bias against group mentions, +particularly for ambiguous and disagreement-inducing samples. Our results also +suggest that improving classifiers, rather than transcription pipelines, is +more helpful for reducing group bias. We publicly release our annotations and +provide recommendations for future toxicity dataset construction. + +
+
+
+
+
+ + ☆ Language Models as Causal Effect Generators + + +
+ We present a framework for large language model (LLM) based data generation +with controllable causal structure. In particular, we define a procedure for +turning any language model and any directed acyclic graph (DAG) into a +sequence-driven structural causal model (SD-SCM). Broadly speaking, an SD-SCM +is a causal model with user-defined structure and LLM-defined structural +equations. We characterize how an SD-SCM allows sampling from observational, +interventional, and counterfactual distributions according to the desired +causal structure. We then leverage this procedure to propose a new type of +benchmark for causal inference methods, generating individual-level +counterfactual data without needing to manually specify functional +relationships between variables. We create an example benchmark consisting of +thousands of datasets, and test a suite of popular estimation methods on these +datasets for average, conditional average, and individual treatment effect +estimation, both with and without hidden confounding. Apart from generating +data, the same procedure also allows us to test for the presence of a causal +effect that might be encoded in an LLM. This procedure can underpin auditing +LLMs for misinformation, discrimination, or otherwise undesirable behavior. We +believe SD-SCMs can serve as a useful tool in any application that would +benefit from sequential data with controllable causal structure. + +
+
+
+
+
+ + ☆ ExpressivityArena: Can LLMs Express Information Implicitly? + + +
+ While Large Language Models (LLMs) have demonstrated remarkable performance +in certain dimensions, their ability to express implicit language cues that +human use for effective communication remains unclear. This paper presents +ExpressivityArena, a Python library for measuring the implicit communication +abilities of LLMs. We provide a comprehensive framework to evaluate +expressivity of arbitrary LLMs and explore its practical implications. To this +end, we refine the definition and measurements of ``expressivity,'' and use our +framework in a set of small experiments. These experiments test LLMs in +creative and logical tasks such as poetry, coding, and emotion-based responses. +They are then evaluated by an automated grader, through ExpressivityArena, +which we verify to be the most pragmatic for testing expressivity. Building on +these experiments, we deepen our understanding of the expressivity of LLMs by +assessing their ability to remain expressive in conversations. Our findings +indicate that LLMs are capable of generating and understanding expressive +content, however, with some limitations. These insights will inform the future +development and deployment of expressive LLMs. We provide the code for +ExpressivityArena alongside our paper. + +
+
+ comment: 8 pages, 22 figures +
+
+
+
+
+ + ☆ Can adversarial attacks by large language models be attributed? + + +
+ Attributing outputs from Large Language Models (LLMs) in adversarial +settings-such as cyberattacks and disinformation-presents significant +challenges that are likely to grow in importance. We investigate this +attribution problem using formal language theory, specifically language +identification in the limit as introduced by Gold and extended by Angluin. By +modeling LLM outputs as formal languages, we analyze whether finite text +samples can uniquely pinpoint the originating model. Our results show that due +to the non-identifiability of certain language classes, under some mild +assumptions about overlapping outputs from fine-tuned models it is +theoretically impossible to attribute outputs to specific LLMs with certainty. +This holds also when accounting for expressivity limitations of Transformer +architectures. Even with direct model access or comprehensive monitoring, +significant computational hurdles impede attribution efforts. These findings +highlight an urgent need for proactive measures to mitigate risks posed by +adversarial LLM use as their influence continues to expand. + +
+
+ comment: 7 pages, 1 figure +
+
+
+
+
+ + ☆ Derivational Morphology Reveals Analogical Generalization in Large + Language Models + + +
+ What mechanisms underlie linguistic generalization in large language models +(LLMs)? This question has attracted considerable attention, with most studies +analyzing the extent to which the language skills of LLMs resemble rules. As of +yet, it is not known whether linguistic generalization in LLMs could equally +well be explained as the result of analogical processes, which can be +formalized as similarity operations on stored exemplars. A key shortcoming of +prior research is its focus on linguistic phenomena with a high degree of +regularity, for which rule-based and analogical approaches make the same +predictions. Here, we instead examine derivational morphology, specifically +English adjective nominalization, which displays notable variability. We +introduce a new method for investigating linguistic generalization in LLMs: +focusing on GPT-J, we fit cognitive models that instantiate rule-based and +analogical learning to the LLM training data and compare their predictions on a +set of nonce adjectives with those of the LLM, allowing us to draw direct +conclusions regarding underlying mechanisms. As expected, rule-based and +analogical models explain the predictions of GPT-J equally well for adjectives +with regular nominalization patterns. However, for adjectives with variable +nominalization patterns, the analogical model provides a much better match. +Furthermore, GPT-J's behavior is sensitive to the individual word frequencies, +even for regular forms, a behavior that is consistent with an analogical +account of regular forms but not a rule-based one. These findings refute the +hypothesis that GPT-J's linguistic generalization on adjective nominalization +involves rules, suggesting similarity operations on stored exemplars as the +underlying mechanism. Overall, our study suggests that analogical processes +play a bigger role in the linguistic generalization of LLMs than previously +thought. + +
+
+
+
+
+ + ☆ JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified + Multimodal Understanding and Generation + + +
+ We present JanusFlow, a powerful framework that unifies image understanding +and generation in a single model. JanusFlow introduces a minimalist +architecture that integrates autoregressive language models with rectified +flow, a state-of-the-art method in generative modeling. Our key finding +demonstrates that rectified flow can be straightforwardly trained within the +large language model framework, eliminating the need for complex architectural +modifications. To further improve the performance of our unified model, we +adopt two key strategies: (i) decoupling the understanding and generation +encoders, and (ii) aligning their representations during unified training. +Extensive experiments show that JanusFlow achieves comparable or superior +performance to specialized models in their respective domains, while +significantly outperforming existing unified approaches across standard +benchmarks. This work represents a step toward more efficient and versatile +vision-language models. + +
+
+
+
+
+ + ☆ From General to Specific: Utilizing General Hallucation to Automatically + Measure the Role Relationship Fidelity for Specific Role-Play Agents + + +
+ The advanced role-playing capabilities of Large Language Models (LLMs) have +paved the way for developing Role-Playing Agents (RPAs). However, existing +benchmarks, such as HPD, which incorporates manually scored character +relationships into the context for LLMs to sort coherence, and SocialBench, +which uses specific profiles generated by LLMs in the context of +multiple-choice tasks to assess character preferences, face limitations like +poor generalizability, implicit and inaccurate judgments, and excessive context +length. To address the above issues, we propose an automatic, scalable, and +generalizable paradigm. Specifically, we construct a benchmark by extracting +relations from a general knowledge graph and leverage RPA's inherent +hallucination properties to prompt it to interact across roles, employing +ChatGPT for stance detection and defining relationship hallucination along with +three related metrics. Extensive experiments validate the effectiveness and +stability of our metrics. Our findings further explore factors influencing +these metrics and discuss the trade-off between relationship hallucination and +factuality. + +
+
+
+
+
+ + ☆ CryptoLLM: Unleashing the Power of Prompted LLMs for SmartQnA and + Classification of Crypto Posts + + +
+ The rapid growth of social media has resulted in an large volume of +user-generated content, particularly in niche domains such as cryptocurrency. +This task focuses on developing robust classification models to accurately +categorize cryptocurrency-related social media posts into predefined classes, +including but not limited to objective, positive, negative, etc. Additionally, +the task requires participants to identify the most relevant answers from a set +of posts in response to specific questions. By leveraging advanced LLMs, this +research aims to enhance the understanding and filtering of cryptocurrency +discourse, thereby facilitating more informed decision-making in this volatile +sector. We have used a prompt-based technique to solve the classification task +for reddit posts and twitter posts. Also, we have used 64-shot technique along +with prompts on GPT-4-Turbo model to determine whether a answer is relevant to +a question or not. + +
+
+ comment: Accepted at FIRE 2024 (Track: Opinion Extraction and Question + Answering from CryptoCurrency-Related Tweets and Reddit posts (CryptOQA)) +
+
+
+
+
+ + ☆ Mapping the Podcast Ecosystem with the Structured Podcast Research + Corpus + + +
+ Podcasts provide highly diverse content to a massive listener base through a +unique on-demand modality. However, limited data has prevented large-scale +computational analysis of the podcast ecosystem. To fill this gap, we introduce +a massive dataset of over 1.1M podcast transcripts that is largely +comprehensive of all English language podcasts available through public RSS +feeds from May and June of 2020. This data is not limited to text, but rather +includes audio features and speaker turns for a subset of 370K episodes, and +speaker role inferences and other metadata for all 1.1M episodes. Using this +data, we also conduct a foundational investigation into the content, structure, +and responsiveness of this ecosystem. Together, our data and analyses open the +door to continued computational research of this popular and impactful medium. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Trustful LLMs: Customizing and Grounding Text Generation with Knowledge + Bases and Dual Decoders + + +
+ Although people are impressed by the content generation skills of large +language models, the use of LLMs, such as ChatGPT, is limited by the domain +grounding of the content. The correctness and groundedness of the generated +content need to be based on a verified context, such as results from +Retrieval-Augmented Generation (RAG). One important issue when adapting LLMs to +a customized domain is that the generated responses are often incomplete, or +the additions are not verified and may even be hallucinated. Prior studies on +hallucination detection have focused on evaluation metrics, which are not +easily adaptable to dynamic domains and can be vulnerable to attacks like +jail-breaking. In this work, we propose 1) a post-processing algorithm that +leverages knowledge triplets in RAG context to correct hallucinations and 2) a +dual-decoder model that fuses RAG context to guide the generation process. + +
+
+
+
+
+ + ☆ Verbosity $\neq$ Veracity: Demystify Verbosity Compensation Behavior of + Large Language Models + + +
+ When unsure about an answer, humans often respond with more words than +necessary, hoping that part of the response will be correct. We observe a +similar behavior in large language models (LLMs), which we term "Verbosity +Compensation" (VC). VC is harmful because it confuses the user understanding, +leading to low efficiency, and influences the LLM services by increasing the +latency and cost of generating useless tokens. In this paper, we present the +first work that defines and analyzes Verbosity Compensation, explores its +causes, and proposes a simple mitigating approach. We define Verbosity +Compensation as the behavior of generating responses that can be compressed +without information loss when prompted to write concisely. Our experiments, +conducted on five datasets of knowledge and reasoning-based QA tasks with 14 +newly developed LLMs, reveal three conclusions. 1) We reveal a pervasive +presence of verbosity compensation across all models and all datasets. Notably, +GPT-4 exhibits a VC frequency of 50.40%. 2) We reveal the large performance gap +between verbose and concise responses, with a notable difference of 27.61% on +the Qasper dataset. We also demonstrate that this difference does not naturally +diminish as LLM capability increases. Both 1) and 2) highlight the urgent need +to mitigate the frequency of VC behavior and disentangle verbosity with +veracity. We propose a simple yet effective cascade algorithm that replaces the +verbose responses with the other model-generated responses. The results show +that our approach effectively alleviates the VC of the Mistral model from +63.81% to 16.16% on the Qasper dataset. 3) We also find that verbose responses +exhibit higher uncertainty across all five datasets, suggesting a strong +connection between verbosity and model uncertainty. Our dataset and code are +available at https://github.com/psunlpgroup/VerbosityLLM. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Tucano: Advancing Neural Text Generation for Portuguese + + +
+ Significant advances have been made in natural language processing in recent +years. However, our current deep learning approach to language modeling +requires substantial resources in terms of data and computation. One of the +side effects of this data-hungry paradigm is the current schism between +languages, separating those considered high-resource, where most of the +development happens and resources are available, and the low-resource ones, +which struggle to attain the same level of performance and autonomy. This study +aims to introduce a new set of resources to stimulate the future development of +neural text generation in Portuguese. In this work, we document the development +of GigaVerbo, a concatenation of deduplicated Portuguese text corpora amounting +to 200 billion tokens. Via this corpus, we trained a series of +decoder-transformers named Tucano. Our models perform equal or superior to +other Portuguese and multilingual language models of similar size in several +Portuguese benchmarks. The evaluation of our models also reveals that model +performance on many currently available benchmarks used by the Portuguese NLP +community has little to no correlation with the scaling of token ingestion +during training, highlighting the limitations of such evaluations when it comes +to the assessment of Portuguese generative language models. All derivatives of +our study are openly released on GitHub and Hugging Face. See +https://nkluge-correa.github.io/Tucano/ + +
+
+
+
+
+ + ☆ IAE: Irony-based Adversarial Examples for Sentiment Analysis Systems + + +
+ Adversarial examples, which are inputs deliberately perturbed with +imperceptible changes to induce model errors, have raised serious concerns for +the reliability and security of deep neural networks (DNNs). While adversarial +attacks have been extensively studied in continuous data domains such as +images, the discrete nature of text presents unique challenges. In this paper, +we propose Irony-based Adversarial Examples (IAE), a method that transforms +straightforward sentences into ironic ones to create adversarial text. This +approach exploits the rhetorical device of irony, where the intended meaning is +opposite to the literal interpretation, requiring a deeper understanding of +context to detect. The IAE method is particularly challenging due to the need +to accurately locate evaluation words, substitute them with appropriate +collocations, and expand the text with suitable ironic elements while +maintaining semantic coherence. Our research makes the following key +contributions: (1) We introduce IAE, a strategy for generating textual +adversarial examples using irony. This method does not rely on pre-existing +irony corpora, making it a versatile tool for creating adversarial text in +various NLP tasks. (2) We demonstrate that the performance of several +state-of-the-art deep learning models on sentiment analysis tasks significantly +deteriorates when subjected to IAE attacks. This finding underscores the +susceptibility of current NLP systems to adversarial manipulation through +irony. (3) We compare the impact of IAE on human judgment versus NLP systems, +revealing that humans are less susceptible to the effects of irony in text. + +
+
+
+
+
+ + ☆ Ethical Concern Identification in NLP: A Corpus of ACL Anthology Ethics + Statements + + +
+ What ethical concerns, if any, do LLM researchers have? We introduce EthiCon, +a corpus of 1,580 ethical concern statements extracted from scientific papers +published in the ACL Anthology. We extract ethical concern keywords from the +statements and show promising results in automating the concern identification +process. Through a survey, we compare the ethical concerns of the corpus to the +concerns listed by the general public and professionals in the field. Finally, +we compare our retrieved ethical concerns with existing taxonomies pointing to +gaps and future research directions. + +
+
+
+
+
+ + ☆ Chain Association-based Attacking and Shielding Natural Language + Processing Systems + + +
+ Association as a gift enables people do not have to mention something in +completely straightforward words and allows others to understand what they +intend to refer to. In this paper, we propose a chain association-based +adversarial attack against natural language processing systems, utilizing the +comprehension gap between humans and machines. We first generate a chain +association graph for Chinese characters based on the association paradigm for +building search space of potential adversarial examples. Then, we introduce an +discrete particle swarm optimization algorithm to search for the optimal +adversarial examples. We conduct comprehensive experiments and show that +advanced natural language processing models and applications, including large +language models, are vulnerable to our attack, while humans appear good at +understanding the perturbed text. We also explore two methods, including +adversarial training and associative graph-based recovery, to shield systems +from chain association-based attack. Since a few examples that use some +derogatory terms, this paper contains materials that may be offensive or +upsetting to some people. + +
+
+
+
+
+ + ☆ Likelihood as a Performance Gauge for Retrieval-Augmented Generation NAACL 2025 + + +
+ Recent work finds that retrieval-augmented generation with large language +models is prone to be influenced by the order of retrieved documents in the +context. However, the lack of in-depth analysis limits the use of this +phenomenon for prompt engineering in practice. In this study, we posit that +likelihoods serve as an effective gauge for language model performance. Through +experiments on two question-answering datasets with a variety of +state-of-the-art language models, we reveal correlations between answer +accuracy and the likelihood of the question at both the corpus level and the +instance level. In addition, we find that question likelihood can also indicate +the position of the task-relevant information in the context. Based on these +findings, we propose two methods that use question likelihood as a gauge for +selecting and constructing prompts that lead to better performance. We +demonstrate their effectiveness with experiments. In addition, our +likelihood-based methods are efficient, as they only need to compute the +likelihood of the input, requiring much fewer language model passes than +heuristic prompt engineering methods that require generating responses. Our +analysis deepens our understanding of how input prompts affect model +performance and provides a promising direction for efficient prompt +optimization. + +
+
+ comment: Under review at NAACL 2025. Code is available at + https://github.com/lyutyuh/poptimizer +
+
+
+
+
+ + ☆ Automatic Album Sequencing + + +
+ Album sequencing is a critical part of the album production process. +Recently, a data-driven approach was proposed that sequences general +collections of independent media by extracting the narrative essence of the +items in the collections. While this approach implies an album sequencing +technique, it is not widely accessible to a less technical audience, requiring +advanced knowledge of machine learning techniques to use. To address this, we +introduce a new user-friendly web-based tool that allows a less technical +audience to upload music tracks, execute this technique in one click, and +subsequently presents the result in a clean visualization to the user. To both +increase the number of templates available to the user and address shortcomings +of previous work, we also introduce a new direct transformer-based album +sequencing method. We find that our more direct method outperforms a random +baseline but does not reach the same performance as the narrative essence +approach. Both methods are included in our web-based user interface, and this +-- alongside a full copy of our implementation -- is publicly available at +https://github.com/dylanashley/automatic-album-sequencing + +
+
+ comment: presented as a late breaking demo in the 25th International Society + for Music Information Retrieval Conference; 3 pages in main text, 3 figures + in main text; source code available at + https://github.com/dylanashley/automatic-album-sequencing +
+
+
+
+
+ + ☆ Spider 2.0: Evaluating Language Models on Real-World Enterprise + Text-to-SQL Workflows + + +
+ Real-world enterprise text-to-SQL workflows often involve complex cloud or +local data across various database systems, multiple SQL queries in various +dialects, and diverse operations from data transformation to analytics. We +introduce Spider 2.0, an evaluation framework comprising 632 real-world +text-to-SQL workflow problems derived from enterprise-level database use cases. +The databases in Spider 2.0 are sourced from real data applications, often +containing over 1,000 columns and stored in local or cloud database systems +such as BigQuery and Snowflake. We show that solving problems in Spider 2.0 +frequently requires understanding and searching through database metadata, +dialect documentation, and even project-level codebases. This challenge calls +for models to interact with complex SQL workflow environments, process +extremely long contexts, perform intricate reasoning, and generate multiple SQL +queries with diverse operations, often exceeding 100 lines, which goes far +beyond traditional text-to-SQL challenges. Our evaluations indicate that based +on o1-preview, our code agent framework successfully solves only 17.0% of the +tasks, compared with 91.2% on Spider 1.0 and 73.0% on BIRD. Our results on +Spider 2.0 show that while language models have demonstrated remarkable +performance in code generation -- especially in prior text-to-SQL benchmarks -- +they require significant improvement in order to achieve adequate performance +for real-world enterprise usage. Progress on Spider 2.0 represents crucial +steps towards developing intelligent, autonomous, code agents for real-world +enterprise settings. Our code, baseline models, and data are available at +https://spider2-sql.github.io. + +
+
+
+
+
+ + ☆ Mitigating Bias in Queer Representation within Large Language Models: A + Collaborative Agent Approach NeurIPS 2024 + + +
+ Large Language Models (LLMs) often perpetuate biases in pronoun usage, +leading to misrepresentation or exclusion of queer individuals. This paper +addresses the specific problem of biased pronoun usage in LLM outputs, +particularly the inappropriate use of traditionally gendered pronouns ("he," +"she") when inclusive language is needed to accurately represent all +identities. We introduce a collaborative agent pipeline designed to mitigate +these biases by analyzing and optimizing pronoun usage for inclusivity. Our +multi-agent framework includes specialized agents for both bias detection and +correction. Experimental evaluations using the Tango dataset-a benchmark +focused on gender pronoun usage-demonstrate that our approach significantly +improves inclusive pronoun classification, achieving a 32.6 percentage point +increase over GPT-4o in correctly disagreeing with inappropriate traditionally +gendered pronouns $(\chi^2 = 38.57, p < 0.0001)$. These results accentuate the +potential of agent-driven frameworks in enhancing fairness and inclusivity in +AI-generated content, demonstrating their efficacy in reducing biases and +promoting socially responsible AI. + +
+
+ comment: NeurIPS 2024 Queer in AI Workshop +
+
+
+
+
+ + ☆ Annotating Constructions with UD: the experience of the Italian + Constructicon + + +
+ The paper descirbes a first attempt of linking the Italian constructicon to +UD resources + +
+
+
+
+
+ + ☆ Direct Preference Optimization Using Sparse Feature-Level Constraints + + +
+ The alignment of large language models (LLMs) with human preferences remains +a key challenge. While post-training techniques like Reinforcement Learning +from Human Feedback (RLHF) and Direct Preference Optimization (DPO) have +achieved notable success, they often introduce computational inefficiencies and +training instability. In this paper, we propose Feature-level constrained +Preference Optimization (FPO), a novel method designed to simplify the +alignment process while ensuring stability. FPO leverages pre-trained Sparse +Autoencoders (SAEs) and introduces feature-level constraints, allowing for +efficient, sparsity-enforced alignment. Our approach enjoys efficiency by using +sparse features activated in a well-trained sparse autoencoder and the quality +of sequential KL divergence by using the feature-level offline reference. +Experimental results on benchmark datasets demonstrate that FPO achieves a +5.08% absolute improvement in win rate with much lower computational cost +compared to state-of-the-art baselines, making it a promising solution for +efficient and controllable LLM alignments. + +
+
+
+
+
+ + ☆ Multimodal Clinical Reasoning through Knowledge-augmented Rationale + Generation + + +
+ Clinical rationales play a pivotal role in accurate disease diagnosis; +however, many models predominantly use discriminative methods and overlook the +importance of generating supportive rationales. Rationale distillation is a +process that transfers knowledge from large language models (LLMs) to smaller +language models (SLMs), thereby enhancing the latter's ability to break down +complex tasks. Despite its benefits, rationale distillation alone is inadequate +for addressing domain knowledge limitations in tasks requiring specialized +expertise, such as disease diagnosis. Effectively embedding domain knowledge in +SLMs poses a significant challenge. While current LLMs are primarily geared +toward processing textual data, multimodal LLMs that incorporate time series +data, especially electronic health records (EHRs), are still evolving. To +tackle these limitations, we introduce ClinRaGen, an SLM optimized for +multimodal rationale generation in disease diagnosis. ClinRaGen incorporates a +unique knowledge-augmented attention mechanism to merge domain knowledge with +time series EHR data, utilizing a stepwise rationale distillation strategy to +produce both textual and time series-based clinical rationales. Our evaluations +show that ClinRaGen markedly improves the SLM's capability to interpret +multimodal EHR data and generate accurate clinical rationales, supporting more +reliable disease diagnosis, advancing LLM applications in healthcare, and +narrowing the performance divide between LLMs and SLMs. + +
+
+ comment: 11 pages. 4 figures +
+
+
+
+
+ + ☆ Circuit Complexity Bounds for RoPE-based Transformer Architecture + + +
+ Characterizing the express power of the Transformer architecture is critical +to understanding its capacity limits and scaling law. Recent works provide the +circuit complexity bounds to Transformer-like architecture. On the other hand, +Rotary Position Embedding ($\mathsf{RoPE}$) has emerged as a crucial technique +in modern large language models, offering superior performance in capturing +positional information compared to traditional position embeddings, which shows +great potential in application prospects, particularly for the long context +scenario. Empirical evidence also suggests that $\mathsf{RoPE}$-based +Transformer architectures demonstrate greater generalization capabilities +compared to conventional Transformer models. In this work, we establish a +tighter circuit complexity bound for Transformers with $\mathsf{RoPE}$ +attention. Our key contribution is that we show that unless $\mathsf{TC}^0 = +\mathsf{NC}^1$, a $\mathsf{RoPE}$-based Transformer with +$\mathrm{poly}(n)$-precision, $O(1)$ layers, hidden dimension $d \leq O(n)$ +cannot solve the arithmetic problem or the Boolean formula value problem. This +result significantly demonstrates the fundamental limitation of the +expressivity of the $\mathsf{RoPE}$-based Transformer architecture, although it +achieves giant empirical success. Our theoretical framework not only +establishes tighter complexity bounds but also may instruct further work on the +$\mathsf{RoPE}$-based Transformer. + +
+
+
+
+
+ + ☆ Problem-Oriented Segmentation and Retrieval: Case Study on Tutoring + Conversations EMNLP 2024 + + +
+ Many open-ended conversations (e.g., tutoring lessons or business meetings) +revolve around pre-defined reference materials, like worksheets or meeting +bullets. To provide a framework for studying such conversation structure, we +introduce Problem-Oriented Segmentation & Retrieval (POSR), the task of jointly +breaking down conversations into segments and linking each segment to the +relevant reference item. As a case study, we apply POSR to education where +effectively structuring lessons around problems is critical yet difficult. We +present LessonLink, the first dataset of real-world tutoring lessons, featuring +3,500 segments, spanning 24,300 minutes of instruction and linked to 116 SAT +math problems. We define and evaluate several joint and independent approaches +for POSR, including segmentation (e.g., TextTiling), retrieval (e.g., ColBERT), +and large language models (LLMs) methods. Our results highlight that modeling +POSR as one joint task is essential: POSR methods outperform independent +segmentation and retrieval pipelines by up to +76% on joint metrics and surpass +traditional segmentation methods by up to +78% on segmentation metrics. We +demonstrate POSR's practical impact on downstream education applications, +deriving new insights on the language and time use in real-world lesson +structures. + +
+
+ comment: EMNLP 2024 Findings. Our code and dataset are open-sourced at + https://github.com/rosewang2008/posr +
+
+
+
+
+ + ☆ Entropy Controllable Direct Preference Optimization + + +
+ In the post-training of large language models (LLMs), Reinforcement Learning +from Human Feedback (RLHF) is an effective approach to achieve generation +aligned with human preferences. Direct Preference Optimization (DPO) allows for +policy training with a simple binary cross-entropy loss without a reward model. +The objective of DPO is regularized by reverse KL divergence that encourages +mode-seeking fitting to the reference policy. Nonetheless, we indicate that +minimizing reverse KL divergence could fail to capture a mode of the reference +distribution, which may hurt the policy's performance. Based on this +observation, we propose a simple modification to DPO, H-DPO, which allows for +control over the entropy of the resulting policy, enhancing the distribution's +sharpness and thereby enabling mode-seeking fitting more effectively. In our +experiments, we show that H-DPO outperformed DPO across various tasks, +demonstrating superior results in pass@$k$ evaluations for mathematical tasks. +Moreover, H-DPO is simple to implement, requiring only minor modifications to +the loss calculation of DPO, which makes it highly practical and promising for +wide-ranging applications in the training of LLMs. + +
+
+
+
+
+ + ☆ Contrastive Language Prompting to Ease False Positives in Medical + Anomaly Detection + + +
+ A pre-trained visual-language model, contrastive language-image pre-training +(CLIP), successfully accomplishes various downstream tasks with text prompts, +such as finding images or localizing regions within the image. Despite CLIP's +strong multi-modal data capabilities, it remains limited in specialized +environments, such as medical applications. For this purpose, many CLIP +variants-i.e., BioMedCLIP, and MedCLIP-SAMv2-have emerged, but false positives +related to normal regions persist. Thus, we aim to present a simple yet +important goal of reducing false positives in medical anomaly detection. We +introduce a Contrastive LAnguage Prompting (CLAP) method that leverages both +positive and negative text prompts. This straightforward approach identifies +potential lesion regions by visual attention to the positive prompts in the +given image. To reduce false positives, we attenuate attention on normal +regions using negative prompts. Extensive experiments with the BMAD dataset, +including six biomedical benchmarks, demonstrate that CLAP method enhances +anomaly detection performance. Our future plans include developing an automated +fine prompting method for more practical usage. + +
+
+ comment: 4 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Large Language Models as Neurolinguistic Subjects: Identifying Internal + Representations for Form and Meaning + + +
+ This study investigates the linguistic understanding of Large Language Models +(LLMs) regarding signifier (form) and signified (meaning) by distinguishing two +LLM evaluation paradigms: psycholinguistic and neurolinguistic. Traditional +psycholinguistic evaluations often reflect statistical biases that may +misrepresent LLMs' true linguistic capabilities. We introduce a neurolinguistic +approach, utilizing a novel method that combines minimal pair and diagnostic +probing to analyze activation patterns across model layers. This method allows +for a detailed examination of how LLMs represent form and meaning, and whether +these representations are consistent across languages. Our contributions are +three-fold: (1) We compare neurolinguistic and psycholinguistic methods, +revealing distinct patterns in LLM assessment; (2) We demonstrate that LLMs +exhibit higher competence in form compared to meaning, with the latter largely +correlated to the former; (3) We present new conceptual minimal pair datasets +for Chinese (COMPS-ZH) and German (COMPS-DE), complementing existing English +datasets. + +
+
+
+
+
+ + ☆ SecEncoder: Logs are All You Need in Security + + +
+ Large and Small Language Models (LMs) are typically pretrained using +extensive volumes of text, which are sourced from publicly accessible platforms +such as Wikipedia, Book Corpus, or through web scraping. These models, due to +their exposure to a wide range of language data, exhibit impressive +generalization capabilities and can perform a multitude of tasks +simultaneously. However, they often fall short when it comes to domain-specific +tasks due to their broad training data. This paper introduces SecEncoder, a +specialized small language model that is pretrained using security logs. +SecEncoder is designed to address the domain-specific limitations of general +LMs by focusing on the unique language and patterns found in security logs. +Experimental results indicate that SecEncoder outperforms other LMs, such as +BERTlarge, DeBERTa-v3-large and OpenAI's Embedding (textembedding-ada-002) +models, which are pretrained mainly on natural language, across various tasks. +Furthermore, although SecEncoder is primarily pretrained on log data, it +outperforms models pretrained on natural language for a range of tasks beyond +log analysis, such as incident prioritization and threat intelligence document +retrieval. This suggests that domain specific pretraining with logs can +significantly enhance the performance of LMs in security. These findings pave +the way for future research into security-specific LMs and their potential +applications. + +
+
+
+
+
+ + ☆ Prompt-enhanced Network for Hateful Meme Classification + + +
+ The dynamic expansion of social media has led to an inundation of hateful +memes on media platforms, accentuating the growing need for efficient +identification and removal. Acknowledging the constraints of conventional +multimodal hateful meme classification, which heavily depends on external +knowledge and poses the risk of including irrelevant or redundant content, we +developed Pen -- a prompt-enhanced network framework based on the prompt +learning approach. Specifically, after constructing the sequence through the +prompt method and encoding it with a language model, we performed region +information global extraction on the encoded sequence for multi-view +perception. By capturing global information about inference instances and +demonstrations, Pen facilitates category selection by fully leveraging sequence +information. This approach significantly improves model classification +accuracy. Additionally, to bolster the model's reasoning capabilities in the +feature space, we introduced prompt-aware contrastive learning into the +framework to improve the quality of sample feature distributions. Through +extensive ablation experiments on two public datasets, we evaluate the +effectiveness of the Pen framework, concurrently comparing it with +state-of-the-art model baselines. Our research findings highlight that Pen +surpasses manual prompt methods, showcasing superior generalization and +classification accuracy in hateful meme classification tasks. Our code is +available at https://github.com/juszzi/Pen. + +
+
+ comment: Published in Proceedings of the Thirty-Third International Joint + Conference on Artificial Intelligence Main Track. Pages 6397-6405 +
+
+
+
+
+ + ☆ SparrowVQE: Visual Question Explanation for Course Content Understanding + + +
+ Visual Question Answering (VQA) research seeks to create AI systems to answer +natural language questions in images, yet VQA methods often yield overly +simplistic and short answers. This paper aims to advance the field by +introducing Visual Question Explanation (VQE), which enhances the ability of +VQA to provide detailed explanations rather than brief responses and address +the need for more complex interaction with visual content. We first created an +MLVQE dataset from a 14-week streamed video machine learning course, including +885 slide images, 110,407 words of transcripts, and 9,416 designed +question-answer (QA) pairs. Next, we proposed a novel SparrowVQE, a small 3 +billion parameters multimodal model. We trained our model with a three-stage +training mechanism consisting of multimodal pre-training (slide images and +transcripts feature alignment), instruction tuning (tuning the pre-trained +model with transcripts and QA pairs), and domain fine-tuning (fine-tuning slide +image and QA pairs). Eventually, our SparrowVQE can understand and connect +visual information using the SigLIP model with transcripts using the Phi-2 +language model with an MLP adapter. Experimental results demonstrate that our +SparrowVQE achieves better performance in our developed MLVQE dataset and +outperforms state-of-the-art methods in the other five benchmark VQA datasets. +The source code is available at +\url{https://github.com/YoushanZhang/SparrowVQE}. + +
+
+
+
+
+ + ☆ Rapid Response: Mitigating LLM Jailbreaks with a Few Examples + + +
+ As large language models (LLMs) grow more powerful, ensuring their safety +against misuse becomes crucial. While researchers have focused on developing +robust defenses, no method has yet achieved complete invulnerability to +attacks. We propose an alternative approach: instead of seeking perfect +adversarial robustness, we develop rapid response techniques to look to block +whole classes of jailbreaks after observing only a handful of attacks. To study +this setting, we develop RapidResponseBench, a benchmark that measures a +defense's robustness against various jailbreak strategies after adapting to a +few observed examples. We evaluate five rapid response methods, all of which +use jailbreak proliferation, where we automatically generate additional +jailbreaks similar to the examples observed. Our strongest method, which +fine-tunes an input classifier to block proliferated jailbreaks, reduces attack +success rate by a factor greater than 240 on an in-distribution set of +jailbreaks and a factor greater than 15 on an out-of-distribution set, having +observed just one example of each jailbreaking strategy. Moreover, further +studies suggest that the quality of proliferation model and number of +proliferated examples play an key role in the effectiveness of this defense. +Overall, our results highlight the potential of responding rapidly to novel +jailbreaks to limit LLM misuse. + +
+
+
+
+
+ + ☆ Controlled Evaluation of Syntactic Knowledge in Multilingual Language + Models + + +
+ Language models (LMs) are capable of acquiring elements of human-like +syntactic knowledge. Targeted syntactic evaluation tests have been employed to +measure how well they form generalizations about syntactic phenomena in +high-resource languages such as English. However, we still lack a thorough +understanding of LMs' capacity for syntactic generalizations in low-resource +languages, which are responsible for much of the diversity of syntactic +patterns worldwide. In this study, we develop targeted syntactic evaluation +tests for three low-resource languages (Basque, Hindi, and Swahili) and use +them to evaluate five families of open-access multilingual Transformer LMs. We +find that some syntactic tasks prove relatively easy for LMs while others +(agreement in sentences containing indirect objects in Basque, agreement across +a prepositional phrase in Swahili) are challenging. We additionally uncover +issues with publicly available Transformers, including a bias toward the +habitual aspect in Hindi in multilingual BERT and underperformance compared to +similar-sized models in XGLM-4.5B. + +
+
+
+
+
+ + ☆ IdentifyMe: A Challenging Long-Context Mention Resolution Benchmark + + +
+ Recent evaluations of LLMs on coreference resolution have revealed that +traditional output formats and evaluation metrics do not fully capture the +models' referential understanding. To address this, we introduce IdentifyMe, a +new benchmark for mention resolution presented in a multiple-choice question +(MCQ) format, commonly used for evaluating LLMs. IdentifyMe features long +narratives and employs heuristics to exclude easily identifiable mentions, +creating a more challenging task. The benchmark also consists of a curated +mixture of different mention types and corresponding entities, allowing for a +fine-grained analysis of model performance. We evaluate both closed- and open +source LLMs on IdentifyMe and observe a significant performance gap (20-30%) +between the state-of-the-art sub-10B open models vs. closed ones. We observe +that pronominal mentions, which have limited surface information, are typically +much harder for models to resolve than nominal mentions. Additionally, we find +that LLMs often confuse entities when their mentions overlap in nested +structures. The highest-scoring model, GPT-4o, achieves 81.9% accuracy, +highlighting the strong referential capabilities of state-of-the-art LLMs while +also indicating room for further improvement. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ BudgetMLAgent: A Cost-Effective LLM Multi-Agent system for Automating + Machine Learning Tasks + + +
+ Large Language Models (LLMs) excel in diverse applications including +generation of code snippets, but often struggle with generating code for +complex Machine Learning (ML) tasks. Although existing LLM single-agent based +systems give varying performance depending on the task complexity, they purely +rely on larger and expensive models such as GPT-4. Our investigation reveals +that no-cost and low-cost models such as Gemini-Pro, Mixtral and CodeLlama +perform far worse than GPT-4 in a single-agent setting. With the motivation of +developing a cost-efficient LLM based solution for solving ML tasks, we propose +an LLM Multi-Agent based system which leverages combination of experts using +profiling, efficient retrieval of past observations, LLM cascades, and +ask-the-expert calls. Through empirical analysis on ML engineering tasks in the +MLAgentBench benchmark, we demonstrate the effectiveness of our system, using +no-cost models, namely Gemini as the base LLM, paired with GPT-4 in cascade and +expert to serve occasional ask-the-expert calls for planning. With 94.2\% +reduction in the cost (from \$0.931 per run cost averaged over all tasks for +GPT-4 single agent system to \$0.054), our system is able to yield better +average success rate of 32.95\% as compared to GPT-4 single-agent system +yielding 22.72\% success rate averaged over all the tasks of MLAgentBench. + +
+
+ comment: Presented at AIMLSystems '24 +
+
+
+
+
+ + ☆ DecoPrompt : Decoding Prompts Reduces Hallucinations when Large Language + Models Meet False Premises + + +
+ While large language models (LLMs) have demonstrated increasing power, they +have also called upon studies on their hallucinated outputs that deviate from +factually correct statements. In this paper, we focus on one important scenario +of false premises, where LLMs are distracted by misaligned claims although the +model possesses the required factual knowledge to answer original questions +accurately. Inspired by the observation that entropy of the false-premise +prompt is closely related to its likelihood to elicit hallucination generation, +we propose a new prompting algorithm, named DecoPrompt, to mitigate +hallucination. DecoPrompt leverages LLMs to "decode" the false-premise prompts +without really eliciting hallucination output from LLMs. We perform experiments +on two datasets, demonstrating that DecoPrompt can reduce hallucinations +effectively on outputs from different LLMs. Moreover, DecoPrompt exhibits +cross-model transferability, which facilitates its applications to scenarios +such as LLMs of large sizes or unavailable model logits. + +
+
+
+
+
+ + ☆ Efficient and Accurate Prompt Optimization: the Benefit of Memory in + Exemplar-Guided Reflection + + +
+ Automatic prompt engineering aims to enhance the generation quality of large +language models (LLMs). Recent works utilize feedbacks generated from erroneous +cases to guide the prompt optimization. During inference, they may further +retrieve several semantically-related exemplars and concatenate them to the +optimized prompts to improve the performance. However, those works only utilize +the feedback at the current step, ignoring historical and unseleccted feedbacks +which are potentially beneficial. Moreover, the selection of exemplars only +considers the general semantic relationship and may not be optimal in terms of +task performance and matching with the optimized prompt. In this work, we +propose an Exemplar-Guided Reflection with Memory mechanism (ERM) to realize +more efficient and accurate prompt optimization. Specifically, we design an +exemplar-guided reflection mechanism where the feedback generation is +additionally guided by the generated exemplars. We further build two kinds of +memory to fully utilize the historical feedback information and support more +effective exemplar retrieval. Empirical evaluations show our method surpasses +previous state-of-the-arts with less optimization steps, i.e., improving F1 +score by 10.1 on LIAR dataset, and reducing half of the optimization steps on +ProTeGi. + +
+
+
+
+
+ + ♻ ☆ GlossLM: A Massively Multilingual Corpus and Pretrained Model for + Interlinear Glossed Text EMNLP 2024 + + +
+ Language documentation projects often involve the creation of annotated text +in a format such as interlinear glossed text (IGT), which captures fine-grained +morphosyntactic analyses in a morpheme-by-morpheme format. However, there are +few existing resources providing large amounts of standardized, easily +accessible IGT data, limiting their applicability to linguistic research, and +making it difficult to use such data in NLP modeling. + We compile the largest existing corpus of IGT data from a variety of sources, +covering over 450k examples across 1.8k languages, to enable research on +crosslingual transfer and IGT generation. We normalize much of our data to +follow a standard set of labels across languages. + Furthermore, we explore the task of automatically generating IGT in order to +aid documentation projects. As many languages lack sufficient monolingual data, +we pretrain a large multilingual model on our corpus. We demonstrate the +utility of this model by finetuning it on monolingual corpora, outperforming +SOTA models by up to 6.6\%. Our pretrained model and dataset are available on +Hugging Face. + +
+
+ comment: EMNLP 2024. First two authors are equal contribution +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. + +
+
+
+
+
+ + ♻ ☆ One fish, two fish, but not the whole sea: Alignment reduces language + models' conceptual diversity + + +
+ Researchers in social science and psychology have recently proposed using +large language models (LLMs) as replacements for humans in behavioral research. +In addition to arguments about whether LLMs accurately capture population-level +patterns, this has raised questions about whether LLMs capture human-like +conceptual diversity. Separately, it is debated whether post-training alignment +(RLHF or RLAIF) affects models' internal diversity. Inspired by human studies, +we use a new way of measuring the conceptual diversity of +synthetically-generated LLM "populations" by relating the internal variability +of simulated individuals to the population-level variability. We use this +approach to evaluate non-aligned and aligned LLMs on two domains with rich +human behavioral data. While no model reaches human-like diversity, aligned +models generally display less diversity than their instruction fine-tuned +counterparts. Our findings highlight potential trade-offs between increasing +models' value alignment and decreasing the diversity of their conceptual +representations. + +
+
+ comment: 17 pages, 10 figures; corrected figure version +
+
+
+
+
+ + ♻ ☆ CodeTree: Agent-guided Tree Search for Code Generation with Large + Language Models + + +
+ Pre-trained on massive amounts of code and text data, large language models +(LLMs) have demonstrated remarkable achievements in performing code generation +tasks. With additional execution-based feedback, these models can act as agents +with capabilities to self-refine and improve generated code autonomously. +However, on challenging coding tasks with extremely large search space, current +agentic approaches still struggle with multi-stage planning, generating, and +debugging. To address this problem, we propose CodeTree, a framework for LLM +agents to efficiently explore the search space in different stages of the code +generation process. Specifically, we adopted a unified tree structure to +explicitly explore different coding strategies, generate corresponding coding +solutions, and subsequently refine the solutions. In each stage, critical +decision-making (ranking, termination, expanding) of the exploration process is +guided by both the environmental execution-based feedback and +LLM-agent-generated feedback. We comprehensively evaluated CodeTree on 7 code +generation benchmarks and demonstrated the significant performance gains of +CodeTree against strong baselines. Using GPT-4o as the base model, we +consistently achieved top results of 95.1 on HumanEval, 98.7 on MBPP, and 43.0 +on CodeContests. On the challenging SWEBench benchmark, our approach led to +significant performance gains. + +
+
+
+
+
+ + ♻ ☆ Context-aware Inductive Knowledge Graph Completion with Latent Type + Constraints and Subgraph Reasoning + + +
+ Inductive knowledge graph completion (KGC) aims to predict missing triples +with unseen entities. Recent works focus on modeling reasoning paths between +the head and tail entity as direct supporting evidence. However, these methods +depend heavily on the existence and quality of reasoning paths, which limits +their general applicability in different scenarios. In addition, we observe +that latent type constraints and neighboring facts inherent in KGs are also +vital in inferring missing triples. To effectively utilize all useful +information in KGs, we introduce CATS, a novel context-aware inductive KGC +solution. With sufficient guidance from proper prompts and supervised +fine-tuning, CATS activates the strong semantic understanding and reasoning +capabilities of large language models to assess the existence of query triples, +which consist of two modules. First, the type-aware reasoning module evaluates +whether the candidate entity matches the latent entity type as required by the +query relation. Then, the subgraph reasoning module selects relevant reasoning +paths and neighboring facts, and evaluates their correlation to the query +triple. Experiment results on three widely used datasets demonstrate that CATS +significantly outperforms state-of-the-art methods in 16 out of 18 +transductive, inductive, and few-shot settings with an average absolute MRR +improvement of 7.2%. + +
+
+
+
+
+ + ♻ ☆ Plausible Extractive Rationalization through Semi-Supervised Entailment + Signal ACL + + +
+ The increasing use of complex and opaque black box models requires the +adoption of interpretable measures, one such option is extractive rationalizing +models, which serve as a more interpretable alternative. These models, also +known as Explain-Then-Predict models, employ an explainer model to extract +rationales and subsequently condition the predictor with the extracted +information. Their primary objective is to provide precise and faithful +explanations, represented by the extracted rationales. In this paper, we take a +semi-supervised approach to optimize for the plausibility of extracted +rationales. We adopt a pre-trained natural language inference (NLI) model and +further fine-tune it on a small set of supervised rationales ($10\%$). The NLI +predictor is leveraged as a source of supervisory signals to the explainer via +entailment alignment. We show that, by enforcing the alignment agreement +between the explanation and answer in a question-answering task, the +performance can be improved without access to ground truth labels. We evaluate +our approach on the ERASER dataset and show that our approach achieves +comparable results with supervised extractive models and outperforms +unsupervised approaches by $> 100\%$. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ Self-training Large Language Models through Knowledge Detection EMNLP + + +
+ Large language models (LLMs) often necessitate extensive labeled datasets and +training compute to achieve impressive performance across downstream tasks. +This paper explores a self-training paradigm, where the LLM autonomously +curates its own labels and selectively trains on unknown data samples +identified through a reference-free consistency method. Empirical evaluations +demonstrate significant improvements in reducing hallucination in generation +across multiple subjects. Furthermore, the selective training framework +mitigates catastrophic forgetting in out-of-distribution benchmarks, addressing +a critical limitation in training LLMs. Our findings suggest that such an +approach can substantially reduce the dependency on large labeled datasets, +paving the way for more scalable and cost-effective language model training. + +
+
+ comment: EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ Enhancing Language Model Factuality via Activation-Based Confidence + Calibration and Guided Decoding EMNLP 2024 + + +
+ Calibrating language models (LMs) aligns their generation confidence with the +actual likelihood of answer correctness, which can inform users about LMs' +reliability and mitigate hallucinated content. However, prior calibration +methods, such as self-consistency-based and logit-based approaches, are either +limited in inference-time efficiency or fall short of providing informative +signals. Moreover, simply filtering out low-confidence responses reduces the +LM's helpfulness when the answers are correct. Therefore, effectively using +calibration techniques to enhance an LM's factuality remains an unsolved +challenge. In this paper, we first propose an activation-based calibration +method, ActCab, which trains a linear layer on top of the LM's last-layer +activations that can better capture the representations of knowledge. Built on +top of ActCab, we further propose CoDec, a confidence-guided decoding strategy +to elicit truthful answers with high confidence from LMs. By evaluating on five +popular QA benchmarks, ActCab achieves superior calibration performance than +all competitive baselines, e.g., by reducing the average expected calibration +error (ECE) score by up to 39%. Further experiments on CoDec show consistent +improvements in several LMs' factuality on challenging QA datasets, such as +TruthfulQA, highlighting the value of confidence signals in enhancing +factuality. + +
+
+ comment: EMNLP 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ Exploiting User Comments for Early Detection of Fake News Prior to + Users' Commenting + + +
+ Both accuracy and timeliness are key factors in detecting fake news on social +media. However, most existing methods encounter an accuracy-timeliness dilemma: +Content-only methods guarantee timeliness but perform moderately because of +limited available information, while social con-text-based ones generally +perform better but inevitably lead to latency because of social context +accumulation needs. To break such a dilemma, a feasible but not well-studied +solution is to leverage social contexts (e.g., comments) from historical news +for training a detection model and apply it to newly emerging news without +social contexts. This requires the model to (1) sufficiently learn helpful +knowledge from social contexts, and (2) be well compatible with situations that +social contexts are available or not. To achieve this goal, we propose to +absorb and parameterize useful knowledge from comments in historical news and +then inject it into a content-only detection model. Specifically, we design the +Comments ASsisted FakE News Detection method (CAS-FEND), which transfers useful +knowledge from a comment-aware teacher model to a content-only student model +and detects newly emerging news with the student model. Experiments show that +the CAS-FEND student model outperforms all content-only methods and even +comment-aware ones with 1/4 comments as inputs, demonstrating its superiority +for early detection. + +
+
+ comment: 19 pages, 6 figures, 7 tables. The article has been accepted by + Frontiers of Computer Science (FCS), with the DOI: + {10.1007/s11704-024-40674-6} +
+
+
+
+
+ + ♻ ☆ How Do Large Language Models Acquire Factual Knowledge During + Pretraining? NeurIPS 2024 + + +
+ Despite the recent observation that large language models (LLMs) can store +substantial factual knowledge, there is a limited understanding of the +mechanisms of how they acquire factual knowledge through pretraining. This work +addresses this gap by studying how LLMs acquire factual knowledge during +pretraining. The findings reveal several important insights into the dynamics +of factual knowledge acquisition during pretraining. First, counterintuitively, +we observe that pretraining on more data shows no significant improvement in +the model's capability to acquire and maintain factual knowledge. Next, there +is a power-law relationship between training steps and forgetting of +memorization and generalization of factual knowledge, and LLMs trained with +duplicated training data exhibit faster forgetting. Third, training LLMs with +larger batch sizes can enhance the models' robustness to forgetting. Overall, +our observations suggest that factual knowledge acquisition in LLM pretraining +occurs by progressively increasing the probability of factual knowledge +presented in the pretraining data at each step. However, this increase is +diluted by subsequent forgetting. Based on this interpretation, we demonstrate +that we can provide plausible explanations for recently observed behaviors of +LLMs, such as the poor performance of LLMs on long-tail knowledge and the +benefits of deduplicating the pretraining corpus. + +
+
+ comment: Accepted at NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ A Unified Multi-Task Learning Architecture for Hate Detection Leveraging + User-Based Information + + +
+ Hate speech, offensive language, aggression, racism, sexism, and other +abusive language are common phenomena in social media. There is a need for +Artificial Intelligence(AI)based intervention which can filter hate content at +scale. Most existing hate speech detection solutions have utilized the features +by treating each post as an isolated input instance for the classification. +This paper addresses this issue by introducing a unique model that improves +hate speech identification for the English language by utilising intra-user and +inter-user-based information. The experiment is conducted over single-task +learning (STL) and multi-task learning (MTL) paradigms that use deep neural +networks, such as convolutional neural networks (CNN), gated recurrent unit +(GRU), bidirectional encoder representations from the transformer (BERT), and A +Lite BERT (ALBERT). We use three benchmark datasets and conclude that combining +certain user features with textual features gives significant improvements in +macro-F1 and weighted-F1. + +
+
+ comment: 7 pages, 1 figure, and two tables. Accepted at the 20th International + Conference on Natural Language Processing (ICON) 2023. + https://aclanthology.org/2023.icon-1.53 +
+
+
+
+
+ + ♻ ☆ An Early FIRST Reproduction and Improvements to Single-Token Decoding + for Fast Listwise Reranking + + +
+ Recent advances have demonstrated that large language models (LLMs) excel as +listwise rerankers, but their high computational demands remain a barrier to +widespread adoption. Further, the traditional language modeling (LM) objective +is not ideally suited for reranking tasks. FIRST is a novel approach that +addresses these challenges by integrating a learning-to-rank objective and +leveraging the logits of only the first generated token, thereby significantly +reducing inference latency compared to traditional LLM rerankers. In this +study, we extend the evaluation of FIRST to the TREC Deep Learning datasets +(DL19-22), validating its robustness across diverse domains. We investigate the +influence of different first-stage retrievers on FIRST rerankers, observing +diminishing returns and patterns consistent with traditional LLM rerankers. +Through applying the FIRST objective to a broader range of backbone models, we +achieve effectiveness surpassing the original implementation. Our experiments +confirm that fast reranking with single-token logits does not compromise +out-of-domain reranking quality. To better quantify the computational savings +in the original study, we measure and compare latency to find a 21%-42% gain +across various models and benchmarks. Moreover, while LM training implicitly +improves zero-shot single-token reranking, our experiments also raise questions +about whether LM pre-training may hinder subsequent fine-tuning with the FIRST +objective. These findings pave the way for more efficient and effective +listwise reranking in future applications. + +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ The Dark Patterns of Personalized Persuasion in Large Language Models: + Exposing Persuasive Linguistic Features for Big Five Personality Traits in + LLMs Responses + + +
+ This study explores how the Large Language Models (LLMs) adjust linguistic +features to create personalized persuasive outputs. While research showed that +LLMs personalize outputs, a gap remains in understanding the linguistic +features of their persuasive capabilities. We identified 13 linguistic features +crucial for influencing personalities across different levels of the Big Five +model of personality. We analyzed how prompts with personality trait +information influenced the output of 19 LLMs across five model families. The +findings show that models use more anxiety-related words for neuroticism, +increase achievement-related words for conscientiousness, and employ fewer +cognitive processes words for openness to experience. Some model families excel +at adapting language for openness to experience, others for conscientiousness, +while only one model adapts language for neuroticism. Our findings show how +LLMs tailor responses based on personality cues in prompts, indicating their +potential to create persuasive content affecting the mind and well-being of the +recipients. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ Efficient LLM Comparative Assessment: a Product of Experts Framework for + Pairwise Comparisons + + +
+ LLM-as-a-judge approaches are a practical and effective way of assessing a +range of text tasks. However, when using pairwise comparisons to rank a set of +candidates, the computational cost scales quadratically with the number of +candidates, which has practical limitations. This paper introduces a Product of +Expert (PoE) framework for efficient LLM Comparative Assessment. Here +individual comparisons are considered experts that provide information on a +pair's score difference. The PoE framework combines the information from these +experts to yield an expression that can be maximized with respect to the +underlying set of candidates, and is highly flexible where any form of expert +can be assumed. When Gaussian experts are used one can derive simple +closed-form solutions for the optimal candidate ranking, and expressions for +selecting which comparisons should be made to maximize the probability of this +ranking. Our approach enables efficient comparative assessment, where by using +only a small subset of the possible comparisons, one can generate score +predictions that correlate well with human judgements. We evaluate the approach +on multiple NLG tasks and demonstrate that our framework can yield considerable +computational savings when performing pairwise comparative assessment. With +many candidate texts, using as few as 2% of comparisons the PoE solution can +achieve similar performance to when all comparisons are used. + +
+
+
+
+
+ + ♻ ☆ Qwen2.5-Coder Technical Report + + +
+ In this report, we introduce the Qwen2.5-Coder series, a significant upgrade +from its predecessor, CodeQwen1.5. This series includes six models: +Qwen2.5-Coder-(0.5B/1.5B/3B/7B/14B/32B). As a code-specific model, +Qwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained +on a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning, +scalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder +demonstrates impressive code generation capabilities while retaining general +and math skills. These models have been evaluated on a wide range of +code-related tasks, achieving state-of-the-art (SOTA) performance across more +than 10 benchmarks, including code generation, completion, reasoning, and +repair, consistently outperforming larger models of the same model size. We +believe that the release of the Qwen2.5-Coder series will advance research in +code intelligence and, with its permissive licensing, support wider adoption by +developers in real-world applications. + +
+
+
+
+
+ + ♻ ☆ Kwai-STaR: Transform LLMs into State-Transition Reasoners + + +
+ Mathematical reasoning presents a significant challenge to the cognitive +capabilities of LLMs. Various methods have been proposed to enhance the +mathematical ability of LLMs. However, few recognize the value of state +transition for LLM reasoning. In this work, we define mathematical +problem-solving as a process of transiting from an initial unsolved state to +the final resolved state, and propose Kwai-STaR framework, which transforms +LLMs into State-Transition Reasoners to improve their intuitive reasoning +capabilities. Our approach comprises three main steps: (1) Define the state +space tailored to the mathematical reasoning. (2) Generate state-transition +data based on the state space. (3) Convert original LLMs into State-Transition +Reasoners via a curricular training strategy. Our experiments validate the +effectiveness of Kwai-STaR in enhancing mathematical reasoning: After training +on the small-scale Kwai-STaR dataset, general LLMs, including Mistral-7B and +LLaMA-3, achieve considerable performance gain on the GSM8K and GSM-Hard +dataset. Additionally, the state transition-based design endows Kwai-STaR with +remarkable training and inference efficiency. Further experiments are underway +to establish the generality of Kwai-STaR. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ LLMs for Generating and Evaluating Counterfactuals: A Comprehensive + Study EMNLP + + +
+ As NLP models become more complex, understanding their decisions becomes more +crucial. Counterfactuals (CFs), where minimal changes to inputs flip a model's +prediction, offer a way to explain these models. While Large Language Models +(LLMs) have shown remarkable performance in NLP tasks, their efficacy in +generating high-quality CFs remains uncertain. This work fills this gap by +investigating how well LLMs generate CFs for two NLU tasks. We conduct a +comprehensive comparison of several common LLMs, and evaluate their CFs, +assessing both intrinsic metrics, and the impact of these CFs on data +augmentation. Moreover, we analyze differences between human and LLM-generated +CFs, providing insights for future research directions. Our results show that +LLMs generate fluent CFs, but struggle to keep the induced changes minimal. +Generating CFs for Sentiment Analysis (SA) is less challenging than NLI where +LLMs show weaknesses in generating CFs that flip the original label. This also +reflects on the data augmentation performance, where we observe a large gap +between augmenting with human and LLMs CFs. Furthermore, we evaluate LLMs' +ability to assess CFs in a mislabelled data setting, and show that they have a +strong bias towards agreeing with the provided labels. GPT4 is more robust +against this bias and its scores correlate well with automatic metrics. Our +findings reveal several limitations and point to potential future work +directions. + +
+
+ comment: Accepted to EMNLP Findings 2024 +
+
+
+
+
+ + ♻ ☆ RLHF Workflow: From Reward Modeling to Online RLHF + + +
+ We present the workflow of Online Iterative Reinforcement Learning from Human +Feedback (RLHF) in this technical report, which is widely reported to +outperform its offline counterpart by a large margin in the recent large +language model (LLM) literature. However, existing open-source RLHF projects +are still largely confined to the offline learning setting. In this technical +report, we aim to fill in this gap and provide a detailed recipe that is easy +to reproduce for online iterative RLHF. In particular, since online human +feedback is usually infeasible for open-source communities with limited +resources, we start by constructing preference models using a diverse set of +open-source datasets and use the constructed proxy preference model to +approximate human feedback. Then, we discuss the theoretical insights and +algorithmic principles behind online iterative RLHF, followed by a detailed +practical implementation. Our trained LLM achieves impressive performance on +LLM chatbot benchmarks, including AlpacaEval-2, Arena-Hard, and MT-Bench, as +well as other academic benchmarks such as HumanEval and TruthfulQA. We have +shown that supervised fine-tuning (SFT) and iterative RLHF can obtain +state-of-the-art performance with fully open-source datasets. Further, we have +made our models, curated datasets, and comprehensive step-by-step code +guidebooks publicly available. Please refer to +https://github.com/RLHFlow/RLHF-Reward-Modeling and +https://github.com/RLHFlow/Online-RLHF for more detailed information. + +
+
+ comment: Published in Transactions on Machine Learning Research (09/2024) +
+
+
+
+
+ + ♻ ☆ LeKUBE: A Legal Knowledge Update BEnchmark + + +
+ Recent advances in Large Language Models (LLMs) have significantly shaped the +applications of AI in multiple fields, including the studies of legal +intelligence. Trained on extensive legal texts, including statutes and legal +documents, the legal LLMs can capture important legal knowledge/concepts +effectively and provide important support for downstream legal applications +such as legal consultancy. Yet, the dynamic nature of legal statutes and +interpretations also poses new challenges to the use of LLMs in legal +applications. Particularly, how to update the legal knowledge of LLMs +effectively and efficiently has become an important research problem in +practice. Existing benchmarks for evaluating knowledge update methods are +mostly designed for the open domain and cannot address the specific challenges +of the legal domain, such as the nuanced application of new legal knowledge, +the complexity and lengthiness of legal regulations, and the intricate nature +of legal reasoning. To address this gap, we introduce the Legal Knowledge +Update BEnchmark, i.e. LeKUBE, which evaluates knowledge update methods for +legal LLMs across five dimensions. Specifically, we categorize the needs of +knowledge updates in the legal domain with the help of legal professionals, and +then hire annotators from law schools to create synthetic updates to the +Chinese Criminal and Civil Code as well as sets of questions of which the +answers would change after the updates. Through a comprehensive evaluation of +state-of-the-art knowledge update methods, we reveal a notable gap between +existing knowledge update methods and the unique needs of the legal domain, +emphasizing the need for further research and development of knowledge update +mechanisms tailored for legal LLMs. + +
+
+
+
+
+ + ♻ ☆ Exploring Advanced Large Language Models with LLMsuite + + +
+ This tutorial explores the advancements and challenges in the development of +Large Language Models (LLMs) such as ChatGPT and Gemini. It addresses inherent +limitations like temporal knowledge cutoffs, mathematical inaccuracies, and the +generation of incorrect information, proposing solutions like Retrieval +Augmented Generation (RAG), Program-Aided Language Models (PAL), and frameworks +such as ReAct and LangChain. The integration of these techniques enhances LLM +performance and reliability, especially in multi-step reasoning and complex +task execution. The paper also covers fine-tuning strategies, including +instruction fine-tuning, parameter-efficient methods like LoRA, and +Reinforcement Learning from Human Feedback (RLHF) as well as Reinforced +Self-Training (ReST). Additionally, it provides a comprehensive survey of +transformer architectures and training techniques for LLMs. The source code can +be accessed by contacting the author via email for a request. + +
+
+ comment: Keywords: Language Model Benchmarking, Pre-Trained LLM Comparison, + LLM Performance Analysis, NLP Model Evaluation Tools, Public Dataset + Inference for LLMs, BLEU and ROUGE Metrics for LLM, Open Source LLM Testing + Tools, Large Language Model Evaluation Software, NLP Benchmarking Suite, + Comprehensive LLM Evaluation Toolkit +
+
+
+
+
+ + ♻ ☆ OmAgent: A Multi-modal Agent Framework for Complex Video Understanding + with Task Divide-and-Conquer + + +
+ Recent advancements in Large Language Models (LLMs) have expanded their +capabilities to multimodal contexts, including comprehensive video +understanding. However, processing extensive videos such as 24-hour CCTV +footage or full-length films presents significant challenges due to the vast +data and processing demands. Traditional methods, like extracting key frames or +converting frames to text, often result in substantial information loss. To +address these shortcomings, we develop OmAgent, efficiently stores and +retrieves relevant video frames for specific queries, preserving the detailed +content of videos. Additionally, it features an Divide-and-Conquer Loop capable +of autonomous reasoning, dynamically invoking APIs and tools to enhance query +processing and accuracy. This approach ensures robust video understanding, +significantly reducing information loss. Experimental results affirm OmAgent's +efficacy in handling various types of videos and complex tasks. Moreover, we +have endowed it with greater autonomy and a robust tool-calling system, +enabling it to accomplish even more intricate tasks. + +
+
+
+
+
+ + ♻ ☆ SciDFM: A Large Language Model with Mixture-of-Experts for Science NeurIPS + 2024 + + +
+ Recently, there has been a significant upsurge of interest in leveraging +large language models (LLMs) to assist scientific discovery. However, most LLMs +only focus on general science, while they lack domain-specific knowledge, such +as chemical molecules and amino acid sequences. To bridge these gaps, we +introduce SciDFM, a mixture-of-experts LLM, which is trained from scratch and +is able to conduct college-level scientific reasoning and understand molecules +and amino acid sequences. We collect a large-scale training corpus containing +numerous scientific papers and books from different disciplines as well as data +from domain-specific databases. We further fine-tune the pre-trained model on +lots of instruction data to improve performances on downstream benchmarks. From +experiment results, we show that SciDFM achieves strong performance on general +scientific benchmarks such as SciEval and SciQ, and it reaches a SOTA +performance on domain-specific benchmarks among models of similar size. We +further analyze the expert layers and show that the results of expert selection +vary with data from different disciplines. To benefit the broader research +community, we open-source SciDFM at +https://huggingface.co/OpenDFM/SciDFM-MoE-A5.6B-v1.0. + +
+
+ comment: 12 pages, 1 figure, 9 tables. Technical Report, accepted by NeurIPS + 2024 Workshop FM4Science +
+
+
+
+
+ + ♻ ☆ How Does the Textual Information Affect the Retrieval of Multimodal + In-Context Learning? EMNLP 2024 + + +
+ The increase in parameter size of multimodal large language models (MLLMs) +introduces significant capabilities, particularly in-context learning, where +MLLMs enhance task performance without updating pre-trained parameters. This +effectiveness, however, hinges on the appropriate selection of in-context +examples, a process that is currently biased towards visual data, overlooking +textual information. Furthermore, the area of supervised retrievers for MLLMs, +crucial for optimal in-context example selection, continues to be +uninvestigated. Our study offers an in-depth evaluation of the impact of +textual information on the unsupervised selection of in-context examples in +multimodal contexts, uncovering a notable sensitivity of retriever performance +to the employed modalities. Responding to this, we introduce a novel supervised +MLLM-retriever MSIER that employs a neural network to select examples that +enhance multimodal in-context learning efficiency. This approach is validated +through extensive testing across three distinct tasks, demonstrating the +method's effectiveness. Additionally, we investigate the influence of +modalities on our supervised retrieval method's training and pinpoint factors +contributing to our model's success. This exploration paves the way for future +advancements, highlighting the potential for refined in-context learning in +MLLMs through the strategic use of multimodal data. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Deception Detection from Linguistic and Physiological Data Streams Using + Bimodal Convolutional Neural Networks + + +
+ Deception detection is gaining increasing interest due to ethical and +security concerns. This paper explores the application of convolutional neural +networks for the purpose of multimodal deception detection. We use a dataset +built by interviewing 104 subjects about two topics, with one truthful and one +falsified response from each subject about each topic. In particular, we make +three main contributions. First, we extract linguistic and physiological +features from this data to train and construct the neural network models. +Second, we propose a fused convolutional neural network model using both +modalities in order to achieve an improved overall performance. Third, we +compare our new approach with earlier methods designed for multimodal deception +detection. We find that our system outperforms regular classification methods; +our results indicate the feasibility of using neural networks for deception +detection even in the presence of limited amounts of data. + +
+
+ comment: Accepted by 2024 5th International Conference on Information Science, + Parallel and Distributed Systems +
+
+
+
+
+ + ♻ ☆ SKVQ: Sliding-window Key and Value Cache Quantization for Large Language + Models + + +
+ Large language models (LLMs) can now handle longer sequences of tokens, +enabling complex tasks like book understanding and generating lengthy novels. +However, the key-value (KV) cache required for LLMs consumes substantial memory +as context length increasing, becoming the bottleneck for deployment. In this +paper, we present a strategy called SKVQ, which stands for sliding-window KV +cache quantization, to address the issue of extremely low bitwidth KV cache +quantization. To achieve this, SKVQ rearranges the channels of the KV cache in +order to improve the similarity of channels in quantization groups, and applies +clipped dynamic quantization at the group level. Additionally, SKVQ ensures +that the most recent window tokens in the KV cache are preserved with high +precision. This helps maintain the accuracy of a small but important portion of +the KV cache.SKVQ achieves high compression ratios while maintaining accuracy. +Our evaluation on LLMs demonstrates that SKVQ surpasses previous quantization +approaches, allowing for quantization of the KV cache to 2-bit keys and 1.5-bit +values with minimal loss of accuracy. With SKVQ, it is possible to process +context lengths of up to 1M on an 80GB memory GPU for a 7b model and up to 7 +times faster decoding. + +
+
+
+
+
+ + ♻ ☆ Harnessing Earnings Reports for Stock Predictions: A QLoRA-Enhanced LLM + Approach + + +
+ Accurate stock market predictions following earnings reports are crucial for +investors. Traditional methods, particularly classical machine learning models, +struggle with these predictions because they cannot effectively process and +interpret extensive textual data contained in earnings reports and often +overlook nuances that influence market movements. This paper introduces an +advanced approach by employing Large Language Models (LLMs) instruction +fine-tuned with a novel combination of instruction-based techniques and +quantized low-rank adaptation (QLoRA) compression. Our methodology integrates +'base factors', such as financial metric growth and earnings transcripts, with +'external factors', including recent market indices performances and analyst +grades, to create a rich, supervised dataset. This comprehensive dataset +enables our models to achieve superior predictive performance in terms of +accuracy, weighted F1, and Matthews correlation coefficient (MCC), especially +evident in the comparison with benchmarks such as GPT-4. We specifically +highlight the efficacy of the llama-3-8b-Instruct-4bit model, which showcases +significant improvements over baseline models. The paper also discusses the +potential of expanding the output capabilities to include a 'Hold' option and +extending the prediction horizon, aiming to accommodate various investment +styles and time frames. This study not only demonstrates the power of +integrating cutting-edge AI with fine-tuned financial data but also paves the +way for future research in enhancing AI-driven financial analysis tools. + +
+
+ comment: Accepted by 2024 6th International Conference on Data-driven + Optimization of Complex Systems +
+
+
+
+
+ + ♻ ☆ MASIVE: Open-Ended Affective State Identification in English and Spanish EMNLP 2024 + + +
+ In the field of emotion analysis, much NLP research focuses on identifying a +limited number of discrete emotion categories, often applied across languages. +These basic sets, however, are rarely designed with textual data in mind, and +culture, language, and dialect can influence how particular emotions are +interpreted. In this work, we broaden our scope to a practically unbounded set +of \textit{affective states}, which includes any terms that humans use to +describe their experiences of feeling. We collect and publish MASIVE, a dataset +of Reddit posts in English and Spanish containing over 1,000 unique affective +states each. We then define the new problem of \textit{affective state +identification} for language generation models framed as a masked span +prediction task. On this task, we find that smaller finetuned multilingual +models outperform much larger LLMs, even on region-specific Spanish affective +states. Additionally, we show that pretraining on MASIVE improves model +performance on existing emotion benchmarks. Finally, through machine +translation experiments, we find that native speaker-written data is vital to +good performance on this task. + +
+
+ comment: EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ Exploring Diverse Methods in Visual Question Answering + + +
+ This study explores innovative methods for improving Visual Question +Answering (VQA) using Generative Adversarial Networks (GANs), autoencoders, and +attention mechanisms. Leveraging a balanced VQA dataset, we investigate three +distinct strategies. Firstly, GAN-based approaches aim to generate answer +embeddings conditioned on image and question inputs, showing potential but +struggling with more complex tasks. Secondly, autoencoder-based techniques +focus on learning optimal embeddings for questions and images, achieving +comparable results with GAN due to better ability on complex questions. Lastly, +attention mechanisms, incorporating Multimodal Compact Bilinear pooling (MCB), +address language priors and attention modeling, albeit with a +complexity-performance trade-off. This study underscores the challenges and +opportunities in VQA and suggests avenues for future research, including +alternative GAN formulations and attentional mechanisms. + +
+
+ comment: Accepted by 2024 5th International Conference on Electronic + Communication and Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ LAMP: A Language Model on the Map + + +
+ Large Language Models (LLMs) are poised to play an increasingly important +role in our lives, providing assistance across a wide array of tasks. In the +geospatial domain, LLMs have demonstrated the ability to answer generic +questions, such as identifying a country's capital; nonetheless, their utility +is hindered when it comes to answering fine-grained questions about specific +places, such as grocery stores or restaurants, which constitute essential +aspects of people's everyday lives. This is mainly because the places in our +cities haven't been systematically fed into LLMs, so as to understand and +memorize them. This study introduces a novel framework for fine-tuning a +pre-trained model on city-specific data, to enable it to provide accurate +recommendations, while minimizing hallucinations. We share our model, LAMP, and +the data used to train it. We conduct experiments to analyze its ability to +correctly retrieving spatial objects, and compare it to well-known open- and +closed- source language models, such as GPT-4. Finally, we explore its emerging +capabilities through a case study on day planning. + +
+
+
+
+
+ + ♻ ☆ Game-theoretic LLM: Agent Workflow for Negotiation Games + + +
+ This paper investigates the rationality of large language models (LLMs) in +strategic decision-making contexts, specifically within the framework of game +theory. We evaluate several state-of-the-art LLMs across a spectrum of +complete-information and incomplete-information games. Our findings reveal that +LLMs frequently deviate from rational strategies, particularly as the +complexity of the game increases with larger payoff matrices or deeper +sequential trees. + To address these limitations, we design multiple game-theoretic workflows +that guide the reasoning and decision-making processes of LLMs. These workflows +aim to enhance the models' ability to compute Nash Equilibria and make rational +choices, even under conditions of uncertainty and incomplete information. +Experimental results demonstrate that the adoption of these workflows +significantly improves the rationality and robustness of LLMs in game-theoretic +tasks. Specifically, with the workflow, LLMs exhibit marked improvements in +identifying optimal strategies, achieving near-optimal allocations in +negotiation scenarios, and reducing susceptibility to exploitation during +negotiations. Furthermore, we explore the meta-strategic considerations of +whether it is rational for agents to adopt such workflows, recognizing that the +decision to use or forgo the workflow constitutes a game-theoretic issue in +itself. + Our research contributes to a deeper understanding of LLMs' decision-making +capabilities in strategic contexts and provides insights into enhancing their +rationality through structured workflows. The findings have implications for +the development of more robust and strategically sound AI agents capable of +navigating complex interactive environments. Code and data supporting this +study are available at \url{https://github.com/Wenyueh/game_theory}. + +
+
+ comment: 45 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Dynamic Adaptive Optimization for Effective Sentiment Analysis + Fine-Tuning on Large Language Models + + +
+ Sentiment analysis plays a crucial role in various domains, such as business +intelligence and financial forecasting. Large language models (LLMs) have +become a popular paradigm for sentiment analysis, leveraging multi-task +learning to address specific tasks concurrently. However, LLMs with fine-tuning +for sentiment analysis often underperforms due to the inherent challenges in +managing diverse task complexities. Moreover, constant-weight approaches in +multi-task learning struggle to adapt to variations in data characteristics, +further complicating model effectiveness. To address these issues, we propose a +novel multi-task learning framework with a dynamic adaptive optimization (DAO) +module. This module is designed as a plug-and-play component that can be +seamlessly integrated into existing models, providing an effective and flexible +solution for multi-task learning. The key component of the DAO module is +dynamic adaptive loss, which dynamically adjusts the weights assigned to +different tasks based on their relative importance and data characteristics +during training. Sentiment analyses on a standard and customized financial text +dataset demonstrate that the proposed framework achieves superior performance. +Specifically, this work improves the Mean Squared Error (MSE) and Accuracy +(ACC) by 15.58% and 1.24% respectively, compared with previous work. + +
+
+
+
+
+ + ♻ ☆ Reminding Multimodal Large Language Models of Object-aware Knowledge + with Retrieved Tags EMNLP 2024 + + +
+ Despite recent advances in the general visual instruction-following ability +of Multimodal Large Language Models (MLLMs), they still struggle with critical +problems when required to provide a precise and detailed response to a visual +instruction: (1) failure to identify novel objects or entities, (2) mention of +non-existent objects, and (3) neglect of object's attributed details. Intuitive +solutions include improving the size and quality of data or using larger +foundation models. They show effectiveness in mitigating these issues, but at +an expensive cost of collecting a vast amount of new data and introducing a +significantly larger model. Standing at the intersection of these approaches, +we examine the three object-oriented problems from the perspective of the +image-to-text mapping process by the multimodal connector. In this paper, we +first identify the limitations of multimodal connectors stemming from +insufficient training data. Driven by this, we propose to enhance the mapping +with retrieval-augmented tag tokens, which contain rich object-aware +information such as object names and attributes. With our Tag-grounded visual +instruction tuning with retrieval Augmentation (TUNA), we outperform baselines +that share the same language model and training data on 12 benchmarks. +Furthermore, we show the zero-shot capability of TUNA when provided with +specific datastores. + +
+
+ comment: Main Conference at EMNLP 2024 +
+
+
+
+
+ + ♻ ☆ SLANG: New Concept Comprehension of Large Language Models EMNLP 2024 + + +
+ The dynamic nature of language, particularly evident in the realm of slang +and memes on the Internet, poses serious challenges to the adaptability of +large language models (LLMs). Traditionally anchored to static datasets, these +models often struggle to keep up with the rapid linguistic evolution +characteristic of online communities. This research aims to bridge this gap by +enhancing LLMs' comprehension of the evolving new concepts on the Internet, +without the high cost of continual retraining. In pursuit of this goal, we +introduce $\textbf{SLANG}$, a benchmark designed to autonomously integrate +novel data and assess LLMs' ability to comprehend emerging concepts, alongside +$\textbf{FOCUS}$, an approach uses causal inference to enhance LLMs to +understand new phrases and their colloquial context. Our benchmark and approach +involves understanding real-world instances of linguistic shifts, serving as +contextual beacons, to form more precise and contextually relevant connections +between newly emerging expressions and their meanings. The empirical analysis +shows that our causal inference-based approach outperforms the baseline methods +in terms of precision and relevance in the comprehension of Internet slang and +memes. + +
+
+ comment: EMNLP 2024 Main +
+
+
+
+
+ + ♻ ☆ MMLongBench-Doc: Benchmarking Long-context Document Understanding with + Visualizations NeurIPS 2024 + + +
+ Understanding documents with rich layouts and multi-modal components is a +long-standing and practical task. Recent Large Vision-Language Models (LVLMs) +have made remarkable strides in various tasks, particularly in single-page +document understanding (DU). However, their abilities on long-context DU remain +an open problem. This work presents MMLongBench-Doc, a long-context, +multi-modal benchmark comprising 1,062 expert-annotated questions. Distinct +from previous datasets, it is constructed upon 130 lengthy PDF-formatted +documents with an average of 49.4 pages and 20,971 textual tokens. Towards +comprehensive evaluation, answers to these questions rely on pieces of evidence +from (1) different sources (text, image, chart, table, and layout structure) +and (2) various locations (i.e. page number). Moreover, 33.2% of the questions +are cross-page questions requiring evidence across multiple pages. 22.8% of the +questions are designed to be unanswerable for detecting potential +hallucinations. Experiments on 14 LVLMs demonstrate that long-context DU +greatly challenges current models. Notably, the best-performing model, GPT-4o, +achieves an F1 score of only 42.7%, while the second-best, GPT-4V, scores +31.4%. Furthermore, 12 LVLMs (all except GPT-4o and GPT-4V) even present worse +performance than their LLM counterparts which are fed with lossy-parsed OCR +documents. These results validate the necessity of future research toward more +capable long-context LVLMs. Project Page: +https://mayubo2333.github.io/MMLongBench-Doc + +
+
+ comment: Accepted to NeurIPS 2024 Datasets and Benchmarks Track (Spotlight) +
+
+
+
+
+ + ♻ ☆ Self-Data Distillation for Recovering Quality in Pruned Large Language + Models + + +
+ Large language models have driven significant progress in natural language +processing, but their deployment requires substantial compute and memory +resources. As models scale, compression techniques become essential for +balancing model quality with computational efficiency. Structured pruning, +which removes less critical components of the model, is a promising strategy +for reducing complexity. However, one-shot pruning often results in significant +quality degradation, particularly in tasks requiring multi-step reasoning. To +recover lost quality, supervised fine-tuning (SFT) is commonly applied, but it +can lead to catastrophic forgetting by shifting the model's learned data +distribution. Therefore, addressing the degradation from both pruning and SFT +is essential to preserve the original model's quality. In this work, we utilize +self-data distilled fine-tuning to address these challenges. Our approach +leverages the original, unpruned model to generate a distilled dataset that +preserves semantic richness and mitigates catastrophic forgetting by +maintaining alignment with the base model's knowledge. Empirically, we +demonstrate that self-data distillation consistently outperforms standard SFT, +improving average accuracy by up to 8% on the HuggingFace OpenLLM Leaderboard +v1. Specifically, when pruning six decoder blocks on Llama3.1-8B Instruct +(i.e., 32 to 26 layers, reducing the model size from 8.03B to 6.72B +parameters), our method retains 91.2% of the original model's accuracy compared +to 81.7% with SFT, while reducing real-world FLOPs by 16.3%. Furthermore, +combining self-data distilled models through model merging yields enhanced +quality retention. Additionally, leveraging these pruned models in speculative +decoding increases token acceptance rates, thereby improving inference +efficiency in applied settings. + +
+
+ comment: 13 pages, 4 figures, 6 Tables (Main Paper) + 5 pages (Supplementary + Material) +
+
+
+
+
+ + ♻ ☆ On Active Privacy Auditing in Supervised Fine-tuning for White-Box + Language Models + + +
+ The pretraining and fine-tuning approach has become the leading technique for +various NLP applications. However, recent studies reveal that fine-tuning data, +due to their sensitive nature, domain-specific characteristics, and +identifiability, pose significant privacy concerns. To help develop more +privacy-resilient fine-tuning models, we introduce a novel active privacy +auditing framework, dubbed Parsing, designed to identify and quantify privacy +leakage risks during the supervised fine-tuning (SFT) of language models (LMs). +The framework leverages improved white-box membership inference attacks (MIAs) +as the core technology, utilizing novel learning objectives and a two-stage +pipeline to monitor the privacy of the LMs' fine-tuning process, maximizing the +exposure of privacy risks. Additionally, we have improved the effectiveness of +MIAs on large LMs including GPT-2, Llama2, and certain variants of them. Our +research aims to provide the SFT community of LMs with a reliable, ready-to-use +privacy auditing tool, and to offer valuable insights into safeguarding privacy +during the fine-tuning process. Experimental results confirm the framework's +efficiency across various models and tasks, emphasizing notable privacy +concerns in the fine-tuning process. Project code available for +https://anonymous.4open.science/r/PARSING-4817/. + +
+
+
+
+
+ + ♻ ☆ Stronger Models are NOT Stronger Teachers for Instruction Tuning + + +
+ Instruction tuning has been widely adopted to ensure large language models +(LLMs) follow user instructions effectively. The resulting +instruction-following capabilities of LLMs heavily rely on the instruction +datasets used for tuning. Recently, synthetic instruction datasets have emerged +as an economically viable solution to provide LLMs diverse and high-quality +instructions. However, existing approaches typically assume that larger or +stronger models are stronger teachers for instruction tuning, and hence simply +adopt these models as response generators to the synthetic instructions. In +this paper, we challenge this commonly-adopted assumption. Our extensive +experiments across five base models and twenty response generators reveal that +larger and stronger models are not necessarily stronger teachers of smaller +models. We refer to this phenomenon as the Larger Models' Paradox. We observe +that existing metrics cannot precisely predict the effectiveness of response +generators since they ignore the compatibility between teachers and base models +being fine-tuned. We thus develop a novel metric, named as +Compatibility-Adjusted Reward (CAR) to measure the effectiveness of response +generators. Our experiments across five base models demonstrate that CAR +outperforms almost all baselines. + +
+
+
+
+
+ + ♻ ☆ Freeze-Omni: A Smart and Low Latency Speech-to-speech Dialogue Model + with Frozen LLM + + +
+ Rapidly developing large language models (LLMs) have brought tremendous +intelligent applications. GPT-4o's excellent duplex speech interaction ability +has recently brought impressive experience to users. Researchers have recently +proposed several multi-modal LLMs in this direction that can achieve +speech-to-speech dialogue. This paper proposes a novel speech-text multimodal +LLM architecture called Freeze-Omni. Our main contribution is that the speech +input and output modalities can be easily connected to a textual LLM while +keeping the LLM's parameters frozen throughout the training process. We +designed 3-stage training strategies both for the modeling of speech input and +output, enabling Freeze-Omni to obtain speech-to-speech dialogue ability using +text-speech paired data (such as ASR and TTS data) and only 60,000 multi-round +text Q&A data on 8 GPUs. Moreover, we can effectively ensure that the +intelligence of the Freeze-Omni in the speech modality is at the same level +compared with that in the text modality of its backbone LLM, while the +end-to-end latency of the spoken response achieves a low level. In addition, we +also designed a method to achieve duplex dialogue ability through multi-task +training, making Freeze-Omni have a more natural style of dialogue ability +between the users. Freeze-Omni mainly provides a possibility for researchers to +conduct multimodal LLM under the condition of a frozen LLM, avoiding various +impacts caused by the catastrophic forgetting of LLM caused by fewer data and +training resources. + +
+
+ comment: Project Page: https://freeze-omni.github.io/ +
+
+
+
+
+ + ♻ ☆ Entity-Aware Self-Attention and Contextualized GCN for Enhanced Relation + Extraction in Long Sentences + + +
+ Relation extraction as an important natural Language processing (NLP) task is +to identify relations between named entities in text. Recently, graph +convolutional networks over dependency trees have been widely used to capture +syntactic features and achieved attractive performance. However, most existing +dependency-based approaches ignore the positive influence of the words outside +the dependency trees, sometimes conveying rich and useful information on +relation extraction. In this paper, we propose a novel model, Entity-aware +Self-attention Contextualized GCN (ESC-GCN), which efficiently incorporates +syntactic structure of input sentences and semantic context of sequences. To be +specific, relative position self-attention obtains the overall semantic +pairwise correlation related to word position, and contextualized graph +convolutional networks capture rich intra-sentence dependencies between words +by adequately pruning operations. Furthermore, entity-aware attention layer +dynamically selects which token is more decisive to make final relation +prediction. In this way, our proposed model not only reduces the noisy impact +from dependency trees, but also obtains easily-ignored entity-related semantic +representation. Extensive experiments on various tasks demonstrate that our +model achieves encouraging performance as compared to existing dependency-based +and sequence-based models. Specially, our model excels in extracting relations +between entities of long sentences. + +
+
+
+
+
+ + ♻ ☆ Entity-Aware Biaffine Attention Model for Improved Constituent Parsing + with Reduced Entity Violations + + +
+ Constituency parsing involves analyzing a sentence by breaking it into +sub-phrases, or constituents. While many deep neural models have achieved +state-of-the-art performance in this task, they often overlook the +entity-violating issue, where an entity fails to form a complete sub-tree in +the resultant parsing tree. To address this, we propose an entity-aware +biaffine attention model for constituent parsing. This model incorporates +entity information into the biaffine attention mechanism by using additional +entity role vectors for potential phrases, which enhances the parsing accuracy. +We introduce a new metric, the Entity Violating Rate (EVR), to quantify the +extent of entity violations in parsing results. Experiments on three popular +datasets-ONTONOTES, PTB, and CTB-demonstrate that our model achieves the lowest +EVR while maintaining high precision, recall, and F1-scores comparable to +existing models. Further evaluation in downstream tasks, such as sentence +sentiment analysis, highlights the effectiveness of our model and the validity +of the proposed EVR metric. + +
+
+
+
+
+ + ♻ ☆ Explaining Large Language Models Decisions Using Shapley Values + + +
+ The emergence of large language models (LLMs) has opened up exciting +possibilities for simulating human behavior and cognitive processes, with +potential applications in various domains, including marketing research and +consumer behavior analysis. However, the validity of utilizing LLMs as +stand-ins for human subjects remains uncertain due to glaring divergences that +suggest fundamentally different underlying processes at play and the +sensitivity of LLM responses to prompt variations. This paper presents a novel +approach based on Shapley values from cooperative game theory to interpret LLM +behavior and quantify the relative contribution of each prompt component to the +model's output. Through two applications - a discrete choice experiment and an +investigation of cognitive biases - we demonstrate how the Shapley value method +can uncover what we term "token noise" effects, a phenomenon where LLM +decisions are disproportionately influenced by tokens providing minimal +informative content. This phenomenon raises concerns about the robustness and +generalizability of insights obtained from LLMs in the context of human +behavior simulation. Our model-agnostic approach extends its utility to +proprietary LLMs, providing a valuable tool for practitioners and researchers +to strategically optimize prompts and mitigate apparent cognitive biases. Our +findings underscore the need for a more nuanced understanding of the factors +driving LLM responses before relying on them as substitutes for human subjects +in survey settings. We emphasize the importance of researchers reporting +results conditioned on specific prompt templates and exercising caution when +drawing parallels between human behavior and LLMs. + +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4pi-1.23/) +
+
+
+
+
+ + ♻ ☆ CogErgLLM: Exploring Large Language Model Systems Design Perspective + Using Cognitive Ergonomics ICML'24 + + +
+ Integrating cognitive ergonomics with LLMs is crucial for improving safety, +reliability, and user satisfaction in human-AI interactions. Current LLM +designs often lack this integration, resulting in systems that may not fully +align with human cognitive capabilities and limitations. This oversight +exacerbates biases in LLM outputs and leads to suboptimal user experiences due +to inconsistent application of user-centered design principles. Researchers are +increasingly leveraging NLP, particularly LLMs, to model and understand human +behavior across social sciences, psychology, psychiatry, health, and +neuroscience. Our position paper explores the need to integrate cognitive +ergonomics into LLM design, providing a comprehensive framework and practical +guidelines for ethical development. By addressing these challenges, we aim to +advance safer, more reliable, and ethically sound human-AI interactions. + +
+
+ comment: 10 Page, 3 Figures. Accepted in: (i) ICML'24: LLMs & Cognition + Workshop (Non-archival; OpenReview: + https://openreview.net/forum?id=63C9YSc77p) (ii) EMNLP'24 : NLP for Science + Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4science-1.22/) +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 52 + +
+
+
+ + ☆ DPU: Dynamic Prototype Updating for Multimodal Out-of-Distribution + Detection + + +
+ Out-of-distribution (OOD) detection is essential for ensuring the robustness +of machine learning models by identifying samples that deviate from the +training distribution. While traditional OOD detection has primarily focused on +single-modality inputs, such as images, recent advances in multimodal models +have demonstrated the potential of leveraging multiple modalities (e.g., video, +optical flow, audio) to enhance detection performance. However, existing +methods often overlook intra-class variability within in-distribution (ID) +data, assuming that samples of the same class are perfectly cohesive and +consistent. This assumption can lead to performance degradation, especially +when prediction discrepancies are uniformly amplified across all samples. To +address this issue, we propose Dynamic Prototype Updating (DPU), a novel +plug-and-play framework for multimodal OOD detection that accounts for +intra-class variations. Our method dynamically updates class center +representations for each class by measuring the variance of similar samples +within each batch, enabling adaptive adjustments. This approach allows us to +amplify prediction discrepancies based on the updated class centers, thereby +improving the model's robustness and generalization across different +modalities. Extensive experiments on two tasks, five datasets, and nine base +OOD algorithms demonstrate that DPU significantly improves OOD detection +performance, setting a new state-of-the-art in multimodal OOD detection, with +improvements of up to 80 percent in Far-OOD detection. To facilitate +accessibility and reproducibility, our code is publicly available on GitHub. + +
+
+
+
+
+ + ☆ GTA: Global Tracklet Association for Multi-Object Tracking in Sports ACCV 2024 + + +
+ Multi-object tracking in sports scenarios has become one of the focal points +in computer vision, experiencing significant advancements through the +integration of deep learning techniques. Despite these breakthroughs, +challenges remain, such as accurately re-identifying players upon re-entry into +the scene and minimizing ID switches. In this paper, we propose an +appearance-based global tracklet association algorithm designed to enhance +tracking performance by splitting tracklets containing multiple identities and +connecting tracklets seemingly from the same identity. This method can serve as +a plug-and-play refinement tool for any multi-object tracker to further boost +their performance. The proposed method achieved a new state-of-the-art +performance on the SportsMOT dataset with HOTA score of 81.04%. Similarly, on +the SoccerNet dataset, our method enhanced multiple trackers' performance, +consistently increasing the HOTA score from 79.41% to 83.11%. These significant +and consistent improvements across different trackers and datasets underscore +our proposed method's potential impact on the application of sports player +tracking. We open-source our project codebase at +https://github.com/sjc042/gta-link.git. + +
+
+ comment: Accepted by ACCV 2024 MLCSA Workshop +
+
+
+
+
+ + ☆ Latent Space Disentanglement in Diffusion Transformers Enables Precise + Zero-shot Semantic Editing + + +
+ Diffusion Transformers (DiTs) have recently achieved remarkable success in +text-guided image generation. In image editing, DiTs project text and image +inputs to a joint latent space, from which they decode and synthesize new +images. However, it remains largely unexplored how multimodal information +collectively forms this joint space and how they guide the semantics of the +synthesized images. In this paper, we investigate the latent space of DiT +models and uncover two key properties: First, DiT's latent space is inherently +semantically disentangled, where different semantic attributes can be +controlled by specific editing directions. Second, consistent semantic editing +requires utilizing the entire joint latent space, as neither encoded image nor +text alone contains enough semantic information. We show that these editing +directions can be obtained directly from text prompts, enabling precise +semantic control without additional training or mask annotations. Based on +these insights, we propose a simple yet effective Encode-Identify-Manipulate +(EIM) framework for zero-shot fine-grained image editing. Specifically, we +first encode both the given source image and the text prompt that describes the +image, to obtain the joint latent embedding. Then, using our proposed Hessian +Score Distillation Sampling (HSDS) method, we identify editing directions that +control specific target attributes while preserving other image features. These +directions are guided by text prompts and used to manipulate the latent +embeddings. Moreover, we propose a new metric to quantify the disentanglement +degree of the latent space of diffusion models. Extensive experiment results on +our new curated benchmark dataset and analysis demonstrate DiT's +disentanglement properties and effectiveness of the EIM framework. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2408.13335 +
+
+
+
+
+ + ☆ An Explainable Machine Learning Approach for Age and Gender Estimation + in Living Individuals Using Dental Biometrics + + +
+ Objectives: Age and gender estimation is crucial for various applications, +including forensic investigations and anthropological studies. This research +aims to develop a predictive system for age and gender estimation in living +individuals, leveraging dental measurements such as Coronal Height (CH), +Coronal Pulp Cavity Height (CPCH), and Tooth Coronal Index (TCI). Methods: +Machine learning models were employed in our study, including Cat Boost +Classifier (Catboost), Gradient Boosting Machine (GBM), Ada Boost Classifier +(AdaBoost), Random Forest (RF), eXtreme Gradient Boosting (XGB), Light Gradient +Boosting Machine (LGB), and Extra Trees Classifier (ETC), to analyze dental +data from 862 living individuals (459 males and 403 females). Specifically, +periapical radiographs from six teeth per individual were utilized, including +premolars and molars from both maxillary and mandibular. A novel ensemble +learning technique was developed, which uses multiple models each tailored to +distinct dental metrics, to estimate age and gender accurately. Furthermore, an +explainable AI model has been created utilizing SHAP, enabling dental experts +to make judicious decisions based on comprehensible insight. Results: The RF +and XGB models were particularly effective, yielding the highest F1 score for +age and gender estimation. Notably, the XGB model showed a slightly better +performance in age estimation, achieving an F1 score of 73.26%. A similar trend +for the RF model was also observed in gender estimation, achieving a F1 score +of 77.53%. Conclusions: This study marks a significant advancement in dental +forensic methods, showcasing the potential of machine learning to automate age +and gender estimation processes with improved accuracy. + +
+
+
+
+
+ + ☆ TractoEmbed: Modular Multi-level Embedding framework for white matter + tract segmentation ICPR + + +
+ White matter tract segmentation is crucial for studying brain structural +connectivity and neurosurgical planning. However, segmentation remains +challenging due to issues like class imbalance between major and minor tracts, +structural similarity, subject variability, symmetric streamlines between +hemispheres etc. To address these challenges, we propose TractoEmbed, a modular +multi-level embedding framework, that encodes localized representations through +learning tasks in respective encoders. In this paper, TractoEmbed introduces a +novel hierarchical streamline data representation that captures maximum spatial +information at each level i.e. individual streamlines, clusters, and patches. +Experiments show that TractoEmbed outperforms state-of-the-art methods in white +matter tract segmentation across different datasets, and spanning various age +groups. The modular framework directly allows the integration of additional +embeddings in future works. + +
+
+ comment: Accepted at 27th International Conference on Pattern Recognition + (ICPR), 2024 15 pages, 2 figures +
+
+
+
+
+ + ☆ Comprehensive and Comparative Analysis between Transfer Learning and + Custom Built VGG and CNN-SVM Models for Wildfire Detection + + +
+ Contemporary Artificial Intelligence (AI) and Machine Learning (ML) research +places a significant emphasis on transfer learning, showcasing its +transformative potential in enhancing model performance across diverse domains. +This paper examines the efficiency and effectiveness of transfer learning in +the context of wildfire detection. Three purpose-built models -- Visual +Geometry Group (VGG)-7, VGG-10, and Convolutional Neural Network (CNN)-Support +Vector Machine(SVM) CNN-SVM -- are rigorously compared with three pretrained +models -- VGG-16, VGG-19, and Residual Neural Network (ResNet) ResNet101. We +trained and evaluated these models using a dataset that captures the +complexities of wildfires, incorporating variables such as varying lighting +conditions, time of day, and diverse terrains. The objective is to discern how +transfer learning performs against models trained from scratch in addressing +the intricacies of the wildfire detection problem. By assessing the performance +metrics, including accuracy, precision, recall, and F1 score, a comprehensive +understanding of the advantages and disadvantages of transfer learning in this +specific domain is obtained. This study contributes valuable insights to the +ongoing discourse, guiding future directions in AI and ML research. Keywords: +Wildfire prediction, deep learning, machine learning fire, detection + +
+
+ comment: In Proc. of the 2024 IEEE International Conference On Intelligent + Computing in Data Sciences +
+
+
+
+
+ + ☆ EAPCR: A Universal Feature Extractor for Scientific Data without + Explicit Feature Relation Patterns + + +
+ Conventional methods, including Decision Tree (DT)-based methods, have been +effective in scientific tasks, such as non-image medical diagnostics, system +anomaly detection, and inorganic catalysis efficiency prediction. However, most +deep-learning techniques have struggled to surpass or even match this level of +success as traditional machine-learning methods. The primary reason is that +these applications involve multi-source, heterogeneous data where features lack +explicit relationships. This contrasts with image data, where pixels exhibit +spatial relationships; textual data, where words have sequential dependencies; +and graph data, where nodes are connected through established associations. The +absence of explicit Feature Relation Patterns (FRPs) presents a significant +challenge for deep learning techniques in scientific applications that are not +image, text, and graph-based. In this paper, we introduce EAPCR, a universal +feature extractor designed for data without explicit FRPs. Tested across +various scientific tasks, EAPCR consistently outperforms traditional methods +and bridges the gap where deep learning models fall short. To further +demonstrate its robustness, we synthesize a dataset without explicit FRPs. +While Kolmogorov-Arnold Network (KAN) and feature extractors like Convolutional +Neural Networks (CNNs), Graph Convolutional Networks (GCNs), and Transformers +struggle, EAPCR excels, demonstrating its robustness and superior performance +in scientific tasks without FRPs. + +
+
+
+
+
+ + ☆ TomoGRAF: A Robust and Generalizable Reconstruction Network for + Single-View Computed Tomography + + +
+ Computed tomography (CT) provides high spatial resolution visualization of 3D +structures for scientific and clinical applications. Traditional +analytical/iterative CT reconstruction algorithms require hundreds of angular +data samplings, a condition that may not be met in practice due to physical and +mechanical limitations. Sparse view CT reconstruction has been proposed using +constrained optimization and machine learning methods with varying success, +less so for ultra-sparse view CT reconstruction with one to two views. Neural +radiance field (NeRF) is a powerful tool for reconstructing and rendering 3D +natural scenes from sparse views, but its direct application to 3D medical +image reconstruction has been minimally successful due to the differences +between optical and X-ray photon transportation. Here, we develop a novel +TomoGRAF framework incorporating the unique X-ray transportation physics to +reconstruct high-quality 3D volumes using ultra-sparse projections without +prior. TomoGRAF captures the CT imaging geometry, simulates the X-ray casting +and tracing process, and penalizes the difference between simulated and ground +truth CT sub-volume during training. We evaluated the performance of TomoGRAF +on an unseen dataset of distinct imaging characteristics from the training data +and demonstrated a vast leap in performance compared with state-of-the-art deep +learning and NeRF methods. TomoGRAF provides the first generalizable solution +for image-guided radiotherapy and interventional radiology applications, where +only one or a few X-ray views are available, but 3D volumetric information is +desired. + +
+
+
+
+
+ + ☆ CameraHMR: Aligning People with Perspective 3DV 2025 + + +
+ We address the challenge of accurate 3D human pose and shape estimation from +monocular images. The key to accuracy and robustness lies in high-quality +training data. Existing training datasets containing real images with pseudo +ground truth (pGT) use SMPLify to fit SMPL to sparse 2D joint locations, +assuming a simplified camera with default intrinsics. We make two contributions +that improve pGT accuracy. First, to estimate camera intrinsics, we develop a +field-of-view prediction model (HumanFoV) trained on a dataset of images +containing people. We use the estimated intrinsics to enhance the 4D-Humans +dataset by incorporating a full perspective camera model during SMPLify +fitting. Second, 2D joints provide limited constraints on 3D body shape, +resulting in average-looking bodies. To address this, we use the BEDLAM dataset +to train a dense surface keypoint detector. We apply this detector to the +4D-Humans dataset and modify SMPLify to fit the detected keypoints, resulting +in significantly more realistic body shapes. Finally, we upgrade the HMR2.0 +architecture to include the estimated camera parameters. We iterate model +training and SMPLify fitting initialized with the previously trained model. +This leads to more accurate pGT and a new model, CameraHMR, with +state-of-the-art accuracy. Code and pGT are available for research purposes. + +
+
+ comment: 3DV 2025 +
+
+
+
+
+ + ☆ TIPO: Text to Image with Text Presampling for Prompt Optimization + + +
+ TIPO (Text to Image with text pre-sampling for Prompt Optimization) is an +innovative framework designed to enhance text-to-image (T2I) generation by +language model (LM) for automatic prompt engineering. By refining and extending +user-provided prompts, TIPO bridges the gap between simple inputs and the +detailed prompts required for high-quality image generation. Unlike previous +approaches that rely on Large Language Models (LLMs) or reinforcement learning +(RL), TIPO adjusts user input prompts with the distribution of a trained prompt +dataset, eliminating the need for complex runtime cost via lightweight model. +This pre-sampling approach enables efficient and scalable prompt optimization, +grounded in the model's training distribution. Experimental results demonstrate +TIPO's effectiveness in improving aesthetic scores, reducing image corruption, +and better aligning generated images with dataset distributions. These findings +highlight the critical role of prompt engineering in T2I systems and open +avenues for broader applications of automatic prompt refinement. + +
+
+ comment: 21 pages, 13 figures +
+
+
+
+
+ + ☆ Material Transforms from Disentangled NeRF Representations + + +
+ In this paper, we first propose a novel method for transferring material +transformations across different scenes. Building on disentangled Neural +Radiance Field (NeRF) representations, our approach learns to map Bidirectional +Reflectance Distribution Functions (BRDF) from pairs of scenes observed in +varying conditions, such as dry and wet. The learned transformations can then +be applied to unseen scenes with similar materials, therefore effectively +rendering the transformation learned with an arbitrary level of intensity. +Extensive experiments on synthetic scenes and real-world objects validate the +effectiveness of our approach, showing that it can learn various +transformations such as wetness, painting, coating, etc. Our results highlight +not only the versatility of our method but also its potential for practical +applications in computer graphics. We publish our method implementation, along +with our synthetic/real datasets on +https://github.com/astra-vision/BRDFTransform + +
+
+
+
+
+ + ☆ GaussianAnything: Interactive Point Cloud Latent Diffusion for 3D + Generation + + +
+ While 3D content generation has advanced significantly, existing methods +still face challenges with input formats, latent space design, and output +representations. This paper introduces a novel 3D generation framework that +addresses these challenges, offering scalable, high-quality 3D generation with +an interactive Point Cloud-structured Latent space. Our framework employs a +Variational Autoencoder (VAE) with multi-view posed RGB-D(epth)-N(ormal) +renderings as input, using a unique latent space design that preserves 3D shape +information, and incorporates a cascaded latent diffusion model for improved +shape-texture disentanglement. The proposed method, GaussianAnything, supports +multi-modal conditional 3D generation, allowing for point cloud, caption, and +single/multi-view image inputs. Notably, the newly proposed latent space +naturally enables geometry-texture disentanglement, thus allowing 3D-aware +editing. Experimental results demonstrate the effectiveness of our approach on +multiple datasets, outperforming existing methods in both text- and +image-conditioned 3D generation. + +
+
+ comment: project page: https://nirvanalan.github.io/projects/GA/ +
+
+
+
+
+ + ☆ LLMPhy: Complex Physical Reasoning Using Large Language Models and World + Models + + +
+ Physical reasoning is an important skill needed for robotic agents when +operating in the real world. However, solving such reasoning problems often +involves hypothesizing and reflecting over complex multi-body interactions +under the effect of a multitude of physical forces and thus learning all such +interactions poses a significant hurdle for state-of-the-art machine learning +frameworks, including large language models (LLMs). To study this problem, we +propose a new physical reasoning task and a dataset, dubbed TraySim. Our task +involves predicting the dynamics of several objects on a tray that is given an +external impact -- the domino effect of the ensued object interactions and +their dynamics thus offering a challenging yet controlled setup, with the goal +of reasoning being to infer the stability of the objects after the impact. To +solve this complex physical reasoning task, we present LLMPhy, a zero-shot +black-box optimization framework that leverages the physics knowledge and +program synthesis abilities of LLMs, and synergizes these abilities with the +world models built into modern physics engines. Specifically, LLMPhy uses an +LLM to generate code to iteratively estimate the physical hyperparameters of +the system (friction, damping, layout, etc.) via an implicit +analysis-by-synthesis approach using a (non-differentiable) simulator in the +loop and uses the inferred parameters to imagine the dynamics of the scene +towards solving the reasoning task. To show the effectiveness of LLMPhy, we +present experiments on our TraySim dataset to predict the steady-state poses of +the objects. Our results show that the combination of the LLM and the physics +engine leads to state-of-the-art zero-shot physical reasoning performance, +while demonstrating superior convergence against standard black-box +optimization methods and better estimation of the physical parameters. + +
+
+
+
+
+ + ☆ Wavelet Latent Diffusion (Wala): Billion-Parameter 3D Generative Model + with Compact Wavelet Encodings + + +
+ Large-scale 3D generative models require substantial computational resources +yet often fall short in capturing fine details and complex geometries at high +resolutions. We attribute this limitation to the inefficiency of current +representations, which lack the compactness required to model the generative +models effectively. To address this, we introduce a novel approach called +Wavelet Latent Diffusion, or WaLa, that encodes 3D shapes into wavelet-based, +compact latent encodings. Specifically, we compress a $256^3$ signed distance +field into a $12^3 \times 4$ latent grid, achieving an impressive 2427x +compression ratio with minimal loss of detail. This high level of compression +allows our method to efficiently train large-scale generative networks without +increasing the inference time. Our models, both conditional and unconditional, +contain approximately one billion parameters and successfully generate +high-quality 3D shapes at $256^3$ resolution. Moreover, WaLa offers rapid +inference, producing shapes within two to four seconds depending on the +condition, despite the model's scale. We demonstrate state-of-the-art +performance across multiple datasets, with significant improvements in +generation quality, diversity, and computational efficiency. We open-source our +code and, to the best of our knowledge, release the largest pretrained 3D +generative models across different modalities. + +
+
+
+
+
+ + ☆ Artistic Neural Style Transfer Algorithms with Activation Smoothing + + +
+ The works of Gatys et al. demonstrated the capability of Convolutional Neural +Networks (CNNs) in creating artistic style images. This process of transferring +content images in different styles is called Neural Style Transfer (NST). In +this paper, we re-implement image-based NST, fast NST, and arbitrary NST. We +also explore to utilize ResNet with activation smoothing in NST. Extensive +experimental results demonstrate that smoothing transformation can greatly +improve the quality of stylization results. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ☆ JanusFlow: Harmonizing Autoregression and Rectified Flow for Unified + Multimodal Understanding and Generation + + +
+ We present JanusFlow, a powerful framework that unifies image understanding +and generation in a single model. JanusFlow introduces a minimalist +architecture that integrates autoregressive language models with rectified +flow, a state-of-the-art method in generative modeling. Our key finding +demonstrates that rectified flow can be straightforwardly trained within the +large language model framework, eliminating the need for complex architectural +modifications. To further improve the performance of our unified model, we +adopt two key strategies: (i) decoupling the understanding and generation +encoders, and (ii) aligning their representations during unified training. +Extensive experiments show that JanusFlow achieves comparable or superior +performance to specialized models in their respective domains, while +significantly outperforming existing unified approaches across standard +benchmarks. This work represents a step toward more efficient and versatile +vision-language models. + +
+
+
+
+
+ + ☆ Commissioning An All-Sky Infrared Camera Array for Detection Of Airborne + Objects + + +
+ To date there is little publicly available scientific data on Unidentified +Aerial Phenomena (UAP) whose properties and kinematics purportedly reside +outside the performance envelope of known phenomena. To address this +deficiency, the Galileo Project is designing, building, and commissioning a +multi-modal ground-based observatory to continuously monitor the sky and +conduct a rigorous long-term aerial census of all aerial phenomena, including +natural and human-made. One of the key instruments is an all-sky infrared +camera array using eight uncooled long-wave infrared FLIR Boson 640 cameras. +Their calibration includes a novel extrinsic calibration method using airplane +positions from Automatic Dependent Surveillance-Broadcast (ADS-B) data. We +establish a first baseline for the system performance over five months of field +operation, using a real-world dataset derived from ADS-B data, synthetic 3-D +trajectories, and a hand-labelled real-world dataset. We report acceptance +rates (e.g. viewable airplanes that are recorded) and detection efficiencies +(e.g. recorded airplanes which are successfully detected) for a variety of +weather conditions, range and aircraft size. We reconstruct $\sim$500,000 +trajectories of aerial objects from this commissioning period. A toy outlier +search focused on large sinuosity of the 2-D reconstructed trajectories flags +about 16% of trajectories as outliers. After manual review, 144 trajectories +remain ambiguous: they are likely mundane objects but cannot be elucidated at +this stage of development without distance and kinematics estimation or other +sensor modalities. Our observed count of ambiguous outliers combined with +systematic uncertainties yields an upper limit of 18,271 outliers count for the +five-month interval at a 95% confidence level. This likelihood-based method to +evaluate significance is applicable to all of our future outlier searches. + +
+
+
+
+
+ + ☆ SimBase: A Simple Baseline for Temporal Video Grounding + + +
+ This paper presents SimBase, a simple yet effective baseline for temporal +video grounding. While recent advances in temporal grounding have led to +impressive performance, they have also driven network architectures toward +greater complexity, with a range of methods to (1) capture temporal +relationships and (2) achieve effective multimodal fusion. In contrast, this +paper explores the question: How effective can a simplified approach be? To +investigate, we design SimBase, a network that leverages lightweight, +one-dimensional temporal convolutional layers instead of complex temporal +structures. For cross-modal interaction, SimBase only employs an element-wise +product instead of intricate multimodal fusion. Remarkably, SimBase achieves +state-of-the-art results on two large-scale datasets. As a simple yet powerful +baseline, we hope SimBase will spark new ideas and streamline future +evaluations in temporal video grounding. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ DuoLift-GAN:Reconstructing CT from Single-view and Biplanar X-Rays with + Generative Adversarial Networks + + +
+ Computed tomography (CT) provides highly detailed three-dimensional (3D) +medical images but is costly, time-consuming, and often inaccessible in +intraoperative settings (Organization et al. 2011). Recent advancements have +explored reconstructing 3D chest volumes from sparse 2D X-rays, such as +single-view or orthogonal double-view images. However, current models tend to +process 2D images in a planar manner, prioritizing visual realism over +structural accuracy. In this work, we introduce DuoLift Generative Adversarial +Networks (DuoLift-GAN), a novel architecture with dual branches that +independently elevate 2D images and their features into 3D representations. +These 3D outputs are merged into a unified 3D feature map and decoded into a +complete 3D chest volume, enabling richer 3D information capture. We also +present a masked loss function that directs reconstruction towards critical +anatomical regions, improving structural accuracy and visual quality. This +paper demonstrates that DuoLift-GAN significantly enhances reconstruction +accuracy while achieving superior visual realism compared to existing methods. + +
+
+
+
+
+ + ☆ Learning Disentangled Representations for Perceptual Point Cloud Quality + Assessment via Mutual Information Minimization + + +
+ No-Reference Point Cloud Quality Assessment (NR-PCQA) aims to objectively +assess the human perceptual quality of point clouds without relying on +pristine-quality point clouds for reference. It is becoming increasingly +significant with the rapid advancement of immersive media applications such as +virtual reality (VR) and augmented reality (AR). However, current NR-PCQA +models attempt to indiscriminately learn point cloud content and distortion +representations within a single network, overlooking their distinct +contributions to quality information. To address this issue, we propose DisPA, +a novel disentangled representation learning framework for NR-PCQA. The +framework trains a dual-branch disentanglement network to minimize mutual +information (MI) between representations of point cloud content and distortion. +Specifically, to fully disentangle representations, the two branches adopt +different philosophies: the content-aware encoder is pretrained by a masked +auto-encoding strategy, which can allow the encoder to capture semantic +information from rendered images of distorted point clouds; the +distortion-aware encoder takes a mini-patch map as input, which forces the +encoder to focus on low-level distortion patterns. Furthermore, we utilize an +MI estimator to estimate the tight upper bound of the actual MI and further +minimize it to achieve explicit representation disentanglement. Extensive +experimental results demonstrate that DisPA outperforms state-of-the-art +methods on multiple PCQA datasets. + +
+
+
+
+
+ + ☆ Deep Learning 2.0: Artificial Neurons That Matter -- Reject Correlation, + Embrace Orthogonality CVPR 2025 + + +
+ We introduce a yat-product-powered neural network, the Neural Matter Network +(NMN), a breakthrough in deep learning that achieves non-linear pattern +recognition without activation functions. Our key innovation relies on the +yat-product and yat-product, which naturally induces non-linearity by +projecting inputs into a pseudo-metric space, eliminating the need for +traditional activation functions while maintaining only a softmax layer for +final class probability distribution. This approach simplifies network +architecture and provides unprecedented transparency into the network's +decision-making process. Our comprehensive empirical evaluation across +different datasets demonstrates that NMN consistently outperforms traditional +MLPs. The results challenge the assumption that separate activation functions +are necessary for effective deep-learning models. The implications of this work +extend beyond immediate architectural benefits, by eliminating intermediate +activation functions while preserving non-linear capabilities, yat-MLP +establishes a new paradigm for neural network design that combines simplicity +with effectiveness. Most importantly, our approach provides unprecedented +insights into the traditionally opaque "black-box" nature of neural networks, +offering a clearer understanding of how these models process and classify +information. + +
+
+ comment: Submitted to CVPR 2025 +
+
+
+
+
+ + ☆ Isometric Transformations for Image Augmentation in Mueller Matrix + Polarimetry + + +
+ Mueller matrix polarimetry captures essential information about polarized +light interactions with a sample, presenting unique challenges for data +augmentation in deep learning due to its distinct structure. While +augmentations are an effective and affordable way to enhance dataset diversity +and reduce overfitting, standard transformations like rotations and flips do +not preserve the polarization properties in Mueller matrix images. To this end, +we introduce a versatile simulation framework that applies physically +consistent rotations and flips to Mueller matrices, tailored to maintain +polarization fidelity. Our experimental results across multiple datasets reveal +that conventional augmentations can lead to misleading results when applied to +polarimetric data, underscoring the necessity of our physics-based approach. In +our experiments, we first compare our polarization-specific augmentations +against real-world captures to validate their physical consistency. We then +apply these augmentations in a semantic segmentation task, achieving +substantial improvements in model generalization and performance. This study +underscores the necessity of physics-informed data augmentation for +polarimetric imaging in deep learning (DL), paving the way for broader adoption +and more robust applications across diverse research in the field. In +particular, our framework unlocks the potential of DL models for polarimetric +datasets with limited sample sizes. Our code implementation is available at +github.com/hahnec/polar_augment. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ TLDR: Traffic Light Detection using Fourier Domain Adaptation in Hostile + WeatheR + + +
+ The scarcity of comprehensive datasets in the traffic light detection and +recognition domain and the poor performance of state-of-the-art models under +hostile weather conditions present significant challenges. To address these +issues, this paper proposes a novel approach by merging two widely used +datasets, LISA and S2TLD. The merged dataset is further processed to tackle +class imbalance, a common problem in this domain. This merged dataset becomes +our source domain. Synthetic rain and fog are added to the dataset to create +our target domain. We employ Fourier Domain Adaptation (FDA) to create a final +dataset with a minimized domain gap between the two datasets, helping the model +trained on this final dataset adapt to rainy and foggy weather conditions. +Additionally, we explore Semi-Supervised Learning (SSL) techniques to leverage +the available data more effectively. Experimental results demonstrate that +models trained on FDA-augmented images outperform those trained without FDA +across confidence-dependent and independent metrics, like mAP50, mAP50-95, +Precision, and Recall. The best-performing model, YOLOv8, achieved a Precision +increase of 5.1860%, Recall increase of 14.8009%, mAP50 increase of 9.5074%, +and mAP50-95 increase of 19.5035%. On average, percentage increases of 7.6892% +in Precision, 19.9069% in Recall, 15.8506% in mAP50, and 23.8099% in mAP50-95 +were observed across all models, highlighting the effectiveness of FDA in +mitigating the impact of adverse weather conditions on model performance. These +improvements pave the way for real-world applications where reliable +performance in challenging environmental conditions is critical. + +
+
+ comment: Under Review at IEEE Transactions of Artificial Intelligence. 10 + Pages, 7 Figures +
+
+
+
+
+ + ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse + Tensor-based Transformer + + +
+ The evolution of 3D visualization techniques has fundamentally transformed +how we interact with digital content. At the forefront of this change is point +cloud technology, offering an immersive experience that surpasses traditional +2D representations. However, the massive data size of point clouds presents +significant challenges in data compression. Current methods for lossy point +cloud attribute compression (PCAC) generally focus on reconstructing the +original point clouds with minimal error. However, for point cloud +visualization scenarios, the reconstructed point clouds with distortion still +need to undergo a complex rendering process, which affects the final +user-perceived quality. In this paper, we propose an end-to-end deep learning +framework that seamlessly integrates PCAC with differentiable rendering, +denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of +rendered multiview images for viewing. In a differentiable manner, the impact +of the rendering process on the reconstructed point clouds is taken into +account. Moreover, we characterize point clouds as sparse tensors and propose a +sparse tensor-based transformer, called SP-Trans. By aligning with the local +density of the point cloud and utilizing an enhanced local attention mechanism, +SP-Trans captures the intricate relationships within the point cloud, further +improving feature analysis and synthesis within the framework. Extensive +experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art +compression performance, compared to existing reconstruction-oriented methods, +including traditional, learning-based, and hybrid methods. + +
+
+
+
+
+ + ☆ Joint multi-dimensional dynamic attention and transformer for general + image restoration + + +
+ Outdoor images often suffer from severe degradation due to rain, haze, and +noise, impairing image quality and challenging high-level tasks. Current image +restoration methods struggle to handle complex degradation while maintaining +efficiency. This paper introduces a novel image restoration architecture that +combines multi-dimensional dynamic attention and self-attention within a U-Net +framework. To leverage the global modeling capabilities of transformers and the +local modeling capabilities of convolutions, we integrate sole CNNs in the +encoder-decoder and sole transformers in the latent layer. Additionally, we +design convolutional kernels with selected multi-dimensional dynamic attention +to capture diverse degraded inputs efficiently. A transformer block with +transposed self-attention further enhances global feature extraction while +maintaining efficiency. Extensive experiments demonstrate that our method +achieves a better balance between performance and computational complexity +across five image restoration tasks: deraining, deblurring, denoising, +dehazing, and enhancement, as well as superior performance for high-level +vision tasks. The source code will be available at +https://github.com/House-yuyu/MDDA-former. + +
+
+
+
+
+ + ☆ INTRABENCH: Interactive Radiological Benchmark + + +
+ Current interactive segmentation approaches, inspired by the success of +META's Segment Anything model, have achieved notable advancements, however, +they come with substantial limitations that hinder their practical application +in real clinical scenarios. These include unrealistic human interaction +requirements, such as slice-by-slice operations for 2D models on 3D data, a +lack of iterative refinement, and insufficient evaluation experiments. These +shortcomings prevent accurate assessment of model performance and lead to +inconsistent outcomes across studies. IntRaBench overcomes these challenges by +offering a comprehensive and reproducible framework for evaluating interactive +segmentation methods in realistic, clinically relevant scenarios. It includes +diverse datasets, target structures, and segmentation models, and provides a +flexible codebase that allows seamless integration of new models and prompting +strategies. Additionally, we introduce advanced techniques to minimize +clinician interaction, ensuring fair comparisons between 2D and 3D models. By +open-sourcing IntRaBench, we invite the research community to integrate their +models and prompting techniques, ensuring continuous and transparent evaluation +of interactive segmentation models in 3D medical imaging. + +
+
+ comment: Undergoing Peer-Review +
+
+
+
+
+ + ☆ Diverse capability and scaling of diffusion and auto-regressive models + when learning abstract rules NeurIPS2024 + + +
+ Humans excel at discovering regular structures from limited samples and +applying inferred rules to novel settings. We investigate whether modern +generative models can similarly learn underlying rules from finite samples and +perform reasoning through conditional sampling. Inspired by Raven's Progressive +Matrices task, we designed GenRAVEN dataset, where each sample consists of +three rows, and one of 40 relational rules governing the object position, +number, or attributes applies to all rows. We trained generative models to +learn the data distribution, where samples are encoded as integer arrays to +focus on rule learning. We compared two generative model families: diffusion +(EDM, DiT, SiT) and autoregressive models (GPT2, Mamba). We evaluated their +ability to generate structurally consistent samples and perform panel +completion via unconditional and conditional sampling. We found diffusion +models excel at unconditional generation, producing more novel and consistent +samples from scratch and memorizing less, but performing less well in panel +completion, even with advanced conditional sampling methods. Conversely, +autoregressive models excel at completing missing panels in a rule-consistent +manner but generate less consistent samples unconditionally. We observe diverse +data scaling behaviors: for both model families, rule learning emerges at a +certain dataset size - around 1000s examples per rule. With more training data, +diffusion models improve both their unconditional and conditional generation +capabilities. However, for autoregressive models, while panel completion +improves with more training data, unconditional generation consistency +declines. Our findings highlight complementary capabilities and limitations of +diffusion and autoregressive models in rule learning and reasoning tasks, +suggesting avenues for further research into their mechanisms and potential for +human-like reasoning. + +
+
+ comment: 12 pages, 5 figures. Accepted to NeurIPS2024 Workshop on System 2 + Reasoning At Scale as long paper +
+
+
+
+
+ + ☆ CDXFormer: Boosting Remote Sensing Change Detection with Extended Long + Short-Term Memory + + +
+ In complex scenes and varied conditions, effectively integrating +spatial-temporal context is crucial for accurately identifying changes. +However, current RS-CD methods lack a balanced consideration of performance and +efficiency. CNNs lack global context, Transformers have quadratic computational +complexity, and Mambas are restricted by CUDA acceleration. In this paper, we +propose CDXFormer, with a core component that is a powerful XLSTM-based feature +enhancement layer, integrating the advantages of linear computational +complexity, global context perception, and strong interpret-ability. +Specifically, we introduce a scale-specific Feature Enhancer layer, +incorporating a Cross-Temporal Global Perceptron customized for +semantic-accurate deep features, and a Cross-Temporal Spatial Refiner +customized for detail-rich shallow features. Additionally, we propose a +Cross-Scale Interactive Fusion module to progressively interact global change +representations with spatial responses. Extensive experimental results +demonstrate that CDXFormer achieves state-of-the-art performance across three +benchmark datasets, offering a compelling balance between efficiency and +accuracy. Code is available at https://github.com/xwmaxwma/rschange. + +
+
+
+
+
+ + ☆ NL-SLAM for OC-VLN: Natural Language Grounded SLAM for Object-Centric + VLN + + +
+ Landmark-based navigation (e.g. go to the wooden desk) and relative +positional navigation (e.g. move 5 meters forward) are distinct navigation +challenges solved very differently in existing robotics navigation methodology. +We present a new dataset, OC-VLN, in order to distinctly evaluate grounding +object-centric natural language navigation instructions in a method for +performing landmark-based navigation. We also propose Natural Language grounded +SLAM (NL-SLAM), a method to ground natural language instruction to robot +observations and poses. We actively perform NL-SLAM in order to follow +object-centric natural language navigation instructions. Our methods leverage +pre-trained vision and language foundation models and require no task-specific +training. We construct two strong baselines from state-of-the-art methods on +related tasks, Object Goal Navigation and Vision Language Navigation, and we +show that our approach, NL-SLAM, outperforms these baselines across all our +metrics of success on OC-VLN. Finally, we successfully demonstrate the +effectiveness of NL-SLAM for performing navigation instruction following in the +real world on a Boston Dynamics Spot robot. + +
+
+
+
+
+ + ☆ Towards Vision Mixture of Experts for Wildlife Monitoring on the Edge + + +
+ The explosion of IoT sensors in industrial, consumer and remote sensing use +cases has come with unprecedented demand for computing infrastructure to +transmit and to analyze petabytes of data. Concurrently, the world is slowly +shifting its focus towards more sustainable computing. For these reasons, there +has been a recent effort to reduce the footprint of related computing +infrastructure, especially by deep learning algorithms, for advanced insight +generation. The `TinyML' community is actively proposing methods to save +communication bandwidth and excessive cloud storage costs while reducing +algorithm inference latency and promoting data privacy. Such proposed +approaches should ideally process multiple types of data, including time +series, audio, satellite images, and video, near the network edge as multiple +data streams has been shown to improve the discriminative ability of learning +algorithms, especially for generating fine grained results. Incidentally, there +has been recent work on data driven conditional computation of subnetworks that +has shown real progress in using a single model to share parameters among very +different types of inputs such as images and text, reducing the computation +requirement of multi-tower multimodal networks. Inspired by such line of work, +we explore similar per patch conditional computation for the first time for +mobile vision transformers (vision only case), that will eventually be used for +single-tower multimodal edge models. We evaluate the model on Cornell Sap +Sucker Woods 60, a fine grained bird species discrimination dataset. Our +initial experiments uses $4X$ fewer parameters compared to MobileViTV2-1.0 with +a $1$% accuracy drop on the iNaturalist '21 birds test data provided as part of +the SSW60 dataset. + +
+
+
+
+
+ + ☆ Large-scale Remote Sensing Image Target Recognition and Automatic + Annotation + + +
+ This paper presents a method for object recognition and automatic labeling in +large-area remote sensing images called LRSAA. The method integrates YOLOv11 +and MobileNetV3-SSD object detection algorithms through ensemble learning to +enhance model performance. Furthermore, it employs Poisson disk sampling +segmentation techniques and the EIOU metric to optimize the training and +inference processes of segmented images, followed by the integration of +results. This approach not only reduces the demand for computational resources +but also achieves a good balance between accuracy and speed. The source code +for this project has been made publicly available on +https://github.com/anaerovane/LRSAA. + +
+
+
+
+
+ + ☆ Horticultural Temporal Fruit Monitoring via 3D Instance Segmentation and + Re-Identification using Point Clouds + + +
+ Robotic fruit monitoring is a key step toward automated agricultural +production systems. Robots can significantly enhance plant and temporal fruit +monitoring by providing precise, high-throughput assessments that overcome the +limitations of traditional manual methods. Fruit monitoring is a challenging +task due to the significant variation in size, shape, orientation, and +occlusion of fruits. Also, fruits may be harvested or newly grown between +recording sessions. Most methods are 2D image-based and they lack the 3D +structure, depth, and spatial information, which represent key aspects of fruit +monitoring. 3D colored point clouds, instead, can offer this information but +they introduce challenges such as their sparsity and irregularity. In this +paper, we present a novel approach for temporal fruit monitoring that addresses +point clouds collected in a greenhouse over time. Our method segments fruits +using a learning-based instance segmentation approach directly on the point +cloud. Each segmented fruit is processed by a 3D sparse convolutional neural +network to extract descriptors, which are used in an attention-based matching +network to associate fruits with their instances from previous data +collections. Experimental results on a real dataset of strawberries demonstrate +that our approach outperforms other methods for fruits re-identification over +time, allowing for precise temporal fruit monitoring in real and complex +scenarios. + +
+
+ comment: Submitted to IEEE Robotics and Automation Letters +
+
+
+
+
+ + ☆ Interaction Asymmetry: A General Principle for Learning Composable + Abstractions + + +
+ Learning disentangled representations of concepts and re-composing them in +unseen ways is crucial for generalizing to out-of-domain situations. However, +the underlying properties of concepts that enable such disentanglement and +compositional generalization remain poorly understood. In this work, we propose +the principle of interaction asymmetry which states: "Parts of the same concept +have more complex interactions than parts of different concepts". We formalize +this via block diagonality conditions on the $(n+1)$th order derivatives of the +generator mapping concepts to observed data, where different orders of +"complexity" correspond to different $n$. Using this formalism, we prove that +interaction asymmetry enables both disentanglement and compositional +generalization. Our results unify recent theoretical results for learning +concepts of objects, which we show are recovered as special cases with +$n\!=\!0$ or $1$. We provide results for up to $n\!=\!2$, thus extending these +prior works to more flexible generator functions, and conjecture that the same +proof strategies generalize to larger $n$. Practically, our theory suggests +that, to disentangle concepts, an autoencoder should penalize its latent +capacity and the interactions between concepts during decoding. We propose an +implementation of these criteria using a flexible Transformer-based VAE, with a +novel regularizer on the attention weights of the decoder. On synthetic image +datasets consisting of objects, we provide evidence that this model can achieve +comparable object disentanglement to existing models that use more explicit +object-centric priors. + +
+
+ comment: Preprint, under review +
+
+
+
+
+ + ☆ Novel View Synthesis with Pixel-Space Diffusion Models + + +
+ Synthesizing a novel view from a single input image is a challenging task. +Traditionally, this task was approached by estimating scene depth, warping, and +inpainting, with machine learning models enabling parts of the pipeline. More +recently, generative models are being increasingly employed in novel view +synthesis (NVS), often encompassing the entire end-to-end system. In this work, +we adapt a modern diffusion model architecture for end-to-end NVS in the pixel +space, substantially outperforming previous state-of-the-art (SOTA) techniques. +We explore different ways to encode geometric information into the network. Our +experiments show that while these methods may enhance performance, their impact +is minor compared to utilizing improved generative models. Moreover, we +introduce a novel NVS training scheme that utilizes single-view datasets, +capitalizing on their relative abundance compared to their multi-view +counterparts. This leads to improved generalization capabilities to scenes with +out-of-domain content. + +
+
+
+
+
+ + ♻ ☆ Strike the Balance: On-the-Fly Uncertainty based User Interactions for + Long-Term Video Object Segmentation ACCV 2024 + + +
+ In this paper, we introduce a variant of video object segmentation (VOS) that +bridges interactive and semi-automatic approaches, termed Lazy Video Object +Segmentation (ziVOS). In contrast, to both tasks, which handle video object +segmentation in an off-line manner (i.e., pre-recorded sequences), we propose +through ziVOS to target online recorded sequences. Here, we strive to strike a +balance between performance and robustness for long-term scenarios by +soliciting user feedback's on-the-fly during the segmentation process. Hence, +we aim to maximize the tracking duration of an object of interest, while +requiring minimal user corrections to maintain tracking over an extended +period. We propose a competitive baseline, i.e., Lazy-XMem, as a reference for +future works in ziVOS. Our proposed approach uses an uncertainty estimation of +the tracking state to determine whether a user interaction is necessary to +refine the model's prediction. To quantitatively assess the performance of our +method and the user's workload, we introduce complementary metrics alongside +those already established in the field. We evaluate our approach using the +recently introduced LVOS dataset, which offers numerous long-term videos. Our +code is publicly available at https://github.com/Vujas-Eteph/LazyXMem. + +
+
+ comment: Accepted at ACCV 2024 +
+
+
+
+
+ + ♻ ☆ LADDER: Language Driven Slice Discovery and Error Rectification + + +
+ Error slice discovery associates structured patterns with model errors. +Existing methods discover error slices by clustering the error-prone samples +with similar patterns or assigning discrete attributes to each sample for +post-hoc analysis. While these methods aim for interpretability and easier +mitigation through reweighting or rebalancing, they may not capture the full +complexity of error patterns due to incomplete or missing attributes. Contrary +to the existing approach, this paper utilizes the reasoning capabilities of the +Large Language Model (LLM) to analyze complex error patterns and generate +testable hypotheses. This paper proposes LADDER: Language Driven slice +Discovery and Error Rectification. It first projects the model's representation +into a language-aligned feature space (eg CLIP) to preserve semantics in the +original model feature space. This ensures the accurate retrieval of sentences +that highlight the model's errors. Next, the LLM utilizes the sentences and +generates hypotheses to discover error slices. Finally, we mitigate the error +by fine-tuning the classification head by creating a group-balanced dataset +using the hypotheses. Our entire method does not require any attribute +annotation, either explicitly or through external tagging models. We validate +our method with \textbf{five} image classification datasets. + +
+
+
+
+
+ + ♻ ☆ LoTLIP: Improving Language-Image Pre-training for Long Text + Understanding + + +
+ Understanding long text is of great demands in practice but beyond the reach +of most language-image pre-training (LIP) models. In this work, we empirically +confirm that the key reason causing such an issue is that the training images +are usually paired with short captions, leaving certain tokens easily +overshadowed by salient tokens. Towards this problem, our initial attempt is to +relabel the data with long captions, however, directly learning with which may +lead to performance degradation in understanding short text (e.g., in the image +classification task). Then, after incorporating corner tokens to aggregate +diverse textual information, we manage to help the model catch up to its +original level of short text understanding yet greatly enhance its capability +of long text understanding. We further look into whether the model can +continuously benefit from longer captions and notice a clear trade-off between +the performance and the efficiency. Finally, we validate the effectiveness of +our approach using a self-constructed large-scale dataset, which consists of +100M long caption oriented text-image pairs. Our method demonstrates superior +performance in long-text-image retrieval tasks. The project page is available +at https://wuw2019.github.io/lot-lip. + +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS NeurIPS 2024 + + +
+ Recent advances in real-time neural rendering using point-based techniques +have enabled broader adoption of 3D representations. However, foundational +approaches like 3D Gaussian Splatting impose substantial storage overhead, as +Structure-from-Motion (SfM) points can grow to millions, often requiring +gigabyte-level disk space for a single unbounded scene. This growth presents +scalability challenges and hinders splatting efficiency. To address this, we +introduce LightGaussian, a method for transforming 3D Gaussians into a more +compact format. Inspired by Network Pruning, LightGaussian identifies Gaussians +with minimal global significance on scene reconstruction, and applies a pruning +and recovery process to reduce redundancy while preserving visual quality. +Knowledge distillation and pseudo-view augmentation then transfer spherical +harmonic coefficients to a lower degree, yielding compact representations. +Gaussian Vector Quantization, based on each Gaussian's global significance, +further lowers bitwidth with minimal accuracy loss. LightGaussian achieves an +average 15x compression rate while boosting FPS from 144 to 237 within the +3D-GS framework, enabling efficient complex scene representation on the +Mip-NeRF 360 and Tank & Temple datasets. The proposed Gaussian pruning approach +is also adaptable to other 3D representations (e.g., Scaffold-GS), +demonstrating strong generalization capabilities. + +
+
+ comment: NeurIPS 2024, Project page: https://lightgaussian.github.io/ +
+
+
+
+
+ + ♻ ☆ Odd-One-Out: Anomaly Detection by Comparing with Neighbors + + +
+ This paper introduces a novel anomaly detection (AD) problem that focuses on +identifying `odd-looking' objects relative to the other instances in a given +scene. In contrast to the traditional AD benchmarks, anomalies in our task are +scene-specific, defined by the regular instances that make up the majority. +Since object instances may be only partly visible from a single viewpoint, our +setting employs multiple views of each scene as input. To provide a testbed for +future research in this task, we introduce two benchmarks, ToysAD-8K and +PartsAD-15K. We propose a novel method that constructs 3D object-centric +representations from multiple 2D views for each instance and detects the +anomalous ones through a cross-instance comparison. We rigorously analyze our +method quantitatively and qualitatively on the presented benchmarks. + +
+
+ comment: Codes & Dataset at https://github.com/VICO-UoE/OddOneOutAD +
+
+
+
+
+ + ♻ ☆ WavShadow: Wavelet Based Shadow Segmentation and Removal + + +
+ Shadow removal and segmentation remain challenging tasks in computer vision, +particularly in complex real world scenarios. This study presents a novel +approach that enhances the ShadowFormer model by incorporating Masked +Autoencoder (MAE) priors and Fast Fourier Convolution (FFC) blocks, leading to +significantly faster convergence and improved performance. We introduce key +innovations: (1) integration of MAE priors trained on Places2 dataset for +better context understanding, (2) adoption of Haar wavelet features for +enhanced edge detection and multiscale analysis, and (3) implementation of a +modified SAM Adapter for robust shadow segmentation. Extensive experiments on +the challenging DESOBA dataset demonstrate that our approach achieves state of +the art results, with notable improvements in both convergence speed and shadow +removal quality. + +
+
+
+
+
+ + ♻ ☆ Meta-Learned Modality-Weighted Knowledge Distillation for Robust + Multi-Modal Learning with Missing Data + + +
+ In multi-modal learning, some modalities are more influential than others, +and their absence can have a significant impact on classification/segmentation +accuracy. Addressing this challenge, we propose a novel approach called +Meta-learned Modality-weighted Knowledge Distillation (MetaKD), which enables +multi-modal models to maintain high accuracy even when key modalities are +missing. MetaKD adaptively estimates the importance weight of each modality +through a meta-learning process. These learned importance weights guide a +pairwise modality-weighted knowledge distillation process, allowing +high-importance modalities to transfer knowledge to lower-importance ones, +resulting in robust performance despite missing inputs. Unlike previous methods +in the field, which are often task-specific and require significant +modifications, our approach is designed to work in multiple tasks (e.g., +segmentation and classification) with minimal adaptation. Experimental results +on five prevalent datasets, including three Brain Tumor Segmentation datasets +(BraTS2018, BraTS2019 and BraTS2020), the Alzheimer's Disease Neuroimaging +Initiative (ADNI) classification dataset and the Audiovision-MNIST +classification dataset, demonstrate the proposed model is able to outperform +the compared models by a large margin. + +
+
+
+
+
+ + ♻ ☆ Interpret Your Decision: Logical Reasoning Regularization for + Generalization in Visual Classification NeurIPS2024 + + +
+ Vision models excel in image classification but struggle to generalize to +unseen data, such as classifying images from unseen domains or discovering +novel categories. In this paper, we explore the relationship between logical +reasoning and deep learning generalization in visual classification. A logical +regularization termed L-Reg is derived which bridges a logical analysis +framework to image classification. Our work reveals that L-Reg reduces the +complexity of the model in terms of the feature distribution and classifier +weights. Specifically, we unveil the interpretability brought by L-Reg, as it +enables the model to extract the salient features, such as faces to persons, +for classification. Theoretical analysis and experiments demonstrate that L-Reg +enhances generalization across various scenarios, including multi-domain +generalization and generalized category discovery. In complex real-world +scenarios where images span unknown classes and unseen domains, L-Reg +consistently improves generalization, highlighting its practical efficacy. + +
+
+ comment: Accepted by NeurIPS2024 as Spotlight +
+
+
+
+
+ + ♻ ☆ Pseudo-triplet Guided Few-shot Composed Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging task that aims to retrieve +the target image with a multimodal query, i.e., a reference image, and its +complementary modification text. As previous supervised or zero-shot learning +paradigms all fail to strike a good trade-off between the model's +generalization ability and retrieval performance, recent researchers have +introduced the task of few-shot CIR (FS-CIR) and proposed a textual +inversion-based network based on pretrained CLIP model to realize it. Despite +its promising performance, the approach encounters two key limitations: simply +relying on the few annotated samples for CIR model training and +indiscriminately selecting training triplets for CIR model fine-tuning. To +address these two limitations, we propose a novel two-stage pseudo triplet +guided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an +attentive masking and captioning-based pseudo triplet generation method, to +construct pseudo triplets from pure image data and use them to fulfill the +CIR-task specific pertaining. In the second stage, we propose a challenging +triplet-based CIR fine-tuning method, where we design a pseudo modification +text-based sample challenging score estimation strategy and a robust top +range-based random sampling strategy for sampling robust challenging triplets +to promote the model fine-tuning. Notably, our scheme is plug-and-play and +compatible with any existing supervised CIR models. We test our scheme across +two backbones on three public datasets (i.e., FashionIQ, CIRR, and +Birds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4% +respectively, demonstrating our scheme's efficacy. + +
+
+ comment: 10pages +
+
+
+
+
+ + ♻ ☆ Bootstrapping Reinforcement Learning with Imitation for Vision-Based + Agile Flight + + +
+ Learning visuomotor policies for agile quadrotor flight presents significant +difficulties, primarily from inefficient policy exploration caused by +high-dimensional visual inputs and the need for precise and low-latency +control. To address these challenges, we propose a novel approach that combines +the performance of Reinforcement Learning (RL) and the sample efficiency of +Imitation Learning (IL) in the task of vision-based autonomous drone racing. +While RL provides a framework for learning high-performance controllers through +trial and error, it faces challenges with sample efficiency and computational +demands due to the high dimensionality of visual inputs. Conversely, IL +efficiently learns from visual expert demonstrations, but it remains limited by +the expert's performance and state distribution. To overcome these limitations, +our policy learning framework integrates the strengths of both approaches. Our +framework contains three phases: training a teacher policy using RL with +privileged state information, distilling it into a student policy via IL, and +adaptive fine-tuning via RL. Testing in both simulated and real-world scenarios +shows our approach can not only learn in scenarios where RL from scratch fails +but also outperforms existing IL methods in both robustness and performance, +successfully navigating a quadrotor through a race course using only visual +information. Videos of the experiments are available at +https://rpg.ifi.uzh.ch/bootstrap-rl-with-il/index.html. + +
+
+ comment: 8th Annual Conference on Robot Learning (CoRL) +
+
+
+
+
+ + ♻ ☆ REVEX: A Unified Framework for Removal-Based Explainable Artificial + Intelligence in Video + + +
+ We developed REVEX, a removal-based video explanations framework. This work +extends fine-grained explanation frameworks for computer vision data and adapts +six existing techniques to video by adding temporal information and local +explanations. The adapted methods were evaluated across networks, datasets, +image classes, and evaluation metrics. By decomposing explanation into steps, +strengths and weaknesses were revealed in the studied methods, for example, on +pixel clustering and perturbations in the input. Video LIME outperformed other +methods with deletion values up to 31\% lower and insertion up to 30\% higher, +depending on method and network. Video RISE achieved superior performance in +the average drop metric, with values 10\% lower. In contrast, +localization-based metrics revealed low performance across all methods, with +significant variation depending on network. Pointing game accuracy reached +53\%, and IoU-based metrics remained below 20\%. Drawing on the findings across +XAI methods, we further examine the limitations of the employed XAI evaluation +metrics and highlight their suitability in different applications. + +
+
+
+
+
+ + ♻ ☆ Transfer Learning for Wildlife Classification: Evaluating YOLOv8 against + DenseNet, ResNet, and VGGNet on a Custom Dataset + + +
+ This study evaluates the performance of various deep learning models, +specifically DenseNet, ResNet, VGGNet, and YOLOv8, for wildlife species +classification on a custom dataset. The dataset comprises 575 images of 23 +endangered species sourced from reputable online repositories. The study +utilizes transfer learning to fine-tune pre-trained models on the dataset, +focusing on reducing training time and enhancing classification accuracy. The +results demonstrate that YOLOv8 outperforms other models, achieving a training +accuracy of 97.39% and a validation F1-score of 96.50%. These findings suggest +that YOLOv8, with its advanced architecture and efficient feature extraction +capabilities, holds great promise for automating wildlife monitoring and +conservation efforts. + +
+
+ comment: This is published in Journal of Artificial Intelligence and Capsule + Networks, December 2024, Volume 6, Issue 4, Pages 415-435 +
+
+
+
+
+ + ♻ ☆ LLMs Can Evolve Continually on Modality for X-Modal Reasoning + + +
+ Multimodal Large Language Models (MLLMs) have gained significant attention +due to their impressive capabilities in multimodal understanding. However, +existing methods rely heavily on extensive modal-specific pretraining and +joint-modal tuning, leading to significant computational burdens when expanding +to new modalities. In this paper, we propose PathWeave, a flexible and scalable +framework with modal-Path sWitching and ExpAnsion abilities that enables MLLMs +to continually EVolve on modalities for $\mathbb{X}$-modal reasoning. We +leverage the concept of Continual Learning and develop an incremental training +strategy atop pre-trained MLLMs, enabling their expansion to new modalities +using uni-modal data, without executing joint-modal pretraining. In detail, a +novel Adapter-in-Adapter (AnA) framework is introduced, in which uni-modal and +cross-modal adapters are seamlessly integrated to facilitate efficient modality +alignment and collaboration. Additionally, an MoE-based gating module is +applied between two types of adapters to further enhance the multimodal +interaction. To investigate the proposed method, we establish a challenging +benchmark called Continual Learning of Modality (MCL), which consists of +high-quality QA data from five distinct modalities: image, video, audio, depth +and point cloud. Extensive experiments demonstrate the effectiveness of the +proposed AnA framework on learning plasticity and memory stability during +continual learning. Furthermore, PathWeave performs comparably to +state-of-the-art MLLMs while concurrently reducing parameter training burdens +by 98.73%. Our code locates at https://github.com/JiazuoYu/PathWeave + +
+
+
+
+
+ + ♻ ☆ L4DR: LiDAR-4DRadar Fusion for Weather-Robust 3D Object Detection + + +
+ LiDAR-based vision systems are integral for 3D object detection, which is +crucial for autonomous navigation. However, they suffer from performance +degradation in adverse weather conditions due to the quality deterioration of +LiDAR point clouds. Fusing LiDAR with the weather-robust 4D radar sensor is +expected to solve this problem. However, the fusion of LiDAR and 4D radar is +challenging because they differ significantly in terms of data quality and the +degree of degradation in adverse weather. To address these issues, we introduce +L4DR, a weather-robust 3D object detection method that effectively achieves +LiDAR and 4D Radar fusion. Our L4DR includes Multi-Modal Encoding (MME) and +Foreground-Aware Denoising (FAD) technique to reconcile sensor gaps, which is +the first exploration of the complementarity of early fusion between LiDAR and +4D radar. Additionally, we design an Inter-Modal and Intra-Modal ({IM}2 ) +parallel feature extraction backbone coupled with a Multi-Scale Gated Fusion +(MSGF) module to counteract the varying degrees of sensor degradation under +adverse weather conditions. Experimental evaluation on a VoD dataset with +simulated fog proves that L4DR is more adaptable to changing weather +conditions. It delivers a significant performance increase under different fog +levels, improving the 3D mAP by up to 20.0% over the traditional LiDAR-only +approach. Moreover, the results on the K-Radar dataset validate the consistent +performance improvement of L4DR in real-world adverse weather conditions. + +
+
+
+
+
+ + ♻ ☆ Adapting Segment Anything Model to Multi-modal Salient Object Detection + with Semantic Feature Fusion Guidance + + +
+ Although most existing multi-modal salient object detection (SOD) methods +demonstrate effectiveness through training models from scratch, the limited +multi-modal data hinders these methods from reaching optimality. In this paper, +we propose a novel framework to explore and exploit the powerful feature +representation and zero-shot generalization ability of the pre-trained Segment +Anything Model (SAM) for multi-modal SOD. Despite serving as a recent vision +fundamental model, driving the class-agnostic SAM to comprehend and detect +salient objects accurately is non-trivial, especially in challenging scenes. To +this end, we develop \underline{SAM} with se\underline{m}antic +f\underline{e}ature fu\underline{s}ion guidanc\underline{e} (Sammese), which +incorporates multi-modal saliency-specific knowledge into SAM to adapt SAM to +multi-modal SOD tasks. However, it is difficult for SAM trained on single-modal +data to directly mine the complementary benefits of multi-modal inputs and +comprehensively utilize them to achieve accurate saliency prediction. To +address these issues, we first design a multi-modal complementary fusion module +to extract robust multi-modal semantic features by integrating information from +visible and thermal or depth image pairs. Then, we feed the extracted +multi-modal semantic features into both the SAM image encoder and mask decoder +for fine-tuning and prompting, respectively. Specifically, in the image +encoder, a multi-modal adapter is proposed to adapt the single-modal SAM to +multi-modal information. In the mask decoder, a semantic-geometric prompt +generation strategy is proposed to produce corresponding embeddings with +various saliency cues. Extensive experiments on both RGB-D and RGB-T SOD +benchmarks show the effectiveness of the proposed framework. The code will be +available at \url{https://github.com/Angknpng/Sammese}. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ CausalDiff: Causality-Inspired Disentanglement via Diffusion Model for + Adversarial Defense NeurIPS 2024 + + +
+ Despite ongoing efforts to defend neural classifiers from adversarial +attacks, they remain vulnerable, especially to unseen attacks. In contrast, +humans are difficult to be cheated by subtle manipulations, since we make +judgments only based on essential factors. Inspired by this observation, we +attempt to model label generation with essential label-causative factors and +incorporate label-non-causative factors to assist data generation. For an +adversarial example, we aim to discriminate the perturbations as non-causative +factors and make predictions only based on the label-causative factors. +Concretely, we propose a casual diffusion model (CausalDiff) that adapts +diffusion models for conditional data generation and disentangles the two types +of casual factors by learning towards a novel casual information bottleneck +objective. Empirically, CausalDiff has significantly outperformed +state-of-the-art defense methods on various unseen attacks, achieving an +average robustness of 86.39% (+4.01%) on CIFAR-10, 56.25% (+3.13%) on +CIFAR-100, and 82.62% (+4.93%) on GTSRB (German Traffic Sign Recognition +Benchmark). + +
+
+ comment: accepted by NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Act in Collusion: A Persistent Distributed Multi-Target Backdoor in + Federated Learning + + +
+ Federated learning, a novel paradigm designed to protect data privacy, is +vulnerable to backdoor attacks due to its distributed nature. Current research +often designs attacks based on a single attacker with a single backdoor, +overlooking more realistic and complex threats in federated learning. We +propose a more practical threat model for federated learning: the distributed +multi-target backdoor. In this model, multiple attackers control different +clients, embedding various triggers and targeting different classes, +collaboratively implanting backdoors into the global model via central +aggregation. Empirical validation shows that existing methods struggle to +maintain the effectiveness of multiple backdoors in the global model. Our key +insight is that similar backdoor triggers cause parameter conflicts and +injecting new backdoors disrupts gradient directions, significantly weakening +some backdoors performance. To solve this, we propose a Distributed +Multi-Target Backdoor Attack (DMBA), ensuring efficiency and persistence of +backdoors from different malicious clients. To avoid parameter conflicts, we +design a multi-channel dispersed frequency trigger strategy to maximize trigger +differences. To mitigate gradient interference, we introduce backdoor replay in +local training to neutralize conflicting gradients. Extensive validation shows +that 30 rounds after the attack, Attack Success Rates of three different +backdoors from various clients remain above 93%. The code will be made publicly +available after the review period. + +
+
+
+
+
+ + ♻ ☆ PhyTracker: An Online Tracker for Phytoplankton + + +
+ Phytoplankton, a crucial component of aquatic ecosystems, requires efficient +monitoring to understand marine ecological processes and environmental +conditions. Traditional phytoplankton monitoring methods, relying on non-in +situ observations, are time-consuming and resource-intensive, limiting timely +analysis. To address these limitations, we introduce PhyTracker, an intelligent +in situ tracking framework designed for automatic tracking of phytoplankton. +PhyTracker overcomes significant challenges unique to phytoplankton monitoring, +such as constrained mobility within water flow, inconspicuous appearance, and +the presence of impurities. Our method incorporates three innovative modules: a +Texture-enhanced Feature Extraction (TFE) module, an Attention-enhanced +Temporal Association (ATA) module, and a Flow-agnostic Movement Refinement +(FMR) module. These modules enhance feature capture, differentiate between +phytoplankton and impurities, and refine movement characteristics, +respectively. Extensive experiments on the PMOT dataset validate the +superiority of PhyTracker in phytoplankton tracking, and additional tests on +the MOT dataset demonstrate its general applicability, outperforming +conventional tracking methods. This work highlights key differences between +phytoplankton and traditional objects, offering an effective solution for +phytoplankton monitoring. + +
+
+ comment: 13pages,eleven figures +
+
+
+
+
+
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ A Theoretical Analysis of Recommendation Loss Functions under Negative + Sampling + + +
+ Recommender Systems (RSs) are pivotal in diverse domains such as e-commerce, +music streaming, and social media. This paper conducts a comparative analysis +of prevalent loss functions in RSs: Binary Cross-Entropy (BCE), Categorical +Cross-Entropy (CCE), and Bayesian Personalized Ranking (BPR). Exploring the +behaviour of these loss functions across varying negative sampling settings, we +reveal that BPR and CCE are equivalent when one negative sample is used. +Additionally, we demonstrate that all losses share a common global minimum. +Evaluation of RSs mainly relies on ranking metrics known as Normalized +Discounted Cumulative Gain (NDCG) and Mean Reciprocal Rank (MRR). We produce +bounds of the different losses for negative sampling settings to establish a +probabilistic lower bound for NDCG. We show that the BPR bound on NDCG is +weaker than that of BCE, contradicting the common assumption that BPR is +superior to BCE in RSs training. Experiments on five datasets and four models +empirically support these theoretical findings. Our code is available at +\url{https://anonymous.4open.science/r/recsys_losses} . + +
+
+ comment: main paper 8 pages, 4 figures +
+
+
+
+
+ + ☆ Unlocking Legal Knowledge with Multi-Layered Embedding-Based Retrieval + + +
+ This work addresses the challenge of capturing the complexities of legal +knowledge by proposing a multi-layered embedding-based retrieval method for +legal and legislative texts. Creating embeddings not only for individual +articles but also for their components (paragraphs, clauses) and structural +groupings (books, titles, chapters, etc), we seek to capture the subtleties of +legal information through the use of dense vectors of embeddings, representing +it at varying levels of granularity. Our method meets various information needs +by allowing the Retrieval Augmented Generation system to provide accurate +responses, whether for specific segments or entire sections, tailored to the +user's query. We explore the concepts of aboutness, semantic chunking, and +inherent hierarchy within legal texts, arguing that this method enhances the +legal information retrieval. Despite the focus being on Brazil's legislative +methods and the Brazilian Constitution, which follow a civil law tradition, our +findings should in principle be applicable across different legal systems, +including those adhering to common law traditions. Furthermore, the principles +of the proposed method extend beyond the legal domain, offering valuable +insights for organizing and retrieving information in any field characterized +by information encoded in hierarchical text. + +
+
+ comment: 27 pages, 10 figures +
+
+
+
+
+ + ☆ Advancing Sustainability via Recommender Systems: A Survey + + +
+ Human behavioral patterns and consumption paradigms have emerged as pivotal +determinants in environmental degradation and climate change, with quotidian +decisions pertaining to transportation, energy utilization, and resource +consumption collectively precipitating substantial ecological impacts. +Recommender systems, which generate personalized suggestions based on user +preferences and historical interaction data, exert considerable influence on +individual behavioral trajectories. However, conventional recommender systems +predominantly optimize for user engagement and economic metrics, inadvertently +neglecting the environmental and societal ramifications of their +recommendations, potentially catalyzing over-consumption and reinforcing +unsustainable behavioral patterns. Given their instrumental role in shaping +user decisions, there exists an imperative need for sustainable recommender +systems that incorporate sustainability principles to foster eco-conscious and +socially responsible choices. This comprehensive survey addresses this critical +research gap by presenting a systematic analysis of sustainable recommender +systems. As these systems can simultaneously advance multiple sustainability +objectives--including resource conservation, sustainable consumer behavior, and +social impact enhancement--examining their implementations across distinct +application domains provides a more rigorous analytical framework. Through a +methodological analysis of domain-specific implementations encompassing +transportation, food, buildings, and auxiliary sectors, we can better elucidate +how these systems holistically advance sustainability objectives while +addressing sector-specific constraints and opportunities. Moreover, we +delineate future research directions for evolving recommender systems beyond +sustainability advocacy toward fostering environmental resilience and social +consciousness in society. + +
+
+ comment: 20pages, 10 figures. Working paper: https://github.com/enoche/SusRec +
+
+
+
+
+ + ☆ Overhead-free User-side Recommender Systems + + +
+ Traditionally, recommendation algorithms have been designed for service +developers. But recently, a new paradigm called user-side recommender systems +has been proposed. User-side recommender systems are built and used by end +users, in sharp contrast to traditional provider-side recommender systems. Even +if the official recommender system offered by the provider is not fair, end +users can create and enjoy their own user-side recommender systems by +themselves. Although the concept of user-side recommender systems is +attractive, the problem is they require tremendous communication costs between +the user and the official system. Even the most efficient user-side recommender +systems require about 5 times more costs than provider-side recommender +systems. Such high costs hinder the adoption of user-side recommender systems. +In this paper, we propose overhead-free user-side recommender systems, +RecCycle, which realizes user-side recommender systems without any +communication overhead. The main idea of RecCycle is to recycle past +recommendation results offered by the provider's recommender systems. The +ingredients of RecCycle can be retrieved ``for free,'' and it greatly reduces +the cost of user-side recommendations. In the experiments, we confirm that +RecCycle performs as well as state-of-the-art user-side recommendation +algorithms while RecCycle reduces costs significantly. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2208.09864, + arXiv:2403.15757 +
+
+
+
+
+ + ☆ Towards Automated Model Design on Recommender Systems + + +
+ The increasing popularity of deep learning models has created new +opportunities for developing AI-based recommender systems. Designing +recommender systems using deep neural networks requires careful architecture +design, and further optimization demands extensive co-design efforts on jointly +optimizing model architecture and hardware. Design automation, such as +Automated Machine Learning (AutoML), is necessary to fully exploit the +potential of recommender model design, including model choices and +model-hardware co-design strategies. We introduce a novel paradigm that +utilizes weight sharing to explore abundant solution spaces. Our paradigm +creates a large supernet to search for optimal architectures and co-design +strategies to address the challenges of data multi-modality and heterogeneity +in the recommendation domain. From a model perspective, the supernet includes a +variety of operators, dense connectivity, and dimension search options. From a +co-design perspective, it encompasses versatile Processing-In-Memory (PIM) +configurations to produce hardware-efficient models. Our solution space's +scale, heterogeneity, and complexity pose several challenges, which we address +by proposing various techniques for training and evaluating the supernet. Our +crafted models show promising results on three Click-Through Rates (CTR) +prediction benchmarks, outperforming both manually designed and AutoML-crafted +models with state-of-the-art performance when focusing solely on architecture +search. From a co-design perspective, we achieve 2x FLOPs efficiency, 1.8x +energy efficiency, and 1.5x performance improvements in recommender models. + +
+
+ comment: Accepted in ACM Transactions on Recommender Systems. arXiv admin + note: substantial text overlap with arXiv:2207.07187 +
+
+
+
+
+ + ☆ AdaS&S: a One-Shot Supernet Approach for Automatic Embedding Size Search + in Deep Recommender System + + +
+ Deep Learning Recommendation Model(DLRM)s utilize the embedding layer to +represent various categorical features. Traditional DLRMs adopt unified +embedding size for all features, leading to suboptimal performance and +redundant parameters. Thus, lots of Automatic Embedding size Search (AES) works +focus on obtaining mixed embedding sizes with strong model performance. +However, previous AES works can hardly address several challenges together: (1) +The search results of embedding sizes are unstable; (2) Recommendation effect +with AES results is unsatisfactory; (3) Memory cost of embeddings is +uncontrollable. To address these challenges, we propose a novel one-shot AES +framework called AdaS&S, in which a supernet encompassing various candidate +embeddings is built and AES is performed as searching network architectures +within it. Our framework contains two main stages: In the first stage, we +decouple training parameters from searching embedding sizes, and propose the +Adaptive Sampling method to yield a well-trained supernet, which further helps +to produce stable AES results. In the second stage, to obtain embedding sizes +that benefits the model effect, we design a reinforcement learning search +process which utilizes the supernet trained previously. Meanwhile, to adapt +searching to specific resource constraint, we introduce the resource +competition penalty to balance the model effectiveness and memory cost of +embeddings. We conduct extensive experiments on public datasets to show the +superiority of AdaS&S. Our method could improve AUC by about 0.3% while saving +about 20% of model parameters. Empirical analysis also shows that the stability +of searching results in AdaS&S significantly exceeds other methods. + +
+
+
+
+
+ + ☆ Enhancing Link Prediction with Fuzzy Graph Attention Networks and + Dynamic Negative Sampling + + +
+ Link prediction is crucial for understanding complex networks but traditional +Graph Neural Networks (GNNs) often rely on random negative sampling, leading to +suboptimal performance. This paper introduces Fuzzy Graph Attention Networks +(FGAT), a novel approach integrating fuzzy rough sets for dynamic negative +sampling and enhanced node feature aggregation. Fuzzy Negative Sampling (FNS) +systematically selects high-quality negative edges based on fuzzy similarities, +improving training efficiency. FGAT layer incorporates fuzzy rough set +principles, enabling robust and discriminative node representations. +Experiments on two research collaboration networks demonstrate FGAT's superior +link prediction accuracy, outperforming state-of-the-art baselines by +leveraging the power of fuzzy rough sets for effective negative sampling and +node feature learning. + +
+
+
+
+
+ + ♻ ☆ Explicit and Implicit Semantic Ranking Framework + + +
+ The core challenge in numerous real-world applications is to match an inquiry +to the best document from a mutable and finite set of candidates. Existing +industry solutions, especially latency-constrained services, often rely on +similarity algorithms that sacrifice quality for speed. In this paper we +introduce a generic semantic learning-to-rank framework, Self-training Semantic +Cross-attention Ranking (sRank). This transformer-based framework uses linear +pairwise loss with mutable training batch sizes and achieves quality gains and +high efficiency, and has been applied effectively to show gains on two industry +tasks at Microsoft over real-world large-scale data sets: Smart Reply (SR) and +Ambient Clinical Intelligence (ACI). In Smart Reply, sRank assists live +customers with technical support by selecting the best reply from predefined +solutions based on consumer and support agent messages. It achieves 11.7% gain +in offline top-one accuracy on the SR task over the previous system, and has +enabled 38.7% time reduction in composing messages in telemetry recorded since +its general release in January 2021. In the ACI task, sRank selects relevant +historical physician templates that serve as guidance for a text summarization +model to generate higher quality medical notes. It achieves 35.5% top-one +accuracy gain, along with 46% relative ROUGE-L gain in generated medical notes. + +
+
+
+
+
+ + ♻ ☆ An Early FIRST Reproduction and Improvements to Single-Token Decoding + for Fast Listwise Reranking + + +
+ Recent advances have demonstrated that large language models (LLMs) excel as +listwise rerankers, but their high computational demands remain a barrier to +widespread adoption. Further, the traditional language modeling (LM) objective +is not ideally suited for reranking tasks. FIRST is a novel approach that +addresses these challenges by integrating a learning-to-rank objective and +leveraging the logits of only the first generated token, thereby significantly +reducing inference latency compared to traditional LLM rerankers. In this +study, we extend the evaluation of FIRST to the TREC Deep Learning datasets +(DL19-22), validating its robustness across diverse domains. We investigate the +influence of different first-stage retrievers on FIRST rerankers, observing +diminishing returns and patterns consistent with traditional LLM rerankers. +Through applying the FIRST objective to a broader range of backbone models, we +achieve effectiveness surpassing the original implementation. Our experiments +confirm that fast reranking with single-token logits does not compromise +out-of-domain reranking quality. To better quantify the computational savings +in the original study, we measure and compare latency to find a 21%-42% gain +across various models and benchmarks. Moreover, while LM training implicitly +improves zero-shot single-token reranking, our experiments also raise questions +about whether LM pre-training may hinder subsequent fine-tuning with the FIRST +objective. These findings pave the way for more efficient and effective +listwise reranking in future applications. + +
+
+
+
+
+ + ♻ ☆ Content-Based Collaborative Generation for Recommender Systems CIKM 2024 + + +
+ Generative models have emerged as a promising utility to enhance recommender +systems. It is essential to model both item content and user-item collaborative +interactions in a unified generative framework for better recommendation. +Although some existing large language model (LLM)-based methods contribute to +fusing content information and collaborative signals, they fundamentally rely +on textual language generation, which is not fully aligned with the +recommendation task. How to integrate content knowledge and collaborative +interaction signals in a generative framework tailored for item recommendation +is still an open research challenge. + In this paper, we propose content-based collaborative generation for +recommender systems, namely ColaRec. ColaRec is a sequence-to-sequence +framework which is tailored for directly generating the recommended item +identifier. Precisely, the input sequence comprises data pertaining to the +user's interacted items, and the output sequence represents the generative +identifier (GID) for the suggested item. To model collaborative signals, the +GIDs are constructed from a pretrained collaborative filtering model, and the +user is represented as the content aggregation of interacted items. To this +end, ColaRec captures both collaborative signals and content information in a +unified framework. Then an item indexing task is proposed to conduct the +alignment between the content-based semantic space and the interaction-based +collaborative space. Besides, a contrastive loss is further introduced to +ensure that items with similar collaborative GIDs have similar content +representations. To verify the effectiveness of ColaRec, we conduct experiments +on four benchmark datasets. Empirical results demonstrate the superior +performance of ColaRec. + +
+
+ comment: Accepted by CIKM 2024; GitHub: + https://github.com/Junewang0614/ColaRec +
+
+
+
+
+ + ♻ ☆ A Comparative Study on Enhancing Prediction in Social Network + Advertisement through Data Augmentation + + +
+ In the ever-evolving landscape of social network advertising, the volume and +accuracy of data play a critical role in the performance of predictive models. +However, the development of robust predictive algorithms is often hampered by +the limited size and potential bias present in real-world datasets. This study +presents and explores a generative augmentation framework of social network +advertising data. Our framework explores three generative models for data +augmentation - Generative Adversarial Networks (GANs), Variational Autoencoders +(VAEs), and Gaussian Mixture Models (GMMs) - to enrich data availability and +diversity in the context of social network advertising analytics effectiveness. +By performing synthetic extensions of the feature space, we find that through +data augmentation, the performance of various classifiers has been +quantitatively improved. Furthermore, we compare the relative performance gains +brought by each data augmentation technique, providing insights for +practitioners to select appropriate techniques to enhance model performance. +This paper contributes to the literature by showing that synthetic data +augmentation alleviates the limitations imposed by small or imbalanced datasets +in the field of social network advertising. At the same time, this article also +provides a comparative perspective on the practicality of different data +augmentation methods, thereby guiding practitioners to choose appropriate +techniques to enhance model performance. + +
+
+ comment: Accepted by 2024 4th International Conference on Machine Learning and + Intelligent Systems Engineering (MLISE) +
+
+
+
+
+ + ♻ ☆ Explainable Identification of Hate Speech towards Islam using Graph + Neural Networks NeurIPS 2023 + + +
+ Islamophobic language on online platforms fosters intolerance, making +detection and elimination crucial for promoting harmony. Traditional hate +speech detection models rely on NLP techniques like tokenization, +part-of-speech tagging, and encoder-decoder models. However, Graph Neural +Networks (GNNs), with their ability to utilize relationships between data +points, offer more effective detection and greater explainability. In this +work, we represent speeches as nodes and connect them with edges based on their +context and similarity to develop the graph. This study introduces a novel +paradigm using GNNs to identify and explain hate speech towards Islam. Our +model leverages GNNs to understand the context and patterns of hate speech by +connecting texts via pretrained NLP-generated word embeddings, achieving +state-of-the-art performance and enhancing detection accuracy while providing +valuable explanations. This highlights the potential of GNNs in combating +online hate speech and fostering a safer, more inclusive online environment. + +
+
+ comment: Accepted in: (i) NeurIPS 2023 : Muslims in ML Workshop (Non-archival) + (https://www.musiml.org/schedule/#:~:text=Azmine%20Toushik%20Wasi) (ii) EMNLP + 2024 : NLP for Positive Impact Workshop (Archival; ACL Anthology: + https://aclanthology.org/2024.nlp4pi-1.23/) +
+
+
+
+
+
+
+
+ + Machine Learning 7 + +
+
+
+ + ☆ Retrieval Augmented Time Series Forecasting + + +
+ Retrieval-augmented generation (RAG) is a central component of modern LLM +systems, particularly in scenarios where up-to-date information is crucial for +accurately responding to user queries or when queries exceed the scope of the +training data. The advent of time-series foundation models (TSFM), such as +Chronos, and the need for effective zero-shot forecasting performance across +various time-series domains motivates the question: Do benefits of RAG +similarly carry over to time series forecasting? In this paper, we advocate +that the dynamic and event-driven nature of time-series data makes RAG a +crucial component of TSFMs and introduce a principled RAG framework for +time-series forecasting, called Retrieval Augmented Forecasting (RAF). Within +RAF, we develop efficient strategies for retrieving related time-series +examples and incorporating them into forecast. Through experiments and +mechanistic studies, we demonstrate that RAF indeed improves the forecasting +accuracy across diverse time series domains and the improvement is more +significant for larger TSFM sizes. + +
+
+
+
+
+ + ☆ Deceiving Question-Answering Models: A Hybrid Word-Level Adversarial + Approach + + +
+ Deep learning underpins most of the currently advanced natural language +processing (NLP) tasks such as textual classification, neural machine +translation (NMT), abstractive summarization and question-answering (QA). +However, the robustness of the models, particularly QA models, against +adversarial attacks is a critical concern that remains insufficiently explored. +This paper introduces QA-Attack (Question Answering Attack), a novel word-level +adversarial strategy that fools QA models. Our attention-based attack exploits +the customized attention mechanism and deletion ranking strategy to identify +and target specific words within contextual passages. It creates deceptive +inputs by carefully choosing and substituting synonyms, preserving grammatical +integrity while misleading the model to produce incorrect responses. Our +approach demonstrates versatility across various question types, particularly +when dealing with extensive long textual inputs. Extensive experiments on +multiple benchmark datasets demonstrate that QA-Attack successfully deceives +baseline QA models and surpasses existing adversarial techniques regarding +success rate, semantics changes, BLEU score, fluency and grammar error rate. + +
+
+
+
+
+ + ☆ NVCiM-PT: An NVCiM-assisted Prompt Tuning Framework for Edge LLMs DATE 2025 + + +
+ Large Language Models (LLMs) deployed on edge devices, known as edge LLMs, +need to continuously fine-tune their model parameters from user-generated data +under limited resource constraints. However, most existing learning methods are +not applicable for edge LLMs because of their reliance on high resources and +low learning capacity. Prompt tuning (PT) has recently emerged as an effective +fine-tuning method for edge LLMs by only modifying a small portion of LLM +parameters, but it suffers from user domain shifts, resulting in repetitive +training and losing resource efficiency. Conventional techniques to address +domain shift issues often involve complex neural networks and sophisticated +training, which are incompatible for PT for edge LLMs. Therefore, an open +research question is how to address domain shift issues for edge LLMs with +limited resources. In this paper, we propose a prompt tuning framework for edge +LLMs, exploiting the benefits offered by non-volatile computing-in-memory +(NVCiM) architectures. We introduce a novel NVCiM-assisted PT framework, where +we narrow down the core operations to matrix-matrix multiplication, which can +then be accelerated by performing in-situ computation on NVCiM. To the best of +our knowledge, this is the first work employing NVCiM to improve the edge LLM +PT performance. + +
+
+ comment: Accepted by DATE 2025 +
+
+
+
+
+ + ☆ A Social Outcomes and Priorities centered (SOP) Framework for AI policy + + +
+ Rapid developments in AI and its adoption across various domains have +necessitated a need to build robust guardrails and risk containment plans while +ensuring equitable benefits for the betterment of society. The current +technology-centered approach has resulted in a fragmented, reactive, and +ineffective policy apparatus. This paper highlights the immediate and urgent +need to pivot to a society-centered approach to develop comprehensive, +coherent, forward-looking AI policy. To this end, we present a Social Outcomes +and Priorities centered (SOP) framework for AI policy along with proposals on +implementation of its various components. While the SOP framework is presented +from a US-centric view, the takeaways are general and applicable globally. + +
+
+
+
+
+ + ☆ Imitation Learning from Observations: An Autoregressive Mixture of + Experts Approach + + +
+ This paper presents a novel approach to imitation learning from observations, +where an autoregressive mixture of experts model is deployed to fit the +underlying policy. The parameters of the model are learned via a two-stage +framework. By leveraging the existing dynamics knowledge, the first stage of +the framework estimates the control input sequences and hence reduces the +problem complexity. At the second stage, the policy is learned by solving a +regularized maximum-likelihood estimation problem using the estimated control +input sequences. We further extend the learning procedure by incorporating a +Lyapunov stability constraint to ensure asymptotic stability of the identified +model, for accurate multi-step predictions. The effectiveness of the proposed +framework is validated using two autonomous driving datasets collected from +human demonstrations, demonstrating its practical applicability in modelling +complex nonlinear dynamics. + +
+
+
+
+
+ + ♻ ☆ PEaRL: Personalized Privacy of Human-Centric Systems using Early-Exit + Reinforcement Learning + + +
+ In the evolving landscape of human-centric systems, personalized privacy +solutions are becoming increasingly crucial due to the dynamic nature of human +interactions. Traditional static privacy models often fail to meet the diverse +and changing privacy needs of users. This paper introduces PEaRL, a system +designed to enhance privacy preservation by tailoring its approach to +individual behavioral patterns and preferences. While incorporating +reinforcement learning (RL) for its adaptability, PEaRL primarily focuses on +employing an early-exit strategy that dynamically balances privacy protection +and system utility. This approach addresses the challenges posed by the +variability and evolution of human behavior, which static privacy models +struggle to handle effectively. We evaluate PEaRL in two distinct contexts: +Smart Home environments and Virtual Reality (VR) Smart Classrooms. The +empirical results demonstrate PEaRL's capability to provide a personalized +tradeoff between user privacy and application utility, adapting effectively to +individual user preferences. On average, across both systems, PEaRL enhances +privacy protection by 31%, with a corresponding utility reduction of 24%. + +
+
+ comment: 15 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ A Unified Analysis on the Subgradient Upper Bounds for the Subgradient + Methods Minimizing Composite Nonconvex, Nonsmooth and Non-Lipschitz Functions + + +
+ This paper presents a unified analysis for the proximal subgradient method +(Prox-SubGrad) type approach to minimize an overall objective of $f(x)+r(x)$, +subject to convex constraints, where both $f$ and $r$ are weakly convex, +nonsmooth, and non-Lipschitz. Leveraging on the properties of the Moreau +envelope of weakly convex functions, we are able to relate error-bound +conditions, the growth conditions of the subgradients of the objective, and the +behavior of the proximal subgradient iterates on some remarkably broad classes +of objective functions. Various existing as well as new bounding conditions are +studied, leading to novel iteration complexity results. The terrain of our +exploration expands to stochastic proximal subgradient algorithms. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Rendering-Oriented 3D Point Cloud Attribute Compression using Sparse + Tensor-based Transformer + + +
+ The evolution of 3D visualization techniques has fundamentally transformed +how we interact with digital content. At the forefront of this change is point +cloud technology, offering an immersive experience that surpasses traditional +2D representations. However, the massive data size of point clouds presents +significant challenges in data compression. Current methods for lossy point +cloud attribute compression (PCAC) generally focus on reconstructing the +original point clouds with minimal error. However, for point cloud +visualization scenarios, the reconstructed point clouds with distortion still +need to undergo a complex rendering process, which affects the final +user-perceived quality. In this paper, we propose an end-to-end deep learning +framework that seamlessly integrates PCAC with differentiable rendering, +denoted as rendering-oriented PCAC (RO-PCAC), directly targeting the quality of +rendered multiview images for viewing. In a differentiable manner, the impact +of the rendering process on the reconstructed point clouds is taken into +account. Moreover, we characterize point clouds as sparse tensors and propose a +sparse tensor-based transformer, called SP-Trans. By aligning with the local +density of the point cloud and utilizing an enhanced local attention mechanism, +SP-Trans captures the intricate relationships within the point cloud, further +improving feature analysis and synthesis within the framework. Extensive +experiments demonstrate that the proposed RO-PCAC achieves state-of-the-art +compression performance, compared to existing reconstruction-oriented methods, +including traditional, learning-based, and hybrid methods. + +
+
+
+
+
+ + ☆ Automatic Album Sequencing + + +
+ Album sequencing is a critical part of the album production process. +Recently, a data-driven approach was proposed that sequences general +collections of independent media by extracting the narrative essence of the +items in the collections. While this approach implies an album sequencing +technique, it is not widely accessible to a less technical audience, requiring +advanced knowledge of machine learning techniques to use. To address this, we +introduce a new user-friendly web-based tool that allows a less technical +audience to upload music tracks, execute this technique in one click, and +subsequently presents the result in a clean visualization to the user. To both +increase the number of templates available to the user and address shortcomings +of previous work, we also introduce a new direct transformer-based album +sequencing method. We find that our more direct method outperforms a random +baseline but does not reach the same performance as the narrative essence +approach. Both methods are included in our web-based user interface, and this +-- alongside a full copy of our implementation -- is publicly available at +https://github.com/dylanashley/automatic-album-sequencing + +
+
+ comment: presented as a late breaking demo in the 25th International Society + for Music Information Retrieval Conference; 3 pages in main text, 3 figures + in main text; source code available at + https://github.com/dylanashley/automatic-album-sequencing +
+
+
+
+
+ + ☆ SAV-SE: Scene-aware Audio-Visual Speech Enhancement with Selective State + Space Model + + +
+ Speech enhancement plays an essential role in various applications, and the +integration of visual information has been demonstrated to bring substantial +advantages. However, the majority of current research concentrates on the +examination of facial and lip movements, which can be compromised or entirely +inaccessible in scenarios where occlusions occur or when the camera view is +distant. Whereas contextual visual cues from the surrounding environment have +been overlooked: for example, when we see a dog bark, our brain has the innate +ability to discern and filter out the barking noise. To this end, in this +paper, we introduce a novel task, i.e. SAV-SE. To our best knowledge, this is +the first proposal to use rich contextual information from synchronized video +as auxiliary cues to indicate the type of noise, which eventually improves the +speech enhancement performance. Specifically, we propose the VC-S$^2$E method, +which incorporates the Conformer and Mamba modules for their complementary +strengths. Extensive experiments are conducted on public MUSIC, AVSpeech and +AudioSet datasets, where the results demonstrate the superiority of VC-S$^2$E +over other competitive methods. We will make the source code publicly +available. Project demo page: https://AVSEPage.github.io/ + +
+
+
+
+
+ + ☆ Understanding Audiovisual Deepfake Detection: Techniques, Challenges, + Human Factors and Perceptual Insights + + +
+ Deep Learning has been successfully applied in diverse fields, and its impact +on deepfake detection is no exception. Deepfakes are fake yet realistic +synthetic content that can be used deceitfully for political impersonation, +phishing, slandering, or spreading misinformation. Despite extensive research +on unimodal deepfake detection, identifying complex deepfakes through joint +analysis of audio and visual streams remains relatively unexplored. To fill +this gap, this survey first provides an overview of audiovisual deepfake +generation techniques, applications, and their consequences, and then provides +a comprehensive review of state-of-the-art methods that combine audio and +visual modalities to enhance detection accuracy, summarizing and critically +analyzing their strengths and limitations. Furthermore, we discuss existing +open source datasets for a deeper understanding, which can contribute to the +research community and provide necessary information to beginners who want to +analyze deep learning-based audiovisual methods for video forensics. By +bridging the gap between unimodal and multimodal approaches, this paper aims to +improve the effectiveness of deepfake detection strategies and guide future +research in cybersecurity and media integrity. + +
+
+
+
+
+ + ☆ Harmonizing Pixels and Melodies: Maestro-Guided Film Score Generation + and Composition Style Transfer + + +
+ We introduce a film score generation framework to harmonize visual pixels and +music melodies utilizing a latent diffusion model. Our framework processes film +clips as input and generates music that aligns with a general theme while +offering the capability to tailor outputs to a specific composition style. Our +model directly produces music from video, utilizing a streamlined and efficient +tuning mechanism on ControlNet. It also integrates a film encoder adept at +understanding the film's semantic depth, emotional impact, and aesthetic +appeal. Additionally, we introduce a novel, effective yet straightforward +evaluation metric to evaluate the originality and recognizability of music +within film scores. To fill this gap for film scores, we curate a comprehensive +dataset of film videos and legendary original scores, injecting domain-specific +knowledge into our data-driven generation model. Our model outperforms existing +methodologies in creating film scores, capable of generating music that +reflects the guidance of a maestro's style, thereby redefining the benchmark +for automated film scores and laying a robust groundwork for future research in +this domain. The code and generated samples are available at +https://anonymous.4open.science/r/HPM. + +
+
+
+
+
+ + ♻ ☆ Pseudo-triplet Guided Few-shot Composed Image Retrieval + + +
+ Composed Image Retrieval (CIR) is a challenging task that aims to retrieve +the target image with a multimodal query, i.e., a reference image, and its +complementary modification text. As previous supervised or zero-shot learning +paradigms all fail to strike a good trade-off between the model's +generalization ability and retrieval performance, recent researchers have +introduced the task of few-shot CIR (FS-CIR) and proposed a textual +inversion-based network based on pretrained CLIP model to realize it. Despite +its promising performance, the approach encounters two key limitations: simply +relying on the few annotated samples for CIR model training and +indiscriminately selecting training triplets for CIR model fine-tuning. To +address these two limitations, we propose a novel two-stage pseudo triplet +guided few-shot CIR scheme, dubbed PTG-FSCIR. In the first stage, we propose an +attentive masking and captioning-based pseudo triplet generation method, to +construct pseudo triplets from pure image data and use them to fulfill the +CIR-task specific pertaining. In the second stage, we propose a challenging +triplet-based CIR fine-tuning method, where we design a pseudo modification +text-based sample challenging score estimation strategy and a robust top +range-based random sampling strategy for sampling robust challenging triplets +to promote the model fine-tuning. Notably, our scheme is plug-and-play and +compatible with any existing supervised CIR models. We test our scheme across +two backbones on three public datasets (i.e., FashionIQ, CIRR, and +Birds-to-Words), achieving maximum improvements of 13.3%, 22.2%, and 17.4% +respectively, demonstrating our scheme's efficacy. + +
+
+ comment: 10pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 13 + +
+
+
+ + ☆ Untangling Hate Speech Definitions: A Semantic Componential Analysis + Across Cultures and Domains + + +
+ Hate speech relies heavily on cultural influences, leading to varying +individual interpretations. For that reason, we propose a Semantic Componential +Analysis (SCA) framework for a cross-cultural and cross-domain analysis of hate +speech definitions. We create the first dataset of definitions derived from +five domains: online dictionaries, research papers, Wikipedia articles, +legislation, and online platforms, which are later analyzed into semantic +components. Our analysis reveals that the components differ from definition to +definition, yet many domains borrow definitions from one another without taking +into account the target culture. We conduct zero-shot model experiments using +our proposed dataset, employing three popular open-sourced LLMs to understand +the impact of different definitions on hate speech detection. Our findings +indicate that LLMs are sensitive to definitions: responses for hate speech +detection change according to the complexity of definitions used in the prompt. + +
+
+
+
+
+ + ☆ Using Generative AI and Multi-Agents to Provide Automatic Feedback + + +
+ This study investigates the use of generative AI and multi-agent systems to +provide automatic feedback in educational contexts, particularly for student +constructed responses in science assessments. The research addresses a key gap +in the field by exploring how multi-agent systems, called AutoFeedback, can +improve the quality of GenAI-generated feedback, overcoming known issues such +as over-praise and over-inference that are common in single-agent large +language models (LLMs). The study developed a multi-agent system consisting of +two AI agents: one for generating feedback and another for validating and +refining it. The system was tested on a dataset of 240 student responses, and +its performance was compared to that of a single-agent LLM. Results showed that +AutoFeedback significantly reduced the occurrence of over-praise and +over-inference errors, providing more accurate and pedagogically sound +feedback. The findings suggest that multi-agent systems can offer a more +reliable solution for generating automated feedback in educational settings, +highlighting their potential for scalable and personalized learning support. +These results have important implications for educators and researchers seeking +to leverage AI in formative assessments, offering a pathway to more effective +feedback mechanisms that enhance student learning outcomes. + +
+
+
+
+
+ + ☆ Controllable Context Sensitivity and the Knob Behind It + + +
+ When making predictions, a language model must trade off how much it relies +on its context vs. its prior knowledge. Choosing how sensitive the model is to +its context is a fundamental functionality, as it enables the model to excel at +tasks like retrieval-augmented generation and question-answering. In this +paper, we search for a knob which controls this sensitivity, determining +whether language models answer from the context or their prior knowledge. To +guide this search, we design a task for controllable context sensitivity. In +this task, we first feed the model a context (Paris is in England) and a +question (Where is Paris?); we then instruct the model to either use its prior +or contextual knowledge and evaluate whether it generates the correct answer +for both intents (either France or England). When fine-tuned on this task, +instruction-tuned versions of Llama-3.1, Mistral-v0.3, and Gemma-2 can solve it +with high accuracy (85-95%). Analyzing these high-performing models, we narrow +down which layers may be important to context sensitivity using a novel linear +time algorithm. Then, in each model, we identify a 1-D subspace in a single +layer that encodes whether the model follows context or prior knowledge. +Interestingly, while we identify this subspace in a fine-tuned model, we find +that the exact same subspace serves as an effective knob in not only that model +but also non-fine-tuned instruct and base models of that model family. Finally, +we show a strong correlation between a model's performance and how distinctly +it separates context-agreeing from context-ignoring answers in this subspace. +These results suggest a single subspace facilitates how the model chooses +between context and prior knowledge, hinting at a simple fundamental mechanism +that controls this behavior. + +
+
+
+
+
+ + ☆ Beyond Keywords: A Context-based Hybrid Approach to Mining Ethical + Concern-related App Reviews + + +
+ With the increasing proliferation of mobile applications in our everyday +experiences, the concerns surrounding ethics have surged significantly. Users +generally communicate their feedback, report issues, and suggest new +functionalities in application (app) reviews, frequently emphasizing safety, +privacy, and accountability concerns. Incorporating these reviews is essential +to developing successful products. However, app reviews related to ethical +concerns generally use domain-specific language and are expressed using a more +varied vocabulary. Thus making automated ethical concern-related app review +extraction a challenging and time-consuming effort. + This study proposes a novel Natural Language Processing (NLP) based approach +that combines Natural Language Inference (NLI), which provides a deep +comprehension of language nuances, and a decoder-only (LLaMA-like) Large +Language Model (LLM) to extract ethical concern-related app reviews at scale. +Utilizing 43,647 app reviews from the mental health domain, the proposed +methodology 1) Evaluates four NLI models to extract potential privacy reviews +and compares the results of domain-specific privacy hypotheses with generic +privacy hypotheses; 2) Evaluates four LLMs for classifying app reviews to +privacy concerns; and 3) Uses the best NLI and LLM models further to extract +new privacy reviews from the dataset. Results show that the +DeBERTa-v3-base-mnli-fever-anli NLI model with domain-specific hypotheses +yields the best performance, and Llama3.1-8B-Instruct LLM performs best in the +classification of app reviews. Then, using NLI+LLM, an additional 1,008 new +privacy-related reviews were extracted that were not identified through the +keyword-based approach in previous research, thus demonstrating the +effectiveness of the proposed approach. + +
+
+
+
+
+ + ☆ Toward Optimal Search and Retrieval for RAG NeurIPS 2024 + + +
+ Retrieval-augmented generation (RAG) is a promising method for addressing +some of the memory-related challenges associated with Large Language Models +(LLMs). Two separate systems form the RAG pipeline, the retriever and the +reader, and the impact of each on downstream task performance is not +well-understood. Here, we work towards the goal of understanding how retrievers +can be optimized for RAG pipelines for common tasks such as Question Answering +(QA). We conduct experiments focused on the relationship between retrieval and +RAG performance on QA and attributed QA and unveil a number of insights useful +to practitioners developing high-performance RAG pipelines. For example, +lowering search accuracy has minor implications for RAG performance while +potentially increasing retrieval speed and memory efficiency. + +
+
+ comment: Accepted to NeurIPS 2024 Workshop ATTRIB +
+
+
+
+
+ + ♻ ☆ ChuLo: Chunk-Level Key Information Representation for Long Document + Processing + + +
+ Transformer-based models have achieved remarkable success in various Natural +Language Processing (NLP) tasks, yet their ability to handle long documents is +constrained by computational limitations. Traditional approaches, such as +truncating inputs, sparse self-attention, and chunking, attempt to mitigate +these issues, but they often lead to information loss and hinder the model's +ability to capture long-range dependencies. In this paper, we introduce ChuLo, +a novel chunk representation method for long document classification that +addresses these limitations. Our ChuLo groups input tokens using unsupervised +keyphrase extraction, emphasizing semantically important keyphrase based chunk +to retain core document content while reducing input length. This approach +minimizes information loss and improves the efficiency of Transformer-based +models. Preserving all tokens in long document understanding, especially token +classification tasks, is especially important to ensure that fine-grained +annotations, which depend on the entire sequence context, are not lost. We +evaluate our method on multiple long document classification tasks and long +document token classification tasks, demonstrating its effectiveness through +comprehensive qualitative and quantitative analyses. + +
+
+ comment: The paper has been submitted to a conference and is currently under + review +
+
+
+
+
+ + ♻ ☆ Decomposition of surprisal: Unified computational model of ERP + components in language processing + + +
+ The functional interpretation of language-related ERP components has been a +central debate in psycholinguistics for decades. We advance an +information-theoretic model of human language processing in the brain in which +incoming linguistic input is processed at first shallowly and later with more +depth, with these two kinds of information processing corresponding to distinct +electroencephalographic signatures. Formally, we show that the information +content (surprisal) of a word in context can be decomposed into two quantities: +(A) shallow surprisal, which signals shallow processing difficulty for a word, +and corresponds with the N400 signal; and (B) deep surprisal, which reflects +the discrepancy between shallow and deep representations, and corresponds to +the P600 signal and other late positivities. Both of these quantities can be +estimated straightforwardly using modern NLP models. We validate our theory by +successfully simulating ERP patterns elicited by a variety of linguistic +manipulations in previously-reported experimental data from six experiments, +with successful novel qualitative and quantitative predictions. Our theory is +compatible with traditional cognitive theories assuming a `good-enough' shallow +representation stage, but with a precise information-theoretic formulation. The +model provides an information-theoretic model of ERP components grounded on +cognitive processes, and brings us closer to a fully-specified +neuro-computational model of language processing. + +
+
+
+
+
+ + ♻ ☆ DrAttack: Prompt Decomposition and Reconstruction Makes Powerful LLM + Jailbreakers + + +
+ The safety alignment of Large Language Models (LLMs) is vulnerable to both +manual and automated jailbreak attacks, which adversarially trigger LLMs to +output harmful content. However, current methods for jailbreaking LLMs, which +nest entire harmful prompts, are not effective at concealing malicious intent +and can be easily identified and rejected by well-aligned LLMs. This paper +discovers that decomposing a malicious prompt into separated sub-prompts can +effectively obscure its underlying malicious intent by presenting it in a +fragmented, less detectable form, thereby addressing these limitations. We +introduce an automatic prompt \textbf{D}ecomposition and +\textbf{R}econstruction framework for jailbreak \textbf{Attack} (DrAttack). +DrAttack includes three key components: (a) `Decomposition' of the original +prompt into sub-prompts, (b) `Reconstruction' of these sub-prompts implicitly +by in-context learning with semantically similar but harmless reassembling +demo, and (c) a `Synonym Search' of sub-prompts, aiming to find sub-prompts' +synonyms that maintain the original intent while jailbreaking LLMs. An +extensive empirical study across multiple open-source and closed-source LLMs +demonstrates that, with a significantly reduced number of queries, DrAttack +obtains a substantial gain of success rate over prior SOTA prompt-only +attackers. Notably, the success rate of 78.0\% on GPT-4 with merely 15 queries +surpassed previous art by 33.1\%. The project is available at +https://github.com/xirui-li/DrAttack. + +
+
+
+
+
+ + ♻ ☆ SWE-bench: Can Language Models Resolve Real-World GitHub Issues? ICLR 2024 + + +
+ Language models have outpaced our ability to evaluate them effectively, but +for their future development it is essential to study the frontier of their +capabilities. We find real-world software engineering to be a rich, +sustainable, and challenging testbed for evaluating the next generation of +language models. To this end, we introduce SWE-bench, an evaluation framework +consisting of $2,294$ software engineering problems drawn from real GitHub +issues and corresponding pull requests across $12$ popular Python repositories. +Given a codebase along with a description of an issue to be resolved, a +language model is tasked with editing the codebase to address the issue. +Resolving issues in SWE-bench frequently requires understanding and +coordinating changes across multiple functions, classes, and even files +simultaneously, calling for models to interact with execution environments, +process extremely long contexts and perform complex reasoning that goes far +beyond traditional code generation tasks. Our evaluations show that both +state-of-the-art proprietary models and our fine-tuned model SWE-Llama can +resolve only the simplest issues. The best-performing model, Claude 2, is able +to solve a mere $1.96$% of the issues. Advances on SWE-bench represent steps +towards LMs that are more practical, intelligent, and autonomous. + +
+
+ comment: Data, code, and leaderboard are available at https://www.swebench.com + ICLR 2024, https://openreview.net/forum?id=VTF8yNQM66 +
+
+
+
+
+ + ♻ ☆ Temporal Dynamics of Emotion and Cognition in Human Translation: + Integrating the Task Segment Framework and the HOF Taxonomy + + +
+ The paper develops a novel generative model of human translation processes +grounded in empirical translation process data. Assuming three processes that +unfold concurrently in the translating mind, it integrates the Task Segment +Framework (Munoz & Apfelthaler 2022) and the HOF taxonomy (Carl et al 2024) +into a coherent architecture: uninterrupted translation production is caused by +routinized/automated processes, cognitive/reflective interventions lead to +longer keystroke pauses, while emotional/affective states of the mind are +identified by distinctive gazing patterns. Utilizing data from the CRITT +Translation Process Research Database (TPR-DB), the paper illustrates how the +temporal structure of keystroke and gazing data can be related to the three +assumed hidden mental processes that are believed to cause the observable data. +The paper relates this embedded generative model with Robinsons (2023) +ideosomatic theory of translation, opening exciting, new theoretical horizons +for Cognitive Translation Studies, grounded in empirical data and evaluation. + +
+
+
+
+
+ + ♻ ☆ Simple is Effective: The Roles of Graphs and Large Language Models in + Knowledge-Graph-Based Retrieval-Augmented Generation + + +
+ Large Language Models (LLMs) demonstrate strong reasoning abilities but face +limitations such as hallucinations and outdated knowledge. Knowledge Graph +(KG)-based Retrieval-Augmented Generation (RAG) addresses these issues by +grounding LLM outputs in structured external knowledge from KGs. However, +current KG-based RAG frameworks still struggle to optimize the trade-off +between retrieval effectiveness and efficiency in identifying a suitable amount +of relevant graph information for the LLM to digest. We introduce SubgraphRAG, +extending the KG-based RAG framework that retrieves subgraphs and leverages +LLMs for reasoning and answer prediction. Our approach innovatively integrates +a lightweight multilayer perceptron with a parallel triple-scoring mechanism +for efficient and flexible subgraph retrieval while encoding directional +structural distances to enhance retrieval effectiveness. The size of retrieved +subgraphs can be flexibly adjusted to match the query's need and the downstream +LLM's capabilities. This design strikes a balance between model complexity and +reasoning power, enabling scalable and generalizable retrieval processes. +Notably, based on our retrieved subgraphs, smaller LLMs like +Llama3.1-8B-Instruct deliver competitive results with explainable reasoning, +while larger models like GPT-4o achieve state-of-the-art accuracy compared with +previous baselines -- all without fine-tuning. Extensive evaluations on the +WebQSP and CWQ benchmarks highlight SubgraphRAG's strengths in efficiency, +accuracy, and reliability by reducing hallucinations and improving response +grounding. + +
+
+ comment: Code available at https://github.com/Graph-COM/SubgraphRAG +
+
+
+
+
+ + ♻ ☆ Extrinsically-Focused Evaluation of Omissions in Medical Summarization ML4H 2024 + + +
+ Large language models (LLMs) have shown promise in safety-critical +applications such as healthcare, yet the ability to quantify performance has +lagged. An example of this challenge is in evaluating a summary of the +patient's medical record. A resulting summary can enable the provider to get a +high-level overview of the patient's health status quickly. Yet, a summary that +omits important facts about the patient's record can produce a misleading +picture. This can lead to negative consequences on medical decision-making. We +propose MED-OMIT as a metric to explore this challenge. We focus on using +provider-patient history conversations to generate a subjective (a summary of +the patient's history) as a case study. We begin by discretizing facts from the +dialogue and identifying which are omitted from the subjective. To determine +which facts are clinically relevant, we measure the importance of each fact to +a simulated differential diagnosis. We compare MED-OMIT's performance to that +of clinical experts and find broad agreement We use MED-OMIT to evaluate LLM +performance on subjective generation and find some LLMs (gpt-4 and +llama-3.1-405b) work well with little effort, while others (e.g. Llama 2) +perform worse. + +
+
+ comment: Accepted to ML4H 2024 +
+
+
+
+
+ + ♻ ☆ Atlas-Chat: Adapting Large Language Models for Low-Resource Moroccan + Arabic Dialect + + +
+ We introduce Atlas-Chat, the first-ever collection of LLMs specifically +developed for dialectal Arabic. Focusing on Moroccan Arabic, also known as +Darija, we construct our instruction dataset by consolidating existing Darija +language resources, creating novel datasets both manually and synthetically, +and translating English instructions with stringent quality control. +Atlas-Chat-2B, 9B, and 27B models, fine-tuned on the dataset, exhibit superior +ability in following Darija instructions and performing standard NLP tasks. +Notably, our models outperform both state-of-the-art and Arabic-specialized +LLMs like LLaMa, Jais, and AceGPT, e.g., our 9B model gains a 13% performance +boost over a larger 13B model on DarijaMMLU, in our newly introduced evaluation +suite for Darija covering both discriminative and generative tasks. +Furthermore, we perform an experimental analysis of various fine-tuning +strategies and base model choices to determine optimal configurations. All our +resources are publicly accessible, and we believe our work offers comprehensive +design methodologies of instruction-tuning for low-resource languages, which +are often neglected in favor of data-rich languages by contemporary LLMs. + +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Music Discovery Dialogue Generation Using Human Intent Analysis and + Large Language Models + + +
+ A conversational music retrieval system can help users discover music that +matches their preferences through dialogue. To achieve this, a conversational +music retrieval system should seamlessly engage in multi-turn conversation by +1) understanding user queries and 2) responding with natural language and +retrieved music. A straightforward solution would be a data-driven approach +utilizing such conversation logs. However, few datasets are available for the +research and are limited in terms of volume and quality. In this paper, we +present a data generation framework for rich music discovery dialogue using a +large language model (LLM) and user intents, system actions, and musical +attributes. This is done by i) dialogue intent analysis using grounded theory, +ii) generating attribute sequences via cascading database filtering, and iii) +generating utterances using large language models. By applying this framework +to the Million Song dataset, we create LP-MusicDialog, a Large Language Model +based Pseudo Music Dialogue dataset, containing over 288k music conversations +using more than 319k music items. Our evaluation shows that the synthetic +dataset is competitive with an existing, small human dialogue dataset in terms +of dialogue consistency, item relevance, and naturalness. Furthermore, using +the dataset, we train a conversational music retrieval model and show promising +results. + +
+
+ comment: Accepted for publication at the 25th International Society for Music + Information Retrieval Conference (ISMIR 2024) +
+
+
+
+
+ + ☆ The Shapley index for music streaming platforms + + +
+ We study an index to measure the popularity of artists in music streaming +platforms. This index, which can be used to allocate the amount raised via paid +subscriptions among participating artists, is based on the Shapley value, a +centerpiece in cooperative game theory. We characterize this Shapley index +combining several axioms formalizing principles with normative appeal. This +permits to place the index in the literature, as an alternative to the +well-known (and widely used in the industry) pro-rata and user-centric indices. + +
+
+
+
+
+ + ☆ Invar-RAG: Invariant LLM-aligned Retrieval for Better Generation + + +
+ Retrieval-augmented generation (RAG) has shown impressive capability in +providing reliable answer predictions and addressing hallucination problems. A +typical RAG implementation uses powerful retrieval models to extract external +information and large language models (LLMs) to generate answers. In contrast, +recent LLM-based retrieval has gained attention for its substantial +improvements in information retrieval (IR) due to the LLMs' semantic +understanding capability. However, directly applying LLM to RAG systems +presents challenges. This may cause feature locality problems as massive +parametric knowledge can hinder effective usage of global information across +the corpus; for example, an LLM-based retriever often inputs document summaries +instead of full documents. Moreover, various pre-trained tasks in LLMs +introduce variance, further weakening performance as a retriever. + To address these issues, we propose a novel two-stage fine-tuning +architecture called Invar-RAG. In the retrieval stage, an LLM-based retriever +is constructed by integrating LoRA-based representation learning to tackle +feature locality issues. To enhance retrieval performance, we develop two +patterns (invariant and variant patterns) and an invariance loss to reduce LLM +variance. In the generation stage, a refined fine-tuning method is employed to +improve LLM accuracy in generating answers based on retrieved information. +Experimental results show that Invar-RAG significantly outperforms existing +baselines across three open-domain question answering (ODQA) datasets. Code is +available in the Supplementary Material for reproducibility. + +
+
+
+
+
+ + ☆ LLM-Assisted Relevance Assessments: When Should We Ask LLMs for Help? + + +
+ Test collections are information retrieval tools that allow researchers to +quickly and easily evaluate ranking algorithms. While test collections have +become an integral part of IR research, the process of data creation involves +significant efforts in manual annotations, which often makes it very expensive +and time-consuming. Thus, the test collections could become small when the +budget is limited, which may lead to unstable evaluations. As an alternative, +recent studies have proposed the use of large language models (LLMs) to +completely replace human assessors. However, while LLMs seem to somewhat +correlate with human judgments, they are not perfect and often show bias. +Moreover, even if a well-performing LLM or prompt is found on one dataset, +there is no guarantee that it will perform similarly in practice, due to +difference in tasks and data. Thus a complete replacement with LLMs is argued +to be too risky and not fully trustable. + Thus, in this paper, we propose \textbf{L}LM-\textbf{A}ssisted +\textbf{R}elevance \textbf{A}ssessments (\textbf{LARA}), an effective method to +balance manual annotations with LLM annotations, which helps to make a rich and +reliable test collection. We use the LLM's predicted relevance probabilities in +order to select the most profitable documents to manually annotate under a +budget constraint. While solely relying on LLM's predicted probabilities to +manually annotate performs fairly well, with theoretical reasoning, LARA guides +the human annotation process even more effectively via online calibration +learning. Then, using the calibration model learned from the limited manual +annotations, LARA debiases the LLM predictions to annotate the remaining +non-assessed data. Empirical evaluations on TREC-COVID and TREC-8 Ad Hoc +datasets show that LARA outperforms the alternative solutions under almost any +budget constraint. + +
+
+
+
+
+ + ☆ Adaptive Conditional Expert Selection Network for Multi-domain + Recommendation + + +
+ Mixture-of-Experts (MOE) has recently become the de facto standard in +Multi-domain recommendation (MDR) due to its powerful expressive ability. +However, such MOE-based method typically employs all experts for each instance, +leading to scalability issue and low-discriminability between domains and +experts. Furthermore, the design of commonly used domain-specific networks +exacerbates the scalability issues. To tackle the problems, We propose a novel +method named CESAA consists of Conditional Expert Selection (CES) Module and +Adaptive Expert Aggregation (AEA) Module to tackle these challenges. +Specifically, CES first combines a sparse gating strategy with domain-shared +experts. Then AEA utilizes mutual information loss to strengthen the +correlations between experts and specific domains, and significantly improve +the distinction between experts. As a result, only domain-shared experts and +selected domain-specific experts are activated for each instance, striking a +balance between computational efficiency and model performance. Experimental +results on both public ranking and industrial retrieval datasets verify the +effectiveness of our method in MDR tasks. + +
+
+
+
+
+ + ☆ Large Language Model in Medical Informatics: Direct Classification and + Enhanced Text Representations for Automatic ICD Coding + + +
+ Addressing the complexity of accurately classifying International +Classification of Diseases (ICD) codes from medical discharge summaries is +challenging due to the intricate nature of medical documentation. This paper +explores the use of Large Language Models (LLM), specifically the LLAMA +architecture, to enhance ICD code classification through two methodologies: +direct application as a classifier and as a generator of enriched text +representations within a Multi-Filter Residual Convolutional Neural Network +(MultiResCNN) framework. We evaluate these methods by comparing them against +state-of-the-art approaches, revealing LLAMA's potential to significantly +improve classification outcomes by providing deep contextual insights into +medical texts. + +
+
+ comment: accepted at the 2024 IEEE International Conference on Bioinformatics + and Biomedicine (BIBM 2024) +
+
+
+
+
+ + ☆ AssistRAG: Boosting the Potential of Large Language Models with an + Intelligent Information Assistant NeurIPS 2024 + + +
+ The emergence of Large Language Models (LLMs) has significantly advanced +natural language processing, but these models often generate factually +incorrect information, known as "hallucination". Initial retrieval-augmented +generation (RAG) methods like the "Retrieve-Read" framework was inadequate for +complex reasoning tasks. Subsequent prompt-based RAG strategies and Supervised +Fine-Tuning (SFT) methods improved performance but required frequent retraining +and risked altering foundational LLM capabilities. To cope with these +challenges, we propose Assistant-based Retrieval-Augmented Generation +(AssistRAG), integrating an intelligent information assistant within LLMs. This +assistant manages memory and knowledge through tool usage, action execution, +memory building, and plan specification. Using a two-phase training approach, +Curriculum Assistant Learning and Reinforced Preference Optimization. AssistRAG +enhances information retrieval and decision-making. Experiments show AssistRAG +significantly outperforms benchmarks, especially benefiting less advanced LLMs, +by providing superior reasoning capabilities and accurate responses. + +
+
+ comment: Accepted by NeurIPS 2024 (poster) +
+
+
+
+
+ + ☆ Boosting the Targeted Transferability of Adversarial Examples via + Salient Region & Weighted Feature Drop + + +
+ Deep neural networks can be vulnerable to adversarially crafted examples, +presenting significant risks to practical applications. A prevalent approach +for adversarial attacks relies on the transferability of adversarial examples, +which are generated from a substitute model and leveraged to attack unknown +black-box models. Despite various proposals aimed at improving transferability, +the success of these attacks in targeted black-box scenarios is often hindered +by the tendency for adversarial examples to overfit to the surrogate models. In +this paper, we introduce a novel framework based on Salient region & Weighted +Feature Drop (SWFD) designed to enhance the targeted transferability of +adversarial examples. Drawing from the observation that examples with higher +transferability exhibit smoother distributions in the deep-layer outputs, we +propose the weighted feature drop mechanism to modulate activation values +according to weights scaled by norm distribution, effectively addressing the +overfitting issue when generating adversarial examples. Additionally, by +leveraging salient region within the image to construct auxiliary images, our +method enables the adversarial example's features to be transferred to the +target category in a model-agnostic manner, thereby enhancing the +transferability. Comprehensive experiments confirm that our approach +outperforms state-of-the-art methods across diverse configurations. On average, +the proposed SWFD raises the attack success rate for normally trained models +and robust models by 16.31% and 7.06% respectively. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ FineTuneBench: How well do commercial fine-tuning APIs infuse knowledge + into LLMs? + + +
+ There is great interest in fine-tuning frontier large language models (LLMs) +to inject new information and update existing knowledge. While commercial LLM +fine-tuning APIs from providers such as OpenAI and Google promise flexible +adaptation for various applications, the efficacy of fine-tuning remains +unclear. In this study, we introduce FineTuneBench, an evaluation framework and +dataset for understanding how well commercial fine-tuning APIs can successfully +learn new and updated knowledge. We analyze five frontier LLMs with +commercially available fine-tuning APIs, including GPT-4o and Gemini 1.5 Pro, +on their effectiveness in two settings: (1) ingesting novel information, such +as recent news events and new people profiles, and (2) updating existing +knowledge, such as updated medical guidelines and code frameworks. Our results +reveal substantial shortcomings in all the models' abilities to effectively +learn new information through fine-tuning, with an average generalization +accuracy of 37% across all models. When updating existing knowledge, such as +incorporating medical guideline updates, commercial fine-tuning APIs show even +more limited capability (average generalization accuracy of 19%). Overall, +fine-tuning GPT-4o mini is the most effective for infusing new knowledge and +updating knowledge, followed by GPT-3.5 Turbo and GPT-4o. The fine-tuning APIs +for Gemini 1.5 Flesh and Gemini 1.5 Pro are unable to learn new knowledge or +update existing knowledge. These findings underscore a major shortcoming in +using current commercial fine-tuning services to achieve reliable knowledge +infusion in common scenarios. We open source the FineTuneBench dataset at +https://github.com/kevinwu23/StanfordFineTuneBench. + +
+
+
+
+
+ + ♻ ☆ Know Your Neighborhood: General and Zero-Shot Capable Binary Function + Search Powered by Call Graphlets + + +
+ Binary code similarity detection is an important problem with applications in +areas such as malware analysis, vulnerability research and license violation +detection. This paper proposes a novel graph neural network architecture +combined with a novel graph data representation called call graphlets. A call +graphlet encodes the neighborhood around each function in a binary executable, +capturing the local and global context through a series of statistical +features. A specialized graph neural network model operates on this graph +representation, learning to map it to a feature vector that encodes semantic +binary code similarities using deep-metric learning. The proposed approach is +evaluated across five distinct datasets covering different architectures, +compiler tool chains, and optimization levels. Experimental results show that +the combination of call graphlets and the novel graph neural network +architecture achieves comparable or state-of-the-art performance compared to +baseline techniques across cross-architecture, mono-architecture and zero shot +tasks. In addition, our proposed approach also performs well when evaluated +against an out-of-domain function inlining task. The work provides a general +and effective graph neural network-based solution for conducting binary code +similarity detection. + +
+
+ comment: 13 pages, Under-Review +
+
+
+
+
+ + ♻ ☆ INQUIRE: A Natural World Text-to-Image Retrieval Benchmark NeurIPS 2024 + + +
+ We introduce INQUIRE, a text-to-image retrieval benchmark designed to +challenge multimodal vision-language models on expert-level queries. INQUIRE +includes iNaturalist 2024 (iNat24), a new dataset of five million natural world +images, along with 250 expert-level retrieval queries. These queries are paired +with all relevant images comprehensively labeled within iNat24, comprising +33,000 total matches. Queries span categories such as species identification, +context, behavior, and appearance, emphasizing tasks that require nuanced image +understanding and domain expertise. Our benchmark evaluates two core retrieval +tasks: (1) INQUIRE-Fullrank, a full dataset ranking task, and (2) +INQUIRE-Rerank, a reranking task for refining top-100 retrievals. Detailed +evaluation of a range of recent multimodal models demonstrates that INQUIRE +poses a significant challenge, with the best models failing to achieve an +mAP@50 above 50%. In addition, we show that reranking with more powerful +multimodal models can enhance retrieval performance, yet there remains a +significant margin for improvement. By focusing on scientifically-motivated +ecological challenges, INQUIRE aims to bridge the gap between AI capabilities +and the needs of real-world scientific inquiry, encouraging the development of +retrieval systems that can assist with accelerating ecological and biodiversity +research. Our dataset and code are available at +https://inquire-benchmark.github.io + +
+
+ comment: Published in NeurIPS 2024, Datasets and Benchmarks Track +
+
+
+
+
+ + ♻ ☆ CROLoss: Towards a Customizable Loss for Retrieval Models in Recommender + Systems CIKM 2022 + + +
+ In large-scale recommender systems, retrieving top N relevant candidates +accurately with resource constrain is crucial. To evaluate the performance of +such retrieval models, Recall@N, the frequency of positive samples being +retrieved in the top N ranking, is widely used. However, most of the +conventional loss functions for retrieval models such as softmax cross-entropy +and pairwise comparison methods do not directly optimize Recall@N. Moreover, +those conventional loss functions cannot be customized for the specific +retrieval size N required by each application and thus may lead to sub-optimal +performance. In this paper, we proposed the Customizable Recall@N Optimization +Loss (CROLoss), a loss function that can directly optimize the Recall@N metrics +and is customizable for different choices of N. This proposed CROLoss +formulation defines a more generalized loss function space, covering most of +the conventional loss functions as special cases. Furthermore, we develop the +Lambda method, a gradient-based method that invites more flexibility and can +further boost the system performance. We evaluate the proposed CROLoss on two +public benchmark datasets. The results show that CROLoss achieves SOTA results +over conventional loss functions for both datasets with various choices of +retrieval size N. CROLoss has been deployed onto our online E-commerce +advertising platform, where a fourteen-day online A/B test demonstrated that +CROLoss contributes to a significant business revenue growth of 4.75%. + +
+
+ comment: 9 pages, 5 figures. Accepted by by CIKM 2022 +
+
+
+
+
+ + ♻ ☆ Entity Extraction from High-Level Corruption Schemes via Large Language + Models + + +
+ The rise of financial crime that has been observed in recent years has +created an increasing concern around the topic and many people, organizations +and governments are more and more frequently trying to combat it. Despite the +increase of interest in this area, there is a lack of specialized datasets that +can be used to train and evaluate works that try to tackle those problems. This +article proposes a new micro-benchmark dataset for algorithms and models that +identify individuals and organizations, and their multiple writings, in news +articles, and presents an approach that assists in its creation. Experimental +efforts are also reported, using this dataset, to identify individuals and +organizations in financial-crime-related articles using various low-billion +parameter Large Language Models (LLMs). For these experiments, standard metrics +(Accuracy, Precision, Recall, F1 Score) are reported and various prompt +variants comprising the best practices of prompt engineering are tested. In +addition, to address the problem of ambiguous entity mentions, a simple, yet +effective LLM-based disambiguation method is proposed, ensuring that the +evaluation aligns with reality. Finally, the proposed approach is compared +against a widely used state-of-the-art open-source baseline, showing the +superiority of the proposed method. + +
+
+
+
+
+ + ♻ ☆ OAEI-LLM: A Benchmark Dataset for Understanding Large Language Model + Hallucinations in Ontology Matching + + +
+ Hallucinations of large language models (LLMs) commonly occur in +domain-specific downstream tasks, with no exception in ontology matching (OM). +The prevalence of using LLMs for OM raises the need for benchmarks to better +understand LLM hallucinations. The OAEI-LLM dataset is an extended version of +the Ontology Alignment Evaluation Initiative (OAEI) datasets that evaluate +LLM-specific hallucinations in OM tasks. We outline the methodology used in +dataset construction and schema extension, and provide examples of potential +use cases. + +
+
+ comment: 5 pages, 1 figure, 1 table +
+
+
+
+
+ + ♻ ☆ TF-DCon: Leveraging Large Language Models (LLMs) to Empower + Training-Free Dataset Condensation for Content-Based Recommendation + + +
+ Modern techniques in Content-based Recommendation (CBR) leverage item content +information to provide personalized services to users, but suffer from +resource-intensive training on large datasets. To address this issue, we +explore the dataset condensation for textual CBR in this paper. The goal of +dataset condensation is to synthesize a small yet informative dataset, upon +which models can achieve performance comparable to those trained on large +datasets. While existing condensation approaches are tailored to classification +tasks for continuous data like images or embeddings, direct application of them +to CBR has limitations. To bridge this gap, we investigate efficient dataset +condensation for content-based recommendation. Inspired by the remarkable +abilities of large language models (LLMs) in text comprehension and generation, +we leverage LLMs to empower the generation of textual content during +condensation. To handle the interaction data involving both users and items, we +devise a dual-level condensation method: content-level and user-level. At +content-level, we utilize LLMs to condense all contents of an item into a new +informative title. At user-level, we design a clustering-based synthesis +module, where we first utilize LLMs to extract user interests. Then, the user +interests and user embeddings are incorporated to condense users and generate +interactions for condensed users. Notably, the condensation paradigm of this +method is forward and free from iterative optimization on the synthesized +dataset. Extensive empirical findings from our study, conducted on three +authentic datasets, substantiate the efficacy of the proposed method. +Particularly, we are able to approximate up to 97% of the original performance +while reducing the dataset size by 95% (i.e., on dataset MIND). + +
+
+ comment: An updated version +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ Just Label the Repeats for In-The-Wild Audio-to-Score Alignment + + +
+ We propose an efficient workflow for high-quality offline alignment of +in-the-wild performance audio and corresponding sheet music scans (images). +Recent work on audio-to-score alignment extends dynamic time warping (DTW) to +be theoretically able to handle jumps in sheet music induced by repeat +signs-this method requires no human annotations, but we show that it often +yields low-quality alignments. As an alternative, we propose a workflow and +interface that allows users to quickly annotate jumps (by clicking on repeat +signs), requiring a small amount of human supervision but yielding much higher +quality alignments on average. Additionally, we refine audio and score feature +representations to improve alignment quality by: (1) integrating measure +detection into the score feature representation, and (2) using raw onset +prediction probabilities from a music transcription model instead of piano +roll. We propose an evaluation protocol for audio-to-score alignment that +computes the distance between the estimated and ground truth alignment in units +of measures. Under this evaluation, we find that our proposed jump annotation +workflow and improved feature representations together improve alignment +accuracy by 150% relative to prior work (33% to 82%). + +
+
+ comment: 25th International Society for Music Information Retrieval + Conference, San Francisco, 2024 +
+
+
+
+
+ + ☆ Multimodal Fusion Balancing Through Game-Theoretic Regularization + + +
+ Multimodal learning can complete the picture of information extraction by +uncovering key dependencies between data sources. However, current systems fail +to fully leverage multiple modalities for optimal performance. This has been +attributed to modality competition, where modalities strive for training +resources, leaving some underoptimized. We show that current balancing methods +struggle to train multimodal models that surpass even simple baselines, such as +ensembles. This raises the question: how can we ensure that all modalities in +multimodal training are sufficiently trained, and that learning from new +modalities consistently improves performance? This paper proposes the +Multimodal Competition Regularizer (MCR), a new loss component inspired by +mutual information (MI) decomposition designed to prevent the adverse effects +of competition in multimodal training. Our key contributions are: 1) +Introducing game-theoretic principles in multimodal learning, where each +modality acts as a player competing to maximize its influence on the final +outcome, enabling automatic balancing of the MI terms. 2) Refining lower and +upper bounds for each MI term to enhance the extraction of task-relevant unique +and shared information across modalities. 3) Suggesting latent space +permutations for conditional MI estimation, significantly improving +computational efficiency. MCR outperforms all previously suggested training +strategies and is the first to consistently improve multimodal learning beyond +the ensemble baseline, clearly demonstrating that combining modalities leads to +significant performance gains on both synthetic and large real-world datasets. + +
+
+ comment: 21 pages, 6 figures, 4 tables, 1 algorithm +
+
+
+
+
+ + ☆ Low Complexity Learning-based Lossless Event-based Compression + + +
+ Event cameras are a cutting-edge type of visual sensors that capture data by +detecting brightness changes at the pixel level asynchronously. These cameras +offer numerous benefits over conventional cameras, including high temporal +resolution, wide dynamic range, low latency, and lower power consumption. +However, the substantial data rates they produce require efficient compression +techniques, while also fulfilling other typical application requirements, such +as the ability to respond to visual changes in real-time or near real-time. +Additionally, many event-based applications demand high accuracy, making +lossless coding desirable, as it retains the full detail of the sensor data. +Learning-based methods show great potential due to their ability to model the +unique characteristics of event data thus allowing to achieve high compression +rates. This paper proposes a low-complexity lossless coding solution based on +the quadtree representation that outperforms traditional compression algorithms +in efficiency and speed, ensuring low computational complexity and minimal +delay for real-time applications. Experimental results show that the proposed +method delivers better compression ratios, i.e., with fewer bits per event, and +lower computational complexity compared to current lossless data compression +methods. + +
+
+
+
+
+ + ☆ A Hierarchical Compression Technique for 3D Gaussian Splatting + Compression + + +
+ 3D Gaussian Splatting (GS) demonstrates excellent rendering quality and +generation speed in novel view synthesis. However, substantial data size poses +challenges for storage and transmission, making 3D GS compression an essential +technology. Current 3D GS compression research primarily focuses on developing +more compact scene representations, such as converting explicit 3D GS data into +implicit forms. In contrast, compression of the GS data itself has hardly been +explored. To address this gap, we propose a Hierarchical GS Compression (HGSC) +technique. Initially, we prune unimportant Gaussians based on importance scores +derived from both global and local significance, effectively reducing +redundancy while maintaining visual quality. An Octree structure is used to +compress 3D positions. Based on the 3D GS Octree, we implement a hierarchical +attribute compression strategy by employing a KD-tree to partition the 3D GS +into multiple blocks. We apply farthest point sampling to select anchor +primitives within each block and others as non-anchor primitives with varying +Levels of Details (LoDs). Anchor primitives serve as reference points for +predicting non-anchor primitives across different LoDs to reduce spatial +redundancy. For anchor primitives, we use the region adaptive hierarchical +transform to achieve near-lossless compression of various attributes. For +non-anchor primitives, each is predicted based on the k-nearest anchor +primitives. To further minimize prediction errors, the reconstructed LoD and +anchor primitives are combined to form new anchor primitives to predict the +next LoD. Our method notably achieves superior compression quality and a +significant data size reduction of over 4.5 times compared to the +state-of-the-art compression method on small scenes datasets. + +
+
+
+
+
+ + ☆ JPEG AI Image Compression Visual Artifacts: Detection Methods and + Dataset + + +
+ Learning-based image compression methods have improved in recent years and +started to outperform traditional codecs. However, neural-network approaches +can unexpectedly introduce visual artifacts in some images. We therefore +propose methods to separately detect three types of artifacts (texture and +boundary degradation, color change, and text corruption), to localize the +affected regions, and to quantify the artifact strength. We consider only those +regions that exhibit distortion due solely to the neural compression but that a +traditional codec recovers successfully at a comparable bitrate. We employed +our methods to collect artifacts for the JPEG AI verification model with +respect to HM-18.0, the H.265 reference software. We processed about 350,000 +unique images from the Open Images dataset using different compression-quality +parameters; the result is a dataset of 46,440 artifacts validated through +crowd-sourced subjective assessment. Our proposed dataset and methods are +valuable for testing neural-network-based image codecs, identifying bugs in +these codecs, and enhancing their performance. We make source code of the +methods and the dataset publicly available. + +
+
+
+
+
+ + ☆ Loss-tolerant neural video codec aware congestion control for real time + video communication + + +
+ Because of reinforcement learning's (RL) ability to automatically create more +adaptive controlling logics beyond the hand-crafted heuristics, numerous effort +has been made to apply RL to congestion control (CC) design for real time video +communication (RTC) applications and has successfully shown promising benefits +over the rule-based RTC CCs. Online reinforcement learning is often adopted to +train the RL models so the models can directly adapt to real network +environments. However, its trail-and-error manner can also cause catastrophic +degradation of the quality of experience (QoE) of RTC application at run time. +Thus, safeguard strategies such as falling back to hand-crafted heuristics can +be used to run along with RL models to guarantee the actions explored in the +training sensible, despite that these safeguard strategies interrupt the +learning process and make it more challenging to discover optimal RL policies. + The recent emergence of loss-tolerant neural video codecs (NVC) naturally +provides a layer of protection for the online learning of RL-based congestion +control because of its resilience to packet losses, but such packet loss +resilience have not been fully exploited in prior works yet. In this paper, we +present a reinforcement learning (RL) based congestion control which can be +aware of and takes advantage of packet loss tolerance characteristic of NVCs +via reward in online RL learning. Through extensive evaluation on various +videos and network traces in a simulated environment, we demonstrate that our +NVC-aware CC running with the loss-tolerant NVC reduces the training time by +41\% compared to other prior RL-based CCs. It also boosts the mean video +quality by 0.3 to 1.6dB, lower the tail frame delay by 3 to 200ms, and reduces +the video stalls by 20\% to 77\% in comparison with other baseline RTC CCs. + +
+
+
+
+
+ + ♻ ☆ Unmasking Illusions: Understanding Human Perception of Audiovisual + Deepfakes + + +
+ The emergence of contemporary deepfakes has attracted significant attention +in machine learning research, as artificial intelligence (AI) generated +synthetic media increases the incidence of misinterpretation and is difficult +to distinguish from genuine content. Currently, machine learning techniques +have been extensively studied for automatically detecting deepfakes. However, +human perception has been less explored. Malicious deepfakes could ultimately +cause public and social problems. Can we humans correctly perceive the +authenticity of the content of the videos we watch? The answer is obviously +uncertain; therefore, this paper aims to evaluate the human ability to discern +deepfake videos through a subjective study. We present our findings by +comparing human observers to five state-ofthe-art audiovisual deepfake +detection models. To this end, we used gamification concepts to provide 110 +participants (55 native English speakers and 55 non-native English speakers) +with a webbased platform where they could access a series of 40 videos (20 real +and 20 fake) to determine their authenticity. Each participant performed the +experiment twice with the same 40 videos in different random orders. The videos +are manually selected from the FakeAVCeleb dataset. We found that all AI models +performed better than humans when evaluated on the same 40 videos. The study +also reveals that while deception is not impossible, humans tend to +overestimate their detection capabilities. Our experimental results may help +benchmark human versus machine performance, advance forensics analysis, and +enable adaptive countermeasures. + +
+
+
+
+
+ + ♻ ☆ Video Summarization: Towards Entity-Aware Captions + + +
+ Existing popular video captioning benchmarks and models deal with generic +captions devoid of specific person, place or organization named entities. In +contrast, news videos present a challenging setting where the caption requires +such named entities for meaningful summarization. As such, we propose the task +of summarizing news video directly to entity-aware captions. We also release a +large-scale dataset, VIEWS (VIdeo NEWS), to support research on this task. +Further, we propose a method that augments visual information from videos with +context retrieved from external world knowledge to generate entity-aware +captions. We demonstrate the effectiveness of our approach on three video +captioning models. We also show that our approach generalizes to existing news +image captions dataset. With all the extensive experiments and insights, we +believe we establish a solid basis for future research on this challenging +task. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ Generating Mixcode Popular Songs with Artificial Intelligence: Concepts, + Plans, and Speculations + + +
+ Music is a potent form of expression that can communicate, accentuate or even +create the emotions of an individual or a collective. Both historically and in +contemporary experiences, musical expression was and is commonly +instrumentalized for social, political and/or economic purposes. Generative +artificial intelligence provides a wealth of both opportunities and challenges +with regard to music and its role in society. This paper discusses a proposed +project integrating artificial intelligence and popular music, with the +ultimate goal of creating a powerful tool for implementing music for social +transformation, education, healthcare, and emotional well-being. Given that it +is being presented at the outset of a collaboration between a computer +scientist/data analyst and an ethnomusicologist/social anthropologist. it is +mainly conceptual and somewhat speculative in nature. + +
+
+ comment: Link to the paper:https://aimc2024.pubpub.org/pub/rdulfbve/release/1 + Published in The International Conference on AI and Musical Creativity at the + University of Oxford (2024) https://aimc2024.pubpub.org/ +
+
+
+
+
+ + ☆ Metric Learning for Tag Recommendation: Tackling Data Sparsity and Cold + Start Issues + + +
+ With the rapid growth of digital information, personalized recommendation +systems have become an indispensable part of Internet services, especially in +the fields of e-commerce, social media, and online entertainment. However, +traditional collaborative filtering and content-based recommendation methods +have limitations in dealing with data sparsity and cold start problems, +especially in the face of largescale heterogeneous data, which makes it +difficult to meet user expectations. This paper proposes a new label +recommendation algorithm based on metric learning, which aims to overcome the +challenges of traditional recommendation systems by learning effective distance +or similarity metrics to capture the subtle differences between user +preferences and item features. Experimental results show that the algorithm +outperforms baseline methods including local response metric learning (LRML), +collaborative metric learning (CML), and adaptive tensor factorization (ATF) +based on adversarial learning on multiple evaluation metrics. In particular, it +performs particularly well in the accuracy of the first few recommended items, +while maintaining high robustness and maintaining high recommendation accuracy. + +
+
+
+
+
+ + ♻ ☆ "Knowing When You Don't Know": A Multilingual Relevance Assessment + Dataset for Robust Retrieval-Augmented Generation EMNLP 2024 + + +
+ Retrieval-Augmented Generation (RAG) grounds Large Language Model (LLM) +output by leveraging external knowledge sources to reduce factual +hallucinations. However, prior work lacks a comprehensive evaluation of +different language families, making it challenging to evaluate LLM robustness +against errors in external retrieved knowledge. To overcome this, we establish +NoMIRACL, a human-annotated dataset for evaluating LLM robustness in RAG across +18 typologically diverse languages. NoMIRACL includes both a non-relevant and a +relevant subset. Queries in the non-relevant subset contain passages judged as +non-relevant, whereas queries in the relevant subset include at least a single +judged relevant passage. We measure relevance assessment using: (i) +hallucination rate, measuring model tendency to hallucinate, when the answer is +not present in passages in the non-relevant subset, and (ii) error rate, +measuring model inaccuracy to recognize relevant passages in the relevant +subset.In our work, we observe that most models struggle to balance the two +capacities. Models such as LLAMA-2 and Orca-2 achieve over 88% hallucination +rate on the non-relevant subset. Mistral and LLAMA-3 hallucinate less but can +achieve up to a 74.9% error rate on the relevant subset. Overall, GPT-4 is +observed to provide the best tradeoff on both subsets, highlighting future work +necessary to improve LLM robustness. NoMIRACL dataset and evaluation code are +available at: https://github.com/project-miracl/nomiracl. + +
+
+ comment: EMNLP 2024 (Findings) +
+
+
+
+
+ + ♻ ☆ QDA-SQL: Questions Enhanced Dialogue Augmentation for Multi-Turn + Text-to-SQL + + +
+ Fine-tuning large language models (LLMs) for specific domain tasks has +achieved great success in Text-to-SQL tasks. However, these fine-tuned models +often face challenges with multi-turn Text-to-SQL tasks caused by ambiguous or +unanswerable questions. It is desired to enhance LLMs to handle multiple types +of questions in multi-turn Text-to-SQL tasks. To address this, we propose a +novel data augmentation method, called QDA-SQL, which generates multiple types +of multi-turn Q\&A pairs using LLMs. In QDA-SQL, we introduce a method +incorporating validation and correction mechanisms to handle complex multi-turn +Text-to-SQL tasks. Experimental results demonstrate that QDA-SQL enables +fine-tuned models to exhibit higher performance on SQL statement accuracy and +enhances their ability to handle complex, unanswerable questions in multi-turn +Text-to-SQL tasks. The generation script and test set are released at +https://github.com/mcxiaoxiao/QDA-SQL + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ SBI-RAG: Enhancing Math Word Problem Solving for Students through + Schema-Based Instruction and Retrieval-Augmented Generation NeurIPS'24 + + +
+ Many students struggle with math word problems (MWPs), often finding it +difficult to identify key information and select the appropriate mathematical +operations. Schema-based instruction (SBI) is an evidence-based strategy that +helps students categorize problems based on their structure, improving +problem-solving accuracy. Building on this, we propose a Schema-Based +Instruction Retrieval-Augmented Generation (SBI-RAG) framework that +incorporates a large language model (LLM). Our approach emphasizes step-by-step +reasoning by leveraging schemas to guide solution generation. We evaluate its +performance on the GSM8K dataset, comparing it with GPT-4 and GPT-3.5 Turbo, +and introduce a "reasoning score" metric to assess solution quality. Our +findings suggest that SBI-RAG enhances reasoning clarity and facilitates a more +structured problem-solving process potentially providing educational benefits +for students. + +
+
+ comment: Accepted to the 4th MATH-AI Workshop at NeurIPS'24 +
+
+
+
+
+ + ♻ ☆ LML-DAP: Language Model Learning a Dataset for Data-Augmented Prediction + + +
+ Classification tasks are typically handled using Machine Learning (ML) +models, which lack a balance between accuracy and interpretability. This paper +introduces a new approach for classification tasks using Large Language Models +(LLMs) in an explainable method. Unlike ML models, which rely heavily on data +cleaning and feature engineering, this method streamlines the process using +LLMs. This paper proposes a method called "Language Model Learning (LML)" +powered by a new method called "Data-Augmented Prediction (DAP)." The +classification is performed by LLMs using a method similar to that used by +humans who manually explore and understand the data to decide classifications. +In the process of LML, a dataset is summarized and evaluated to determine the +features leading to each label the most. In the DAP process, the system uses +the data summary and a row of the testing dataset to automatically generate a +query to retrieve relevant rows from the dataset for context-aware +classification. LML and DAP unlock new possibilities in areas that require +explainable and context-aware decisions by ensuring satisfactory accuracy even +with complex data. The system scored an accuracy above 90% in some test cases, +confirming the effectiveness and potential of the system to outperform ML +models in various scenarios. The source code is available at +https://github.com/Pro-GenAI/LML-DAP + +
+
+ comment: Made the abstract and the content clearer +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ♻ ☆ AMD: Autoregressive Motion Diffusion AAAI2024 + + +
+ Human motion generation aims to produce plausible human motion sequences +according to various conditional inputs, such as text or audio. Despite the +feasibility of existing methods in generating motion based on short prompts and +simple motion patterns, they encounter difficulties when dealing with long +prompts or complex motions. The challenges are two-fold: 1) the scarcity of +human motion-captured data for long prompts and complex motions. 2) the high +diversity of human motions in the temporal domain and the substantial +divergence of distributions from conditional modalities, leading to a +many-to-many mapping problem when generating motion with complex and long +texts. In this work, we address these gaps by 1) elaborating the first dataset +pairing long textual descriptions and 3D complex motions (HumanLong3D), and 2) +proposing an autoregressive motion diffusion model (AMD). Specifically, AMD +integrates the text prompt at the current timestep with the text prompt and +action sequences at the previous timestep as conditional information to predict +the current action sequences in an iterative manner. Furthermore, we present +its generalization for X-to-Motion with "No Modality Left Behind", enabling the +generation of high-definition and high-fidelity human motions based on +user-defined modality input. + +
+
+ comment: accepted by AAAI2024. Official Code: + https://github.com/fluide1022/AMD +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ GuidelineGuard: An Agentic Framework for Medical Note Evaluation with + Guideline Adherence + + +
+ Although rapid advancements in Large Language Models (LLMs) are facilitating +the integration of artificial intelligence-based applications and services in +healthcare, limited research has focused on the systematic evaluation of +medical notes for guideline adherence. This paper introduces GuidelineGuard, an +agentic framework powered by LLMs that autonomously analyzes medical notes, +such as hospital discharge and office visit notes, to ensure compliance with +established healthcare guidelines. By identifying deviations from recommended +practices and providing evidence-based suggestions, GuidelineGuard helps +clinicians adhere to the latest standards from organizations like the WHO and +CDC. This framework offers a novel approach to improving documentation quality +and reducing clinical errors. + +
+
+
+
+
+ + ☆ Annotative Indexing + + +
+ This paper introduces annotative indexing, a novel framework that unifies and +generalizes traditional inverted indexes, column stores, object stores, and +graph databases. As a result, annotative indexing can provide the underlying +indexing framework for databases that support knowledge graphs, entity +retrieval, semi-structured data, and ranked retrieval. While we primarily focus +on human language data in the form of text, annotative indexing is sufficiently +general to support a range of other datatypes, and we provide examples of +SQL-like queries over a JSON store that includes numbers and dates. Taking +advantage of the flexibility of annotative indexing, we also demonstrate a +fully dynamic annotative index incorporating support for ACID properties of +transactions with hundreds of multiple concurrent readers and writers. + +
+
+
+
+
+ + ☆ KeyB2: Selecting Key Blocks is Also Important for Long Document Ranking + with Large Language Models + + +
+ The rapid development of large language models (LLMs) like Llama has +significantly advanced information retrieval (IR) systems. However, using LLMs +for long documents, as in RankLLaMA, remains challenging due to computational +complexity, especially concerning input token length. Furthermore, the internal +mechanisms of LLMs during ranking are still not fully understood. In this +paper, we first explore the internal workings of LLMs during relevance +judgement and identify that specific attention heads play a crucial role in +aligning relevant tokens. This observation inspires us to revisit the block +pre-ranking strategy used in KeyB, which remains state-of-the-art (SOTA) on the +TREC 2019 DL document ranking dataset. Building on these insights, we develop +KeyB2, an advanced long document IR approach that integrates block pre-ranking +with the performance of LLMs. KeyB2 efficiently identifies and processes the +most relevant blocks, reducing computational costs and improving ranking +effectiveness. Additionally, we introduce a new bi-encoder block matching +strategy for KeyB2. Comprehensive experiments on long-document datasets, +including TREC 2019 DL, Robust04, and MLDR-zh, show that KeyB2 outperforms +baselines like RankLLaMA and KeyB by reducing reranking time and GPU memory +usage while enhancing retrieval performance, achieving new SOTA results on TREC +2019 DL with higher NDCG@10 and MAP scores. + +
+
+
+
+
+ + ☆ Leveraging Retrieval-Augmented Generation for University Knowledge + Retrieval + + +
+ This paper introduces an innovative approach using Retrieval-Augmented +Generation (RAG) pipelines with Large Language Models (LLMs) to enhance +information retrieval and query response systems for university-related +question answering. By systematically extracting data from the university +official webpage and employing advanced prompt engineering techniques, we +generate accurate, contextually relevant responses to user queries. + We developed a comprehensive university benchmark, UniversityQuestionBench +(UQB), to rigorously evaluate our system performance, based on common key +metrics in the filed of RAG pipelines, assessing accuracy and reliability +through various metrics and real-world scenarios. Our experimental results +demonstrate significant improvements in the precision and relevance of +generated responses, enhancing user experience and reducing the time required +to obtain relevant answers. In summary, this paper presents a novel application +of RAG pipelines and LLMs, supported by a meticulously prepared university +benchmark, offering valuable insights into advanced AI techniques for academic +data retrieval and setting the stage for future research in this domain. + +
+
+ comment: 6 pages, 2 figures, 1 table, Submitted to 15th IKT conference +
+
+
+
+
+ + ☆ Interpret the Internal States of Recommendation Model with Sparse + Autoencoder + + +
+ Explainable recommendation systems are important to enhance transparency, +accuracy, and fairness. Beyond result-level explanations, model-level +interpretations can provide valuable insights that allow developers to optimize +system designs and implement targeted improvements. However, most current +approaches depend on specialized model designs, which often lack generalization +capabilities. Given the various kinds of recommendation models, existing +methods have limited ability to effectively interpret them. To address this +issue, we propose RecSAE, an automatic, generalizable probing method for +interpreting the internal states of Recommendation models with Sparse +AutoEncoder. RecSAE serves as a plug-in module that does not affect original +models during interpretations, while also enabling predictable modifications to +their behaviors based on interpretation results. Firstly, we train an +autoencoder with sparsity constraints to reconstruct internal activations of +recommendation models, making the RecSAE latents more interpretable and +monosemantic than the original neuron activations. Secondly, we automated the +construction of concept dictionaries based on the relationship between latent +activations and input item sequences. Thirdly, RecSAE validates these +interpretations by predicting latent activations on new item sequences using +the concept dictionary and deriving interpretation confidence scores from +precision and recall. We demonstrate RecSAE's effectiveness on two datasets, +identifying hundreds of highly interpretable concepts from pure ID-based +models. Latent ablation studies further confirm that manipulating latent +concepts produces corresponding changes in model output behavior, underscoring +RecSAE's utility for both understanding and targeted tuning recommendation +models. Code and data are publicly available at +https://github.com/Alice1998/RecSAE. + +
+
+
+
+
+ + ☆ Snippet-based Conversational Recommender System + + +
+ Conversational Recommender Systems (CRS) engage users in interactive +dialogues to gather preferences and provide personalized recommendations. +Traditionally, CRS rely on pre-defined attributes or expensive, domain-specific +annotated datasets to guide conversations, which limits flexibility and +adaptability across domains. In this work, we introduce SnipRec, a novel CRS +that enhances dialogues and recommendations by extracting diverse expressions +and preferences from user-generated content (UGC) like customer reviews. Using +large language models, SnipRec maps user responses and UGC to concise snippets, +which are used to generate clarification questions and retrieve relevant items. +Our approach eliminates the need for domain-specific training, making it +adaptable to new domains and effective without prior knowledge of user +preferences. Extensive experiments on the Yelp dataset demonstrate the +effectiveness of snippet-based representations against document and +sentence-based representations. Additionally, SnipRec is able to improve +Hits@10 by 0.25 over the course of five conversational turns, underscoring the +efficiency of SnipRec in capturing user preferences through multi-turn +conversations. + +
+
+
+
+
+ + ♻ ☆ Lambda: Learning Matchable Prior For Entity Alignment with Unlabeled + Dangling Cases NeurIPS 2024 + + +
+ We investigate the entity alignment (EA) problem with unlabeled dangling +cases, meaning that partial entities have no counterparts in the other +knowledge graph (KG), and this type of entity remains unlabeled. To address +this challenge, we propose the framework \textit{Lambda} for dangling detection +and then entity alignment. Lambda features a GNN-based encoder called KEESA +with spectral contrastive learning for EA and a positive-unlabeled learning +algorithm for dangling detection called iPULE. iPULE offers theoretical +guarantees of unbiasedness, uniform deviation bounds, and convergence. +Experimental results demonstrate that each component contributes to overall +performances that are superior to baselines, even when baselines additionally +exploit 30\% of dangling entities labeled for training. + +
+
+ comment: Accepted in NeurIPS 2024 as a poster +
+
+
+
+
+ + ♻ ☆ End-to-end Learnable Clustering for Intent Learning in Recommendation + + +
+ Intent learning, which aims to learn users' intents for user understanding +and item recommendation, has become a hot research spot in recent years. +However, existing methods suffer from complex and cumbersome alternating +optimization, limiting performance and scalability. To this end, we propose a +novel intent learning method termed \underline{ELCRec}, by unifying behavior +representation learning into an \underline{E}nd-to-end \underline{L}earnable +\underline{C}lustering framework, for effective and efficient +\underline{Rec}ommendation. Concretely, we encode user behavior sequences and +initialize the cluster centers (latent intents) as learnable neurons. Then, we +design a novel learnable clustering module to separate different cluster +centers, thus decoupling users' complex intents. Meanwhile, it guides the +network to learn intents from behaviors by forcing behavior embeddings close to +cluster centers. This allows simultaneous optimization of recommendation and +clustering via mini-batch data. Moreover, we propose intent-assisted +contrastive learning by using cluster centers as self-supervision signals, +further enhancing mutual promotion. Both experimental results and theoretical +analyses demonstrate the superiority of ELCRec from six perspectives. Compared +to the runner-up, ELCRec improves NDCG@5 by 8.9\% and reduces computational +costs by 22.5\% on the Beauty dataset. Furthermore, due to the scalability and +universal applicability, we deploy this method on the industrial recommendation +system with 130 million page views and achieve promising results. The codes are +available on GitHub (https://github.com/yueliu1999/ELCRec). A collection +(papers, codes, datasets) of deep group recommendation/intent learning methods +is available on GitHub +(https://github.com/yueliu1999/Awesome-Deep-Group-Recommendation). + +
+
+ comment: 37 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Multi-Document Financial Question Answering using LLMs + + +
+ We propose two new methods for multi-document financial question answering. +First, a method that uses semantic tagging, and then, queries the index to get +the context (RAG_SEM). And second, a Knowledge Graph (KG_RAG) based method that +uses semantic tagging, and, retrieves knowledge graph triples from a graph +database, as context. KG_RAG uses knowledge graphs constructed using a small +model that is fine-tuned using knowledge distillation using a large teacher +model. The data consists of 18 10K reports of Apple, Microsoft, Alphabet, +NVIDIA, Amazon and Tesla for the years 2021, 2022 and 2023. The list of +questions in the data consists of 111 complex questions including many esoteric +questions that are difficult to answer and the answers are not completely +obvious. As evaluation metrics, we use overall scores as well as segmented +scores for measurement including the faithfulness, relevance, correctness, +similarity, an LLM based overall score and the rouge scores as well as a +similarity of embeddings. We find that both methods outperform plain RAG +significantly. KG_RAG outperforms RAG_SEM in four out of nine metrics. + +
+
+
+
+
+ + ☆ The effect of different feature selection methods on models created with + XGBoost + + +
+ This study examines the effect that different feature selection methods have +on models created with XGBoost, a popular machine learning algorithm with +superb regularization methods. It shows that three different ways for reducing +the dimensionality of features produces no statistically significant change in +the prediction accuracy of the model. This suggests that the traditional idea +of removing the noisy training data to make sure models do not overfit may not +apply to XGBoost. But it may still be viable in order to reduce computational +complexity. + +
+
+
+
+
+ + ☆ Mitigating Hallucination with ZeroG: An Advanced Knowledge Management + Engine + + +
+ The growth of digital documents presents significant challenges in efficient +management and knowledge extraction. Traditional methods often struggle with +complex documents, leading to issues such as hallucinations and high latency in +responses from Large Language Models (LLMs). ZeroG, an innovative approach, +significantly mitigates these challenges by leveraging knowledge distillation +and prompt tuning to enhance model performance. + ZeroG utilizes a smaller model that replicates the behavior of a larger +teacher model, ensuring contextually relevant and grounded responses, by +employing a black-box distillation approach, it creates a distilled dataset +without relying on intermediate features, optimizing computational efficiency. +This method significantly enhances accuracy and reduces response times, +providing a balanced solution for modern document management. + Incorporating advanced techniques for document ingestion and metadata +utilization, ZeroG improves the accuracy of question-and-answer systems. The +integration of graph databases and robust metadata management further +streamlines information retrieval, allowing for precise and context-aware +responses. By transforming how organizations interact with complex data, ZeroG +enhances productivity and user experience, offering a scalable solution for the +growing demands of digital document management. + +
+
+ comment: 10 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ BERTrend: Neural Topic Modeling for Emerging Trends Detection EMNLP 2024 + + +
+ Detecting and tracking emerging trends and weak signals in large, evolving +text corpora is vital for applications such as monitoring scientific +literature, managing brand reputation, surveilling critical infrastructure and +more generally to any kind of text-based event detection. Existing solutions +often fail to capture the nuanced context or dynamically track evolving +patterns over time. BERTrend, a novel method, addresses these limitations using +neural topic modeling in an online setting. It introduces a new metric to +quantify topic popularity over time by considering both the number of documents +and update frequency. This metric classifies topics as noise, weak, or strong +signals, flagging emerging, rapidly growing topics for further investigation. +Experimentation on two large real-world datasets demonstrates BERTrend's +ability to accurately detect and track meaningful weak signals while filtering +out noise, offering a comprehensive solution for monitoring emerging trends in +large-scale, evolving text corpora. The method can also be used for +retrospective analysis of past events. In addition, the use of Large Language +Models together with BERTrend offers efficient means for the interpretability +of trends of events. + +
+
+ comment: 17 pages, 12 figures, FuturED 2024: Workshop on Future of Event + Detection (CoLocated with EMNLP 2024) +
+
+
+
+
+ + ☆ Harnessing High-Level Song Descriptors towards Natural Language-Based + Music Recommendation + + +
+ Recommender systems relying on Language Models (LMs) have gained popularity +in assisting users to navigate large catalogs. LMs often exploit item +high-level descriptors, i.e. categories or consumption contexts, from training +data or user preferences. This has been proven effective in domains like movies +or products. However, in the music domain, understanding how effectively LMs +utilize song descriptors for natural language-based music recommendation is +relatively limited. In this paper, we assess LMs effectiveness in recommending +songs based on user natural language descriptions and items with descriptors +like genres, moods, and listening contexts. We formulate the recommendation +task as a dense retrieval problem and assess LMs as they become increasingly +familiar with data pertinent to the task and domain. Our findings reveal +improved performance as LMs are fine-tuned for general language similarity, +information retrieval, and mapping longer descriptions to shorter, high-level +descriptors in music. + +
+
+
+
+
+ + ☆ Why These Documents? Explainable Generative Retrieval with Hierarchical + Category Paths + + +
+ Generative retrieval has recently emerged as a new alternative of traditional +information retrieval approaches. However, existing generative retrieval +methods directly decode docid when a query is given, making it impossible to +provide users with explanations as an answer for "Why this document is +retrieved?". To address this limitation, we propose Hierarchical Category +Path-Enhanced Generative Retrieval(HyPE), which enhances explainability by +generating hierarchical category paths step-by-step before decoding docid. HyPE +leverages hierarchical category paths as explanation, progressing from broad to +specific semantic categories. This approach enables diverse explanations for +the same document depending on the query by using shared category paths between +the query and the document, and provides reasonable explanation by reflecting +the document's semantic structure through a coarse-to-fine manner. HyPE +constructs category paths with external high-quality semantic hierarchy, +leverages LLM to select appropriate candidate paths for each document, and +optimizes the generative retrieval model with path-augmented dataset. During +inference, HyPE utilizes path-aware reranking strategy to aggregate diverse +topic information, allowing the most relevant documents to be prioritized in +the final ranked list of docids. Our extensive experiments demonstrate that +HyPE not only offers a high level of explainability but also improves the +retrieval performance in the document retrieval task. + +
+
+
+
+
+ + ☆ Identifying and Decomposing Compound Ingredients in Meal Plans Using + Large Language Models KR + + +
+ This study explores the effectiveness of Large Language Models in meal +planning, focusing on their ability to identify and decompose compound +ingredients. We evaluated three models-GPT-4o, Llama-3 (70b), and Mixtral +(8x7b)-to assess their proficiency in recognizing and breaking down complex +ingredient combinations. Preliminary results indicate that while Llama-3 (70b) +and GPT-4o excels in accurate decomposition, all models encounter difficulties +with identifying essential elements like seasonings and oils. Despite strong +overall performance, variations in accuracy and completeness were observed +across models. These findings underscore LLMs' potential to enhance +personalized nutrition but highlight the need for further refinement in +ingredient decomposition. Future research should address these limitations to +improve nutritional recommendations and health outcomes. + +
+
+ comment: Comments: Presented at NeLaMKRR@KR, 2024 (arXiv:2410.05339) +
+
+
+
+
+ + ☆ IntellBot: Retrieval Augmented LLM Chatbot for Cyber Threat Knowledge + Delivery + + +
+ In the rapidly evolving landscape of cyber security, intelligent chatbots are +gaining prominence. Artificial Intelligence, Machine Learning, and Natural +Language Processing empower these chatbots to handle user inquiries and deliver +threat intelligence. This helps cyber security knowledge readily available to +both professionals and the public. Traditional rule-based chatbots often lack +flexibility and struggle to adapt to user interactions. In contrast, Large +Language Model-based chatbots offer contextually relevant information across +multiple domains and adapt to evolving conversational contexts. In this work, +we develop IntellBot, an advanced cyber security Chatbot built on top of +cutting-edge technologies like Large Language Models and Langchain alongside a +Retrieval-Augmented Generation model to deliver superior capabilities. This +chatbot gathers information from diverse data sources to create a comprehensive +knowledge base covering known vulnerabilities, recent cyber attacks, and +emerging threats. It delivers tailored responses, serving as a primary hub for +cyber security insights. By providing instant access to relevant information +and resources, this IntellBot enhances threat intelligence, incident response, +and overall security posture, saving time and empowering users with knowledge +of cyber security best practices. Moreover, we analyzed the performance of our +copilot using a two-stage evaluation strategy. We achieved BERT score above 0.8 +by indirect approach and a cosine similarity score ranging from 0.8 to 1, which +affirms the accuracy of our copilot. Additionally, we utilized RAGAS to +evaluate the RAG model, and all evaluation metrics consistently produced scores +above 0.77, highlighting the efficacy of our system. + +
+
+
+
+
+ + ☆ Ev2R: Evaluating Evidence Retrieval in Automated Fact-Checking + + +
+ Current automated fact-checking (AFC) approaches commonly evaluate evidence +either implicitly via the predicted verdicts or by comparing retrieved evidence +with a predefined closed knowledge source, such as Wikipedia. However, these +methods suffer from limitations, resulting from their reliance on evaluation +metrics developed for different purposes and constraints imposed by closed +knowledge sources. Recent advances in natural language generation (NLG) +evaluation offer new possibilities for evidence assessment. In this work, we +introduce Ev2R, an evaluation framework for AFC that comprises three types of +approaches for evidence evaluation: reference-based, proxy-reference, and +reference-less. We evaluate their effectiveness through agreement with human +ratings and adversarial tests, and demonstrate that prompt-based scorers, +particularly those leveraging LLMs and reference evidence, outperform +traditional evaluation approaches. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Improving Multi-Domain Task-Oriented Dialogue System with Offline + Reinforcement Learning + + +
+ Task-oriented dialogue (TOD) system is designed to accomplish user-defined +tasks through dialogues. The TOD system has progressed towards end-to-end +modeling by leveraging pre-trained large language models. Fine-tuning the +pre-trained language models using only supervised learning leads to the +exposure bias and token loss problem and it deviates the models from completing +the user's task. To address these issues, we propose a TOD system that +leverages a unified pre-trained language model, GPT2, as a base model. It is +optimized using supervised learning and reinforcement learning (RL). The issues +in the TOD system are mitigated using a non-differentiable reward function. The +reward is calculated using the weighted sum of the success rate and BLEU +evaluation metrics. The success rate and BLEU metrics in reward calculation +guide the language model for user task completion while ensuring a coherent and +fluent response. Our model is acquired by fine-tuning a pre-trained model on +the dialogue-session level which comprises user utterance, belief state, system +act, and system response. Experimental results on MultiWOZ2.1 demonstrate that +our model increases the inform rate by 1.60% and the success rate by 3.17% +compared to the baseline. + +
+
+
+
+
+ + ♻ ☆ Logic Query of Thoughts: Guiding Large Language Models to Answer Complex + Logic Queries with Knowledge Graphs + + +
+ Despite the superb performance in many tasks, large language models (LLMs) +bear the risk of generating hallucination or even wrong answers when confronted +with tasks that demand the accuracy of knowledge. The issue becomes even more +noticeable when addressing logic queries that require multiple logic reasoning +steps. On the other hand, knowledge graph (KG) based question answering methods +are capable of accurately identifying the correct answers with the help of +knowledge graph, yet its accuracy could quickly deteriorate when the knowledge +graph itself is sparse and incomplete. It remains a critical challenge on how +to integrate knowledge graph reasoning with LLMs in a mutually beneficial way +so as to mitigate both the hallucination problem of LLMs as well as the +incompleteness issue of knowledge graphs. In this paper, we propose +'Logic-Query-of-Thoughts' (LGOT) which is the first of its kind to combine LLMs +with knowledge graph based logic query reasoning. LGOT seamlessly combines +knowledge graph reasoning and LLMs, effectively breaking down complex logic +queries into easy to answer subquestions. Through the utilization of both +knowledge graph reasoning and LLMs, it successfully derives answers for each +subquestion. By aggregating these results and selecting the highest quality +candidate answers for each step, LGOT achieves accurate results to complex +questions. Our experimental findings demonstrate substantial performance +enhancements, with up to 20% improvement over ChatGPT. + +
+
+
+
+
+ + ♻ ☆ Cluster-based Graph Collaborative Filtering + + +
+ Graph Convolution Networks (GCNs) have significantly succeeded in learning +user and item representations for recommendation systems. The core of their +efficacy is the ability to explicitly exploit the collaborative signals from +both the first- and high-order neighboring nodes. However, most existing +GCN-based methods overlook the multiple interests of users while performing +high-order graph convolution. Thus, the noisy information from unreliable +neighbor nodes (e.g., users with dissimilar interests) negatively impacts the +representation learning of the target node. Additionally, conducting graph +convolution operations without differentiating high-order neighbors suffers the +over-smoothing issue when stacking more layers, resulting in performance +degradation. In this paper, we aim to capture more valuable information from +high-order neighboring nodes while avoiding noise for better representation +learning of the target node. To achieve this goal, we propose a novel GCN-based +recommendation model, termed Cluster-based Graph Collaborative Filtering +(ClusterGCF). This model performs high-order graph convolution on +cluster-specific graphs, which are constructed by capturing the multiple +interests of users and identifying the common interests among them. +Specifically, we design an unsupervised and optimizable soft node clustering +approach to classify user and item nodes into multiple clusters. Based on the +soft node clustering results and the topology of the user-item interaction +graph, we assign the nodes with probabilities for different clusters to +construct the cluster-specific graphs. To evaluate the effectiveness of +ClusterGCF, we conducted extensive experiments on four publicly available +datasets. Experimental results demonstrate that our model can significantly +improve recommendation performance. + +
+
+ comment: Accepted by ACM TOIS +
+
+
+
+
+ + ♻ ☆ On Generative Agents in Recommendation SIGIR 2024 + + +
+ Recommender systems are the cornerstone of today's information dissemination, +yet a disconnect between offline metrics and online performance greatly hinders +their development. Addressing this challenge, we envision a recommendation +simulator, capitalizing on recent breakthroughs in human-level intelligence +exhibited by Large Language Models (LLMs). We propose Agent4Rec, a user +simulator in recommendation, leveraging LLM-empowered generative agents +equipped with user profile, memory, and actions modules specifically tailored +for the recommender system. In particular, these agents' profile modules are +initialized using real-world datasets (e.g. MovieLens, Steam, Amazon-Book), +capturing users' unique tastes and social traits; memory modules log both +factual and emotional memories and are integrated with an emotion-driven +reflection mechanism; action modules support a wide variety of behaviors, +spanning both taste-driven and emotion-driven actions. Each agent interacts +with personalized recommender models in a page-by-page manner, relying on a +pre-implemented collaborative filtering-based recommendation algorithm. We +delve into both the capabilities and limitations of Agent4Rec, aiming to +explore an essential research question: ``To what extent can LLM-empowered +generative agents faithfully simulate the behavior of real, autonomous humans +in recommender systems?'' Extensive and multi-faceted evaluations of Agent4Rec +highlight both the alignment and deviation between agents and user-personalized +preferences. Beyond mere performance comparison, we explore insightful +experiments, such as emulating the filter bubble effect and discovering the +underlying causal relationships in recommendation tasks. Our codes are +available at https://github.com/LehengTHU/Agent4Rec. + +
+
+ comment: SIGIR 2024 perspective paper +
+
+
+
+
+ + ♻ ☆ Feature Noise Resilient for QoS Prediction with Probabilistic Deep + Supervision + + +
+ Accurate Quality of Service (QoS) prediction is essential for enhancing user +satisfaction in web recommendation systems, yet existing prediction models +often overlook feature noise, focusing predominantly on label noise. In this +paper, we present the Probabilistic Deep Supervision Network (PDS-Net), a +robust framework designed to effectively identify and mitigate feature noise, +thereby improving QoS prediction accuracy. PDS-Net operates with a dual-branch +architecture: the main branch utilizes a decoder network to learn a +Gaussian-based prior distribution from known features, while the second branch +derives a posterior distribution based on true labels. A key innovation of +PDS-Net is its condition-based noise recognition loss function, which enables +precise identification of noisy features in objects (users or services). Once +noisy features are identified, PDS-Net refines the feature's prior +distribution, aligning it with the posterior distribution, and propagates this +adjusted distribution to intermediate layers, effectively reducing noise +interference. Extensive experiments conducted on two real-world QoS datasets +demonstrate that PDS-Net consistently outperforms existing models, achieving an +average improvement of 8.91% in MAE on Dataset D1 and 8.32% on Dataset D2 +compared to the ate-of-the-art. These results highlight PDS-Net's ability to +accurately capture complex user-service relationships and handle feature noise, +underscoring its robustness and versatility across diverse QoS prediction +environments. + +
+
+
+
+
+ + ♻ ☆ Limpeh ga li gong: Challenges in Singlish Annotations + + +
+ Singlish, or Colloquial Singapore English, is a language formed from oral and +social communication within multicultural Singapore. In this work, we work on a +fundamental Natural Language Processing (NLP) task: Parts-Of-Speech (POS) +tagging of Singlish sentences. For our analysis, we build a parallel Singlish +dataset containing direct English translations and POS tags, with translation +and POS annotation done by native Singlish speakers. Our experiments show that +automatic transition- and transformer- based taggers perform with only $\sim +80\%$ accuracy when evaluated against human-annotated POS labels, suggesting +that there is indeed room for improvement on computation analysis of the +language. We provide an exposition of challenges in Singlish annotation: its +inconsistencies in form and semantics, the highly context-dependent particles +of the language, its structural unique expressions, and the variation of the +language on different mediums. Our task definition, resultant labels and +results reflects the challenges in analysing colloquial languages formulated +from a variety of dialects, and paves the way for future studies beyond POS +tagging. + +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ On the Role of Noise in AudioVisual Integration: Evidence from + Artificial Neural Networks that Exhibit the McGurk Effect + + +
+ Humans are able to fuse information from both auditory and visual modalities +to help with understanding speech. This is frequently demonstrated through an +phenomenon known as the McGurk Effect, during which a listener is presented +with incongruent auditory and visual speech that fuse together into the percept +of an illusory intermediate phoneme. Building on a recent framework that +proposes how to address developmental 'why' questions using artificial neural +networks, we evaluated a set of recent artificial neural networks trained on +audiovisual speech by testing them with audiovisually incongruent words +designed to elicit the McGurk effect. We compared networks trained on clean +speech to those trained on noisy speech, and discovered that training with +noisy speech led to an increase in both visual responses and McGurk responses +across all models. Furthermore, we observed that systematically increasing the +level of auditory noise during ANN training also increased the amount of +audiovisual integration up to a point, but at extreme noise levels, this +integration failed to develop. These results suggest that excessive noise +exposure during critical periods of audiovisual learning may negatively +influence the development of audiovisual speech integration. This work also +demonstrates that the McGurk effect reliably emerges untrained from the +behaviour of both supervised and unsupervised networks. This supports the +notion that artificial neural networks might be useful models for certain +aspects of perception and cognition. + +
+
+
+
+
+ + ☆ Interdisciplinary Translations: Sensory Perception as a Universal + Language + + +
+ This paper investigates sensory perception's pivotal role as a universal +communicative bridge across varied cultures and disciplines, and how it +manifests its value in the study of media art, human computer interaction and +artificial intelligence. By analyzing its function in non-verbal communication +through interactive systems, and drawing on the interpretive model in +translation studies where "sense" acts as a mediation between two languages, +this paper illustrates how interdisciplinary communication in media art and +human-computer interaction is afforded by the abstract language of human +sensory perception. Specific examples from traditional art, interactive media +art, HCI, communication, and translation studies demonstrate how sensory +feedback translates and conveys meaning across diverse modalities of expression +and how it fosters connections between humans, art, and technology. Pertaining +to this topic, this paper analyzes the impact of sensory feedback systems in +designing interactive experiences, and reveals the guiding role of sensory +perception in the design philosophy of AI systems. Overall, the study aims to +broaden the understanding of sensory perception's role in communication, +highlighting its significance in the evolution of interactive experiences and +its capacity to unify art, science, and the human experience. + +
+
+ comment: This paper has been accepted to the International Symposium of + Electronic Arts 2024, and the proceedings version will be available at + https://isea-archives.siggraph.org/publications/ with DOI to be added once + published +
+
+
+
+
+ + ☆ Rate-aware Compression for NeRF-based Volumetric Video ACM MM 2024 + + +
+ The neural radiance fields (NeRF) have advanced the development of 3D +volumetric video technology, but the large data volumes they involve pose +significant challenges for storage and transmission. To address these problems, +the existing solutions typically compress these NeRF representations after the +training stage, leading to a separation between representation training and +compression. In this paper, we try to directly learn a compact NeRF +representation for volumetric video in the training stage based on the proposed +rate-aware compression framework. Specifically, for volumetric video, we use a +simple yet effective modeling strategy to reduce temporal redundancy for the +NeRF representation. Then, during the training phase, an implicit entropy model +is utilized to estimate the bitrate of the NeRF representation. This entropy +model is then encoded into the bitstream to assist in the decoding of the NeRF +representation. This approach enables precise bitrate estimation, thereby +leading to a compact NeRF representation. Furthermore, we propose an adaptive +quantization strategy and learn the optimal quantization step for the NeRF +representations. Finally, the NeRF representation can be optimized by using the +rate-distortion trade-off. Our proposed compression framework can be used for +different representations and experimental results demonstrate that our +approach significantly reduces the storage size with marginal distortion and +achieves state-of-the-art rate-distortion performance for volumetric video on +the HumanRF and ReRF datasets. Compared to the previous state-of-the-art method +TeTriRF, we achieved an approximately -80% BD-rate on the HumanRF dataset and +-60% BD-rate on the ReRF dataset. + +
+
+ comment: Accepted by ACM MM 2024 (Oral) +
+
+
+
+
+ + ☆ Content-Adaptive Rate-Quality Curve Prediction Model in Media Processing + System + + +
+ In streaming media services, video transcoding is a common practice to +alleviate bandwidth demands. Unfortunately, traditional methods employing a +uniform rate factor (RF) across all videos often result in significant +inefficiencies. Content-adaptive encoding (CAE) techniques address this by +dynamically adjusting encoding parameters based on video content +characteristics. However, existing CAE methods are often tightly coupled with +specific encoding strategies, leading to inflexibility. In this paper, we +propose a model that predicts both RF-quality and RF-bitrate curves, which can +be utilized to derive a comprehensive bitrate-quality curve. This approach +facilitates flexible adjustments to the encoding strategy without necessitating +model retraining. The model leverages codec features, content features, and +anchor features to predict the bitrate-quality curve accurately. Additionally, +we introduce an anchor suspension method to enhance prediction accuracy. +Experiments confirm that the actual quality metric (VMAF) of the compressed +video stays within 1 of the target, achieving an accuracy of 99.14%. By +incorporating our quality improvement strategy with the rate-quality curve +prediction model, we conducted online A/B tests, obtaining both +0.107% +improvements in video views and video completions and +0.064% app duration +time. Our model has been deployed on the Xiaohongshu App. + +
+
+ comment: Accepted by IEEE VCIP 2024 (Oral) +
+
+
+
+
+ + ♻ ☆ pTSE-T: Presentation Target Speaker Extraction using Unaligned Text Cues + + +
+ TSE(Target Speaker Extraction) aims to extract the clean speech of the target +speaker in an audio mixture, thus eliminating irrelevant background noise and +speech. While prior work has explored various auxiliary cues including +pre-recorded speech, visual information (e.g., lip motions and gestures), and +spatial information, the acquisition and selection of such strong cues are +infeasible in many practical scenarios. Unlike all existing work, in this +paper, we condition the TSE algorithm on semantic cues extracted from limited +and unaligned text content, such as condensed points from a presentation slide. +This method is particularly useful in scenarios like meetings, poster sessions, +or lecture presentations, where acquiring other cues in real-time is +challenging. To this end, we design two different networks. Specifically, our +proposed TPE fuses audio features with content-based semantic cues to +facilitate time-frequency mask generation to filter out extraneous noise, while +another proposal, namely TSR, employs the contrastive learning technique to +associate blindly separated speech signals with semantic cues. The experimental +results show the efficacy in accurately identifying the target speaker by +utilizing semantic cues derived from limited and unaligned text, resulting in +SI-SDRi of 12.16 dB, SDRi of 12.66 dB, PESQi of 0.830 and STOIi of 0.150, +respectively. Dataset and source code will be publicly available. Project demo +page: https://slideTSE.github.io/. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 12 + +
+
+
+ + ☆ Orbit: A Framework for Designing and Evaluating Multi-objective Rankers + + +
+ Machine learning in production needs to balance multiple objectives: This is +particularly evident in ranking or recommendation models, where conflicting +objectives such as user engagement, satisfaction, diversity, and novelty must +be considered at the same time. However, designing multi-objective rankers is +inherently a dynamic wicked problem -- there is no single optimal solution, and +the needs evolve over time. Effective design requires collaboration between +cross-functional teams and careful analysis of a wide range of information. In +this work, we introduce Orbit, a conceptual framework for Objective-centric +Ranker Building and Iteration. The framework places objectives at the center of +the design process, to serve as boundary objects for communication and guide +practitioners for design and evaluation. We implement Orbit as an interactive +system, which enables stakeholders to interact with objective spaces directly +and supports real-time exploration and evaluation of design trade-offs. We +evaluate Orbit through a user study involving twelve industry practitioners, +showing that it supports efficient design space exploration, leads to more +informed decision-making, and enhances awareness of the inherent trade-offs of +multiple objectives. Orbit (1) opens up new opportunities of an +objective-centric design process for any multi-objective ML models, as well as +(2) sheds light on future designs that push practitioners to go beyond a narrow +metric-centric or example-centric mindset. + +
+
+
+
+
+ + ☆ Lightning IR: Straightforward Fine-tuning and Inference of + Transformer-based Language Models for Information Retrieval WSDM'25 + + +
+ A wide range of transformer-based language models have been proposed for +information retrieval tasks. However, fine-tuning and inference of these models +is often complex and requires substantial engineering effort. This paper +introduces Lightning IR, a PyTorch Lightning-based framework for fine-tuning +and inference of transformer-based language models for information retrieval. +Lightning IR provides a modular and extensible architecture that supports all +stages of an information retrieval pipeline: from fine-tuning and indexing to +searching and re-ranking. It is designed to be straightforward to use, +scalable, and reproducible. Lightning IR is available as open-source: +https://github.com/webis-de/lightning-ir. + +
+
+ comment: Accepted as a demo at WSDM'25 +
+
+
+
+
+ + ☆ Self-Calibrated Listwise Reranking with Large Language Models + + +
+ Large language models (LLMs), with advanced linguistic capabilities, have +been employed in reranking tasks through a sequence-to-sequence approach. In +this paradigm, multiple passages are reranked in a listwise manner and a +textual reranked permutation is generated. However, due to the limited context +window of LLMs, this reranking paradigm requires a sliding window strategy to +iteratively handle larger candidate sets. This not only increases computational +costs but also restricts the LLM from fully capturing all the comparison +information for all candidates. To address these challenges, we propose a novel +self-calibrated listwise reranking method, which aims to leverage LLMs to +produce global relevance scores for ranking. To achieve it, we first propose +the relevance-aware listwise reranking framework, which incorporates explicit +list-view relevance scores to improve reranking efficiency and enable global +comparison across the entire candidate set. Second, to ensure the comparability +of the computed scores, we propose self-calibrated training that uses +point-view relevance assessments generated internally by the LLM itself to +calibrate the list-view relevance assessments. Extensive experiments and +comprehensive analysis on the BEIR benchmark and TREC Deep Learning Tracks +demonstrate the effectiveness and efficiency of our proposed method. + +
+
+
+
+
+ + ☆ Best Practices for Distilling Large Language Models into BERT for Web + Search Ranking + + +
+ Recent studies have highlighted the significant potential of Large Language +Models (LLMs) as zero-shot relevance rankers. These methods predominantly +utilize prompt learning to assess the relevance between queries and documents +by generating a ranked list of potential documents. Despite their promise, the +substantial costs associated with LLMs pose a significant challenge for their +direct implementation in commercial search systems. To overcome this barrier +and fully exploit the capabilities of LLMs for text ranking, we explore +techniques to transfer the ranking expertise of LLMs to a more compact model +similar to BERT, using a ranking loss to enable the deployment of less +resource-intensive models. Specifically, we enhance the training of LLMs +through Continued Pre-Training, taking the query as input and the clicked title +and summary as output. We then proceed with supervised fine-tuning of the LLM +using a rank loss, assigning the final token as a representative of the entire +sentence. Given the inherent characteristics of autoregressive language models, +only the final token can encapsulate all preceding tokens. Additionally, +we introduce a hybrid point-wise and margin MSE loss to transfer the ranking +knowledge from LLMs to smaller models like BERT. This method creates a viable +solution for environments with strict resource constraints. Both offline and +online evaluations have confirmed the efficacy of our approach, and our model +has been successfully integrated into a commercial web search engine as of +February 2024. + +
+
+ comment: Arxiv Version +
+
+
+
+
+ + ☆ Leveraging LLMs to Enable Natural Language Search on Go-to-market + Platforms + + +
+ Enterprise searches require users to have complex knowledge of queries, +configurations, and metadata, rendering it difficult for them to access +information as needed. Most go-to-market (GTM) platforms utilize advanced +search, an interface that enables users to filter queries by various fields +using categories or keywords, which, historically, however, has proven to be +exceedingly cumbersome, as users are faced with seemingly hundreds of options, +fields, and buttons. Consequently, querying with natural language has long been +ideal, a notion further empowered by Large Language Models (LLMs). + In this paper, we implement and evaluate a solution for the Zoominfo product +for sellers, which prompts the LLM with natural language, producing search +fields through entity extraction that are then converted into a search query. +The intermediary search fields offer numerous advantages for each query, +including the elimination of syntax errors, simpler ground truths, and an +intuitive format for the LLM to interpret. + We paired this pipeline with many advanced prompt engineering strategies, +featuring an intricate system message, few-shot prompting, chain-of-thought +(CoT) reasoning, and execution refinement. Furthermore, we manually created the +ground truth for 500+ natural language queries, enabling the supervised +fine-tuning of Llama-3-8B-Instruct and the introduction of sophisticated +numerical metrics. + Comprehensive experiments with closed, open source, and fine-tuned LLM models +were conducted through exact, Jaccard, cosine, and semantic similarity on +individual search entities to demonstrate the efficacy of our approach. +Overall, the most accurate closed model had an average accuracy of 97% per +query, with only one field performing under 90%, with comparable results +observed from the fine-tuned models. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Towards Competitive Search Relevance For Inference-Free Learned Sparse + Retrievers + + +
+ Learned sparse retrieval, which can efficiently perform retrieval through +mature inverted-index engines, has garnered growing attention in recent years. +Particularly, the inference-free sparse retrievers are attractive as they +eliminate online model inference in the retrieval phase thereby avoids huge +computational cost, offering reasonable throughput and latency. However, even +the state-of-the-art (SOTA) inference-free sparse models lag far behind in +terms of search relevance when compared to both sparse and dense siamese +models. Towards competitive search relevance for inference-free sparse +retrievers, we argue that they deserve dedicated training methods other than +using same ones with siamese encoders. In this paper, we propose two different +approaches for performance improvement. First, we introduce the IDF-aware FLOPS +loss, which introduces Inverted Document Frequency (IDF) to the sparsification +of representations. We find that it mitigates the negative impact of the FLOPS +regularization on search relevance, allowing the model to achieve a better +balance between accuracy and efficiency. Moreover, we propose a heterogeneous +ensemble knowledge distillation framework that combines siamese dense and +sparse retrievers to generate supervisory signals during the pre-training +phase. The ensemble framework of dense and sparse retriever capitalizes on +their strengths respectively, providing a strong upper bound for knowledge +distillation. To concur the diverse feedback from heterogeneous supervisors, we +normalize and then aggregate the outputs of the teacher models to eliminate +score scale differences. On the BEIR benchmark, our model outperforms existing +SOTA inference-free sparse model by \textbf{3.3 NDCG@10 score}. It exhibits +search relevance comparable to siamese sparse retrievers and client-side +latency only \textbf{1.1x that of BM25}. + +
+
+
+
+
+ + ☆ The Concatenator: A Bayesian Approach To Real Time Concatenative + Musaicing + + +
+ We present ``The Concatenator,'' a real time system for audio-guided +concatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or +``audio mosaicing'') technique, we concatenate a set number of windows within a +corpus of audio to re-create the harmonic and percussive aspects of a target +audio stream. Unlike Driedger's NMF-based technique, however, we instead use an +explicitly Bayesian point of view, where corpus window indices are hidden +states and the target audio stream is an observation. We use a particle filter +to infer the best hidden corpus states in real-time. Our transition model +includes a tunable parameter to control the time-continuity of corpus grains, +and our observation model allows users to prioritize how quickly windows change +to match the target. Because the computational complexity of the system is +independent of the corpus size, our system scales to corpora that are hours +long, which is an important feature in the age of vast audio data collections. +Within The Concatenator module itself, composers can vary grain length, fit to +target, and pitch shift in real time while reacting to the sounds they hear, +enabling them to rapidly iterate ideas. To conclude our work, we evaluate our +system with extensive quantitative tests of the effects of parameters, as well +as a qualitative evaluation with artistic insights. Based on the quality of the +results, we believe the real-time capability unlocks new avenues for musical +expression and control, suitable for live performance and modular synthesis +integration, which furthermore represents an essential breakthrough in +concatenative synthesis technology. + +
+
+ comment: 12 pages, 6 figures, Accepted for Publication in The International + Society for Music Information Retrieval Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ RRADistill: Distilling LLMs' Passage Ranking Ability for Document + Re-Ranking of Long-Tail Queries in a Search Engine EMNLP 2024 + + +
+ Large Language Models (LLMs) excel at understanding the semantic +relationships between queries and documents, even with lengthy and complex +long-tail queries. These queries are challenging for feedback-based rankings +due to sparse user engagement and limited feedback, making LLMs' ranking +ability highly valuable. However, the large size and slow inference of LLMs +necessitate the development of smaller, more efficient models (sLLMs). +Recently, integrating ranking label generation into distillation techniques has +become crucial, but existing methods underutilize LLMs' capabilities and are +cumbersome. Our research, RRADistill: Re-Ranking Ability Distillation, propose +an efficient label generation pipeline and novel sLLM training methods for both +encoder and decoder models. We introduce an encoder-based method using a Term +Control Layer to capture term matching signals and a decoder-based model with a +ranking layer for enhanced understanding. A/B testing on a Korean-based search +platform, validates the effectiveness of our approach in improving re-ranking +for long-tail queries. + +
+
+ comment: Accepted to EMNLP 2024 Industry Track. First two authors contributed + equally +
+
+
+
+
+ + ♻ ☆ On Softmax Direct Preference Optimization for Recommendation NeurIPS 2024 + + +
+ Recommender systems aim to predict personalized rankings based on user +preference data. With the rise of Language Models (LMs), LM-based recommenders +have been widely explored due to their extensive world knowledge and powerful +reasoning abilities. Most of the LM-based recommenders convert historical +interactions into language prompts, pairing with a positive item as the target +response and fine-tuning LM with a language modeling loss. However, the current +objective fails to fully leverage preference data and is not optimized for +personalized ranking tasks, which hinders the performance of LM-based +recommenders. Inspired by the current advancement of Direct Preference +Optimization (DPO) in human preference alignment and the success of softmax +loss in recommendations, we propose Softmax-DPO (S-DPO) to instill ranking +information into the LM to help LM-based recommenders distinguish preferred +items from negatives, rather than solely focusing on positives. Specifically, +we incorporate multiple negatives in user preference data and devise an +alternative version of DPO loss tailored for LM-based recommenders, which is +extended from the traditional full-ranking Plackett-Luce (PL) model to partial +rankings and connected to softmax sampling strategies. Theoretically, we bridge +S-DPO with the softmax loss over negative sampling and find that it has an +inherent benefit of mining hard negatives, which assures its exceptional +capabilities in recommendation tasks. Empirically, extensive experiments +conducted on three real-world datasets demonstrate the superiority of S-DPO to +effectively model user preference and further boost recommendation performance +while providing better rewards for preferred items. Our codes are available at +https://github.com/chenyuxin1999/S-DPO. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ♻ ☆ Talking the Talk Does Not Entail Walking the Walk: On the Limits of + Large Language Models in Lexical Entailment Recognition EMNLP-2024 + + +
+ Verbs form the backbone of language, providing the structure and meaning to +sentences. Yet, their intricate semantic nuances pose a longstanding challenge. +Understanding verb relations through the concept of lexical entailment is +crucial for comprehending sentence meanings and grasping verb dynamics. This +work investigates the capabilities of eight Large Language Models in +recognizing lexical entailment relations among verbs through differently +devised prompting strategies and zero-/few-shot settings over verb pairs from +two lexical databases, namely WordNet and HyperLex. Our findings unveil that +the models can tackle the lexical entailment recognition task with moderately +good performance, although at varying degree of effectiveness and under +different conditions. Also, utilizing few-shot prompting can enhance the +models' performance. However, perfectly solving the task arises as an unmet +challenge for all examined LLMs, which raises an emergence for further research +developments on this topic. + +
+
+ comment: Accepted for publication at The 2024 Conference on Empirical Methods + in Natural Language Processing (EMNLP-2024) - Findings +
+
+
+
+
+ + ♻ ☆ ChartifyText: Automated Chart Generation from Data-Involved Texts via + LLM + + +
+ Text documents with numerical values involved are widely used in various +applications such as scientific research, economy, public health and +journalism. However, it is difficult for readers to quickly interpret such +data-involved texts and gain deep insights. To fill this research gap, this +work aims to automatically generate charts to accurately convey the underlying +data and ideas to readers, which is essentially a challenging task. The +challenges originate from text ambiguities, intrinsic sparsity and uncertainty +of data in text documents, and subjective sentiment differences. Specifically, +we propose ChartifyText, a novel fully-automated approach that leverages Large +Language Models (LLMs) to convert complex data-involved texts to expressive +charts. It consists of two major modules: tabular data inference and expressive +chart generation. The tabular data inference module employs systematic prompt +engineering to guide the LLM (e.g., GPT-4) to infer table data, where data +ranges, uncertainties, missing data values and corresponding subjective +sentiments are explicitly considered. The expressive chart generation module +augments standard charts with intuitive visual encodings and concise texts to +accurately convey the underlying data and insights. We extensively evaluate the +effectiveness of ChartifyText on real-world data-involved text documents +through case studies, in-depth interviews with three visualization experts, and +a carefully-designed user study with 15 participants. The results demonstrate +the usefulness and effectiveness of ChartifyText in helping readers efficiently +and effectively make sense of data-involved texts. + +
+
+
+
+
+ + ♻ ☆ LightRAG: Simple and Fast Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) systems enhance large language models +(LLMs) by integrating external knowledge sources, enabling more accurate and +contextually relevant responses tailored to user needs. However, existing RAG +systems have significant limitations, including reliance on flat data +representations and inadequate contextual awareness, which can lead to +fragmented answers that fail to capture complex inter-dependencies. To address +these challenges, we propose LightRAG, which incorporates graph structures into +text indexing and retrieval processes. This innovative framework employs a +dual-level retrieval system that enhances comprehensive information retrieval +from both low-level and high-level knowledge discovery. Additionally, the +integration of graph structures with vector representations facilitates +efficient retrieval of related entities and their relationships, significantly +improving response times while maintaining contextual relevance. This +capability is further enhanced by an incremental update algorithm that ensures +the timely integration of new data, allowing the system to remain effective and +responsive in rapidly changing data environments. Extensive experimental +validation demonstrates considerable improvements in retrieval accuracy and +efficiency compared to existing approaches. We have made our LightRAG +open-source and available at the link: https://github.com/HKUDS/LightRAG. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ A multi-purpose automatic editing system based on lecture semantics for + remote education + + +
+ Remote teaching has become popular recently due to its convenience and +safety, especially under extreme circumstances like a pandemic. However, online +students usually have a poor experience since the information acquired from the +views provided by the broadcast platforms is limited. One potential solution is +to show more camera views simultaneously, but it is technically challenging and +distracting for the viewers. Therefore, an automatic multi-camera +directing/editing system, which aims at selecting the most concerned view at +each time instance to guide the attention of online students, is in urgent +demand. However, existing systems mostly make simple assumptions and focus on +tracking the position of the speaker instead of the real lecture semantics, and +therefore have limited capacities to deliver optimal information flow. To this +end, this paper proposes an automatic multi-purpose editing system based on the +lecture semantics, which can both direct the multiple video streams for +real-time broadcasting and edit the optimal video offline for review purposes. +Our system directs the views by semantically analyzing the class events while +following the professional directing rules, mimicking a human director to +capture the regions of interest from the viewpoint of the onsite students. We +conduct both qualitative and quantitative analyses to verify the effectiveness +of the proposed system and its components. + +
+
+
+
+
+ + ☆ Continuous Sign Language Recognition System using Deep Learning with + MediaPipe Holistic + + +
+ Sign languages are the language of hearing-impaired people who use visuals +like the hand, facial, and body movements for communication. There are +different signs and gestures representing alphabets, words, and phrases. +Nowadays approximately 300 sign languages are being practiced worldwide such as +American Sign Language (ASL), Chinese Sign Language (CSL), Indian Sign Language +(ISL), and many more. Sign languages are dependent on the vocal language of a +place. Unlike vocal or spoken languages, there are no helping words in sign +language like is, am, are, was, were, will, be, etc. As only a limited +population is well-versed in sign language, this lack of familiarity of sign +language hinders hearing-impaired people from communicating freely and easily +with everyone. This issue can be addressed by a sign language recognition (SLR) +system which has the capability to translate the sign language into vocal +language. In this paper, a continuous SLR system is proposed using a deep +learning model employing Long Short-Term Memory (LSTM), trained and tested on +an ISL primary dataset. This dataset is created using MediaPipe Holistic +pipeline for tracking face, hand, and body movements and collecting landmarks. +The system recognizes the signs and gestures in real-time with 88.23% accuracy. + +
+
+ comment: 14 pages, 4 figures, Wireless Pers Commun +
+
+
+
+
+ + ☆ The Concatenator: A Bayesian Approach To Real Time Concatenative + Musaicing + + +
+ We present ``The Concatenator,'' a real time system for audio-guided +concatenative synthesis. Similarly to Driedger et al.'s ``musaicing'' (or +``audio mosaicing'') technique, we concatenate a set number of windows within a +corpus of audio to re-create the harmonic and percussive aspects of a target +audio stream. Unlike Driedger's NMF-based technique, however, we instead use an +explicitly Bayesian point of view, where corpus window indices are hidden +states and the target audio stream is an observation. We use a particle filter +to infer the best hidden corpus states in real-time. Our transition model +includes a tunable parameter to control the time-continuity of corpus grains, +and our observation model allows users to prioritize how quickly windows change +to match the target. Because the computational complexity of the system is +independent of the corpus size, our system scales to corpora that are hours +long, which is an important feature in the age of vast audio data collections. +Within The Concatenator module itself, composers can vary grain length, fit to +target, and pitch shift in real time while reacting to the sounds they hear, +enabling them to rapidly iterate ideas. To conclude our work, we evaluate our +system with extensive quantitative tests of the effects of parameters, as well +as a qualitative evaluation with artistic insights. Based on the quality of the +results, we believe the real-time capability unlocks new avenues for musical +expression and control, suitable for live performance and modular synthesis +integration, which furthermore represents an essential breakthrough in +concatenative synthesis technology. + +
+
+ comment: 12 pages, 6 figures, Accepted for Publication in The International + Society for Music Information Retrieval Proceedings, 2024 +
+
+
+
+
+ + ♻ ☆ PIAST: A Multimodal Piano Dataset with Audio, Symbolic and Text + + +
+ While piano music has become a significant area of study in Music Information +Retrieval (MIR), there is a notable lack of datasets for piano solo music with +text labels. To address this gap, we present PIAST (PIano dataset with Audio, +Symbolic, and Text), a piano music dataset. Utilizing a piano-specific taxonomy +of semantic tags, we collected 9,673 tracks from YouTube and added human +annotations for 2,023 tracks by music experts, resulting in two subsets: +PIAST-YT and PIAST-AT. Both include audio, text, tag annotations, and +transcribed MIDI utilizing state-of-the-art piano transcription and beat +tracking models. Among many possible tasks with the multi-modal dataset, we +conduct music tagging and retrieval using both audio and MIDI data and report +baseline performances to demonstrate its potential as a valuable resource for +MIR research. + +
+
+ comment: Accepted for publication at the 3rd Workshop on NLP for Music and + Audio (NLP4MusA 2024) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ dsld: A Socially Relevant Tool for Teaching Statistics + + +
+ The growing power of data science can play a crucial role in addressing +social discrimination, necessitating nuanced understanding and effective +mitigation strategies of potential biases. Data Science Looks At Discrimination +(dsld) is an R and Python package designed to provide users with a +comprehensive toolkit of statistical and graphical methods for assessing +possible discrimination related to protected groups, such as race, gender, and +age. Our software offers techniques for discrimination analysis by identifying +and mitigating confounding variables, along with methods for reducing bias in +predictive models. + In educational settings, dsld offers instructors powerful tools to teach +important statistical principles through motivating real world examples of +discrimination analysis. The inclusion of an 80-page Quarto book further +supports users, from statistics educators to legal professionals, in +effectively applying these analytical tools to real world scenarios. + +
+
+ comment: To be submitted to the Journal of Statistics and Data Science + Education +
+
+
+
+
+ + ☆ Reproducible Hybrid Time-Travel Retrieval in Evolving Corpora + + +
+ There are settings in which reproducibility of ranked lists is desirable, +such as when extracting a subset of an evolving document corpus for downstream +research tasks or in domains such as patent retrieval or in medical systematic +reviews, with high reproducibility expectations. However, as global term +statistics change when documents change or are added to a corpus, queries using +typical ranked retrieval models are not even reproducible for the parts of the +document corpus that have not changed. Thus, Boolean retrieval frequently +remains the mechanism of choice in such settings. + We present a hybrid retrieval system combining Lucene for fast retrieval with +a column-store-based retrieval system maintaining a versioned and time-stamped +index. The latter component allows re-execution of previously posed queries +resulting in the same ranked list and further allows for time-travel queries +over evolving collection, as web archives, while maintaining the original +ranking. Thus, retrieval results in evolving document collections are fully +reproducible even when document collections and thus term statistics change. + +
+
+
+
+
+ + ☆ Fine-Grained Guidance for Retrievers: Leveraging LLMs' Feedback in + Retrieval-Augmented Generation + + +
+ Retrieval-Augmented Generation (RAG) has proven to be an effective method for +mitigating hallucination issues inherent in large language models (LLMs). +Previous approaches typically train retrievers based on semantic similarity, +lacking optimization for RAG. More recent works have proposed aligning +retrievers with the preference signals of LLMs. However, these preference +signals are often difficult for dense retrievers, which typically have weaker +language capabilities, to understand and learn effectively. Drawing inspiration +from pedagogical theories like Guided Discovery Learning, we propose a novel +framework, FiGRet (Fine-grained Guidance for Retrievers), which leverages the +language capabilities of LLMs to construct examples from a more granular, +information-centric perspective to guide the learning of retrievers. +Specifically, our method utilizes LLMs to construct easy-to-understand examples +from samples where the retriever performs poorly, focusing on three learning +objectives highly relevant to the RAG scenario: relevance, comprehensiveness, +and purity. These examples serve as scaffolding to ultimately align the +retriever with the LLM's preferences. Furthermore, we employ a dual curriculum +learning strategy and leverage the reciprocal feedback between LLM and +retriever to further enhance the performance of the RAG system. A series of +experiments demonstrate that our proposed framework enhances the performance of +RAG systems equipped with different retrievers and is applicable to various +LLMs. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ Lexicalization Is All You Need: Examining the Impact of Lexical + Knowledge in a Compositional QALD System + + +
+ In this paper, we examine the impact of lexicalization on Question Answering +over Linked Data (QALD). It is well known that one of the key challenges in +interpreting natural language questions with respect to SPARQL lies in bridging +the lexical gap, that is mapping the words in the query to the correct +vocabulary elements. We argue in this paper that lexicalization, that is +explicit knowledge about the potential interpretations of a word with respect +to the given vocabulary, significantly eases the task and increases the +performance of QA systems. Towards this goal, we present a compositional QA +system that can leverage explicit lexical knowledge in a compositional manner +to infer the meaning of a question in terms of a SPARQL query. We show that +such a system, given lexical knowledge, has a performance well beyond current +QA systems, achieving up to a $35.8\%$ increase in the micro $F_1$ score +compared to the best QA system on QALD-9. This shows the importance and +potential of including explicit lexical knowledge. In contrast, we show that +LLMs have limited abilities to exploit lexical knowledge, with only marginal +improvements compared to a version without lexical knowledge. This shows that +LLMs have no ability to compositionally interpret a question on the basis of +the meaning of its parts, a key feature of compositional approaches. Taken +together, our work shows new avenues for QALD research, emphasizing the +importance of lexicalization and compositionality. + +
+
+ comment: 24th International Conference on Knowledge Engineering and Knowledge + Management (EKAW 2024), November 26-28, 2024, Amsterdam, The Netherlands +
+
+
+
+
+ + ☆ Data Fusion of Synthetic Query Variants With Generative Large Language + Models SIGIR + + +
+ Considering query variance in information retrieval (IR) experiments is +beneficial for retrieval effectiveness. Especially ranking ensembles based on +different topically related queries retrieve better results than rankings based +on a single query alone. Recently, generative instruction-tuned Large Language +Models (LLMs) improved on a variety of different tasks in capturing human +language. To this end, this work explores the feasibility of using synthetic +query variants generated by instruction-tuned LLMs in data fusion experiments. +More specifically, we introduce a lightweight, unsupervised, and cost-efficient +approach that exploits principled prompting and data fusion techniques. In our +experiments, LLMs produce more effective queries when provided with additional +context information on the topic. Furthermore, our analysis based on four TREC +newswire benchmarks shows that data fusion based on synthetic query variants is +significantly better than baselines with single queries and also outperforms +pseudo-relevance feedback methods. We publicly share the code and query +datasets with the community as resources for follow-up studies. + +
+
+ comment: The definitive version of record was published in SIGIR-AP '24 +
+
+
+
+
+ + ☆ The Essence of the Essence from the Web:The Metasearch Engine + + +
+ The exponential growth of information source on the web and in turn +continuing technological progress of searching the information by using tools +like Search Engines gives rise to many problems for the user to know which tool +is best for their query and which tool is not. At this time Metasearch Engine +comes into play by reducing the user burden by dispatching queries to multiple +search engines in parallel and refining the results of these search engines to +give the best out of best by doing superior job on their side. These engines do +not own a database of Web pages rather they send search terms to the databases +maintained by the search engine companies, get back results from all the search +engines queried and then compile the results to be presented to the user. In +this paper, we describe the working of a typical metasearch engine and then +present a comparative study of traditional search engines and metasearch +engines on the basis of different parameters and show how metasearch engines +are better than the other search engines. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SEGMN: A Structure-Enhanced Graph Matching Network for Graph Similarity + Learning + + +
+ Graph similarity computation (GSC) aims to quantify the similarity score +between two graphs. Although recent GSC methods based on graph neural networks +(GNNs) take advantage of intra-graph structures in message passing, few of them +fully utilize the structures presented by edges to boost the representation of +their connected nodes. Moreover, previous cross-graph node embedding matching +lacks the perception of the overall structure of the graph pair, due to the +fact that the node representations from GNNs are confined to the intra-graph +structure, causing the unreasonable similarity score. Intuitively, the +cross-graph structure represented in the assignment graph is helpful to rectify +the inappropriate matching. Therefore, we propose a structure-enhanced graph +matching network (SEGMN). Equipped with a dual embedding learning module and a +structure perception matching module, SEGMN achieves structure enhancement in +both embedding learning and cross-graph matching. The dual embedding learning +module incorporates adjacent edge representation into each node to achieve a +structure-enhanced representation. The structure perception matching module +achieves cross-graph structure enhancement through assignment graph +convolution. The similarity score of each cross-graph node pair can be +rectified by aggregating messages from structurally relevant node pairs. +Experimental results on benchmark datasets demonstrate that SEGMN outperforms +the state-of-the-art GSC methods in the GED regression task, and the structure +perception matching module is plug-and-play, which can further improve the +performance of the baselines by up to 25%. + +
+
+
+
+
+ + ☆ Advanced RAG Models with Graph Structures: Optimizing Complex Knowledge + Reasoning and Text Generation + + +
+ This study aims to optimize the existing retrieval-augmented generation model +(RAG) by introducing a graph structure to improve the performance of the model +in dealing with complex knowledge reasoning tasks. The traditional RAG model +has the problem of insufficient processing efficiency when facing complex graph +structure information (such as knowledge graphs, hierarchical relationships, +etc.), which affects the quality and consistency of the generated results. This +study proposes a scheme to process graph structure data by combining graph +neural network (GNN), so that the model can capture the complex relationship +between entities, thereby improving the knowledge consistency and reasoning +ability of the generated text. The experiment used the Natural Questions (NQ) +dataset and compared it with multiple existing generation models. The results +show that the graph-based RAG model proposed in this paper is superior to the +traditional generation model in terms of quality, knowledge consistency, and +reasoning ability, especially when dealing with tasks that require +multi-dimensional reasoning. Through the combination of the enhancement of the +retrieval module and the graph neural network, the model in this study can +better handle complex knowledge background information and has broad potential +value in multiple practical application scenarios. + +
+
+
+
+
+ + ♻ ☆ ContextIQ: A Multimodal Expert-Based Video Retrieval System for + Contextual Advertising WACV 2025 + + +
+ Contextual advertising serves ads that are aligned to the content that the +user is viewing. The rapid growth of video content on social platforms and +streaming services, along with privacy concerns, has increased the need for +contextual advertising. Placing the right ad in the right context creates a +seamless and pleasant ad viewing experience, resulting in higher audience +engagement and, ultimately, better ad monetization. From a technology +standpoint, effective contextual advertising requires a video retrieval system +capable of understanding complex video content at a very granular level. +Current text-to-video retrieval models based on joint multimodal training +demand large datasets and computational resources, limiting their practicality +and lacking the key functionalities required for ad ecosystem integration. We +introduce ContextIQ, a multimodal expert-based video retrieval system designed +specifically for contextual advertising. ContextIQ utilizes modality-specific +experts-video, audio, transcript (captions), and metadata such as objects, +actions, emotion, etc.-to create semantically rich video representations. We +show that our system, without joint training, achieves better or comparable +results to state-of-the-art models and commercial solutions on multiple +text-to-video retrieval benchmarks. Our ablation studies highlight the benefits +of leveraging multiple modalities for enhanced video retrieval accuracy instead +of using a vision-language model alone. Furthermore, we show how video +retrieval systems such as ContextIQ can be used for contextual advertising in +an ad ecosystem while also addressing concerns related to brand safety and +filtering inappropriate content. + +
+
+ comment: Accepted at WACV 2025 +
+
+
+
+
+ + ♻ ☆ PersianRAG: A Retrieval-Augmented Generation System for Persian Language + + +
+ Retrieval augmented generation (RAG) models, which integrate large-scale +pre-trained generative models with external retrieval mechanisms, have shown +significant success in various natural language processing (NLP) tasks. +However, applying RAG models in Persian language as a low-resource language, +poses distinct challenges. These challenges primarily involve the +preprocessing, embedding, retrieval, prompt construction, language modeling, +and response evaluation of the system. In this paper, we address the challenges +towards implementing a real-world RAG system for Persian language called +PersianRAG. We propose novel solutions to overcome these obstacles and evaluate +our approach using several Persian benchmark datasets. Our experimental results +demonstrate the capability of the PersianRAG framework to enhance question +answering task in Persian. + +
+
+
+
+
+ + ♻ ☆ Self-Compositional Data Augmentation for Scientific Keyphrase Generation + + +
+ State-of-the-art models for keyphrase generation require large amounts of +training data to achieve good performance. However, obtaining keyphrase-labeled +documents can be challenging and costly. To address this issue, we present a +self-compositional data augmentation method. More specifically, we measure the +relatedness of training documents based on their shared keyphrases, and combine +similar documents to generate synthetic samples. The advantage of our method +lies in its ability to create additional training samples that keep domain +coherence, without relying on external data or resources. Our results on +multiple datasets spanning three different domains, demonstrate that our method +consistently improves keyphrase generation. A qualitative analysis of the +generated keyphrases for the Computer Science domain confirms this improvement +towards their representativity property. + +
+
+ comment: Accepted to JCDL 2024. This is the author's version of the work. It + is posted here for your personal use. Not for redistribution. The definitive + version was published in the proceedings of the 2024 ACM/IEEE Joint + Conference on Digital Libraries (JCDL 24) + https://doi.org/10.1145/3677389.3702504 +
+
+
+
+
+ + ♻ ☆ CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray + Report Labeling + + +
+ Free-text radiology reports present a rich data source for various medical +tasks, but effectively labeling these texts remains challenging. Traditional +rule-based labeling methods fall short of capturing the nuances of diverse +free-text patterns. Moreover, models using expert-annotated data are limited by +data scarcity and pre-defined classes, impacting their performance, flexibility +and scalability. To address these issues, our study offers three main +contributions: 1) We demonstrate the potential of GPT as an adept labeler using +carefully designed prompts. 2) Utilizing only the data labeled by GPT, we +trained a BERT-based labeler, CheX-GPT, which operates faster and more +efficiently than its GPT counterpart. 3) To benchmark labeler performance, we +introduced a publicly available expert-annotated test set, MIMIC-500, +comprising 500 cases from the MIMIC validation set. Our findings demonstrate +that CheX-GPT not only excels in labeling accuracy over existing models, but +also showcases superior efficiency, flexibility, and scalability, supported by +our introduction of the MIMIC-500 dataset for robust benchmarking. Code and +models are available at https://github.com/Soombit-ai/CheXGPT. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Real-Time Adaptive Multi-Stream GPU System for Online Approximate + Nearest Neighborhood Search CIKM'24 + + +
+ In recent years, Approximate Nearest Neighbor Search (ANNS) has played a +pivotal role in modern search and recommendation systems, especially in +emerging LLM applications like Retrieval-Augmented Generation. There is a +growing exploration into harnessing the parallel computing capabilities of GPUs +to meet the substantial demands of ANNS. However, existing systems primarily +focus on offline scenarios, overlooking the distinct requirements of online +applications that necessitate real-time insertion of new vectors. This +limitation renders such systems inefficient for real-world scenarios. Moreover, +previous architectures struggled to effectively support real-time insertion due +to their reliance on serial execution streams. In this paper, we introduce a +novel Real-Time Adaptive Multi-Stream GPU ANNS System (RTAMS-GANNS). Our +architecture achieves its objectives through three key advancements: 1) We +initially examined the real-time insertion mechanisms in existing GPU ANNS +systems and discovered their reliance on repetitive copying and memory +allocation, which significantly hinders real-time effectiveness on GPUs. As a +solution, we introduce a dynamic vector insertion algorithm based on memory +blocks, which includes in-place rearrangement. 2) To enable real-time vector +insertion in parallel, we introduce a multi-stream parallel execution mode, +which differs from existing systems that operate serially within a single +stream. Our system utilizes a dynamic resource pool, allowing multiple streams +to execute concurrently without additional execution blocking. 3) Through +extensive experiments and comparisons, our approach effectively handles varying +QPS levels across different datasets, reducing latency by up to 40%-80%. The +proposed system has also been deployed in real-world industrial search and +recommendation systems, serving hundreds of millions of users daily, and has +achieved good results. + +
+
+ comment: Accepted by CIKM'24, V2 fixes some typos +
+
+
+
+
+ + ♻ ☆ ELASTIC: Efficient Linear Attention for Sequential Interest Compression + + +
+ State-of-the-art sequential recommendation models heavily rely on +transformer's attention mechanism. However, the quadratic computational and +memory complexities of self attention have limited its scalability for modeling +users' long range behaviour sequences. To address this problem, we propose +ELASTIC, an Efficient Linear Attention for SequenTial Interest Compression, +requiring only linear time complexity and decoupling model capacity from +computational cost. Specifically, ELASTIC introduces a fixed length interest +experts with linear dispatcher attention mechanism which compresses the +long-term behaviour sequences to a significantly more compact representation +which reduces up to 90% GPU memory usage with x2.7 inference speed up. The +proposed linear dispatcher attention mechanism significantly reduces the +quadratic complexity and makes the model feasible for adequately modeling +extremely long sequences. Moreover, in order to retain the capacity for +modeling various user interests, ELASTIC initializes a vast learnable interest +memory bank and sparsely retrieves compressed user's interests from the memory +with a negligible computational overhead. The proposed interest memory +retrieval technique significantly expands the cardinality of available interest +space while keeping the same computational cost, thereby striking a trade-off +between recommendation accuracy and efficiency. To validate the effectiveness +of our proposed ELASTIC, we conduct extensive experiments on various public +datasets and compare it with several strong sequential recommenders. +Experimental results demonstrate that ELASTIC consistently outperforms +baselines by a significant margin and also highlight the computational +efficiency of ELASTIC when modeling long sequences. We will make our +implementation code publicly available. + +
+
+ comment: We hereby withdraw this paper from arXiv due to incomplete + experiments. Upon further review, we have determined that additional + experimental work is necessary to fully validate our findings and conclusions +
+
+
+
+
+
+
+
+ + Multimedia 11 + +
+
+
+ + ☆ Harmful YouTube Video Detection: A Taxonomy of Online Harm and MLLMs as + Alternative Annotators + + +
+ Short video platforms, such as YouTube, Instagram, or TikTok, are used by +billions of users globally. These platforms expose users to harmful content, +ranging from clickbait or physical harms to misinformation or online hate. Yet, +detecting harmful videos remains challenging due to an inconsistent +understanding of what constitutes harm and limited resources and mental tolls +involved in human annotation. As such, this study advances measures and methods +to detect harm in video content. First, we develop a comprehensive taxonomy for +online harm on video platforms, categorizing it into six categories: +Information, Hate and harassment, Addictive, Clickbait, Sexual, and Physical +harms. Next, we establish multimodal large language models as reliable +annotators of harmful videos. We analyze 19,422 YouTube videos using 14 image +frames, 1 thumbnail, and text metadata, comparing the accuracy of crowdworkers +(Mturk) and GPT-4-Turbo with domain expert annotations serving as the gold +standard. Our results demonstrate that GPT-4-Turbo outperforms crowdworkers in +both binary classification (harmful vs. harmless) and multi-label harm +categorization tasks. Methodologically, this study extends the application of +LLMs to multi-label and multi-modal contexts beyond text annotation and binary +classification. Practically, our study contributes to online harm mitigation by +guiding the definitions and identification of harmful content on video +platforms. + +
+
+
+
+
+ + ☆ Long-Form Text-to-Music Generation with Adaptive Prompts: A Case of + Study in Tabletop Role-Playing Games Soundtracks + + +
+ This paper investigates the capabilities of text-to-audio music generation +models in producing long-form music with prompts that change over time, +focusing on soundtrack generation for Tabletop Role-Playing Games (TRPGs). We +introduce Babel Bardo, a system that uses Large Language Models (LLMs) to +transform speech transcriptions into music descriptions for controlling a +text-to-music model. Four versions of Babel Bardo were compared in two TRPG +campaigns: a baseline using direct speech transcriptions, and three LLM-based +versions with varying approaches to music description generation. Evaluations +considered audio quality, story alignment, and transition smoothness. Results +indicate that detailed music descriptions improve audio quality while +maintaining consistency across consecutive descriptions enhances story +alignment and transition smoothness. + +
+
+ comment: Paper accepted at the LAMIR 2024 workshop +
+
+
+
+
+ + ☆ Inter-Frame Coding for Dynamic Meshes via Coarse-to-Fine Anchor Mesh + Generation + + +
+ In the current Video-based Dynamic Mesh Coding (V-DMC) standard, inter-frame +coding is restricted to mesh frames with constant topology. Consequently, +temporal redundancy is not fully leveraged, resulting in suboptimal compression +efficacy. To address this limitation, this paper introduces a novel +coarse-to-fine scheme to generate anchor meshes for frames with time-varying +topology. Initially, we generate a coarse anchor mesh using an octree-based +nearest neighbor search. Motion estimation compensates for regions with +significant motion changes during this process. However, the quality of the +coarse mesh is low due to its suboptimal vertices. To enhance details, the fine +anchor mesh is further optimized using the Quadric Error Metrics (QEM) +algorithm to calculate more precise anchor points. The inter-frame anchor mesh +generated herein retains the connectivity of the reference base mesh, while +concurrently preserving superior quality. Experimental results show that our +method achieves 7.2% ~ 10.3% BD-rate gain compared to the existing V-DMC test +model version 7. + +
+
+
+
+
+ + ☆ Both Text and Images Leaked! A Systematic Analysis of Multimodal LLM + Data Contamination + + +
+ The rapid progression of multimodal large language models (MLLMs) has +demonstrated superior performance on various multimodal benchmarks. However, +the issue of data contamination during training creates challenges in +performance evaluation and comparison. While numerous methods exist for +detecting dataset contamination in large language models (LLMs), they are less +effective for MLLMs due to their various modalities and multiple training +phases. In this study, we introduce a multimodal data contamination detection +framework, MM-Detect, designed for MLLMs. Our experimental results indicate +that MM-Detect is sensitive to varying degrees of contamination and can +highlight significant performance improvements due to leakage of the training +set of multimodal benchmarks. Furthermore, We also explore the possibility of +contamination originating from the pre-training phase of LLMs used by MLLMs and +the fine-tuning phase of MLLMs, offering new insights into the stages at which +contamination may be introduced. + +
+
+
+
+
+ + ☆ Diversify, Contextualize, and Adapt: Efficient Entropy Modeling for + Neural Image Codec NeurIPS 2024 + + +
+ Designing a fast and effective entropy model is challenging but essential for +practical application of neural codecs. Beyond spatial autoregressive entropy +models, more efficient backward adaptation-based entropy models have been +recently developed. They not only reduce decoding time by using smaller number +of modeling steps but also maintain or even improve rate--distortion +performance by leveraging more diverse contexts for backward adaptation. +Despite their significant progress, we argue that their performance has been +limited by the simple adoption of the design convention for forward adaptation: +using only a single type of hyper latent representation, which does not provide +sufficient contextual information, especially in the first modeling step. In +this paper, we propose a simple yet effective entropy modeling framework that +leverages sufficient contexts for forward adaptation without compromising on +bit-rate. Specifically, we introduce a strategy of diversifying hyper latent +representations for forward adaptation, i.e., using two additional types of +contexts along with the existing single type of context. In addition, we +present a method to effectively use the diverse contexts for contextualizing +the current elements to be encoded/decoded. By addressing the limitation of the +previous approach, our proposed framework leads to significant performance +improvements. Experimental results on popular datasets show that our proposed +framework consistently improves rate--distortion performance across various +bit-rate regions, e.g., 3.73% BD-rate gain over the state-of-the-art baseline +on the Kodak dataset. + +
+
+ comment: NeurIPS 2024 +
+
+
+
+
+ + ☆ Investigating Conceptual Blending of a Diffusion Model for Improving + Nonword-to-Image Generation ACM MM 2024 + + +
+ Text-to-image diffusion models sometimes depict blended concepts in the +generated images. One promising use case of this effect would be the +nonword-to-image generation task which attempts to generate images intuitively +imaginable from a non-existing word (nonword). To realize nonword-to-image +generation, an existing study focused on associating nonwords with +similar-sounding words. Since each nonword can have multiple similar-sounding +words, generating images containing their blended concepts would increase +intuitiveness, facilitating creative activities and promoting computational +psycholinguistics. Nevertheless, no existing study has quantitatively evaluated +this effect in either diffusion models or the nonword-to-image generation +paradigm. Therefore, this paper first analyzes the conceptual blending in a +pretrained diffusion model, Stable Diffusion. The analysis reveals that a high +percentage of generated images depict blended concepts when inputting an +embedding interpolating between the text embeddings of two text prompts +referring to different concepts. Next, this paper explores the best text +embedding space conversion method of an existing nonword-to-image generation +framework to ensure both the occurrence of conceptual blending and image +generation quality. We compare the conventional direct prediction approach with +the proposed method that combines $k$-nearest neighbor search and linear +regression. Evaluation reveals that the enhanced accuracy of the embedding +space conversion by the proposed method improves the image generation quality, +while the emergence of conceptual blending could be attributed mainly to the +specific dimensions of the high-dimensional text embedding space. + +
+
+ comment: Paper accepted at ACM MM 2024 (doi: 10.1145/3664647.3681202) with + supplementary materials concatenated +
+
+
+
+
+ + ♻ ☆ Efficiently Collecting Training Dataset for 2D Object Detection by + Online Visual Feedback + + +
+ Training deep-learning-based vision systems require the manual annotation of +a significant number of images. Such manual annotation is highly time-consuming +and labor-intensive. Although previous studies have attempted to eliminate the +effort required for annotation, the effort required for image collection was +retained. To address this, we propose a human-in-the-loop dataset collection +method that uses a web application. To counterbalance the workload and +performance by encouraging the collection of multi-view object image datasets +in an enjoyable manner, thereby amplifying motivation, we propose three types +of online visual feedback features to track the progress of the collection +status. Our experiments thoroughly investigated the impact of each feature on +collection performance and quality of operation. The results suggested the +feasibility of annotation and object detection. + +
+
+ comment: 13 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Transferable Learned Image Compression-Resistant Adversarial + Perturbations BMVC 2024 + + +
+ Adversarial attacks can readily disrupt the image classification system, +revealing the vulnerability of DNN-based recognition tasks. While existing +adversarial perturbations are primarily applied to uncompressed images or +compressed images by the traditional image compression method, i.e., JPEG, +limited studies have investigated the robustness of models for image +classification in the context of DNN-based image compression. With the rapid +evolution of advanced image compression, DNN-based learned image compression +has emerged as the promising approach for transmitting images in many +security-critical applications, such as cloud-based face recognition and +autonomous driving, due to its superior performance over traditional +compression. Therefore, there is a pressing need to fully investigate the +robustness of a classification system post-processed by learned image +compression. To bridge this research gap, we explore the adversarial attack on +a new pipeline that targets image classification models that utilize learned +image compressors as pre-processing modules. Furthermore, to enhance the +transferability of perturbations across various quality levels and +architectures of learned image compression models, we introduce a saliency +score-based sampling method to enable the fast generation of transferable +perturbation. Extensive experiments with popular attack methods demonstrate the +enhanced transferability of our proposed method when attacking images that have +been post-processed with different learned image compression models. + +
+
+ comment: Accepted by BMVC 2024 +
+
+
+
+
+ + ♻ ☆ Routing Experts: Learning to Route Dynamic Experts in Multi-modal Large + Language Models + + +
+ Recently, mixture of experts (MoE) has become a popular paradigm for +achieving the trade-off between modal capacity and efficiency of multi-modal +large language models (MLLMs). Different from previous efforts, we are +dedicated to exploring the dynamic expert path in an already exist MLLM and +show that a standard MLLM can be also a mixture of experts. To approach this +target, we propose a novel dynamic expert scheme for MLLMs, termed Routing +Experts (RoE), which can achieve example-dependent optimal path routing without +obvious structure tweaks. Meanwhile, a new regularization of structure sparsity +is also introduced to enforce MLLMs to learn more short-cut inference, ensuring +the efficiency. In addition, we also realize the first attempt of aligning the +training and inference schemes of MLLMs in terms of network routing. To +validate RoE, we apply it to a set of latest MLLMs, including LLaVA-1.5, +LLaVA-HR and VILA, and conduct extensive experiments on a bunch of VL +benchmarks. The experiment results not only show the great advantages of our +RoE in improving MLLMs' efficiency, but also yield obvious advantages than +MoE-LLaVA in both performance and speed, e.g., an average performance gain of +3.3% on 5 benchmarks while being faster. + +
+
+
+
+
+ + ♻ ☆ Beyond Single-Audio: Advancing Multi-Audio Processing in Audio Large + Language Models EMNLP24 + + +
+ Various audio-LLMs (ALLMs) have been explored recently for tackling different +audio tasks simultaneously using a single, unified model. While existing +evaluations of ALLMs primarily focus on single-audio tasks, real-world +applications often involve processing multiple audio streams simultaneously. To +bridge this gap, we propose the first multi-audio evaluation (MAE) benchmark +that consists of 20 datasets from 11 multi-audio tasks encompassing both speech +and sound scenarios. Comprehensive experiments on MAE demonstrate that the +existing ALLMs, while being powerful in comprehending primary audio elements in +individual audio inputs, struggling to handle multi-audio scenarios. To this +end, we propose a novel multi-audio-LLM (MALLM) to capture audio context among +multiple similar audios using discriminative learning on our proposed synthetic +data. The results demonstrate that the proposed MALLM outperforms all baselines +and achieves high data efficiency using synthetic data without requiring human +annotations. The proposed MALLM opens the door for ALLMs towards multi-audio +processing era and brings us closer to replicating human auditory capabilities +in machines. + +
+
+ comment: EMNLP24 Findings. Data available at + https://github.com/MatthewCYM/MALLM +
+
+
+
+
+ + ♻ ☆ Document Parsing Unveiled: Techniques, Challenges, and Prospects for + Structured Information Extraction + + +
+ Document parsing is essential for converting unstructured and semi-structured +documents-such as contracts, academic papers, and invoices-into structured, +machine-readable data. Document parsing extract reliable structured data from +unstructured inputs, providing huge convenience for numerous applications. +Especially with recent achievements in Large Language Models, document parsing +plays an indispensable role in both knowledge base construction and training +data generation. This survey presents a comprehensive review of the current +state of document parsing, covering key methodologies, from modular pipeline +systems to end-to-end models driven by large vision-language models. Core +components such as layout detection, content extraction (including text, +tables, and mathematical expressions), and multi-modal data integration are +examined in detail. Additionally, this paper discusses the challenges faced by +modular document parsing systems and vision-language models in handling complex +layouts, integrating multiple modules, and recognizing high-density text. It +emphasizes the importance of developing larger and more diverse datasets and +outlines future research directions. + +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`