From 7d11284482b9bfb1a29eb5a6a13e8729da4ec724 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Tue, 23 Jan 2024 05:23:17 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 78999 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 79394 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..3a11b441 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-01-16T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.08374v1","updated":"2024-01-16T14:00:28Z","published":"2024-01-16T14:00:28Z","title":"Cross-lingual neural fuzzy matching for exploiting target-language\n monolingual corpora in computer-aided translation","summary":" Computer-aided translation (CAT) tools based on translation memories (MT)\nplay a prominent role in the translation workflow of professional translators.\nHowever, the reduced availability of in-domain TMs, as compared to in-domain\nmonolingual corpora, limits its adoption for a number of translation tasks. In\nthis paper, we introduce a novel neural approach aimed at overcoming this\nlimitation by exploiting not only TMs, but also in-domain target-language (TL)\nmonolingual corpora, and still enabling a similar functionality to that offered\nby conventional TM-based CAT tools. Our approach relies on cross-lingual\nsentence embeddings to retrieve translation proposals from TL monolingual\ncorpora, and on a neural model to estimate their post-editing effort. The paper\npresents an automatic evaluation of these techniques on four language pairs\nthat shows that our approach can successfully exploit monolingual texts in a\nTM-based CAT environment, increasing the amount of useful translation\nproposals, and that our neural model for estimating the post-editing effort\nenables the combination of translation proposals obtained from monolingual\ncorpora and from TMs in the usual way. A human evaluation performed on a single\nlanguage pair confirms the results of the automatic evaluation and seems to\nindicate that the translation proposals retrieved with our approach are more\nuseful than what the automatic evaluation shows.\n","authors":["Miquel Esplà-Gomis","Víctor M. Sánchez-Cartagena","Juan Antonio Pérez-Ortiz","Felipe Sánchez-Martínez"],"pdf_url":"https://arxiv.org/pdf/2401.08374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08367v1","updated":"2024-01-16T13:52:25Z","published":"2024-01-16T13:52:25Z","title":"Morphology and Syntax of the Tamil Language","summary":" This paper provides an overview of the morphology and syntax of the Tamil\nlanguage, focusing on its contemporary usage. The paper also highlights the\ncomplexity and richness of Tamil in terms of its morphological and syntactic\nfeatures, which will be useful for linguists analysing the language and\nconducting comparative studies. In addition, the paper will be useful for those\ndeveloping computational resources for the Tamil language. It is proven as a\nrule-based morphological analyser cum generator and a computational grammar for\nTamil have already been developed based on this paper. To enhance accessibility\nfor a broader audience, the analysis is conducted without relying on any\nspecific grammatical formalism.\n","authors":["Kengatharaiyer Sarveswaran"],"pdf_url":"https://arxiv.org/pdf/2401.08367v1.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2401.08358v1","updated":"2024-01-16T13:36:07Z","published":"2024-01-16T13:36:07Z","title":"Hallucination Detection and Hallucination Mitigation: An Investigation","summary":" Large language models (LLMs), including ChatGPT, Bard, and Llama, have\nachieved remarkable successes over the last two years in a range of different\napplications. In spite of these successes, there exist concerns that limit the\nwide application of LLMs. A key problem is the problem of hallucination.\nHallucination refers to the fact that in addition to correct responses, LLMs\ncan also generate seemingly correct but factually incorrect responses. This\nreport aims to present a comprehensive review of the current literature on both\nhallucination detection and hallucination mitigation. We hope that this report\ncan serve as a good reference for both engineers and researchers who are\ninterested in LLMs and applying them to real world tasks.\n","authors":["Junliang Luo","Tianyu Li","Di Wu","Michael Jenkin","Steve Liu","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2401.08358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06355v2","updated":"2024-01-16T13:35:20Z","published":"2023-12-11T13:03:39Z","title":"Linguistic and Structural Basis of Engineering Design Knowledge","summary":" Artefact descriptions are the primary carriers of engineering design\nknowledge that is both an outcome and a driver of the design process. While an\nartefact could be described in different connotations, the design process\nrequires a description to embody engineering design knowledge, which is\nexpressed in the text through intricate placement of entities and\nrelationships. As large-language models learn from all kinds of text merely as\na sequence of characters/tokens, these are yet to generate text that embodies\nexplicit engineering design facts. Existing ontological design theories are\nless likely to guide the large-language models whose applications are currently\nlimited to ideation and learning purposes. In this article, we explicate\nengineering design knowledge as knowledge graphs from a large sample of 33,881\npatent documents. We examine the constituents of these knowledge graphs to\nunderstand the linguistic and structural basis of engineering design knowledge.\nIn terms of linguistic basis, we observe that entities and relationships could\nbe generalised to 64 and 24 linguistic syntaxes. While relationships mainly\ncapture attributes ('of'), structure ('in', 'with'), purpose ('to', 'for'),\nhierarchy ('include'), exemplification ('such as'), and behaviour ('to',\n'from'), the hierarchical relationships could specifically be identified using\n75 unique syntaxes. To understand the structural basis, we draw inspiration\nfrom various studies on biological/ecological networks and discover motifs from\npatent knowledge graphs. We identify four 3-node and four 4-node patterns that\ncould further be converged and simplified into sequence [->...->], aggregation\n[->...<-], and hierarchy [<-...->]. Expected to guide large-language model\nbased design tools, we propose few regulatory precepts for concretising\nabstract entities and relationships within subgraphs, while explicating\nhierarchical structures.\n","authors":["L. Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2312.06355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08350v1","updated":"2024-01-16T13:30:09Z","published":"2024-01-16T13:30:09Z","title":"Salute the Classic: Revisiting Challenges of Machine Translation in the\n Age of Large Language Models","summary":" The evolution of Neural Machine Translation (NMT) has been significantly\ninfluenced by six core challenges (Koehn and Knowles, 2017), which have acted\nas benchmarks for progress in this field. This study revisits these challenges,\noffering insights into their ongoing relevance in the context of advanced Large\nLanguage Models (LLMs): domain mismatch, amount of parallel data, rare word\nprediction, translation of long sentences, attention model as word alignment,\nand sub-optimal beam search. Our empirical findings indicate that LLMs\neffectively lessen the reliance on parallel data for major languages in the\npretraining phase. Additionally, the LLM-based translation system significantly\nenhances the translation of long sentences that contain approximately 80 words\nand shows the capability to translate documents of up to 512 words. However,\ndespite these significant improvements, the challenges of domain mismatch and\nprediction of rare words persist. While the challenges of word alignment and\nbeam search, specifically associated with NMT, may not apply to LLMs, we\nidentify three new challenges for LLMs in translation tasks: inference\nefficiency, translation of low-resource languages in the pretraining phase, and\nhuman-aligned evaluation. The datasets and models are released at\nhttps://github.com/pangjh3/LLM4MT.\n","authors":["Jianhui Pang","Fanghua Ye","Longyue Wang","Dian Yu","Derek F. Wong","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2401.08350v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2401.08326v1","updated":"2024-01-16T12:45:15Z","published":"2024-01-16T12:45:15Z","title":"RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large\n Language Models in Tool Learning","summary":" Tool learning has generated widespread interest as a vital means of\ninteraction between Large Language Models (LLMs) and the physical world.\nCurrent research predominantly emphasizes LLMs' capacity to utilize tools in\nwell-structured environments while overlooking their stability when confronted\nwith the inevitable noise of the real world. To bridge this gap, we introduce\nRoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool\nlearning. Specifically, we establish five external environments, each featuring\nvarying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union),\nproviding an in-depth analysis of the model's resilience across three critical\nphases: tool selection, parameter identification, and content filling.\nExperiments involving six widely-used models underscore the urgent necessity\nfor enhancing the robustness of LLMs in tool learning. For instance, the\nperformance of GPT-4 even drops significantly from 80.00 to 58.10 when there is\nno substantial change in manual accuracy. More surprisingly, the noise\ncorrection capability inherent in the GPT family paradoxically impedes its\nadaptability in the face of mild noise. In light of these findings, we propose\nRoTTuning, a strategy that enriches the diversity of training environments to\nbolster the robustness of LLMs in tool learning. The code and data are\navailable at https://github.com/Junjie-Ye/RoTBench.\n","authors":["Junjie Ye","Yilong Wu","Songyang Gao","Sixian Li","Guanyu Li","Xiaoran Fan","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08315v1","updated":"2024-01-16T12:30:56Z","published":"2024-01-16T12:30:56Z","title":"Application of LLM Agents in Recruitment: A Novel Framework for Resume\n Screening","summary":" The automation of resume screening is a crucial aspect of the recruitment\nprocess in organizations. Automated resume screening systems often encompass a\nrange of natural language processing (NLP) tasks. The advent of Large Language\nModels (LLMs) has notably enhanced the efficacy of these systems, showcasing\ntheir robust generalization abilities across diverse language-related tasks.\nAccompanying these developments are various agents based on LLMs, which\nfacilitate their application in practical scenarios. This paper introduces a\nnovel LLM-based agent framework for resume screening, aimed at enhancing\nefficiency and time management in recruitment processes. Our framework is\ndistinct in its ability to efficiently summarize and grade each resume from a\nlarge dataset. Moreover, it utilizes LLM agents for decision-making,\ndetermining which candidates receive job offers, or which ones to bring in for\ninterviews. To evaluate our framework, we constructed a dataset from actual\nresumes and conducted simulate a resume screening process. Subsequently, the\noutcomes of the simulation experiment were compared and subjected to detailed\nanalysis. The results demonstrate that our automated resume screening framework\nis 11 times faster than traditional manual methods. Furthermore, by fine-tuning\nthe LLMs, we observed a significant improvement in the F1 score, reaching\n87.73\\%, during the resume sentence classification phase. In the resume\nsummarization and grading phase, our fine-tuned model surpassed the baseline\nperformance of the GPT-3.5 model. Analysis of the decision-making efficacy of\nthe LLM agents in the final offer stage further underscores the potential of\nLLM agents in transforming resume screening processes.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2401.08315v1.pdf","comment":"Under review, 14 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.08309v1","updated":"2024-01-16T12:10:49Z","published":"2024-01-16T12:10:49Z","title":"Anchor function: a type of benchmark functions for studying language\n models","summary":" Understanding transformer-based language models is becoming increasingly\ncrucial, particularly as they play pivotal roles in advancing towards\nartificial general intelligence. However, language model research faces\nsignificant challenges, especially for academic research groups with\nconstrained resources. These challenges include complex data structures,\nunknown target functions, high computational costs and memory requirements, and\na lack of interpretability in the inference process, etc. Drawing a parallel to\nthe use of simple models in scientific research, we propose the concept of an\nanchor function. This is a type of benchmark function designed for studying\nlanguage models in learning tasks that follow an \"anchor-key\" pattern. By\nutilizing the concept of an anchor function, we can construct a series of\nfunctions to simulate various language tasks. The anchor function plays a role\nanalogous to that of mice in diabetes research, particularly suitable for\nacademic research. We demonstrate the utility of the anchor function with an\nexample, revealing two basic operations by attention structures in language\nmodels: shifting tokens and broadcasting one token from one position to many\npositions. These operations are also commonly observed in large language\nmodels. The anchor function framework, therefore, opens up a series of valuable\nand accessible research questions for further exploration, especially for\ntheoretical study.\n","authors":["Zhongwang Zhang","Zhiwei Wang","Junjie Yao","Zhangchen Zhou","Xiaolong Li","Weinan E","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2401.08309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04350v2","updated":"2024-01-16T12:07:27Z","published":"2023-12-07T15:12:12Z","title":"CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language\n Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsights into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v2.pdf","comment":"NeurIPS 2023; updated with CLadder dataset v1.5"},{"id":"http://arxiv.org/abs/2401.08295v1","updated":"2024-01-16T11:45:03Z","published":"2024-01-16T11:45:03Z","title":"DAPT: A Dual Attention Framework for Parameter-Efficient Continual\n Learning of Large Language Models","summary":" The continual learning (CL) ability is vital for deploying large language\nmodels (LLMs) in the dynamic world. Based on parameter-efficient tuning (PET),\nexisting methods devise the learning module and the selection module to handle\nthe challenges of catastrophic forgetting (CF) and knowledge transfer (KT) in\nCL. The learning module allocates separate PET blocks for each continually\nemerged task and the selection module function to choose the correct one for\nthe input at testing time. However, there are limitations in their deigns of\nboth modules and they ignore the potential of aligning the two module to\naddress CF and KT simultaneously. To this end, we propose a novel Dual\nAttention Framework , to align the PET learning and selection via the Dual\nAttentive Learning\\&Selection module. Extensive Experiments on two CL\nbenchmarks demonstrate the superiority of DAPT to resist CF and facilitate KT\nat the same time. Moreover, DAPT exhibits the superiority when we scale it to\ndifferent model sizes (from 770M to 11B) and unseen tasks.\n","authors":["Weixiang Zhao","Shilong Wang","Yulin Hu","Yanyan Zhao","Bing Qin","Xuanyu Zhang","Qing Yang","Dongliang Xu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2401.08295v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2401.08294v1","updated":"2024-01-16T11:39:09Z","published":"2024-01-16T11:39:09Z","title":"Inferflow: an Efficient and Highly Configurable Inference Engine for\n Large Language Models","summary":" We present Inferflow, an efficient and highly configurable inference engine\nfor large language models (LLMs). With Inferflow, users can serve most of the\ncommon transformer models by simply modifying some lines in corresponding\nconfiguration files, without writing a single line of source code. Compared\nwith most existing inference engines, Inferflow has some key features. First,\nby implementing a modular framework of atomic build-blocks and technologies,\nInferflow is compositionally generalizable to new models. Second, 3.5-bit\nquantization is introduced in Inferflow as a tradeoff between 3-bit and 4-bit\nquantization. Third, hybrid model partitioning for multi-GPU inference is\nintroduced in Inferflow to better balance inference speed and throughput than\nthe existing partition-by-layer and partition-by-tensor strategies.\n","authors":["Shuming Shi","Enbo Zhao","Deng Cai","Leyang Cui","Xinting Huang","Huayang Li"],"pdf_url":"https://arxiv.org/pdf/2401.08294v1.pdf","comment":"Technical report of Inferflow"},{"id":"http://arxiv.org/abs/2401.08276v1","updated":"2024-01-16T10:58:07Z","published":"2024-01-16T10:58:07Z","title":"AesBench: An Expert Benchmark for Multimodal Large Language Models on\n Image Aesthetics Perception","summary":" With collective endeavors, multimodal large language models (MLLMs) are\nundergoing a flourishing development. However, their performances on image\naesthetics perception remain indeterminate, which is highly desired in\nreal-world applications. An obvious obstacle lies in the absence of a specific\nbenchmark to evaluate the effectiveness of MLLMs on aesthetic perception. This\nblind groping may impede the further development of more advanced MLLMs with\naesthetic perception capacity. To address this dilemma, we propose AesBench, an\nexpert benchmark aiming to comprehensively evaluate the aesthetic perception\ncapacities of MLLMs through elaborate design across dual facets. (1) We\nconstruct an Expert-labeled Aesthetics Perception Database (EAPD), which\nfeatures diversified image contents and high-quality annotations provided by\nprofessional aesthetic experts. (2) We propose a set of integrative criteria to\nmeasure the aesthetic perception abilities of MLLMs from four perspectives,\nincluding Perception (AesP), Empathy (AesE), Assessment (AesA) and\nInterpretation (AesI). Extensive experimental results underscore that the\ncurrent MLLMs only possess rudimentary aesthetic perception ability, and there\nis still a significant gap between MLLMs and humans. We hope this work can\ninspire the community to engage in deeper explorations on the aesthetic\npotentials of MLLMs. Source data will be available at\nhttps://github.com/yipoh/AesBench.\n","authors":["Yipo Huang","Quan Yuan","Xiangfei Sheng","Zhichao Yang","Haoning Wu","Pengfei Chen","Yuzhe Yang","Leida Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2401.08276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08273v1","updated":"2024-01-16T10:53:11Z","published":"2024-01-16T10:53:11Z","title":"Large Language Models are Null-Shot Learners","summary":" This paper presents null-shot prompting. Null-shot prompting exploits\nhallucination in large language models (LLMs) by instructing LLMs to utilize\ninformation from the \"Examples\" section that never exists within the provided\ncontext to perform a task. While reducing hallucination is crucial and\nnon-negligible for daily and critical uses of LLMs, we propose that in the\ncurrent landscape in which these LLMs still hallucinate, it is possible, in\nfact, to exploit hallucination to increase performance in performing tasks\ncompared to standard zero-shot prompting. Experiments with six LLMs show\nimprovements in performance across the majority of eight datasets, including\nreading comprehension, arithmetic reasoning, and closed-book question\nanswering. The observed inconsistency in increased relative performance across\nLLMs also potentially indicates a different degree of inherent hallucination in\neach model. These differences show that it is possible to utilize null-shot\nprompting as a way to detect degrees of hallucination in LLMs using existing\nbenchmarking datasets. We also perform ablation studies, including\nexperimenting with a modified version of null-shot prompting that incorporates\nideas from zero-shot chain-of-thought prompting, which shows different trends\nof results.\n","authors":["Pittawat Taveekitworachai","Febri Abdullah","Ruck Thawonmas"],"pdf_url":"https://arxiv.org/pdf/2401.08273v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2401.08255v1","updated":"2024-01-16T10:14:27Z","published":"2024-01-16T10:14:27Z","title":"A Generative Adversarial Attack for Multilingual Text Classifiers","summary":" Current adversarial attack algorithms, where an adversary changes a text to\nfool a victim model, have been repeatedly shown to be effective against text\nclassifiers. These attacks, however, generally assume that the victim model is\nmonolingual and cannot be used to target multilingual victim models, a\nsignificant limitation given the increased use of these models. For this\nreason, in this work we propose an approach to fine-tune a multilingual\nparaphrase model with an adversarial objective so that it becomes able to\ngenerate effective adversarial examples against multilingual classifiers. The\ntraining objective incorporates a set of pre-trained models to ensure text\nquality and language consistency of the generated text. In addition, all the\nmodels are suitably connected to the generator by vocabulary-mapping matrices,\nallowing for full end-to-end differentiability of the overall training\npipeline. The experimental validation over two multilingual datasets and five\nlanguages has shown the effectiveness of the proposed approach compared to\nexisting baselines, particularly in terms of query efficiency. We also provide\na detailed analysis of the generated attacks and discuss limitations and\nopportunities for future research.\n","authors":["Tom Roth","Inigo Jauregi Unanue","Alsharif Abuadbba","Massimo Piccardi"],"pdf_url":"https://arxiv.org/pdf/2401.08255v1.pdf","comment":"AAAI-24 Workshop on Artificial Intelligence for Cyber Security (AICS)"},{"id":"http://arxiv.org/abs/2312.12108v2","updated":"2024-01-16T10:03:57Z","published":"2023-12-19T12:32:27Z","title":"Knowledge Graph Error Detection with Contrastive Confidence Adaption","summary":" Knowledge graphs (KGs) often contain various errors. Previous works on\ndetecting errors in KGs mainly rely on triplet embedding from graph structure.\nWe conduct an empirical study and find that these works struggle to\ndiscriminate noise from semantically-similar correct triplets. In this paper,\nwe propose a KG error detection model CCA to integrate both textual and graph\nstructural information from triplet reconstruction for better distinguishing\nsemantics. We design interactive contrastive learning to capture the\ndifferences between textual and structural patterns. Furthermore, we construct\nrealistic datasets with semantically-similar noise and adversarial noise.\nExperimental results demonstrate that CCA outperforms state-of-the-art\nbaselines, especially in detecting semantically-similar noise and adversarial\nnoise.\n","authors":["Xiangyu Liu","Yang Liu","Wei Hu"],"pdf_url":"https://arxiv.org/pdf/2312.12108v2.pdf","comment":"Accepted in the 38th AAAI Conference on Artificial Intelligence (AAAI\n 2024)"},{"id":"http://arxiv.org/abs/2305.10818v2","updated":"2024-01-16T10:03:54Z","published":"2023-05-18T08:56:05Z","title":"Diffusion Language Models Generation Can Be Halted Early","summary":" Diffusion Language models (DLMs) are a promising avenue for text generation\ndue to their practical properties on tractable controllable generation. They\nalso have the advantage of not having to predict text autoregressively.\nHowever, despite these notable features, DLMs have not yet reached the\nperformance levels of their Autoregressive counterparts. One of the ways to\nreduce the performance gap between these two types of language models is to\nspeed up the generation of DLMs. Therefore, we propose a pioneering methodology\nto address this issue in this work. It enables the execution of more generation\nsteps within a given time frame, potentially leading to higher-quality outputs.\nSpecifically, our methods estimate DLMs completeness of text generation and\nallow adaptive halting of the generation process. We test and refine our\nmethods on Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their\ngeneration workflows. Finally, we confirm that our methods allow halting Plaid,\nSSD, and CDCD models and decrease the generation time by $10$-$40$% without a\ndrop in the quality of model samples.\n","authors":["Sofia Maria Lo Cicero Vaina","Nikita Balagansky","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2305.10818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17727v2","updated":"2024-01-16T09:07:37Z","published":"2023-05-28T13:54:09Z","title":"Learning a Structural Causal Model for Intuition Reasoning in\n Conversation","summary":" Reasoning, a crucial aspect of NLP research, has not been adequately\naddressed by prevailing models including Large Language Model. Conversation\nreasoning, as a critical component of it, remains largely unexplored due to the\nabsence of a well-designed cognitive model. In this paper, inspired by\nintuition theory on conversation cognition, we develop a conversation cognitive\nmodel (CCM) that explains how each utterance receives and activates channels of\ninformation recursively. Besides, we algebraically transformed CCM into a\nstructural causal model (SCM) under some mild assumptions, rendering it\ncompatible with various causal discovery methods. We further propose a\nprobabilistic implementation of the SCM for utterance-level relation reasoning.\nBy leveraging variational inference, it explores substitutes for implicit\ncauses, addresses the issue of their unobservability, and reconstructs the\ncausal representations of utterances through the evidence lower bounds.\nMoreover, we constructed synthetic and simulated datasets incorporating\nimplicit causes and complete cause labels, alleviating the current situation\nwhere all available datasets are implicit-causes-agnostic. Extensive\nexperiments demonstrate that our proposed method significantly outperforms\nexisting methods on synthetic, simulated, and real-world datasets. Finally, we\nanalyze the performance of CCM under latent confounders and propose theoretical\nideas for addressing this currently unresolved issue.\n","authors":["Hang Chen","Bingyu Liao","Jing Luo","Wenjing Zhu","Xinyu Yang"],"pdf_url":"https://arxiv.org/pdf/2305.17727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08206v1","updated":"2024-01-16T08:44:29Z","published":"2024-01-16T08:44:29Z","title":"Generative Multi-Modal Knowledge Retrieval with Large Language Models","summary":" Knowledge retrieval with multi-modal queries plays a crucial role in\nsupporting knowledge-intensive multi-modal applications. However, existing\nmethods face challenges in terms of their effectiveness and training\nefficiency, especially when it comes to training and integrating multiple\nretrievers to handle multi-modal queries. In this paper, we propose an\ninnovative end-to-end generative framework for multi-modal knowledge retrieval.\nOur framework takes advantage of the fact that large language models (LLMs) can\neffectively serve as virtual knowledge bases, even when trained with limited\ndata. We retrieve knowledge via a two-step process: 1) generating knowledge\nclues related to the queries, and 2) obtaining the relevant document by\nsearching databases using the knowledge clue. In particular, we first introduce\nan object-aware prefix-tuning technique to guide multi-grained visual learning.\nThen, we align multi-grained visual features into the textual feature space of\nthe LLM, employing the LLM to capture cross-modal interactions. Subsequently,\nwe construct instruction data with a unified format for model training.\nFinally, we propose the knowledge-guided generation strategy to impose prior\nconstraints in the decoding steps, thereby promoting the generation of\ndistinctive knowledge clues. Through experiments conducted on three benchmarks,\nwe demonstrate significant improvements ranging from 3.0% to 14.6% across all\nevaluation metrics when compared to strong baselines.\n","authors":["Xinwei Long","Jiali Zeng","Fandong Meng","Zhiyuan Ma","Kaiyan Zhang","Bowen Zhou","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.08206v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2305.17547v3","updated":"2024-01-16T08:27:38Z","published":"2023-05-27T18:30:54Z","title":"Translatotron 3: Speech to Speech Translation with Monolingual Data","summary":" This paper presents Translatotron 3, a novel approach to unsupervised direct\nspeech-to-speech translation from monolingual speech-text datasets by combining\nmasked autoencoder, unsupervised embedding mapping, and back-translation.\nExperimental results in speech-to-speech translation tasks between Spanish and\nEnglish show that Translatotron 3 outperforms a baseline cascade system,\nreporting $18.14$ BLEU points improvement on the synthesized\nUnpaired-Conversational dataset. In contrast to supervised approaches that\nnecessitate real paired data, or specialized modeling to replicate\npara-/non-linguistic information such as pauses, speaking rates, and speaker\nidentity, Translatotron 3 showcases its capability to retain it. Audio samples\ncan be found at http://google-research.github.io/lingvo-lab/translatotron3\n","authors":["Eliya Nachmani","Alon Levkovitch","Yifan Ding","Chulayuth Asawaroengchai","Heiga Zen","Michelle Tadmor Ramanovich"],"pdf_url":"https://arxiv.org/pdf/2305.17547v3.pdf","comment":"To appear in ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.11193v7","updated":"2024-01-16T08:12:46Z","published":"2023-12-18T13:40:16Z","title":"\"Paraphrasing The Original Text\" Makes High Accuracy Long-Context QA","summary":" Most open-source generative language models currently have a context window\nof no more than 4k, limiting their ability when facing long text. Many previous\nefforts have tried to extend the context window of models, but their actual\neffects have been found to be very limited. To address this issue, we\ntheoretically analyze the effectiveness of the long-context training data and\nfind that long-context training requires \"effective\" data rather than simply\n\"long\" data, which is rarely noticed in previous studies. Thus, we propose\nadding \"original text paraphrasing\" to enhance the effectiveness of the data.\nThe model trained on our re-fined dataset obtains excellent long-context\ncapabilities and achieves state-of-the-art accuracy on multi-document retrieval\nand QA tasks among models of comparable scales. The model and training data\nhave been made available on\nHuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and\nWiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k).\n","authors":["Yijiong Yu"],"pdf_url":"https://arxiv.org/pdf/2312.11193v7.pdf","comment":"Chinese version of this paper can be downloaded from\n (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/)"},{"id":"http://arxiv.org/abs/2310.10788v2","updated":"2024-01-16T08:09:15Z","published":"2023-10-16T19:50:01Z","title":"Self-Supervised Models of Speech Infer Universal Articulatory Kinematics","summary":" Self-Supervised Learning (SSL) based models of speech have shown remarkable\nperformance on a range of downstream tasks. These state-of-the-art models have\nremained blackboxes, but many recent studies have begun \"probing\" models like\nHuBERT, to correlate their internal representations to different aspects of\nspeech. In this paper, we show \"inference of articulatory kinematics\" as\nfundamental property of SSL models, i.e., the ability of these models to\ntransform acoustics into the causal articulatory dynamics underlying the speech\nsignal. We also show that this abstraction is largely overlapping across the\nlanguage of the data used to train the model, with preference to the language\nwith similar phonological system. Furthermore, we show that with simple affine\ntransformations, Acoustic-to-Articulatory inversion (AAI) is transferrable\nacross speakers, even across genders, languages, and dialects, showing the\ngeneralizability of this property. Together, these results shed new light on\nthe internals of SSL models that are critical to their superior performance,\nand open up new avenues into language-agnostic universal models for speech\nengineering, that are interpretable and grounded in speech science.\n","authors":["Cheol Jun Cho","Abdelrahman Mohamed","Alan W Black","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2310.10788v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08190v1","updated":"2024-01-16T08:08:01Z","published":"2024-01-16T08:08:01Z","title":"MARIO: MAth Reasoning with code Interpreter Output -- A Reproducible\n Pipeline","summary":" Large language models (LLMs) have seen considerable advancements in natural\nlanguage understanding tasks, yet there remains a gap to bridge before\nattaining true artificial general intelligence, especially concerning\nshortcomings in mathematical reasoning capabilities. We postulate that the\ninherent nature of LLM training, which focuses on predicting probabilities of\nnext token, presents challenges in effectively modeling mathematical reasoning\nthat demands exact calculations, both from data-driven and theoretical\nstandpoints. In this paper, we address this challenge by enriching the data\nlandscape and introducing a novel math dataset, enhanced with a capability to\nutilize a Python code interpreter. This dataset is derived from GSM8K and MATH\nand has been further refined through a combination of GPT-4 annotations, human\nreview, and self-training processes, where the errors in the original GSM8K\ntraining set have been fixed. Additionally, we propose a tentative, easily\nreplicable protocol for the fine-tuning of math-specific LLMs, which has led to\na significant improvement in the performance of a 7B-parameter LLM on the GSM8K\nand MATH datasets. We are committed to advancing the field of mathematical\nreasoning in LLMs and, to that end, we have made the model checkpoints and will\nmake the dataset publicly available. We hope this will facilitate further\nresearch and development within the community.\n","authors":["Minpeng Liao","Wei Luo","Chengxi Li","Jing Wu","Kai Fan"],"pdf_url":"https://arxiv.org/pdf/2401.08190v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.08189v1","updated":"2024-01-16T08:04:50Z","published":"2024-01-16T08:04:50Z","title":"PRewrite: Prompt Rewriting with Reinforcement Learning","summary":" Prompt engineering is critical for the development of LLM-based applications.\nHowever, it is usually done manually in a \"trial and error\" fashion. This\nmanual procedure can be time consuming, ineffective, and the generated prompts\nare, in a lot of cases, sub-optimal. Even for the prompts which seemingly work\nwell, there is always a lingering question: can the prompts be made better with\nfurther modifications?\n To address these questions, in this paper, we investigate prompt engineering\nautomation. We consider a specific use case scenario in which developers/users\nhave drafted initial prompts, but lack the time/expertise to optimize them. We\npropose PRewrite, an automated tool to rewrite these drafts and to generate\nhighly effective new prompts. PRewrite is based on the Reinforcement Learning\n(RL) framework which allows for end-to-end optimization and our design allows\nthe RL search to happen in a large action space. The automated tool leverages\nmanually crafted prompts as starting points which makes the rewriting procedure\nmore guided and efficient. The generated prompts are human readable, and\nself-explanatory, unlike some of those in previous works. We conducted\nextensive experiments on diverse datasets and found that the prompts generated\nwith this new method not only outperform professionally crafted prompts, but\nalso prompts generated with other previously proposed methods.\n","authors":["Weize Kong","Spurthi Amba Hombaiah","Mingyang Zhang","Qiaozhu Mei","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2401.08189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03220v4","updated":"2024-01-16T07:12:32Z","published":"2023-11-06T16:03:46Z","title":"ALYMPICS: LLM Agents Meet Game Theory -- Exploring Strategic\n Decision-Making with AI Agents","summary":" This paper introduces Alympics (Olympics for Agents), a systematic simulation\nframework utilizing Large Language Model (LLM) agents for game theory research.\nAlympics creates a versatile platform for studying complex game theory\nproblems, bridging the gap between theoretical game theory and empirical\ninvestigations by providing a controlled environment for simulating human-like\nstrategic interactions with LLM agents. In our pilot case study, the \"Water\nAllocation Challenge,\" we explore Alympics through a challenging strategic game\nfocused on the multi-round auction on scarce survival resources. This study\ndemonstrates the framework's ability to qualitatively and quantitatively\nanalyze game determinants, strategies, and outcomes. Additionally, we conduct a\ncomprehensive human assessment and an in-depth evaluation of LLM agents in\nstrategic decision-making scenarios. Our findings not only expand the\nunderstanding of LLM agents' proficiency in emulating human strategic behavior\nbut also highlight their potential in advancing game theory knowledge, thereby\nenriching our understanding of both game theory and empowering further research\ninto strategic decision-making domains with LLM agents. Codes, prompts, and all\nrelated resources are available at https://github.com/microsoft/Alympics.\n","authors":["Shaoguang Mao","Yuzhe Cai","Yan Xia","Wenshan Wu","Xun Wang","Fengyi Wang","Tao Ge","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2311.03220v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06400v2","updated":"2024-01-16T06:01:48Z","published":"2024-01-12T06:49:49Z","title":"Generalizing Visual Question Answering from Synthetic to Human-Written\n Questions via a Chain of QA with a Large Language Model","summary":" Visual question answering (VQA) is a task where an image is given, and a\nseries of questions are asked about the image. To build an efficient VQA\nalgorithm, a large amount of QA data is required which is very expensive.\nGenerating synthetic QA pairs based on templates is a practical way to obtain\ndata. However, VQA models trained on those data do not perform well on complex,\nhuman-written questions. To address this issue, we propose a new method called\n{\\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a\nsequence of QA interactions between a large language model and a VQA model\ntrained on synthetic data to reason and derive logical answers for\nhuman-written questions. We tested the effectiveness of CoQAH on two types of\nhuman-written VQA datasets for 3D-rendered and chest X-ray images and found\nthat it achieved state-of-the-art accuracy in both types of data. Notably,\nCoQAH outperformed general vision-language models, VQA models, and medical\nfoundation models with no finetuning.\n","authors":["Taehee Kim","Yeongjae Cho","Heejun Shin","Yohan Jo","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2401.06400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10803v2","updated":"2024-01-16T05:54:49Z","published":"2023-10-16T20:05:36Z","title":"SD-HuBERT: Sentence-Level Self-Distillation Induces Syllabic\n Organization in HuBERT","summary":" Data-driven unit discovery in self-supervised learning (SSL) of speech has\nembarked on a new era of spoken language processing. Yet, the discovered units\noften remain in phonetic space and the units beyond phonemes are largely\nunderexplored. Here, we demonstrate that a syllabic organization emerges in\nlearning sentence-level representation of speech. In particular, we adopt\n\"self-distillation\" objective to fine-tune the pretrained HuBERT with an\naggregator token that summarizes the entire sentence. Without any supervision,\nthe resulting model draws definite boundaries in speech, and the\nrepresentations across frames exhibit salient syllabic structures. We\ndemonstrate that this emergent structure largely corresponds to the ground\ntruth syllables. Furthermore, we propose a new benchmark task, Spoken Speech\nABX, for evaluating sentence-level representation of speech. When compared to\nprevious models, our model outperforms in both unsupervised syllable discovery\nand learning sentence-level representation. Together, we demonstrate that the\nself-distillation of HuBERT gives rise to syllabic organization without relying\non external labels or modalities, and potentially provides novel data-driven\nunits for spoken language modeling.\n","authors":["Cheol Jun Cho","Abdelrahman Mohamed","Shang-Wen Li","Alan W Black","Gopala K. Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2310.10803v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10005v2","updated":"2024-01-16T05:43:20Z","published":"2023-05-17T07:23:46Z","title":"DinoSR: Self-Distillation and Online Clustering for Self-supervised\n Speech Representation Learning","summary":" In this paper, we introduce self-distillation and online clustering for\nself-supervised speech representation learning (DinoSR) which combines masked\nlanguage modeling, self-distillation, and online clustering. We show that these\nconcepts complement each other and result in a strong representation learning\nmodel for speech. DinoSR first extracts contextualized embeddings from the\ninput audio with a teacher network, then runs an online clustering system on\nthe embeddings to yield a machine-discovered phone inventory, and finally uses\nthe discretized tokens to guide a student network. We show that DinoSR\nsurpasses previous state-of-the-art performance in several downstream tasks,\nand provide a detailed analysis of the model and the learned discrete units.\n","authors":["Alexander H. Liu","Heng-Jui Chang","Michael Auli","Wei-Ning Hsu","James R. Glass"],"pdf_url":"https://arxiv.org/pdf/2305.10005v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08089v1","updated":"2024-01-16T03:28:29Z","published":"2024-01-16T03:28:29Z","title":"A Study on Training and Developing Large Language Models for Behavior\n Tree Generation","summary":" This paper presents an innovative exploration of the application potential of\nlarge language models (LLM) in addressing the challenging task of automatically\ngenerating behavior trees (BTs) for complex tasks. The conventional manual BT\ngeneration method is inefficient and heavily reliant on domain expertise. On\nthe other hand, existing automatic BT generation technologies encounter\nbottlenecks related to task complexity, model adaptability, and reliability. In\norder to overcome these challenges, we propose a novel methodology that\nleverages the robust representation and reasoning abilities of LLMs. The core\ncontribution of this paper lies in the design of a BT generation framework\nbased on LLM, which encompasses the entire process, from data synthesis and\nmodel training to application developing and data verification. Synthetic data\nis introduced to train the BT generation model (BTGen model), enhancing its\nunderstanding and adaptability to various complex tasks, thereby significantly\nimproving its overall performance. In order to ensure the effectiveness and\nexecutability of the generated BTs, we emphasize the importance of data\nverification and introduce a multilevel verification strategy. Additionally, we\nexplore a range of agent design and development schemes with LLM as the central\nelement. We hope that the work in this paper may provide a reference for the\nresearchers who are interested in BT generation based on LLMs.\n","authors":["Fu Li","Xueying Wang","Bin Li","Yunlong Wu","Yanzhen Wang","Xiaodong Yi"],"pdf_url":"https://arxiv.org/pdf/2401.08089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08088v1","updated":"2024-01-16T03:28:26Z","published":"2024-01-16T03:28:26Z","title":"Enhancing Document-level Translation of Large Language Model via\n Translation Mixed-instructions","summary":" Existing large language models (LLMs) for machine translation are typically\nfine-tuned on sentence-level translation instructions and achieve satisfactory\nperformance at the sentence level. However, when applied to document-level\ntranslation, these models face a significant challenge, particularly when\ndealing with documents containing over 512 tokens. This challenge arises from\nthe issue of sentence-level coverage, where subsequent sentences in the\ndocument remain untranslated. As a result, the document-level translation\ncapability of LLMs fine-tuned on sentence-level translation instructions is\nsignificantly limited. We conjecture that the primary cause of LLMs' weak\ndocument-level translation performance is the absence of document-to-document\nmapping ability. To address the issue, we propose an approach that combines\nsentence-level and document-level translation instructions of varying lengths\nto fine-tune LLMs. Our proposed translation mixed-instructions enable LLMs\n(Llama-2~7B and 13B) to maintain consistent translation performance from the\nsentence level to documents containing as many as 2048 tokens. Extensive\nexperimental results show that the proposed approach significantly enhances the\ndocument-level translation capabilities of LLMs on 10 language pairs,\neffectively mitigating the sentence-level coverage issue in document-level\ntranslation. Experimentation on discourse phenomena has demonstrated that our\ndocument-level translation approach significantly improves translation quality,\nboth in terms of BLEU score and discourse coherence.\n","authors":["Yachao Li","Junhui Li","Jing Jiang","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.08088v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2308.07272v2","updated":"2024-01-16T03:22:15Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n Generation for Few-shot Learning","summary":" Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v2.pdf","comment":"AAAI 2024 Main Track"},{"id":"http://arxiv.org/abs/2401.08047v1","updated":"2024-01-16T02:00:17Z","published":"2024-01-16T02:00:17Z","title":"Incremental Extractive Opinion Summarization Using Cover Trees","summary":" Extractive opinion summarization involves automatically producing a summary\nof text about an entity (e.g., a product's reviews) by extracting\nrepresentative sentences that capture prevalent opinions in the review set.\nTypically, in online marketplaces user reviews accrue over time, and opinion\nsummaries need to be updated periodically to provide customers with up-to-date\ninformation. In this work, we study the task of extractive opinion\nsummarization in an incremental setting, where the underlying review set\nevolves over time. Many of the state-of-the-art extractive opinion\nsummarization approaches are centrality-based, such as CentroidRank.\nCentroidRank performs extractive summarization by selecting a subset of review\nsentences closest to the centroid in the representation space as the summary.\nHowever, these methods are not capable of operating efficiently in an\nincremental setting, where reviews arrive one at a time. In this paper, we\npresent an efficient algorithm for accurately computing the CentroidRank\nsummaries in an incremental setting. Our approach, CoverSumm, relies on\nindexing review representations in a cover tree and maintaining a reservoir of\ncandidate summary review sentences. CoverSumm's efficacy is supported by a\ntheoretical and empirical analysis of running time. Empirically, on a diverse\ncollection of data (both real and synthetically created to illustrate scaling\nconsiderations), we demonstrate that CoverSumm is up to 25x faster than\nbaseline methods, and capable of adapting to nuanced changes in data\ndistribution. We also conduct human evaluations of the generated summaries and\nfind that CoverSumm is capable of producing informative summaries consistent\nwith the underlying review set.\n","authors":["Somnath Basu Roy Chowdhury","Nicholas Monath","Avinava Dubey","Manzil Zaheer","Andrew McCallum","Amr Ahmed","Snigdha Chaturvedi"],"pdf_url":"https://arxiv.org/pdf/2401.08047v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.08046v1","updated":"2024-01-16T01:58:36Z","published":"2024-01-16T01:58:36Z","title":"Enhancing Robustness of LLM-Synthetic Text Detectors for Academic\n Writing: A Comprehensive Analysis","summary":" The emergence of large language models (LLMs), such as Generative Pre-trained\nTransformer 4 (GPT-4) used by ChatGPT, has profoundly impacted the academic and\nbroader community. While these models offer numerous advantages in terms of\nrevolutionizing work and study methods, they have also garnered significant\nattention due to their potential negative consequences. One example is\ngenerating academic reports or papers with little to no human contribution.\nConsequently, researchers have focused on developing detectors to address the\nmisuse of LLMs. However, most existing methods prioritize achieving higher\naccuracy on restricted datasets, neglecting the crucial aspect of\ngeneralizability. This limitation hinders their practical application in\nreal-life scenarios where reliability is paramount. In this paper, we present a\ncomprehensive analysis of the impact of prompts on the text generated by LLMs\nand highlight the potential lack of robustness in one of the current\nstate-of-the-art GPT detectors. To mitigate these issues concerning the misuse\nof LLMs in academic writing, we propose a reference-based Siamese detector\nnamed Synthetic-Siamese which takes a pair of texts, one as the inquiry and the\nother as the reference. Our method effectively addresses the lack of robustness\nof previous detectors (OpenAI detector and DetectGPT) and significantly\nimproves the baseline performances in realistic academic writing scenarios by\napproximately 67% to 95%.\n","authors":["Zhicheng Dou","Yuchen Guo","Ching-Chun Chang","Huy H. Nguyen","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2401.08046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08038v1","updated":"2024-01-16T01:27:26Z","published":"2024-01-16T01:27:26Z","title":"Calpric: Inclusive and Fine-grain Labeling of Privacy Policies with\n Crowdsourcing and Active Learning","summary":" A significant challenge to training accurate deep learning models on privacy\npolicies is the cost and difficulty of obtaining a large and comprehensive set\nof training data. To address these challenges, we present Calpric , which\ncombines automatic text selection and segmentation, active learning and the use\nof crowdsourced annotators to generate a large, balanced training set for\nprivacy policies at low cost. Automated text selection and segmentation\nsimplifies the labeling task, enabling untrained annotators from crowdsourcing\nplatforms, like Amazon's Mechanical Turk, to be competitive with trained\nannotators, such as law students, and also reduces inter-annotator agreement,\nwhich decreases labeling cost. Having reliable labels for training enables the\nuse of active learning, which uses fewer training samples to efficiently cover\nthe input space, further reducing cost and improving class and data category\nbalance in the data set. The combination of these techniques allows Calpric to\nproduce models that are accurate over a wider range of data categories, and\nprovide more detailed, fine-grain labels than previous work. Our crowdsourcing\nprocess enables Calpric to attain reliable labeled data at a cost of roughly\n$0.92-$1.71 per labeled text segment. Calpric 's training process also\ngenerates a labeled data set of 16K privacy policy text segments across 9 Data\ncategories with balanced positive and negative samples.\n","authors":["Wenjun Qiu","David Lie","Lisa Austin"],"pdf_url":"https://arxiv.org/pdf/2401.08038v1.pdf","comment":"published at USENIX Security 2023; associated website:\n https://www.usenix.org/conference/usenixsecurity23/presentation/qiu"},{"id":"http://arxiv.org/abs/2401.03642v2","updated":"2024-01-16T01:05:59Z","published":"2024-01-08T03:14:24Z","title":"A Content-Based Novelty Measure for Scholarly Publications: A Proof of\n Concept","summary":" Novelty, akin to gene mutation in evolution, opens possibilities for\nscholarly advancement. Although peer review remains the gold standard for\nevaluating novelty in scholarly communication and resource allocation, the vast\nvolume of submissions necessitates an automated measure of scholarly novelty.\nAdopting a perspective that views novelty as the atypical combination of\nexisting knowledge, we introduce an information-theoretic measure of novelty in\nscholarly publications. This measure quantifies the degree of 'surprise'\nperceived by a language model that represents the word distribution of\nscholarly discourse. The proposed measure is accompanied by face and construct\nvalidity evidence; the former demonstrates correspondence to scientific common\nsense, and the latter is endorsed through alignment with novelty evaluations\nfrom a select panel of domain experts. Additionally, characterized by its\ninterpretability, fine granularity, and accessibility, this measure addresses\ngaps prevalent in existing methods. We believe this measure holds great\npotential to benefit editors, stakeholders, and policymakers, and it provides a\nreliable lens for examining the relationship between novelty and academic\ndynamics such as creativity, interdisciplinarity, and scientific advances.\n","authors":["Haining Wang"],"pdf_url":"https://arxiv.org/pdf/2401.03642v2.pdf","comment":"Accepted for publication in the proceedings of iConference2024"},{"id":"http://arxiv.org/abs/2401.08026v1","updated":"2024-01-16T00:47:36Z","published":"2024-01-16T00:47:36Z","title":"JustiLM: Few-shot Justification Generation for Explainable Fact-Checking\n of Real-world Claims","summary":" Justification is an explanation that supports the veracity assigned to a\nclaim in fact-checking. However, the task of justification generation is\npreviously oversimplified as summarization of fact-check article authored by\nfact-checkers. Therefore, we propose a realistic approach to generate\njustification based on retrieved evidence. We present a new benchmark dataset\ncalled ExClaim for \\underline{Ex}plainable fact-checking of real-world\n\\underline{Claim}s, and introduce JustiLM, a novel few-shot\n\\underline{Justi}fication generation based on retrieval-augmented\n\\underline{L}anguage \\underline{M}odel by using fact-check articles as\nauxiliary resource during training only. Experiments show that JustiLM achieves\npromising performance in justification generation compared to strong baselines,\nand can also enhance veracity classification with a straightforward extension.\n","authors":["Fengzhu Zeng","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2401.08026v1.pdf","comment":"Accepted in TACL. This is a pre-MIT Press publication version"},{"id":"http://arxiv.org/abs/2401.08025v1","updated":"2024-01-16T00:46:29Z","published":"2024-01-16T00:46:29Z","title":"Self-Imagine: Effective Unimodal Reasoning with Multimodal Models using\n Self-Imagination","summary":" The potential of Vision-Language Models (\\textsc{vlm}s) often remains\nunderutilized in handling complex text-based problems, particularly when these\nproblems could benefit from visual representation. Resonating with humans'\nability to solve complex text-based problems by (1) creating a visual diagram\nfrom the problem and (2) deducing what steps they need to take to solve it, we\npropose \\textsc{Self-Imagine}. We leverage a single Vision-Language Model\n(\\textsc{vlm}) to generate a structured representation of the question using\nHTML, then render the HTML as an image, and finally use the same \\vlm to answer\nthe question using both the question and the image. Our approach does not\nrequire any additional training data or training. We evaluate our approach in\nthree mathematics tasks and nine general-purpose reasoning tasks using\nstate-of-the-art \\textsc{vlm}. Our approach boosts the performance of\n\\textsc{vlm} on all math tasks (\\gsm: +4.62\\%; \\asdiv: +4.49\\%; \\svamp:\n+9.30\\%) and the majority of the general-purpose reasoning tasks by 0.4\\% to\n13.20\\% while achieving comparable performance in other tasks.\n Code and data at https://github.com/snat1505027/self-imagine .\n","authors":["Syeda Nahida Akter","Aman Madaan","Sangwu Lee","Yiming Yang","Eric Nyberg"],"pdf_url":"https://arxiv.org/pdf/2401.08025v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.08887v1","updated":"2024-01-16T23:50:26Z","published":"2024-01-16T23:50:26Z","title":"NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant\n Meeting Transcription","summary":" We introduce the first Natural Office Talkers in Settings of Far-field Audio\nRecordings (``NOTSOFAR-1'') Challenge alongside datasets and baseline system.\nThe challenge focuses on distant speaker diarization and automatic speech\nrecognition (DASR) in far-field meeting scenarios, with single-channel and\nknown-geometry multi-channel tracks, and serves as a launch platform for two\nnew datasets: First, a benchmarking dataset of 315 meetings, averaging 6\nminutes each, capturing a broad spectrum of real-world acoustic conditions and\nconversational dynamics. It is recorded across 30 conference rooms, featuring\n4-8 attendees and a total of 35 unique speakers. Second, a 1000-hour simulated\ntraining dataset, synthesized with enhanced authenticity for real-world\ngeneralization, incorporating 15,000 real acoustic transfer functions. The\ntasks focus on single-device DASR, where multi-channel devices always share the\nsame known geometry. This is aligned with common setups in actual conference\nrooms, and avoids technical complexities associated with multi-device tasks. It\nalso allows for the development of geometry-specific solutions. The NOTSOFAR-1\nChallenge aims to advance research in the field of distant conversational\nspeech recognition, providing key resources to unlock the potential of\ndata-driven methods, which we believe are currently constrained by the absence\nof comprehensive high-quality training and benchmarking datasets.\n","authors":["Alon Vinnikov","Amir Ivry","Aviv Hurvitz","Igor Abramovski","Sharon Koubi","Ilya Gurvich","Shai Pe`er","Xiong Xiao","Benjamin Martinez Elizalde","Naoyuki Kanda","Xiaofei Wang","Shalev Shaer","Stav Yagev","Yossi Asher","Sunit Sivasankaran","Yifan Gong","Min Tang","Huaming Wang","Eyal Krupka"],"pdf_url":"https://arxiv.org/pdf/2401.08887v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2401.08851v1","updated":"2024-01-16T21:56:27Z","published":"2024-01-16T21:56:27Z","title":"Using i-vectors for subject-independent cross-session EEG transfer\n learning","summary":" Cognitive load classification is the task of automatically determining an\nindividual's utilization of working memory resources during performance of a\ntask based on physiologic measures such as electroencephalography (EEG). In\nthis paper, we follow a cross-disciplinary approach, where tools and\nmethodologies from speech processing are used to tackle this problem. The\ncorpus we use was released publicly in 2021 as part of the first passive\nbrain-computer interface competition on cross-session workload estimation. We\npresent our approach which used i-vector-based neural network classifiers to\naccomplish inter-subject cross-session EEG transfer learning, achieving 18%\nrelative improvement over equivalent subject-dependent models. We also report\nexperiments showing how our subject-independent models perform competitively on\nheld-out subjects and improve with additional subject data, suggesting that\nsubject-dependent training is not required for effective cognitive load\ndetermination.\n","authors":["Jonathan Lasko","Jeff Ma","Mike Nicoletti","Jonathan Sussman-Fort","Sooyoung Jeong","William Hartmann"],"pdf_url":"https://arxiv.org/pdf/2401.08851v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2305.00969v6","updated":"2024-01-16T21:49:20Z","published":"2023-05-01T17:56:32Z","title":"CryCeleb: A Speaker Verification Dataset Based on Infant Cry Sounds","summary":" This paper describes the Ubenwa CryCeleb dataset - a labeled collection of\ninfant cries - and the accompanying CryCeleb 2023 task, which is a public\nspeaker verification challenge based on cry sounds. We released more than 6\nhours of manually segmented cry sounds from 786 newborns for academic use,\naiming to encourage research in infant cry analysis. The inaugural public\ncompetition attracted 59 participants, 11 of whom improved the baseline\nperformance. The top-performing system achieved a significant improvement\nscoring 25.8% equal error rate, which is still far from the performance of\nstate-of-the-art adult speaker verification systems. Therefore, we believe\nthere is room for further research on this dataset, potentially extending\nbeyond the verification task.\n","authors":["David Budaghyan","Charles C. Onu","Arsenii Gorin","Cem Subakan","Doina Precup"],"pdf_url":"https://arxiv.org/pdf/2305.00969v6.pdf","comment":"To appear in ICASSP 2024"},{"id":"http://arxiv.org/abs/2303.00915v2","updated":"2024-01-16T21:42:24Z","published":"2023-03-02T02:20:04Z","title":"BiomedCLIP: a multimodal biomedical foundation model pretrained from\n fifteen million scientific image-text pairs","summary":" Biomedical data is inherently multimodal, comprising physical measurements\nand natural language narratives. A generalist biomedical AI model needs to\nsimultaneously process different modalities of data, including text and images.\nTherefore, training an effective generalist biomedical model requires\nhigh-quality multimodal data, such as parallel image-text pairs. Here, we\npresent PMC-15M, a novel dataset that is two orders of magnitude larger than\nexisting biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse\nrange of biomedical image types. PMC-15M contains 15 million biomedical\nimage-text pairs collected from 4.4 million scientific articles. Based on\nPMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with\ndomain-specific adaptations tailored to biomedical vision-language processing.\nWe conducted extensive experiments and ablation studies on standard biomedical\nimaging tasks from retrieval to classification to visual question-answering\n(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of\nstandard datasets, substantially outperforming prior approaches. Intriguingly,\nby large-scale pretraining on diverse biomedical image types, BiomedCLIP even\noutperforms state-of-the-art radiology-specific models such as BioViL in\nradiology-specific tasks such as RSNA pneumonia detection. In summary,\nBiomedCLIP is a fully open-access foundation model that achieves\nstate-of-the-art performance on various biomedical tasks, paving the way for\ntransformative multimodal biomedical discovery and applications. We release our\nmodels at https://aka.ms/biomedclip to facilitate future research in multimodal\nbiomedical AI.\n","authors":["Sheng Zhang","Yanbo Xu","Naoto Usuyama","Hanwen Xu","Jaspreet Bagga","Robert Tinn","Sam Preston","Rajesh Rao","Mu Wei","Naveen Valluri","Cliff Wong","Andrea Tupini","Yu Wang","Matt Mazzola","Swadheen Shukla","Lars Liden","Jianfeng Gao","Matthew P. Lungren","Tristan Naumann","Sheng Wang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2303.00915v2.pdf","comment":"The models are released at https://aka.ms/biomedclip"},{"id":"http://arxiv.org/abs/2401.08835v1","updated":"2024-01-16T21:16:12Z","published":"2024-01-16T21:16:12Z","title":"Improving ASR Contextual Biasing with Guided Attention","summary":" In this paper, we propose a Guided Attention (GA) auxiliary training loss,\nwhich improves the effectiveness and robustness of automatic speech recognition\n(ASR) contextual biasing without introducing additional parameters. A common\nchallenge in previous literature is that the word error rate (WER) reduction\nbrought by contextual biasing diminishes as the number of bias phrases\nincreases. To address this challenge, we employ a GA loss as an additional\ntraining objective besides the Transducer loss. The proposed GA loss aims to\nteach the cross attention how to align bias phrases with text tokens or audio\nframes. Compared to studies with similar motivations, the proposed loss\noperates directly on the cross attention weights and is easier to implement.\nThrough extensive experiments based on Conformer Transducer with Contextual\nAdapter, we demonstrate that the proposed method not only leads to a lower WER\nbut also retains its effectiveness as the number of bias phrases increases.\nSpecifically, the GA loss decreases the WER of rare vocabularies by up to 19.2%\non LibriSpeech compared to the contextual biasing baseline, and up to 49.3%\ncompared to a vanilla Transducer.\n","authors":["Jiyang Tang","Kwangyoun Kim","Suwon Shon","Felix Wu","Prashant Sridhar","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2401.08835v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08833v1","updated":"2024-01-16T21:13:22Z","published":"2024-01-16T21:13:22Z","title":"Revisiting Self-supervised Learning of Speech Representation from a\n Mutual Information Perspective","summary":" Existing studies on self-supervised speech representation learning have\nfocused on developing new training methods and applying pre-trained models for\ndifferent applications. However, the quality of these models is often measured\nby the performance of different downstream tasks. How well the representations\naccess the information of interest is less studied. In this work, we take a\ncloser look into existing self-supervised methods of speech from an\ninformation-theoretic perspective. We aim to develop metrics using mutual\ninformation to help practical problems such as model design and selection. We\nuse linear probes to estimate the mutual information between the target\ninformation and learned representations, showing another insight into the\naccessibility to the target information from speech representations. Further,\nwe explore the potential of evaluating representations in a self-supervised\nfashion, where we estimate the mutual information between different parts of\nthe data without using any labels. Finally, we show that both supervised and\nunsupervised measures echo the performance of the models on layer-wise linear\nprobing and speech recognition.\n","authors":["Alexander H. Liu","Sung-Lin Yeh","James Glass"],"pdf_url":"https://arxiv.org/pdf/2401.08833v1.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08825v1","updated":"2024-01-16T20:57:36Z","published":"2024-01-16T20:57:36Z","title":"AiGen-FoodReview: A Multimodal Dataset of Machine-Generated Restaurant\n Reviews and Images on Social Media","summary":" Online reviews in the form of user-generated content (UGC) significantly\nimpact consumer decision-making. However, the pervasive issue of not only human\nfake content but also machine-generated content challenges UGC's reliability.\nRecent advances in Large Language Models (LLMs) may pave the way to fabricate\nindistinguishable fake generated content at a much lower cost. Leveraging\nOpenAI's GPT-4-Turbo and DALL-E-2 models, we craft AiGen-FoodReview, a\nmulti-modal dataset of 20,144 restaurant review-image pairs divided into\nauthentic and machine-generated. We explore unimodal and multimodal detection\nmodels, achieving 99.80% multimodal accuracy with FLAVA. We use attributes from\nreadability and photographic theories to score reviews and images,\nrespectively, demonstrating their utility as hand-crafted features in scalable\nand interpretable detection models, with comparable performance. The paper\ncontributes by open-sourcing the dataset and releasing fake review detectors,\nrecommending its use in unimodal and multimodal fake review detection tasks,\nand evaluating linguistic and visual features in synthetic versus authentic\ndata.\n","authors":["Alessandro Gambetti","Qiwei Han"],"pdf_url":"https://arxiv.org/pdf/2401.08825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06408v2","updated":"2024-01-16T19:35:28Z","published":"2024-01-12T07:10:10Z","title":"AboutMe: Using Self-Descriptions in Webpages to Document the Effects of\n English Pretraining Data Filters","summary":" Large language models' (LLMs) abilities are drawn from their pretraining\ndata, and model development begins with data curation. However, decisions\naround what data is retained or removed during this initial stage is\nunder-scrutinized. In our work, we ground web text, which is a popular\npretraining data source, to its social and geographic contexts. We create a new\ndataset of 10.3 million self-descriptions of website creators, and extract\ninformation about who they are and where they are from: their topical\ninterests, social roles, and geographic affiliations. Then, we conduct the\nfirst study investigating how ten \"quality\" and English language identification\n(langID) filters affect webpages that vary along these social dimensions. Our\nexperiments illuminate a range of implicit preferences in data curation: we\nshow that some quality classifiers act like topical domain filters, and langID\ncan overlook English content from some regions of the world. Overall, we hope\nthat our work will encourage a new line of research on pretraining data\ncuration practices and its social implications.\n","authors":["Li Lucy","Suchin Gururangan","Luca Soldaini","Emma Strubell","David Bamman","Lauren Klein","Jesse Dodge"],"pdf_url":"https://arxiv.org/pdf/2401.06408v2.pdf","comment":"28 pages, 13 figures"},{"id":"http://arxiv.org/abs/2308.01497v3","updated":"2024-01-16T19:00:56Z","published":"2023-08-03T01:46:27Z","title":"Large Language Model Displays Emergent Ability to Interpret Novel\n Literary Metaphors","summary":" Recent advances in the performance of large language models (LLMs) have\nsparked debate over whether, given sufficient training, high-level human\nabilities emerge in such generic forms of artificial intelligence (AI). Despite\nthe exceptional performance of LLMs on a wide range of tasks involving natural\nlanguage processing and reasoning, there has been sharp disagreement as to\nwhether their abilities extend to more creative human abilities. A core example\nis the ability to interpret novel metaphors. Given the enormous and non curated\ntext corpora used to train LLMs, a serious obstacle to designing tests is the\nrequirement of finding novel yet high quality metaphors that are unlikely to\nhave been included in the training data. Here we assessed the ability of GPT4,\na state of the art large language model, to provide natural-language\ninterpretations of novel literary metaphors drawn from Serbian poetry and\ntranslated into English. Despite exhibiting no signs of having been exposed to\nthese metaphors previously, the AI system consistently produced detailed and\nincisive interpretations. Human judges, blind to the fact that an AI model was\ninvolved, rated metaphor interpretations generated by GPT4 as superior to those\nprovided by a group of college students. In interpreting reversed metaphors,\nGPT4, as well as humans, exhibited signs of sensitivity to the Gricean\ncooperative principle. In addition, for several novel English poems GPT4\nproduced interpretations that were rated as excellent or good by a human\nliterary critic. These results indicate that LLMs such as GPT4 have acquired an\nemergent ability to interpret complex metaphors, including those embedded in\nnovel poems.\n","authors":["Nicholas Ichien","Dušan Stamenković","Keith J. Holyoak"],"pdf_url":"https://arxiv.org/pdf/2308.01497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08772v1","updated":"2024-01-16T19:00:10Z","published":"2024-01-16T19:00:10Z","title":"HuixiangDou: Overcoming Group Chat Scenarios with LLM-based Technical\n Assistance","summary":" In this work, we present HuixiangDou, a technical assistant powered by Large\nLanguage Models (LLM). This system is designed to assist algorithm developers\nby providing insightful responses to questions related to open-source algorithm\nprojects, such as computer vision and deep learning projects from OpenMMLab. We\nfurther explore the integration of this assistant into the group chats of\ninstant messaging (IM) tools such as WeChat and Lark. Through several iterative\nimprovements and trials, we have developed a sophisticated technical chat\nassistant capable of effectively answering users' technical questions without\ncausing message flooding. This paper's contributions include: 1) Designing an\nalgorithm pipeline specifically for group chat scenarios; 2) Verifying the\nreliable performance of text2vec in task rejection; 3) Identifying three\ncritical requirements for LLMs in technical-assistant-like products, namely\nscoring ability, In-Context Learning (ICL), and Long Context. We have made the\nsoftware and source code available at https://github.com/internlm/huixiangdou\nto aid in future research and application. HuixiangDou is applicable to any\ngroup chat within IM tools.\n","authors":["Huanjun Kong","Songyang Zhang","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08772v1.pdf","comment":"Technical report, 11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.08577v1","updated":"2024-01-16T18:59:45Z","published":"2024-01-16T18:59:45Z","title":"MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in\n 3D World","summary":" Human beings possess the capability to multiply a melange of multisensory\ncues while actively exploring and interacting with the 3D world. Current\nmulti-modal large language models, however, passively absorb sensory data as\ninputs, lacking the capacity to actively interact with the objects in the 3D\nenvironment and dynamically collect their multisensory information. To usher in\nthe study of this area, we propose MultiPLY, a multisensory embodied large\nlanguage model that could incorporate multisensory interactive data, including\nvisual, audio, tactile, and thermal information into large language models,\nthereby establishing the correlation among words, actions, and percepts. To\nthis end, we first collect Multisensory Universe, a large-scale multisensory\ninteraction dataset comprising 500k data by deploying an LLM-powered embodied\nagent to engage with the 3D environment. To perform instruction tuning with\npre-trained LLM on such generated data, we first encode the 3D scene as\nabstracted object-centric representations and then introduce action tokens\ndenoting that the embodied agent takes certain actions within the environment,\nas well as state tokens that represent the multisensory state observations of\nthe agent at each time step. In the inference time, MultiPLY could generate\naction tokens, instructing the agent to take the action in the environment and\nobtain the next multisensory state observation. The observation is then\nappended back to the LLM via state tokens to generate subsequent text or action\ntokens. We demonstrate that MultiPLY outperforms baselines by a large margin\nthrough a diverse set of embodied tasks involving object retrieval, tool use,\nmultisensory captioning, and task decomposition.\n","authors":["Yining Hong","Zishuo Zheng","Peihao Chen","Yian Wang","Junyan Li","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2401.08577v1.pdf","comment":"Project page: https://vis-www.cs.umass.edu/multiply"},{"id":"http://arxiv.org/abs/2401.08743v1","updated":"2024-01-16T18:59:24Z","published":"2024-01-16T18:59:24Z","title":"MMToM-QA: Multimodal Theory of Mind Question Answering","summary":" Theory of Mind (ToM), the ability to understand people's minds, is an\nessential ingredient for developing machines with human-level social\nintelligence. Recent machine learning models, particularly large language\nmodels, seem to show some aspects of ToM understanding. However, existing ToM\nbenchmarks use unimodal datasets - either video or text. Human ToM, on the\nother hand, is more than video or text understanding. People can flexibly\nreason about another person's mind based on conceptual representations (e.g.,\ngoals, beliefs, plans) extracted from any available data, which can include\nvisual cues, linguistic narratives, or both. To address this, we introduce a\nmultimodal Theory of Mind question answering (MMToM-QA) benchmark. MMToM-QA\ncomprehensively evaluates machine ToM both on multimodal data and on different\nkinds of unimodal data about a person's activity in a household environment. To\nengineer multimodal ToM capacity, we propose a novel method, BIP-ALM (Bayesian\nInverse Planning Accelerated by Language Models). BIP-ALM extracts unified\nrepresentations from multimodal data and utilizes language models for scalable\nBayesian inverse planning. We conducted a systematic comparison of human\nperformance, BIP-ALM, and state-of-the-art models, including GPT-4. The\nexperiments demonstrate that large language models and large multimodal models\nstill lack robust ToM capacity. BIP-ALM, on the other hand, shows promising\nresults, by leveraging the power of both model-based mental inference and\nlanguage models.\n","authors":["Chuanyang Jin","Yutong Wu","Jing Cao","Jiannan Xiang","Yen-Ling Kuo","Zhiting Hu","Tomer Ullman","Antonio Torralba","Joshua B. Tenenbaum","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2401.08743v1.pdf","comment":"27 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2310.11454v2","updated":"2024-01-16T18:59:22Z","published":"2023-10-17T17:59:46Z","title":"VeRA: Vector-based Random Matrix Adaptation","summary":" Low-rank adapation (LoRA) is a popular method that reduces the number of\ntrainable parameters when finetuning large language models, but still faces\nacute storage challenges when scaling to even larger models or deploying\nnumerous per-user or per-task adapted models. In this work, we present\nVector-based Random Matrix Adaptation (VeRA), which significantly reduces the\nnumber of trainable parameters compared to LoRA, yet maintains the same\nperformance. It achieves this by using a single pair of low-rank matrices\nshared across all layers and learning small scaling vectors instead. We\ndemonstrate its effectiveness on the GLUE and E2E benchmarks, image\nclassification tasks, and show its application in instruction-tuning of 7B and\n13B language models.\n","authors":["Dawid J. Kopiczko","Tijmen Blankevoort","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2310.11454v2.pdf","comment":"Accepted at ICLR 2024, website: https://dkopi.github.io/vera"},{"id":"http://arxiv.org/abs/2401.08574v1","updated":"2024-01-16T18:58:37Z","published":"2024-01-16T18:58:37Z","title":"Deductive Closure Training of Language Models for Coherence, Accuracy,\n and Updatability","summary":" While language models (LMs) can sometimes generate factually correct text and\nestimate truth values of individual claims, these generally do not reflect a\nglobally coherent, manipulable model of the world. As a consequence, current\nLMs also generate incorrect or nonsensical content, and are difficult to edit\nand bring up to date. We present a method called Deductive Closure Training\n(DCT) that uses LMs themselves to identify implications of (and contradictions\nwithin) the text that they generate, yielding an efficient self-supervised\nprocedure for improving LM factuality. Given a collection of seed documents,\nDCT prompts LMs to generate additional text implied by these documents, reason\nglobally about the correctness of this generated text, and finally fine-tune on\ntext inferred to be correct. Given seed documents from a trusted source, DCT\nprovides a tool for supervised model updating; if seed documents are sampled\nfrom the LM itself, DCT enables fully unsupervised fine-tuning for improved\ncoherence and accuracy. Across the CREAK, MQUaKE, and Reversal Curse datasets,\nsupervised DCT improves LM fact verification and text generation accuracy by\n3-26%; on CREAK fully unsupervised DCT improves verification accuracy by 12%.\nThese results show that LMs' reasoning capabilities during inference can be\nleveraged during training to improve their reliability.\n","authors":["Afra Feyza Akyürek","Ekin Akyürek","Leshem Choshen","Derry Wijaya","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2401.08574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08567v1","updated":"2024-01-16T18:52:27Z","published":"2024-01-16T18:52:27Z","title":"Connect, Collapse, Corrupt: Learning Cross-Modal Tasks with Uni-Modal\n Data","summary":" Building cross-modal applications is challenging due to limited paired\nmulti-modal data. Recent works have shown that leveraging a pre-trained\nmulti-modal contrastive representation space enables cross-modal tasks to be\nlearned from uni-modal data. This is based on the assumption that contrastive\noptimization makes embeddings from different modalities interchangeable.\nHowever, this assumption is under-explored due to the poorly understood\ngeometry of the multi-modal contrastive space, where a modality gap exists. In\nour study, we provide a theoretical explanation of this space's geometry and\nintroduce a three-step method, $C^3$ (Connect, Collapse, Corrupt), to bridge\nthe modality gap, enhancing the interchangeability of embeddings. Our $C^3$\nmethod significantly improves cross-modal learning from uni-modal data,\nachieving state-of-the-art results on zero-shot image / audio / video\ncaptioning and text-to-image generation.\n","authors":["Yuhui Zhang","Elaine Sui","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2401.08567v1.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08565v1","updated":"2024-01-16T18:49:55Z","published":"2024-01-16T18:49:55Z","title":"Tuning Language Models by Proxy","summary":" Despite the general capabilities of large pretrained language models, they\nconsistently benefit from further adaptation to better achieve desired\nbehaviors. However, tuning these models has become increasingly\nresource-intensive, or impossible when model weights are private. We introduce\nproxy-tuning, a lightweight decoding-time algorithm that operates on top of\nblack-box LMs to achieve the result of directly tuning the model, but by\naccessing only its prediction over the output vocabulary. Our method instead\ntunes a smaller LM, then applies the difference between the predictions of the\nsmall tuned and untuned LMs to shift the original predictions of the base model\nin the direction of tuning, while retaining the benefits of larger scale\npretraining. In experiments, when we apply proxy-tuning to Llama2-70B using\nproxies of only 7B size, we can close 88% of the gap between Llama2-70B and its\ntruly-tuned chat version, when evaluated across knowledge, reasoning, and\nsafety benchmarks. Interestingly, when tested on TruthfulQA, proxy-tuned models\nare actually more truthful than directly tuned models, possibly because\ndecoding-time guidance better retains the model's factual knowledge. We then\ndemonstrate the generality of proxy-tuning by applying it for domain adaptation\non code, and task-specific finetuning on question-answering and math problems.\nOur work demonstrates the promise of using small tuned LMs to efficiently\ncustomize large, potentially proprietary LMs through decoding-time guidance.\n","authors":["Alisa Liu","Xiaochuang Han","Yizhong Wang","Yulia Tsvetkov","Yejin Choi","Noah A. Smith"],"pdf_url":"https://arxiv.org/pdf/2401.08565v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2401.08537v1","updated":"2024-01-16T17:59:54Z","published":"2024-01-16T17:59:54Z","title":"Spatial Entity Resolution between Restaurant Locations and\n Transportation Destinations in Southeast Asia","summary":" As a tech company, Grab has expanded from transportation to food delivery,\naiming to serve Southeast Asia with hyperlocalized applications. Information\nabout places as transportation destinations can help to improve our knowledge\nabout places as restaurants, so long as the spatial entity resolution problem\nbetween these datasets can be solved. In this project, we attempted to\nrecognize identical place entities from databases of Points-of-Interest (POI)\nand GrabFood restaurants, using their spatial and textual attributes, i.e.,\nlatitude, longitude, place name, and street address.\n Distance metrics were calculated for these attributes and fed to tree-based\nclassifiers. POI-restaurant matching was conducted separately for Singapore,\nPhilippines, Indonesia, and Malaysia. Experimental estimates demonstrate that a\nmatching POI can be found for over 35% of restaurants in these countries. As\npart of these estimates, test datasets were manually created, and RandomForest,\nAdaBoost, Gradient Boosting, and XGBoost perform well, with most accuracy,\nprecision, and recall scores close to or higher than 90% for matched vs.\nunmatched classification. To the authors' knowledge, there are no previous\npublished scientific papers devoted to matching of spatial entities for the\nSoutheast Asia region.\n","authors":["Emily Gao","Dominic Widdows"],"pdf_url":"https://arxiv.org/pdf/2401.08537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04925v2","updated":"2024-01-16T17:40:14Z","published":"2024-01-10T04:37:38Z","title":"The Impact of Reasoning Step Length on Large Language Models","summary":" Chain of Thought (CoT) is significant in improving the reasoning abilities of\nlarge language models (LLMs). However, the correlation between the\neffectiveness of CoT and the length of reasoning steps in prompts remains\nlargely unknown. To shed light on this, we have conducted several empirical\nexperiments to explore the relations. Specifically, we design experiments that\nexpand and compress the rationale reasoning steps within CoT demonstrations,\nwhile keeping all other factors constant. We have the following key findings.\nFirst, the results indicate that lengthening the reasoning steps in prompts,\neven without adding new information into the prompt, considerably enhances\nLLMs' reasoning abilities across multiple datasets. Alternatively, shortening\nthe reasoning steps, even while preserving the key information, significantly\ndiminishes the reasoning abilities of models. This finding highlights the\nimportance of the number of steps in CoT prompts and provides practical\nguidance to make better use of LLMs' potential in complex problem-solving\nscenarios. Second, we also investigated the relationship between the\nperformance of CoT and the rationales used in demonstrations. Surprisingly, the\nresult shows that even incorrect rationales can yield favorable outcomes if\nthey maintain the requisite length of inference. Third, we observed that the\nadvantages of increasing reasoning steps are task-dependent: simpler tasks\nrequire fewer steps, whereas complex tasks gain significantly from longer\ninference sequences.\n","authors":["Mingyu Jin","Qinkai Yu","Dong shu","Haiyan Zhao","Wenyue Hua","Yanda Meng","Yongfeng Zhang","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2401.04925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08517v1","updated":"2024-01-16T17:31:35Z","published":"2024-01-16T17:31:35Z","title":"Supporting Student Decisions on Learning Recommendations: An LLM-Based\n Chatbot with Knowledge Graph Contextualization for Conversational\n Explainability and Mentoring","summary":" Student commitment towards a learning recommendation is not separable from\ntheir understanding of the reasons it was recommended to them; and their\nability to modify it based on that understanding. Among explainability\napproaches, chatbots offer the potential to engage the student in a\nconversation, similar to a discussion with a peer or a mentor. The capabilities\nof chatbots, however, are still not sufficient to replace a human mentor,\ndespite the advancements of generative AI (GenAI) and large language models\n(LLM). Therefore, we propose an approach to utilize chatbots as mediators of\nthe conversation and sources of limited and controlled generation of\nexplanations, to harvest the potential of LLMs while reducing their potential\nrisks at the same time. The proposed LLM-based chatbot supports students in\nunderstanding learning-paths recommendations. We use a knowledge graph (KG) as\na human-curated source of information, to regulate the LLM's output through\ndefining its prompt's context. A group chat approach is developed to connect\nstudents with human mentors, either on demand or in cases that exceed the\nchatbot's pre-defined tasks. We evaluate the chatbot with a user study, to\nprovide a proof-of-concept and highlight the potential requirements and\nlimitations of utilizing chatbots in conversational explainability.\n","authors":["Hasan Abu-Rasheed","Mohamad Hussam Abdulsalam","Christian Weber","Madjid Fathi"],"pdf_url":"https://arxiv.org/pdf/2401.08517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02333v2","updated":"2024-01-16T17:18:35Z","published":"2024-01-04T16:16:14Z","title":"Beyond Extraction: Contextualising Tabular Data for Efficient\n Summarisation by Language Models","summary":" The conventional use of the Retrieval-Augmented Generation (RAG) architecture\nhas proven effective for retrieving information from diverse documents.\nHowever, challenges arise in handling complex table queries, especially within\nPDF documents containing intricate tabular structures.This research introduces\nan innovative approach to enhance the accuracy of complex table queries in\nRAG-based systems. Our methodology involves storing PDFs in the retrieval\ndatabase and extracting tabular content separately. The extracted tables\nundergo a process of context enrichment, concatenating headers with\ncorresponding values. To ensure a comprehensive understanding of the enriched\ndata, we employ a fine-tuned version of the Llama-2-chat language model for\nsummarisation within the RAG architecture. Furthermore, we augment the tabular\ndata with contextual sense using the ChatGPT 3.5 API through a one-shot prompt.\nThis enriched data is then fed into the retrieval database alongside other\nPDFs. Our approach aims to significantly improve the precision of complex table\nqueries, offering a promising solution to a longstanding challenge in\ninformation retrieval.\n","authors":["Uday Allu","Biddwan Ahmed","Vishesh Tripathi"],"pdf_url":"https://arxiv.org/pdf/2401.02333v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08511v1","updated":"2024-01-16T17:15:08Z","published":"2024-01-16T17:15:08Z","title":"The Gaps between Pre-train and Downstream Settings in Bias Evaluation\n and Debiasing","summary":" The output tendencies of Pre-trained Language Models (PLM) vary markedly\nbefore and after Fine-Tuning (FT) due to the updates to the model parameters.\nThese divergences in output tendencies result in a gap in the social biases of\nPLMs. For example, there exits a low correlation between intrinsic bias scores\nof a PLM and its extrinsic bias scores under FT-based debiasing methods.\nAdditionally, applying FT-based debiasing methods to a PLM leads to a decline\nin performance in downstream tasks. On the other hand, PLMs trained on large\ndatasets can learn without parameter updates via In-Context Learning (ICL)\nusing prompts. ICL induces smaller changes to PLMs compared to FT-based\ndebiasing methods. Therefore, we hypothesize that the gap observed in\npre-trained and FT models does not hold true for debiasing methods that use\nICL. In this study, we demonstrate that ICL-based debiasing methods show a\nhigher correlation between intrinsic and extrinsic bias scores compared to\nFT-based methods. Moreover, the performance degradation due to debiasing is\nalso lower in the ICL case compared to that in the FT case.\n","authors":["Masahiro Kaneko","Danushka Bollegala","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2401.08511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08508v1","updated":"2024-01-16T17:11:11Z","published":"2024-01-16T17:11:11Z","title":"EmoLLMs: A Series of Emotional Large Language Models and Annotation\n Tools for Comprehensive Affective Analysis","summary":" Sentiment analysis and emotion detection are important research topics in\nnatural language processing (NLP) and benefit many downstream tasks. With the\nwidespread application of LLMs, researchers have started exploring the\napplication of LLMs based on instruction-tuning in the field of sentiment\nanalysis. However, these models only focus on single aspects of affective\nclassification tasks (e.g. sentimental polarity or categorical emotions), and\noverlook the regression tasks (e.g. sentiment strength or emotion intensity),\nwhich leads to poor performance in downstream tasks. The main reason is the\nlack of comprehensive affective instruction tuning datasets and evaluation\nbenchmarks, which cover various affective classification and regression tasks.\nMoreover, although emotional information is useful for downstream tasks,\nexisting downstream datasets lack high-quality and comprehensive affective\nannotations. In this paper, we propose EmoLLMs, the first series of\nopen-sourced instruction-following LLMs for comprehensive affective analysis\nbased on fine-tuning various LLMs with instruction data, the first multi-task\naffective analysis instruction dataset (AAID) with 234K data samples based on\nvarious classification and regression tasks to support LLM instruction tuning,\nand a comprehensive affective evaluation benchmark (AEB) with 14 tasks from\nvarious sources and domains to test the generalization ability of LLMs. We\npropose a series of EmoLLMs by fine-tuning LLMs with AAID to solve various\naffective instruction tasks. We compare our model with a variety of LLMs on\nAEB, where our models outperform all other open-sourced LLMs, and surpass\nChatGPT and GPT-4 in most tasks, which shows that the series of EmoLLMs achieve\nthe ChatGPT-level and GPT-4-level generalization capabilities on affective\nanalysis tasks, and demonstrates our models can be used as affective annotation\ntools.\n","authors":["Zhiwei Liu","Kailai Yang","Tianlin Zhang","Qianqian Xie","Zeping Yu","Sophia Ananiadou"],"pdf_url":"https://arxiv.org/pdf/2401.08508v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.08500v1","updated":"2024-01-16T17:00:36Z","published":"2024-01-16T17:00:36Z","title":"Code Generation with AlphaCodium: From Prompt Engineering to Flow\n Engineering","summary":" Code generation problems differ from common natural language problems - they\nrequire matching the exact syntax of the target language, identifying happy\npaths and edge cases, paying attention to numerous small details in the problem\nspec, and addressing other code-specific issues and requirements. Hence, many\nof the optimizations and tricks that have been successful in natural language\ngeneration may not be effective for code tasks. In this work, we propose a new\napproach to code generation by LLMs, which we call AlphaCodium - a test-based,\nmulti-stage, code-oriented iterative flow, that improves the performances of\nLLMs on code problems. We tested AlphaCodium on a challenging code generation\ndataset called CodeContests, which includes competitive programming problems\nfrom platforms such as Codeforces. The proposed flow consistently and\nsignificantly improves results. On the validation set, for example, GPT-4\naccuracy (pass@5) increased from 19% with a single well-designed direct prompt\nto 44% with the AlphaCodium flow. Many of the principles and best practices\nacquired in this work, we believe, are broadly applicable to general code\ngeneration tasks. Full implementation is available at:\nhttps://github.com/Codium-ai/AlphaCodium\n","authors":["Tal Ridnik","Dedy Kredo","Itamar Friedman"],"pdf_url":"https://arxiv.org/pdf/2401.08500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08495v1","updated":"2024-01-16T16:52:00Z","published":"2024-01-16T16:52:00Z","title":"The Effect of Group Status on the Variability of Group Representations\n in LLM-generated Text","summary":" Large Language Models (LLMs) have become pervasive in everyday life, yet\ntheir inner workings remain opaque. While scholarly efforts have demonstrated\nLLMs' propensity to reproduce biases in their training data, they have\nprimarily focused on the association of social groups with stereotypic\nattributes. In this paper, we extend this line of inquiry to investigate a bias\nakin to the social-psychological phenomenon where socially dominant groups are\nperceived to be less homogeneous than socially subordinate groups as it is\nreproduced by LLMs. We had ChatGPT, a state-of-the-art LLM, generate a\ndiversity of texts about intersectional group identities and compared text\nhomogeneity. We consistently find that LLMs portray African, Asian, and\nHispanic Americans as more homogeneous than White Americans. They also portray\nwomen as more homogeneous than men, but these differences are small. Finally,\nwe find that the effect of gender differs across racial/ethnic groups such that\nthe effect of gender is consistent within African and Hispanic Americans but\nnot within Asian and White Americans. We speculate possible sources of this\nbias in LLMs and posit that the bias has the potential to amplify biases in\nfuture LLM training and to reinforce stereotypes.\n","authors":["Messi H. J. Lee","Jacob M. Montgomery","Calvin K. Lai"],"pdf_url":"https://arxiv.org/pdf/2401.08495v1.pdf","comment":"Presented at the Socially Responsible Language Modelling Research\n (SoLaR) Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.16349v2","updated":"2024-01-16T16:51:33Z","published":"2023-09-28T11:18:20Z","title":"Human Feedback is not Gold Standard","summary":" Human feedback has become the de facto standard for evaluating the\nperformance of Large Language Models, and is increasingly being used as a\ntraining objective. However, it is not clear which properties of a generated\noutput this single `preference' score captures. We hypothesise that preference\nscores are subjective and open to undesirable biases. We critically analyse the\nuse of human feedback for both training and evaluation, to verify whether it\nfully captures a range of crucial error criteria. We find that while preference\nscores have fairly good coverage, they under-represent important aspects like\nfactuality. We further hypothesise that both preference scores and error\nannotation may be affected by confounders, and leverage instruction-tuned\nmodels to generate outputs that vary along two possible confounding dimensions:\nassertiveness and complexity. We find that the assertiveness of an output skews\nthe perceived rate of factuality errors, indicating that human annotations are\nnot a fully reliable evaluation metric or training objective. Finally, we offer\npreliminary evidence that using human feedback as a training objective\ndisproportionately increases the assertiveness of model outputs. We encourage\nfuture work to carefully consider whether preference scores are well aligned\nwith the desired objective.\n","authors":["Tom Hosking","Phil Blunsom","Max Bartolo"],"pdf_url":"https://arxiv.org/pdf/2309.16349v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08491v1","updated":"2024-01-16T16:49:39Z","published":"2024-01-16T16:49:39Z","title":"Contrastive Perplexity for Controlled Generation: An Application in\n Detoxifying Large Language Models","summary":" The generation of undesirable and factually incorrect content of large\nlanguage models poses a significant challenge and remains largely an unsolved\nissue. This paper studies the integration of a contrastive learning objective\nfor fine-tuning LLMs for implicit knowledge editing and controlled text\ngeneration. Optimizing the training objective entails aligning text\nperplexities in a contrastive fashion. To facilitate training the model in a\nself-supervised fashion, we leverage an off-the-shelf LLM for training data\ngeneration. We showcase applicability in the domain of detoxification. Herein,\nthe proposed approach leads to a significant decrease in the generation of\ntoxic content while preserving general utility for downstream tasks such as\ncommonsense reasoning and reading comprehension. The proposed approach is\nconceptually simple but empirically powerful.\n","authors":["Tassilo Klein","Moin Nabi"],"pdf_url":"https://arxiv.org/pdf/2401.08491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13310v3","updated":"2024-01-16T16:24:36Z","published":"2023-03-23T14:44:47Z","title":"SwissBERT: The Multilingual Language Model for Switzerland","summary":" We present SwissBERT, a masked language model created specifically for\nprocessing Switzerland-related text. SwissBERT is a pre-trained model that we\nadapted to news articles written in the national languages of Switzerland --\nGerman, French, Italian, and Romansh. We evaluate SwissBERT on natural language\nunderstanding tasks related to Switzerland and find that it tends to outperform\nprevious models on these tasks, especially when processing contemporary news\nand/or Romansh Grischun. Since SwissBERT uses language adapters, it may be\nextended to Swiss German dialects in future work. The model and our open-source\ncode are publicly released at https://github.com/ZurichNLP/swissbert.\n","authors":["Jannis Vamvas","Johannes Graën","Rico Sennrich"],"pdf_url":"https://arxiv.org/pdf/2303.13310v3.pdf","comment":"SwissText 2023 [v3: Changed template because the proceedings moved to\n a different publisher. Same content.]"},{"id":"http://arxiv.org/abs/2401.08461v1","updated":"2024-01-16T16:11:35Z","published":"2024-01-16T16:11:35Z","title":"Decentralised Emergence of Robust and Adaptive Linguistic Conventions in\n Populations of Autonomous Agents Grounded in Continuous Worlds","summary":" This paper introduces a methodology through which a population of autonomous\nagents can establish a linguistic convention that enables them to refer to\narbitrary entities that they observe in their environment. The linguistic\nconvention emerges in a decentralised manner through local communicative\ninteractions between pairs of agents drawn from the population. The convention\nconsists of symbolic labels (word forms) associated to concept representations\n(word meanings) that are grounded in a continuous feature space. The concept\nrepresentations of each agent are individually constructed yet compatible on a\ncommunicative level. Through a range of experiments, we show (i) that the\nmethodology enables a population to converge on a communicatively effective,\ncoherent and human-interpretable linguistic convention, (ii) that it is\nnaturally robust against sensor defects in individual agents, (iii) that it can\neffectively deal with noisy observations, uncalibrated sensors and\nheteromorphic populations, (iv) that the method is adequate for continual\nlearning, and (v) that the convention self-adapts to changes in the environment\nand communicative needs of the agents.\n","authors":["Jérôme Botoko Ekila","Jens Nevens","Lara Verheyen","Katrien Beuls","Paul Van Eecke"],"pdf_url":"https://arxiv.org/pdf/2401.08461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08460v1","updated":"2024-01-16T16:09:56Z","published":"2024-01-16T16:09:56Z","title":"Reinforcement Learning for Conversational Question Answering over\n Knowledge Graph","summary":" Conversational question answering (ConvQA) over law knowledge bases (KBs)\ninvolves answering multi-turn natural language questions about law and hope to\nfind answers in the law knowledge base. Despite many methods have been\nproposed. Existing law knowledge base ConvQA model assume that the input\nquestion is clear and can perfectly reflect user's intention. However, in real\nworld, the input questions are noisy and inexplict. This makes the model hard\nto find the correct answer in the law knowledge bases. In this paper, we try to\nuse reinforcement learning to solve this problem. The reinforcement learning\nagent can automatically learn how to find the answer based on the input\nquestion and the conversation history, even when the input question is\ninexplicit. We test the proposed method on several real world datasets and the\nresults show the effectivenss of the proposed model.\n","authors":["Mi Wu"],"pdf_url":"https://arxiv.org/pdf/2401.08460v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17482v2","updated":"2024-01-16T16:03:31Z","published":"2023-12-29T06:05:19Z","title":"MosaicBERT: A Bidirectional Encoder Optimized for Fast Pretraining","summary":" Although BERT-style encoder models are heavily used in NLP research, many\nresearchers do not pretrain their own BERTs from scratch due to the high cost\nof training. In the past half-decade since BERT first rose to prominence, many\nadvances have been made with other transformer architectures and training\nconfigurations that have yet to be systematically incorporated into BERT. Here,\nwe introduce MosaicBERT, a BERT-style encoder architecture and training recipe\nthat is empirically optimized for fast pretraining. This efficient architecture\nincorporates FlashAttention, Attention with Linear Biases (ALiBi), Gated Linear\nUnits (GLU), a module to dynamically remove padded tokens, and low precision\nLayerNorm into the classic transformer encoder block. The training recipe\nincludes a 30% masking ratio for the Masked Language Modeling (MLM) objective,\nbfloat16 precision, and vocabulary size optimized for GPU throughput, in\naddition to best-practices from RoBERTa and other encoder models. When\npretrained from scratch on the C4 dataset, this base model achieves a\ndownstream average GLUE (dev) score of 79.6 in 1.13 hours on 8 A100 80 GB GPUs\nat a cost of roughly $20. We plot extensive accuracy vs. pretraining speed\nPareto curves and show that MosaicBERT base and large are consistently Pareto\noptimal when compared to a competitive BERT base and large. This empirical\nspeed up in pretraining enables researchers and engineers to pretrain custom\nBERT-style models at low cost instead of finetune on existing generic models.\nWe open source our model weights and code.\n","authors":["Jacob Portes","Alex Trott","Sam Havens","Daniel King","Abhinav Venigalla","Moin Nadeem","Nikhil Sardana","Daya Khudia","Jonathan Frankle"],"pdf_url":"https://arxiv.org/pdf/2312.17482v2.pdf","comment":"10 pages, 4 figures in main text. 25 pages total"},{"id":"http://arxiv.org/abs/2305.16801v3","updated":"2024-01-16T15:56:35Z","published":"2023-05-26T10:30:23Z","title":"Motion-Based Sign Language Video Summarization using Curvature and\n Torsion","summary":" An interesting problem in many video-based applications is the generation of\nshort synopses by selecting the most informative frames, a procedure which is\nknown as video summarization. For sign language videos the benefits of using\nthe $t$-parameterized counterpart of the curvature of the 2-D signer's wrist\ntrajectory to identify keyframes, have been recently reported in the\nliterature. In this paper we extend these ideas by modeling the 3-D hand motion\nthat is extracted from each frame of the video. To this end we propose a new\ninformative function based on the $t$-parameterized curvature and torsion of\nthe 3-D trajectory. The method to characterize video frames as keyframes\ndepends on whether the motion occurs in 2-D or 3-D space. Specifically, in the\ncase of 3-D motion we look for the maxima of the harmonic mean of the curvature\nand torsion of the target's trajectory; in the planar motion case we seek for\nthe maxima of the trajectory's curvature. The proposed 3-D feature is\nexperimentally evaluated in applications of sign language videos on (1)\nobjective measures using ground-truth keyframe annotations, (2) human-based\nevaluation of understanding, and (3) gloss classification and the results\nobtained are promising.\n","authors":["Evangelos G. Sartinas","Emmanouil Z. Psarakis","Dimitrios I. Kosmopoulos"],"pdf_url":"https://arxiv.org/pdf/2305.16801v3.pdf","comment":"This work is under consideration at Pattern Recognition Letters for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2309.16540v2","updated":"2024-01-16T15:36:40Z","published":"2023-09-28T15:53:44Z","title":"Unsupervised Pretraining for Fact Verification by Language Model\n Distillation","summary":" Fact verification aims to verify a claim using evidence from a trustworthy\nknowledge base. To address this challenge, algorithms must produce features for\nevery claim that are both semantically meaningful, and compact enough to find a\nsemantic alignment with the source information. In contrast to previous work,\nwhich tackled the alignment problem by learning over annotated corpora of\nclaims and their corresponding labels, we propose SFAVEL (Self-supervised Fact\nVerification via Language Model Distillation), a novel unsupervised pretraining\nframework that leverages pre-trained language models to distil self-supervised\nfeatures into high-quality claim-fact alignments without the need for\nannotations. This is enabled by a novel contrastive loss function that\nencourages features to attain high-quality claim and evidence alignments whilst\npreserving the semantic relationships across the corpora. Notably, we present\nresults that achieve a new state-of-the-art on FB15k-237 (+5.3% Hits@1) and\nFEVER (+8% accuracy) with linear evaluation.\n","authors":["Adrián Bazaga","Pietro Liò","Gos Micklem"],"pdf_url":"https://arxiv.org/pdf/2309.16540v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08429v1","updated":"2024-01-16T15:16:34Z","published":"2024-01-16T15:16:34Z","title":"Machine Translation with Large Language Models: Prompt Engineering for\n Persian, English, and Russian Directions","summary":" Generative large language models (LLMs) have demonstrated exceptional\nproficiency in various natural language processing (NLP) tasks, including\nmachine translation, question answering, text summarization, and natural\nlanguage understanding.\n To further enhance the performance of LLMs in machine translation, we\nconducted an investigation into two popular prompting methods and their\ncombination, focusing on cross-language combinations of Persian, English, and\nRussian. We employed n-shot feeding and tailored prompting frameworks. Our\nfindings indicate that multilingual LLMs like PaLM exhibit human-like machine\ntranslation outputs, enabling superior fine-tuning of desired translation\nnuances in accordance with style guidelines and linguistic considerations.\nThese models also excel in processing and applying prompts. However, the choice\nof language model, machine translation task, and the specific source and target\nlanguages necessitate certain considerations when adopting prompting frameworks\nand utilizing n-shot in-context learning.\n Furthermore, we identified errors and limitations inherent in popular LLMs as\nmachine translation tools and categorized them based on various linguistic\nmetrics. This typology of errors provides valuable insights for utilizing LLMs\neffectively and offers methods for designing prompts for in-context learning.\nOur report aims to contribute to the advancement of machine translation with\nLLMs by improving both the accuracy and reliability of evaluation metrics.\n","authors":["Nooshin Pourkamali","Shler Ebrahim Sharifi"],"pdf_url":"https://arxiv.org/pdf/2401.08429v1.pdf","comment":"34 pages, 46 figures"},{"id":"http://arxiv.org/abs/2401.08420v1","updated":"2024-01-16T15:07:09Z","published":"2024-01-16T15:07:09Z","title":"Ask the experts: sourcing high-quality datasets for nutritional\n counselling through Human-AI collaboration","summary":" Large Language Models (LLMs), with their flexible generation abilities, can\nbe powerful data sources in domains with few or no available corpora. However,\nproblems like hallucinations and biases limit such applications. In this case\nstudy, we pick nutrition counselling, a domain lacking any public resource, and\nshow that high-quality datasets can be gathered by combining LLMs,\ncrowd-workers and nutrition experts. We first crowd-source and cluster a novel\ndataset of diet-related issues, then work with experts to prompt ChatGPT into\nproducing related supportive text. Finally, we let the experts evaluate the\nsafety of the generated text. We release HAI-coaching, the first\nexpert-annotated nutrition counselling dataset containing ~2.4K dietary\nstruggles from crowd workers, and ~97K related supportive texts generated by\nChatGPT. Extensive analysis shows that ChatGPT while producing highly fluent\nand human-like text, also manifests harmful behaviours, especially in sensitive\ntopics like mental health, making it unsuitable for unsupervised use.\n","authors":["Simone Balloccu","Ehud Reiter","Vivek Kumar","Diego Reforgiato Recupero","Daniele Riboni"],"pdf_url":"https://arxiv.org/pdf/2401.08420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08417v1","updated":"2024-01-16T15:04:51Z","published":"2024-01-16T15:04:51Z","title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM\n Performance in Machine Translation","summary":" Moderate-sized large language models (LLMs) -- those with 7B or 13B\nparameters -- exhibit promising machine translation (MT) performance. However,\neven the top-performing 13B LLM-based translation models, like ALMA, does not\nmatch the performance of state-of-the-art conventional encoder-decoder\ntranslation models or larger-scale LLMs such as GPT-4. In this study, we bridge\nthis performance gap. We first assess the shortcomings of supervised\nfine-tuning for LLMs in the MT task, emphasizing the quality issues present in\nthe reference data, despite being human-generated. Then, in contrast to SFT\nwhich mimics reference translations, we introduce Contrastive Preference\nOptimization (CPO), a novel approach that trains models to avoid generating\nadequate but not perfect translations. Applying CPO to ALMA models with only\n22K parallel sentences and 12M parameters yields significant improvements. The\nresulting model, called ALMA-R, can match or exceed the performance of the WMT\ncompetition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.\n","authors":["Haoran Xu","Amr Sharaf","Yunmo Chen","Weiting Tan","Lingfeng Shen","Benjamin Van Durme","Kenton Murray","Young Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08417v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08406v1","updated":"2024-01-16T14:44:47Z","published":"2024-01-16T14:44:47Z","title":"RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on\n Agriculture","summary":" There are two common ways in which developers are incorporating proprietary\nand domain-specific data when building applications of Large Language Models\n(LLMs): Retrieval-Augmented Generation (RAG) and Fine-Tuning. RAG augments the\nprompt with the external data, while fine-Tuning incorporates the additional\nknowledge into the model itself. However, the pros and cons of both approaches\nare not well understood. In this paper, we propose a pipeline for fine-tuning\nand RAG, and present the tradeoffs of both for multiple popular LLMs, including\nLlama2-13B, GPT-3.5, and GPT-4. Our pipeline consists of multiple stages,\nincluding extracting information from PDFs, generating questions and answers,\nusing them for fine-tuning, and leveraging GPT-4 for evaluating the results. We\npropose metrics to assess the performance of different stages of the RAG and\nfine-Tuning pipeline. We conduct an in-depth study on an agricultural dataset.\nAgriculture as an industry has not seen much penetration of AI, and we study a\npotentially disruptive application - what if we could provide location-specific\ninsights to a farmer? Our results show the effectiveness of our dataset\ngeneration pipeline in capturing geographic-specific knowledge, and the\nquantitative and qualitative benefits of RAG and fine-tuning. We see an\naccuracy increase of over 6 p.p. when fine-tuning the model and this is\ncumulative with RAG, which increases accuracy by 5 p.p. further. In one\nparticular experiment, we also demonstrate that the fine-tuned model leverages\ninformation from across geographies to answer specific questions, increasing\nanswer similarity from 47% to 72%. Overall, the results point to how systems\nbuilt using LLMs can be adapted to respond and incorporate knowledge across a\ndimension that is critical for a specific industry, paving the way for further\napplications of LLMs in other industrial domains.\n","authors":["Aman Gupta","Anup Shirgaonkar","Angels de Luis Balaguer","Bruno Silva","Daniel Holstein","Dawei Li","Jennifer Marsman","Leonardo O. Nunes","Mahsa Rouzbahman","Morris Sharp","Nick Mecklenburg","Rafael Padilha","Ranveer Chandra","Renato Luiz de Freitas Cunha","Roberto de M. Estevão Filho","Ryan Tsang","Sara Malvar","Swati Sharma","Todd Hendry","Vijay Aski","Vijetha Vijayendran","Vinamra Benara"],"pdf_url":"https://arxiv.org/pdf/2401.08406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05596v2","updated":"2024-01-16T14:42:45Z","published":"2024-01-11T00:03:36Z","title":"POMP: Probability-driven Meta-graph Prompter for LLMs in Low-resource\n Unsupervised Neural Machine Translation","summary":" Low-resource languages (LRLs) face challenges in supervised neural machine\ntranslation due to limited parallel data, prompting research into unsupervised\nmethods. Unsupervised neural machine translation (UNMT) methods, including\nback-translation, transfer learning, and pivot-based translation, offer\npractical solutions for LRL translation, but they are hindered by issues like\nsynthetic data noise, language bias, and error propagation, which can\npotentially be mitigated by Large Language Models (LLMs). LLMs have advanced\nNMT with in-context learning (ICL) and supervised fine-tuning methods, but\ninsufficient training data results in poor performance in LRLs. We argue that\nLLMs can mitigate the linguistic noise with auxiliary languages to improve\ntranslations in LRLs. In this paper, we propose Probability-driven Meta-graph\nPrompter (POMP), a novel approach employing a dynamic, sampling-based graph of\nmultiple auxiliary languages to enhance LLMs' translation capabilities for\nLRLs. POMP involves constructing a directed acyclic meta-graph for each source\nlanguage, from which we dynamically sample multiple paths to prompt LLMs to\nmitigate the linguistic noise and improve translations during training. We use\nthe BLEURT metric to evaluate the translations and back-propagate rewards,\nestimated by scores, to update the probabilities of auxiliary languages in the\npaths. Our experiments show significant improvements in the translation quality\nof three LRLs, demonstrating the effectiveness of our approach.\n","authors":["Shilong Pan","Zhiliang Tian","Liang Ding","Zhen Huang","Zhihua Wen","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2401.05596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08396v1","updated":"2024-01-16T14:41:20Z","published":"2024-01-16T14:41:20Z","title":"Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine","summary":" Recent studies indicate that Generative Pre-trained Transformer 4 with Vision\n(GPT-4V) outperforms human physicians in medical challenge tasks. However,\nthese evaluations primarily focused on the accuracy of multi-choice questions\nalone. Our study extends the current scope by conducting a comprehensive\nanalysis of GPT-4V's rationales of image comprehension, recall of medical\nknowledge, and step-by-step multimodal reasoning when solving New England\nJournal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test\nthe knowledge and diagnostic capabilities of medical professionals. Evaluation\nresults confirmed that GPT-4V outperforms human physicians regarding\nmulti-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in\ncases where physicians incorrectly answer, with over 80% accuracy. However, we\ndiscovered that GPT-4V frequently presents flawed rationales in cases where it\nmakes the correct final choices (27.3%), most prominent in image comprehension\n(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our\nfindings emphasize the necessity for further in-depth evaluations of its\nrationales before integrating such models into clinical workflows.\n","authors":["Qiao Jin","Fangyuan Chen","Yiliang Zhou","Ziyang Xu","Justin M. Cheung","Robert Chen","Ronald M. Summers","Justin F. Rousseau","Peiyun Ni","Marc J Landsman","Sally L. Baxter","Subhi J. Al'Aref","Yijia Li","Michael F. Chiang","Yifan Peng","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.08396v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08392v1","updated":"2024-01-16T14:33:09Z","published":"2024-01-16T14:33:09Z","title":"DoraemonGPT: Toward Understanding Dynamic Scenes with Large Language\n Models","summary":" The field of AI agents is advancing at an unprecedented rate due to the\ncapabilities of large language models (LLMs). However, LLM-driven visual agents\nmainly focus on solving tasks for the image modality, which limits their\nability to understand the dynamic nature of the real world, making it still far\nfrom real-life applications, e.g., guiding students in laboratory experiments\nand identifying their mistakes. Considering the video modality better reflects\nthe ever-changing and perceptually intensive nature of real-world scenarios, we\ndevise DoraemonGPT, a comprehensive and conceptually elegant system driven by\nLLMs to handle dynamic video tasks. Given a video with a question/task,\nDoraemonGPT begins by converting the input video with massive content into a\nsymbolic memory that stores \\textit{task-related} attributes. This structured\nrepresentation allows for spatial-temporal querying and reasoning by sub-task\ntools, resulting in concise and relevant intermediate results. Recognizing that\nLLMs have limited internal knowledge when it comes to specialized domains\n(e.g., analyzing the scientific principles underlying experiments), we\nincorporate plug-and-play tools to assess external knowledge and address tasks\nacross different domains. Moreover, we introduce a novel LLM-driven planner\nbased on Monte Carlo Tree Search to efficiently explore the large planning\nspace for scheduling various tools. The planner iteratively finds feasible\nsolutions by backpropagating the result's reward, and multiple solutions can be\nsummarized into an improved final answer. We extensively evaluate DoraemonGPT\nin dynamic scenes and provide in-the-wild showcases demonstrating its ability\nto handle more complex questions than previous studies.\n","authors":["Zongxin Yang","Guikun Chen","Xiaodi Li","Wenguan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08392v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14053v2","updated":"2024-01-16T14:03:10Z","published":"2023-10-21T16:14:56Z","title":"Beyond Accuracy: Evaluating Self-Consistency of Code Large Language\n Models with IdentityChain","summary":" Code Large Language Models (Code LLMs) are being increasingly employed in\nreal-life applications, so evaluating them is critical. While the conventional\naccuracy evaluates the performance of Code LLMs on a set of individual tasks,\ntheir self-consistency across different tasks is overlooked. Intuitively, a\ntrustworthy model should be self-consistent when generating natural language\nspecifications for its own code and generating code for its own specifications.\nFailure to preserve self-consistency reveals a lack of understanding of the\nshared semantics underlying natural language and programming language, and\ntherefore undermines the trustworthiness of a model. In this paper, we first\nformally define the self-consistency of Code LLMs and then design a framework,\nIdentityChain, which effectively and efficiently evaluates the self-consistency\nand conventional accuracy of a model at the same time. We study eleven Code\nLLMs and show that they fail to preserve self-consistency, which is indeed a\ndistinct aspect from conventional accuracy. Furthermore, we show that\nIdentityChain can be used as a model debugging tool to expose weaknesses of\nCode LLMs by demonstrating three major weaknesses that we identify in current\nmodels using IdentityChain. Our code is available at\nhttps://github.com/marcusm117/IdentityChain.\n","authors":["Marcus J. Min","Yangruibo Ding","Luca Buratti","Saurabh Pujar","Gail Kaiser","Suman Jana","Baishakhi Ray"],"pdf_url":"https://arxiv.org/pdf/2310.14053v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2310.05628v3","updated":"2024-01-16T14:02:07Z","published":"2023-10-09T11:34:41Z","title":"Glitter or Gold? Deriving Structured Insights from Sustainability\n Reports via Large Language Models","summary":" Over the last decade, several regulatory bodies have started requiring the\ndisclosure of non-financial information from publicly listed companies, in\nlight of the investors' increasing attention to Environmental, Social, and\nGovernance (ESG) issues. Publicly released information on sustainability\npractices is often disclosed in diverse, unstructured, and multi-modal\ndocumentation. This poses a challenge in efficiently gathering and aligning the\ndata into a unified framework to derive insights related to Corporate Social\nResponsibility (CSR). Thus, using Information Extraction (IE) methods becomes\nan intuitive choice for delivering insightful and actionable data to\nstakeholders. In this study, we employ Large Language Models (LLMs), In-Context\nLearning, and the Retrieval-Augmented Generation (RAG) paradigm to extract\nstructured insights related to ESG aspects from companies' sustainability\nreports. We then leverage graph-based representations to conduct statistical\nanalyses concerning the extracted insights. These analyses revealed that ESG\ncriteria cover a wide range of topics, exceeding 500, often beyond those\nconsidered in existing categorizations, and are addressed by companies through\na variety of initiatives. Moreover, disclosure similarities emerged among\ncompanies from the same region or sector, validating ongoing hypotheses in the\nESG literature. Lastly, by incorporating additional company attributes into our\nanalyses, we investigated which factors impact the most on companies' ESG\nratings, showing that ESG disclosure affects the obtained ratings more than\nother financial or company data.\n","authors":["Marco Bronzini","Carlo Nicolini","Bruno Lepri","Andrea Passerini","Jacopo Staiano"],"pdf_url":"https://arxiv.org/pdf/2310.05628v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14324v2","updated":"2024-01-16T13:53:56Z","published":"2023-09-25T17:52:09Z","title":"Towards General-Purpose Text-Instruction-Guided Voice Conversion","summary":" This paper introduces a novel voice conversion (VC) model, guided by text\ninstructions such as \"articulate slowly with a deep tone\" or \"speak in a\ncheerful boyish voice\". Unlike traditional methods that rely on reference\nutterances to determine the attributes of the converted speech, our model adds\nversatility and specificity to voice conversion. The proposed VC model is a\nneural codec language model which processes a sequence of discrete codes,\nresulting in the code sequence of converted speech. It utilizes text\ninstructions as style prompts to modify the prosody and emotional information\nof the given speech. In contrast to previous approaches, which often rely on\nemploying separate encoders like prosody and content encoders to handle\ndifferent aspects of the source speech, our model handles various information\nof speech in an end-to-end manner. Experiments have demonstrated the impressive\ncapabilities of our model in comprehending instructions and delivering\nreasonable results.\n","authors":["Chun-Yi Kuan","Chen An Li","Tsu-Yuan Hsu","Tse-Yang Lin","Ho-Lam Chung","Kai-Wei Chang","Shuo-yiin Chang","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2309.14324v2.pdf","comment":"Accepted to ASRU 2023"},{"id":"http://arxiv.org/abs/2102.01223v3","updated":"2024-01-16T11:50:49Z","published":"2021-02-01T23:11:57Z","title":"Inducing Meaningful Units from Character Sequences with Dynamic Capacity\n Slot Attention","summary":" Characters do not convey meaning, but sequences of characters do. We propose\nan unsupervised distributional method to learn the abstract meaningful units in\na sequence of characters. Rather than segmenting the sequence, our Dynamic\nCapacity Slot Attention model discovers continuous representations of the\nobjects in the sequence, extending an architecture for object discovery in\nimages. We train our model on different languages and evaluate the quality of\nthe obtained representations with forward and reverse probing classifiers.\nThese experiments show that our model succeeds in discovering units which are\nsimilar to those proposed previously in form, content and level of abstraction,\nand which show promise for capturing meaningful information at a higher level\nof abstraction.\n","authors":["Melika Behjati","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2102.01223v3.pdf","comment":"Accepted to TMLR 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.08357v1","updated":"2024-01-16T13:35:28Z","published":"2024-01-16T13:35:28Z","title":"SAMF: Small-Area-Aware Multi-focus Image Fusion for Object Detection","summary":" Existing multi-focus image fusion (MFIF) methods often fail to preserve the\nuncertain transition region and detect small focus areas within large defocused\nregions accurately. To address this issue, this study proposes a new\nsmall-area-aware MFIF algorithm for enhancing object detection capability.\nFirst, we enhance the pixel attributes within the small focus and boundary\nregions, which are subsequently combined with visual saliency detection to\nobtain the pre-fusion results used to discriminate the distribution of focused\npixels. To accurately ensure pixel focus, we consider the source image as a\ncombination of focused, defocused, and uncertain regions and propose a\nthree-region segmentation strategy. Finally, we design an effective pixel\nselection rule to generate segmentation decision maps and obtain the final\nfusion results. Experiments demonstrated that the proposed method can\naccurately detect small and smooth focus areas while improving object detection\nperformance, outperforming existing methods in both subjective and objective\nevaluations. The source code is available at https://github.com/ixilai/SAMF.\n","authors":["Xilai Li","Xiaosong Li","Haishu Tan","Jinyang Li"],"pdf_url":"https://arxiv.org/pdf/2401.08357v1.pdf","comment":"Accepted to International Conference on Acoustics, Speech and Signal\n Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.08345v1","updated":"2024-01-16T13:23:51Z","published":"2024-01-16T13:23:51Z","title":"Multi-view Distillation based on Multi-modal Fusion for Few-shot Action\n Recognition(CLIP-$\\mathrm{M^2}$DF)","summary":" In recent years, few-shot action recognition has attracted increasing\nattention. It generally adopts the paradigm of meta-learning. In this field,\novercoming the overlapping distribution of classes and outliers is still a\nchallenging problem based on limited samples. We believe the combination of\nMulti-modal and Multi-view can improve this issue depending on information\ncomplementarity. Therefore, we propose a method of Multi-view Distillation\nbased on Multi-modal Fusion. Firstly, a Probability Prompt Selector for the\nquery is constructed to generate probability prompt embedding based on the\ncomparison score between the prompt embeddings of the support and the visual\nembedding of the query. Secondly, we establish a Multi-view. In each view, we\nfuse the prompt embedding as consistent information with visual and the global\nor local temporal context to overcome the overlapping distribution of classes\nand outliers. Thirdly, we perform the distance fusion for the Multi-view and\nthe mutual distillation of matching ability from one to another, enabling the\nmodel to be more robust to the distribution bias. Our code is available at the\nURL: \\url{https://github.com/cofly2014/MDMF}.\n","authors":["Fei Guo","YiKang Wang","Han Qi","WenPing Jin","Li Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.08345v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08332v1","updated":"2024-01-16T12:53:42Z","published":"2024-01-16T12:53:42Z","title":"Generative Denoise Distillation: Simple Stochastic Noises Induce\n Efficient Knowledge Transfer for Dense Prediction","summary":" Knowledge distillation is the process of transferring knowledge from a more\npowerful large model (teacher) to a simpler counterpart (student). Numerous\ncurrent approaches involve the student imitating the knowledge of the teacher\ndirectly. However, redundancy still exists in the learned representations\nthrough these prevalent methods, which tend to learn each spatial location's\nfeatures indiscriminately. To derive a more compact representation (concept\nfeature) from the teacher, inspired by human cognition, we suggest an\ninnovative method, termed Generative Denoise Distillation (GDD), where\nstochastic noises are added to the concept feature of the student to embed them\ninto the generated instance feature from a shallow network. Then, the generated\ninstance feature is aligned with the knowledge of the instance from the\nteacher. We extensively experiment with object detection, instance\nsegmentation, and semantic segmentation to demonstrate the versatility and\neffectiveness of our method. Notably, GDD achieves new state-of-the-art\nperformance in the tasks mentioned above. We have achieved substantial\nimprovements in semantic segmentation by enhancing PspNet and DeepLabV3, both\nof which are based on ResNet-18, resulting in mIoU scores of 74.67 and 77.69,\nrespectively, surpassing their previous scores of 69.85 and 73.20 on the\nCityscapes dataset of 20 categories. The source code of GDD is available at\nhttps://github.com/ZhgLiu/GDD.\n","authors":["Zhaoge Liu","Xiaohao Xu","Yunkang Cao","Weiming Shen"],"pdf_url":"https://arxiv.org/pdf/2401.08332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08328v1","updated":"2024-01-16T12:48:52Z","published":"2024-01-16T12:48:52Z","title":"Un-Mixing Test-Time Normalization Statistics: Combatting Label Temporal\n Correlation","summary":" In an era where test-time adaptation methods increasingly rely on the nuanced\nmanipulation of batch normalization (BN) parameters, one critical assumption\noften goes overlooked: that of independently and identically distributed\n(i.i.d.) test batches with respect to unknown labels. This assumption\nculminates in biased estimates of BN statistics and jeopardizes system\nstability under non-i.i.d. conditions. This paper pioneers a departure from the\ni.i.d. paradigm by introducing a groundbreaking strategy termed \"Un-Mixing\nTest-Time Normalization Statistics\" (UnMix-TNS). UnMix-TNS re-calibrates the\ninstance-wise statistics used to normalize each instance in a batch by mixing\nit with multiple unmixed statistics components, thus inherently simulating the\ni.i.d. environment. The key lies in our innovative online unmixing procedure,\nwhich persistently refines these statistics components by drawing upon the\nclosest instances from an incoming test batch. Remarkably generic in its\ndesign, UnMix-TNS seamlessly integrates with an array of state-of-the-art\ntest-time adaptation methods and pre-trained architectures equipped with BN\nlayers. Empirical evaluations corroborate the robustness of UnMix-TNS under\nvaried scenarios ranging from single to continual and mixed domain shifts.\nUnMix-TNS stands out when handling test data streams with temporal correlation,\nincluding those with corrupted real-world non-i.i.d. streams, sustaining its\nefficacy even with minimal batch sizes and individual samples. Our results set\na new standard for test-time adaptation, demonstrating significant improvements\nin both stability and performance across multiple benchmarks.\n","authors":["Devavrat Tomar","Guillaume Vray","Jean-Philippe Thiran","Behzad Bozorgtabar"],"pdf_url":"https://arxiv.org/pdf/2401.08328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01352v2","updated":"2024-01-16T11:32:48Z","published":"2023-11-02T16:00:32Z","title":"Deep learning based Image Compression for Microscopy Images: An\n Empirical Study","summary":" With the fast development of modern microscopes and bioimaging techniques, an\nunprecedentedly large amount of imaging data are being generated, stored,\nanalyzed, and even shared through networks. The size of the data poses great\nchallenges for current data infrastructure. One common way to reduce the data\nsize is by image compression. This present study analyzes classic and deep\nlearning based image compression methods, and their impact on deep learning\nbased image processing models. Deep learning based label-free prediction models\n(i.e., predicting fluorescent images from bright field images) are used as an\nexample application for comparison and analysis. Effective image compression\nmethods could help reduce the data size significantly without losing necessary\ninformation, and therefore reduce the burden on data management infrastructure\nand permit fast transmission through the network for data sharing or cloud\ncomputing. To compress images in such a wanted way, multiple classical lossy\nimage compression techniques are compared to several AI-based compression\nmodels provided by and trained with the CompressAI toolbox using python. These\ndifferent compression techniques are compared in compression ratio, multiple\nimage similarity measures and, most importantly, the prediction accuracy from\nlabel-free models on compressed images. We found that AI-based compression\ntechniques largely outperform the classic ones and will minimally affect the\ndownstream label-free task in 2D cases. In the end, we hope the present study\ncould shed light on the potential of deep learning based image compression and\nthe impact of image compression on downstream deep learning based image\nanalysis models.\n","authors":["Yu Zhou","Jan Sollmann","Jianxu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.01352v2.pdf","comment":"- Update github link; - correct the author name; - update the table\n (correct some errors during calculation); - update the implementation detail\n section and the discussion section"},{"id":"http://arxiv.org/abs/2401.08281v1","updated":"2024-01-16T11:12:36Z","published":"2024-01-16T11:12:36Z","title":"The Faiss library","summary":" Vector databases manage large collections of embedding vectors. As AI\napplications are growing rapidly, so are the number of embeddings that need to\nbe stored and indexed. The Faiss library is dedicated to vector similarity\nsearch, a core functionality of vector databases. Faiss is a toolkit of\nindexing methods and related primitives used to search, cluster, compress and\ntransform vectors. This paper first describes the tradeoff space of vector\nsearch, then the design principles of Faiss in terms of structure, approach to\noptimization and interfacing. We benchmark key features of the library and\ndiscuss a few selected applications to highlight its broad applicability.\n","authors":["Matthijs Douze","Alexandr Guzhva","Chengqi Deng","Jeff Johnson","Gergely Szilvasy","Pierre-Emmanuel Mazaré","Maria Lomeli","Lucas Hosseini","Hervé Jégou"],"pdf_url":"https://arxiv.org/pdf/2401.08281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08276v1","updated":"2024-01-16T10:58:07Z","published":"2024-01-16T10:58:07Z","title":"AesBench: An Expert Benchmark for Multimodal Large Language Models on\n Image Aesthetics Perception","summary":" With collective endeavors, multimodal large language models (MLLMs) are\nundergoing a flourishing development. However, their performances on image\naesthetics perception remain indeterminate, which is highly desired in\nreal-world applications. An obvious obstacle lies in the absence of a specific\nbenchmark to evaluate the effectiveness of MLLMs on aesthetic perception. This\nblind groping may impede the further development of more advanced MLLMs with\naesthetic perception capacity. To address this dilemma, we propose AesBench, an\nexpert benchmark aiming to comprehensively evaluate the aesthetic perception\ncapacities of MLLMs through elaborate design across dual facets. (1) We\nconstruct an Expert-labeled Aesthetics Perception Database (EAPD), which\nfeatures diversified image contents and high-quality annotations provided by\nprofessional aesthetic experts. (2) We propose a set of integrative criteria to\nmeasure the aesthetic perception abilities of MLLMs from four perspectives,\nincluding Perception (AesP), Empathy (AesE), Assessment (AesA) and\nInterpretation (AesI). Extensive experimental results underscore that the\ncurrent MLLMs only possess rudimentary aesthetic perception ability, and there\nis still a significant gap between MLLMs and humans. We hope this work can\ninspire the community to engage in deeper explorations on the aesthetic\npotentials of MLLMs. Source data will be available at\nhttps://github.com/yipoh/AesBench.\n","authors":["Yipo Huang","Quan Yuan","Xiangfei Sheng","Zhichao Yang","Haoning Wu","Pengfei Chen","Yuzhe Yang","Leida Li","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2401.08276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08275v1","updated":"2024-01-16T10:54:37Z","published":"2024-01-16T10:54:37Z","title":"Modeling Spoof Noise by De-spoofing Diffusion and its Application in\n Face Anti-spoofing","summary":" Face anti-spoofing is crucial for ensuring the security and reliability of\nface recognition systems. Several existing face anti-spoofing methods utilize\nGAN-like networks to detect presentation attacks by estimating the noise\npattern of a spoof image and recovering the corresponding genuine image. But\nGAN's limited face appearance space results in the denoised faces cannot cover\nthe full data distribution of genuine faces, thereby undermining the\ngeneralization performance of such methods. In this work, we present a\npioneering attempt to employ diffusion models to denoise a spoof image and\nrestore the genuine image. The difference between these two images is\nconsidered as the spoof noise, which can serve as a discriminative cue for face\nanti-spoofing. We evaluate our proposed method on several intra-testing and\ninter-testing protocols, where the experimental results showcase the\neffectiveness of our method in achieving competitive performance in terms of\nboth accuracy and generalization.\n","authors":["Bin Zhang","Xiangyu Zhu","Xiaoyu Zhang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2401.08275v1.pdf","comment":"Accepted by IJCB2023"},{"id":"http://arxiv.org/abs/2401.08272v1","updated":"2024-01-16T10:51:55Z","published":"2024-01-16T10:51:55Z","title":"Siamese Content-based Search Engine for a More Transparent Skin and\n Breast Cancer Diagnosis through Histological Imaging","summary":" Computer Aid Diagnosis (CAD) has developed digital pathology with Deep\nLearning (DL)-based tools to assist pathologists in decision-making.\nContent-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek\nhighly correlated patches in terms of similarity in histopathological features.\nIn this work, we proposed two CBHIR approaches on breast (Breast-twins) and\nskin cancer (Skin-twins) data sets for robust and accurate patch-level\nretrieval, integrating a custom-built Siamese network as a feature extractor.\nThe proposed Siamese network is able to generalize for unseen images by\nfocusing on the similar histopathological features of the input pairs. The\nproposed CBHIR approaches are evaluated on the Breast (public) and Skin\n(private) data sets with top K accuracy. Finding the optimum amount of K is\nchallenging, but also, as much as K increases, the dissimilarity between the\nquery and the returned images increases which might mislead the pathologists.\nTo the best of the author's belief, this paper is tackling this issue for the\nfirst time on histopathological images by evaluating the top first retrieved\nimages. The Breast-twins model achieves 70% of the F1score at the top first,\nwhich exceeds the other state-of-the-art methods at a higher amount of K such\nas 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto\nEncoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model\ntackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential\n(STUMP) to assist pathologists with retrieving top K images and their\ncorresponding labels. So, this approach can offer a more explainable CAD tool\nto pathologists in terms of transparency, trustworthiness, or reliability among\nother characteristics.\n","authors":["Zahra Tabatabaei","Adrián Colomer","JAvier Oliver Moll","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2401.08272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08263v1","updated":"2024-01-16T10:35:01Z","published":"2024-01-16T10:35:01Z","title":"Multi-Technique Sequential Information Consistency For Dynamic Visual\n Place Recognition In Changing Environments","summary":" Visual place recognition (VPR) is an essential component of robot navigation\nand localization systems that allows them to identify a place using only image\ndata. VPR is challenging due to the significant changes in a place's appearance\ndriven by different daily illumination, seasonal weather variations and diverse\nviewpoints. Currently, no single VPR technique excels in every environmental\ncondition, each exhibiting unique benefits and shortcomings, and therefore\ncombining multiple techniques can achieve more reliable VPR performance.\nPresent multi-method approaches either rely on online ground-truth information,\nwhich is often not available, or on brute-force technique combination,\npotentially lowering performance with high variance technique sets. Addressing\nthese shortcomings, we propose a VPR system dubbed Multi-Sequential Information\nConsistency (MuSIC) which leverages sequential information to select the most\ncohesive technique on an online per-frame basis. For each technique in a set,\nMuSIC computes their respective sequential consistencies by analysing the\nframe-to-frame continuity of their top match candidates, which are then\ndirectly compared to select the optimal technique for the current query image.\nThe use of sequential information to select between VPR methods results in an\noverall VPR performance increase across different benchmark datasets, while\navoiding the need for extra ground-truth of the runtime environment.\n","authors":["Bruno Arcanjo","Bruno Ferrarini","Michael Milford","Klaus D. McDonald-Maier","Shoaib Ehsan"],"pdf_url":"https://arxiv.org/pdf/2401.08263v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.14247"},{"id":"http://arxiv.org/abs/2401.08256v1","updated":"2024-01-16T10:18:57Z","published":"2024-01-16T10:18:57Z","title":"Multitask Learning in Minimally Invasive Surgical Vision: A Review","summary":" Minimally invasive surgery (MIS) has revolutionized many procedures and led\nto reduced recovery time and risk of patient injury. However, MIS poses\nadditional complexity and burden on surgical teams. Data-driven surgical vision\nalgorithms are thought to be key building blocks in the development of future\nMIS systems with improved autonomy. Recent advancements in machine learning and\ncomputer vision have led to successful applications in analyzing videos\nobtained from MIS with the promise of alleviating challenges in MIS videos.\nSurgical scene and action understanding encompasses multiple related tasks\nthat, when solved individually, can be memory-intensive, inefficient, and fail\nto capture task relationships. Multitask learning (MTL), a learning paradigm\nthat leverages information from multiple related tasks to improve performance\nand aid generalization, is wellsuited for fine-grained and high-level\nunderstanding of MIS data. This review provides an overview of the current\nstate-of-the-art MTL systems that leverage videos obtained from MIS. Beyond\nlisting published approaches, we discuss the benefits and limitations of these\nMTL systems. Moreover, this manuscript presents an analysis of the literature\nfor various application fields of MTL in MIS, including those with large\nmodels, highlighting notable trends, new directions of research, and\ndevelopments.\n","authors":["Oluwatosin Alabi","Tom Vercauteren","Miaojing Shi"],"pdf_url":"https://arxiv.org/pdf/2401.08256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08232v1","updated":"2024-01-16T09:33:29Z","published":"2024-01-16T09:33:29Z","title":"Multi-scale 2D Temporal Map Diffusion Models for Natural Language Video\n Localization","summary":" Natural Language Video Localization (NLVL), grounding phrases from natural\nlanguage descriptions to corresponding video segments, is a complex yet\ncritical task in video understanding. Despite ongoing advancements, many\nexisting solutions lack the capability to globally capture temporal dynamics of\nthe video data. In this study, we present a novel approach to NLVL that aims to\naddress this issue. Our method involves the direct generation of a global 2D\ntemporal map via a conditional denoising diffusion process, based on the input\nvideo and language query. The main challenges are the inherent sparsity and\ndiscontinuity of a 2D temporal map in devising the diffusion decoder. To\naddress these challenges, we introduce a multi-scale technique and develop an\ninnovative diffusion decoder. Our approach effectively encapsulates the\ninteraction between the query and video data across various time scales.\nExperiments on the Charades and DiDeMo datasets underscore the potency of our\ndesign.\n","authors":["Chongzhi Zhang","Mingyuan Zhang","Zhiyang Teng","Jiayi Li","Xizhou Zhu","Lewei Lu","Ziwei Liu","Aixin Sun"],"pdf_url":"https://arxiv.org/pdf/2401.08232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14153v4","updated":"2024-01-16T08:57:11Z","published":"2023-06-25T07:40:39Z","title":"DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image\n Generation using Limited Data","summary":" Denoising diffusion probabilistic models (DDPMs) have been proven capable of\nsynthesizing high-quality images with remarkable diversity when trained on\nlarge amounts of data. Typical diffusion models and modern large-scale\nconditional generative models like text-to-image generative models are\nvulnerable to overfitting when fine-tuned on extremely limited data. Existing\nworks have explored subject-driven generation using a reference set containing\na few images. However, few prior works explore DDPM-based domain-driven\ngeneration, which aims to learn the common features of target domains while\nmaintaining diversity. This paper proposes a novel DomainStudio approach to\nadapt DDPMs pre-trained on large-scale source datasets to target domains using\nlimited data. It is designed to keep the diversity of subjects provided by\nsource domains and get high-quality and diverse adapted samples in target\ndomains. We propose to keep the relative distances between adapted samples to\nachieve considerable generation diversity. In addition, we further enhance the\nlearning of high-frequency details for better generation quality. Our approach\nis compatible with both unconditional and conditional diffusion models. This\nwork makes the first attempt to realize unconditional few-shot image generation\nwith diffusion models, achieving better quality and greater diversity than\ncurrent state-of-the-art GAN-based approaches. Moreover, this work also\nsignificantly relieves overfitting for conditional generation and realizes\nhigh-quality domain-driven generation, further expanding the applicable\nscenarios of modern large-scale text-to-image models.\n","authors":["Jingyuan Zhu","Huimin Ma","Jiansheng Chen","Jian Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.14153v4.pdf","comment":"extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures. arXiv\n admin note: substantial text overlap with arXiv:2211.03264"},{"id":"http://arxiv.org/abs/2401.08212v1","updated":"2024-01-16T08:56:52Z","published":"2024-01-16T08:56:52Z","title":"Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and\n Usage in Digital Communication","summary":" Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when\nprocessing multimodal information, especially in the context of social media,\nhas garnered immense interest due to its broad potential and far-reaching\nimplications. Emojis, as one of the most unique aspects of digital\ncommunication, are pivotal in enriching and often clarifying the emotional and\ntonal dimensions. Yet, there is a notable gap in understanding how these\nadvanced models, such as GPT-4V, interpret and employ emojis in the nuanced\ncontext of online interaction. This study intends to bridge this gap by\nexamining the behavior of GPT-4V in replicating human-like use of emojis. The\nfindings reveal a discernible discrepancy between human and GPT-4V behaviors,\nlikely due to the subjective nature of human interpretation and the limitations\nof GPT-4V's English-centric training, suggesting cultural biases and inadequate\nrepresentation of non-English cultures.\n","authors":["Hanjia Lyu","Weihong Qi","Zhongyu Wei","Jiebo Luo"],"pdf_url":"https://arxiv.org/pdf/2401.08212v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08210v1","updated":"2024-01-16T08:54:21Z","published":"2024-01-16T08:54:21Z","title":"ModelNet-O: A Large-Scale Synthetic Dataset for Occlusion-Aware Point\n Cloud Classification","summary":" Recently, 3D point cloud classification has made significant progress with\nthe help of many datasets. However, these datasets do not reflect the\nincomplete nature of real-world point clouds caused by occlusion, which limits\nthe practical application of current methods. To bridge this gap, we propose\nModelNet-O, a large-scale synthetic dataset of 123,041 samples that emulate\nreal-world point clouds with self-occlusion caused by scanning from monocular\ncameras. ModelNet-O is 10 times larger than existing datasets and offers more\nchallenging cases to evaluate the robustness of existing methods. Our\nobservation on ModelNet-O reveals that well-designed sparse structures can\npreserve structural information of point clouds under occlusion, motivating us\nto propose a robust point cloud processing method that leverages a critical\npoint sampling (CPS) strategy in a multi-level manner. We term our method\nPointMLS. Through extensive experiments, we demonstrate that our PointMLS\nachieves state-of-the-art results on ModelNet-O and competitive results on\nregular datasets, and it is robust and effective. More experiments also\ndemonstrate the robustness and effectiveness of PointMLS.\n","authors":["Zhongbin Fang","Xia Li","Xiangtai Li","Shen Zhao","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08210v1.pdf","comment":"Project page: https://github.com/fanglaosi/PointMLS"},{"id":"http://arxiv.org/abs/2401.08209v1","updated":"2024-01-16T08:50:44Z","published":"2024-01-16T08:50:44Z","title":"Transcending the Limit of Local Window: Advanced Super-Resolution\n Transformer with Adaptive Token Dictionary","summary":" Single Image Super-Resolution is a classic computer vision problem that\ninvolves estimating high-resolution (HR) images from low-resolution (LR) ones.\nAlthough deep neural networks (DNNs), especially Transformers for\nsuper-resolution, have seen significant advancements in recent years,\nchallenges still remain, particularly in limited receptive field caused by\nwindow-based self-attention. To address these issues, we introduce a group of\nauxiliary Adapeive Token Dictionary to SR Transformer and establish an ATD-SR\nmethod. The introduced token dictionary could learn prior information from\ntraining data and adapt the learned prior to specific testing image through an\nadaptive refinement step. The refinement strategy could not only provide global\ninformation to all input tokens but also group image tokens into categories.\nBased on category partitions, we further propose a category-based\nself-attention mechanism designed to leverage distant but similar tokens for\nenhancing input features. The experimental results show that our method\nachieves the best performance on various single image super-resolution\nbenchmarks.\n","authors":["Leheng Zhang","Yawei Li","Xingyu Zhou","Xiaorui Zhao","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.08209v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.12600v2","updated":"2024-01-16T08:47:04Z","published":"2023-10-19T09:11:23Z","title":"FUSC: Fetal Ultrasound Semantic Clustering of Second Trimester Scans\n Using Deep Self-supervised Learning","summary":" Ultrasound is the primary imaging modality in clinical practice during\npregnancy. More than 140M fetuses are born yearly, resulting in numerous scans.\nThe availability of a large volume of fetal ultrasound scans presents the\nopportunity to train robust machine learning models. However, the abundance of\nscans also has its challenges, as manual labeling of each image is needed for\nsupervised methods. Labeling is typically labor-intensive and requires\nexpertise to annotate the images accurately. This study presents an\nunsupervised approach for automatically clustering ultrasound images into a\nlarge range of fetal views, reducing or eliminating the need for manual\nlabeling. Our Fetal Ultrasound Semantic Clustering (FUSC) method is developed\nusing a large dataset of 88,063 images and further evaluated on an additional\nunseen dataset of 8,187 images achieving over 92% clustering purity. The result\nof our investigation hold the potential to significantly impact the field of\nfetal ultrasound imaging and pave the way for more advanced automated labeling\nsolutions. Finally, we make the code and the experimental setup publicly\navailable to help advance the field.\n","authors":["Hussain Alasmawi","Leanne Bricker","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2310.12600v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07327v2","updated":"2024-01-16T08:40:21Z","published":"2023-12-12T14:43:09Z","title":"Adaptive Confidence Multi-View Hashing for Multimedia Retrieval","summary":" The multi-view hash method converts heterogeneous data from multiple views\ninto binary hash codes, which is one of the critical technologies in multimedia\nretrieval. However, the current methods mainly explore the complementarity\namong multiple views while lacking confidence learning and fusion. Moreover, in\npractical application scenarios, the single-view data contain redundant noise.\nTo conduct the confidence learning and eliminate unnecessary noise, we propose\na novel Adaptive Confidence Multi-View Hashing (ACMVH) method. First, a\nconfidence network is developed to extract useful information from various\nsingle-view features and remove noise information. Furthermore, an adaptive\nconfidence multi-view network is employed to measure the confidence of each\nview and then fuse multi-view features through a weighted summation. Lastly, a\ndilation network is designed to further enhance the feature representation of\nthe fused features. To the best of our knowledge, we pioneer the application of\nconfidence learning into the field of multimedia retrieval. Extensive\nexperiments on two public datasets show that the proposed ACMVH performs better\nthan state-of-the-art methods (maximum increase of 3.24%). The source code is\navailable at https://github.com/HackerHyper/ACMVH.\n","authors":["Jian Zhu","Yu Cui","Zhangmin Huang","Xingyu Li","Lei Liu","Lingfang Zeng","Li-Rong Dai"],"pdf_url":"https://arxiv.org/pdf/2312.07327v2.pdf","comment":"accepted by International Conference on Acoustics, Speech and Signal\n Processing 2024(ICASSP2024)"},{"id":"http://arxiv.org/abs/2312.14198v2","updated":"2024-01-16T08:18:08Z","published":"2023-12-21T01:56:34Z","title":"ZeroShape: Regression-based Zero-shot Shape Reconstruction","summary":" We study the problem of single-image zero-shot 3D shape reconstruction.\nRecent works learn zero-shot shape reconstruction through generative modeling\nof 3D assets, but these models are computationally expensive at train and\ninference time. In contrast, the traditional approach to this problem is\nregression-based, where deterministic models are trained to directly regress\nthe object shape. Such regression methods possess much higher computational\nefficiency than generative methods. This raises a natural question: is\ngenerative modeling necessary for high performance, or conversely, are\nregression-based approaches still competitive? To answer this, we design a\nstrong regression-based model, called ZeroShape, based on the converging\nfindings in this field and a novel insight. We also curate a large real-world\nevaluation benchmark, with objects from three different real-world 3D datasets.\nThis evaluation benchmark is more diverse and an order of magnitude larger than\nwhat prior works use to quantitatively evaluate their models, aiming at\nreducing the evaluation variance in our field. We show that ZeroShape not only\nachieves superior performance over state-of-the-art methods, but also\ndemonstrates significantly higher computational and data efficiency.\n","authors":["Zixuan Huang","Stefan Stojanov","Anh Thai","Varun Jampani","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2312.14198v2.pdf","comment":"Project page: https://zixuanh.com/projects/zeroshape.html"},{"id":"http://arxiv.org/abs/2401.08194v1","updated":"2024-01-16T08:16:10Z","published":"2024-01-16T08:16:10Z","title":"End-to-End Optimized Image Compression with the Frequency-Oriented\n Transform","summary":" Image compression constitutes a significant challenge amidst the era of\ninformation explosion. Recent studies employing deep learning methods have\ndemonstrated the superior performance of learning-based image compression\nmethods over traditional codecs. However, an inherent challenge associated with\nthese methods lies in their lack of interpretability. Following an analysis of\nthe varying degrees of compression degradation across different frequency\nbands, we propose the end-to-end optimized image compression model facilitated\nby the frequency-oriented transform. The proposed end-to-end image compression\nmodel consists of four components: spatial sampling, frequency-oriented\ntransform, entropy estimation, and frequency-aware fusion. The\nfrequency-oriented transform separates the original image signal into distinct\nfrequency bands, aligning with the human-interpretable concept. Leveraging the\nnon-overlapping hypothesis, the model enables scalable coding through the\nselective transmission of arbitrary frequency components. Extensive experiments\nare conducted to demonstrate that our model outperforms all traditional codecs\nincluding next-generation standard H.266/VVC on MS-SSIM metric. Moreover,\nvisual analysis tasks (i.e., object detection and semantic segmentation) are\nconducted to verify the proposed compression method could preserve semantic\nfidelity besides signal-level precision.\n","authors":["Yuefeng Zhang","Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2401.08194v1.pdf","comment":"25 pages, accepted by MVAP"},{"id":"http://arxiv.org/abs/2305.20089v2","updated":"2024-01-16T08:10:46Z","published":"2023-05-31T17:59:26Z","title":"Learning Explicit Contact for Implicit Reconstruction of Hand-held\n Objects from Monocular Images","summary":" Reconstructing hand-held objects from monocular RGB images is an appealing\nyet challenging task. In this task, contacts between hands and objects provide\nimportant cues for recovering the 3D geometry of the hand-held objects. Though\nrecent works have employed implicit functions to achieve impressive progress,\nthey ignore formulating contacts in their frameworks, which results in\nproducing less realistic object meshes. In this work, we explore how to model\ncontacts in an explicit way to benefit the implicit reconstruction of hand-held\nobjects. Our method consists of two components: explicit contact prediction and\nimplicit shape reconstruction. In the first part, we propose a new subtask of\ndirectly estimating 3D hand-object contacts from a single image. The part-level\nand vertex-level graph-based transformers are cascaded and jointly learned in a\ncoarse-to-fine manner for more accurate contact probabilities. In the second\npart, we introduce a novel method to diffuse estimated contact states from the\nhand mesh surface to nearby 3D space and leverage diffused contact\nprobabilities to construct the implicit neural representation for the\nmanipulated object. Benefiting from estimating the interaction patterns between\nthe hand and the object, our method can reconstruct more realistic object\nmeshes, especially for object parts that are in contact with hands. Extensive\nexperiments on challenging benchmarks show that the proposed method outperforms\nthe current state of the arts by a great margin. Our code is publicly available\nat https://junxinghu.github.io/projects/hoi.html.\n","authors":["Junxing Hu","Hongwen Zhang","Zerui Chen","Mengcheng Li","Yunlong Wang","Yebin Liu","Zhenan Sun"],"pdf_url":"https://arxiv.org/pdf/2305.20089v2.pdf","comment":"Accepted to AAAI 2024.Code and model available at\n https://junxinghu.github.io/projects/hoi.html"},{"id":"http://arxiv.org/abs/2311.04071v3","updated":"2024-01-16T08:04:14Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves the\nstate-of-the-art performance over single-step non-adversarial generation. Our\ncode is available at https://github.com/DJ-LYH/EC-VAE.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v3.pdf","comment":"update results"},{"id":"http://arxiv.org/abs/2401.08185v1","updated":"2024-01-16T08:01:09Z","published":"2024-01-16T08:01:09Z","title":"DPAFNet:Dual Path Attention Fusion Network for Single Image Deraining","summary":" Rainy weather will have a significant impact on the regular operation of the\nimaging system. Based on this premise, image rain removal has always been a\npopular branch of low-level visual tasks, especially methods using deep neural\nnetworks. However, most neural networks are but-branched, such as only using\nconvolutional neural networks or Transformers, which is unfavourable for the\nmultidimensional fusion of image features. In order to solve this problem, this\npaper proposes a dual-branch attention fusion network. Firstly, a two-branch\nnetwork structure is proposed. Secondly, an attention fusion module is proposed\nto selectively fuse the features extracted by the two branches rather than\nsimply adding them. Finally, complete ablation experiments and sufficient\ncomparison experiments prove the rationality and effectiveness of the proposed\nmethod.\n","authors":["Bingcai Wei"],"pdf_url":"https://arxiv.org/pdf/2401.08185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08178v1","updated":"2024-01-16T07:51:00Z","published":"2024-01-16T07:51:00Z","title":"Key-point Guided Deformable Image Manipulation Using Diffusion Model","summary":" In this paper, we introduce a Key-point-guided Diffusion probabilistic Model\n(KDM) that gains precise control over images by manipulating the object's\nkey-point. We propose a two-stage generative model incorporating an optical\nflow map as an intermediate output. By doing so, a dense pixel-wise\nunderstanding of the semantic relation between the image and sparse key point\nis configured, leading to more realistic image generation. Additionally, the\nintegration of optical flow helps regulate the inter-frame variance of\nsequential images, demonstrating an authentic sequential image generation. The\nKDM is evaluated with diverse key-point conditioned image synthesis tasks,\nincluding facial image generation, human pose synthesis, and echocardiography\nvideo prediction, demonstrating the KDM is proving consistency enhanced and\nphoto-realistic images compared with state-of-the-art models.\n","authors":["Seok-Hwan Oh","Guil Jung","Myeong-Gee Kim","Sang-Yun Kim","Young-Min Kim","Hyeon-Jik Lee","Hyuk-Sool Kwon","Hyeon-Min Bae"],"pdf_url":"https://arxiv.org/pdf/2401.08178v1.pdf","comment":"Code is released at\n https://github.com/joseph9337/Key-point-Guided-Deformable-Image-Manipulation-Using-Diffusion-Mode"},{"id":"http://arxiv.org/abs/2401.08174v1","updated":"2024-01-16T07:33:22Z","published":"2024-01-16T07:33:22Z","title":"Completely Occluded and Dense Object Instance Segmentation Using Box\n Prompt-Based Segmentation Foundation Models","summary":" Completely occluded and dense object instance segmentation (IS) is an\nimportant and challenging task. Although current amodal IS methods can predict\ninvisible regions of occluded objects, they are difficult to directly predict\ncompletely occluded objects. For dense object IS, existing box-based methods\nare overly dependent on the performance of bounding box detection. In this\npaper, we propose CFNet, a coarse-to-fine IS framework for completely occluded\nand dense objects, which is based on box prompt-based segmentation foundation\nmodels (BSMs). Specifically, CFNet first detects oriented bounding boxes (OBBs)\nto distinguish instances and provide coarse localization information. Then, it\npredicts OBB prompt-related masks for fine segmentation. To predict completely\noccluded object instances, CFNet performs IS on occluders and utilizes prior\ngeometric properties, which overcomes the difficulty of directly predicting\ncompletely occluded object instances. Furthermore, based on BSMs, CFNet reduces\nthe dependence on bounding box detection performance, improving dense object IS\nperformance. Moreover, we propose a novel OBB prompt encoder for BSMs. To make\nCFNet more lightweight, we perform knowledge distillation on it and introduce a\nGaussian smoothing method for teacher targets. Experimental results demonstrate\nthat CFNet achieves the best performance on both industrial and publicly\navailable datasets.\n","authors":["Zhen Zhou","Junfeng Fan","Yunkai Ma","Sihan Zhao","Fengshui Jing","Min Tan"],"pdf_url":"https://arxiv.org/pdf/2401.08174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08171v1","updated":"2024-01-16T07:26:26Z","published":"2024-01-16T07:26:26Z","title":"Deep Linear Array Pushbroom Image Restoration: A Degradation Pipeline\n and Jitter-Aware Restoration Network","summary":" Linear Array Pushbroom (LAP) imaging technology is widely used in the realm\nof remote sensing. However, images acquired through LAP always suffer from\ndistortion and blur because of camera jitter. Traditional methods for restoring\nLAP images, such as algorithms estimating the point spread function (PSF),\nexhibit limited performance. To tackle this issue, we propose a Jitter-Aware\nRestoration Network (JARNet), to remove the distortion and blur in two stages.\nIn the first stage, we formulate an Optical Flow Correction (OFC) block to\nrefine the optical flow of the degraded LAP images, resulting in pre-corrected\nimages where most of the distortions are alleviated. In the second stage, for\nfurther enhancement of the pre-corrected images, we integrate two jitter-aware\ntechniques within the Spatial and Frequency Residual (SFRes) block: 1)\nintroducing Coordinate Attention (CoA) to the SFRes block in order to capture\nthe jitter state in orthogonal direction; 2) manipulating image features in\nboth spatial and frequency domains to leverage local and global priors.\nAdditionally, we develop a data synthesis pipeline, which applies Continue\nDynamic Shooting Model (CDSM) to simulate realistic degradation in LAP images.\nBoth the proposed JARNet and LAP image synthesis pipeline establish a\nfoundation for addressing this intricate challenge. Extensive experiments\ndemonstrate that the proposed two-stage method outperforms state-of-the-art\nimage restoration models. Code is available at\nhttps://github.com/JHW2000/JARNet.\n","authors":["Zida Chen","Ziran Zhang","Haoying Li","Menghao Li","Yueting Chen","Qi Li","Huajun Feng","Zhihai Xu","Shiqi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08171v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.08154v1","updated":"2024-01-16T06:53:03Z","published":"2024-01-16T06:53:03Z","title":"Learned Image Compression with ROI-Weighted Distortion and Bit\n Allocation","summary":" This one page paper describes our method for the track of image compression.\nTo achieve better perceptual quality, we use the adversarial loss to generate\nrealistic textures, use region of interest (ROI) mask to guide the bit\nallocation for different regions. Our Team name is TLIC.\n","authors":["Wei Jiang","Yongqi Zhai","Hangyu Li","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08154v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.08140v1","updated":"2024-01-16T06:19:18Z","published":"2024-01-16T06:19:18Z","title":"ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Process","summary":" Neural radiance fields (NeRFs) have gained popularity across various\napplications. However, they face challenges in the sparse view setting, lacking\nsufficient constraints from volume rendering. Reconstructing and understanding\na 3D scene from sparse and unconstrained cameras is a long-standing problem in\nclassical computer vision with diverse applications. While recent works have\nexplored NeRFs in sparse, unconstrained view scenarios, their focus has been\nprimarily on enhancing reconstruction and novel view synthesis. Our approach\ntakes a broader perspective by posing the question: \"from where has each point\nbeen seen?\" -- which gates how well we can understand and reconstruct it. In\nother words, we aim to determine the origin or provenance of each 3D point and\nits associated information under sparse, unconstrained views. We introduce\nProvNeRF, a model that enriches a traditional NeRF representation by\nincorporating per-point provenance, modeling likely source locations for each\npoint. We achieve this by extending implicit maximum likelihood estimation\n(IMLE) for stochastic processes. Notably, our method is compatible with any\npre-trained NeRF model and the associated training camera poses. We demonstrate\nthat modeling per-point provenance offers several advantages, including\nuncertainty estimation, criteria-based view selection, and improved novel view\nsynthesis, compared to state-of-the-art methods.\n","authors":["Kiyohiro Nakayama","Mikaela Angelina Uy","Yang You","Ke Li","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2401.08140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06400v2","updated":"2024-01-16T06:01:48Z","published":"2024-01-12T06:49:49Z","title":"Generalizing Visual Question Answering from Synthetic to Human-Written\n Questions via a Chain of QA with a Large Language Model","summary":" Visual question answering (VQA) is a task where an image is given, and a\nseries of questions are asked about the image. To build an efficient VQA\nalgorithm, a large amount of QA data is required which is very expensive.\nGenerating synthetic QA pairs based on templates is a practical way to obtain\ndata. However, VQA models trained on those data do not perform well on complex,\nhuman-written questions. To address this issue, we propose a new method called\n{\\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a\nsequence of QA interactions between a large language model and a VQA model\ntrained on synthetic data to reason and derive logical answers for\nhuman-written questions. We tested the effectiveness of CoQAH on two types of\nhuman-written VQA datasets for 3D-rendered and chest X-ray images and found\nthat it achieved state-of-the-art accuracy in both types of data. Notably,\nCoQAH outperformed general vision-language models, VQA models, and medical\nfoundation models with no finetuning.\n","authors":["Taehee Kim","Yeongjae Cho","Heejun Shin","Yohan Jo","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2401.06400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06506v2","updated":"2024-01-16T05:44:45Z","published":"2024-01-12T11:02:12Z","title":"Frequency Masking for Universal Deepfake Detection","summary":" We study universal deepfake detection. Our goal is to detect synthetic images\nfrom a range of generative AI approaches, particularly from emerging ones which\nare unseen during training of the deepfake detector. Universal deepfake\ndetection requires outstanding generalization capability. Motivated by recently\nproposed masked image modeling which has demonstrated excellent generalization\nin self-supervised pre-training, we make the first attempt to explore masked\nimage modeling for universal deepfake detection. We study spatial and frequency\ndomain masking in training deepfake detectors. Based on empirical analysis, we\npropose a novel deepfake detector via frequency masking. Our focus on frequency\ndomain is different from the majority, which primarily target spatial domain\ndetection. Our comparative analyses reveal substantial performance gains over\nexisting methods. Code and models are publicly available.\n","authors":["Chandler Timm Doloriel","Ngai-Man Cheung"],"pdf_url":"https://arxiv.org/pdf/2401.06506v2.pdf","comment":"Accepted to IEEE ICASSP-2024"},{"id":"http://arxiv.org/abs/2401.08123v1","updated":"2024-01-16T05:37:08Z","published":"2024-01-16T05:37:08Z","title":"The Devil is in the Details: Boosting Guided Depth Super-Resolution via\n Rethinking Cross-Modal Alignment and Aggregation","summary":" Guided depth super-resolution (GDSR) involves restoring missing depth details\nusing the high-resolution RGB image of the same scene. Previous approaches have\nstruggled with the heterogeneity and complementarity of the multi-modal inputs,\nand neglected the issues of modal misalignment, geometrical misalignment, and\nfeature selection. In this study, we rethink some essential components in GDSR\nnetworks and propose a simple yet effective Dynamic Dual Alignment and\nAggregation network (D2A2). D2A2 mainly consists of 1) a dynamic dual alignment\nmodule that adapts to alleviate the modal misalignment via a learnable domain\nalignment block and geometrically align cross-modal features by learning the\noffset; and 2) a mask-to-pixel feature aggregate module that uses the gated\nmechanism and pixel attention to filter out irrelevant texture noise from RGB\nfeatures and combine the useful features with depth features. By combining the\nstrengths of RGB and depth features while minimizing disturbance introduced by\nthe RGB image, our method with simple reuse and redesign of basic components\nachieves state-of-the-art performance on multiple benchmark datasets. The code\nis available at https://github.com/JiangXinni/D2A2.\n","authors":["Xinni Jiang","Zengsheng Kuang","Chunle Guo","Ruixun Zhang","Lei Cai","Xiao Fan","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2401.08123v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08117v1","updated":"2024-01-16T05:10:50Z","published":"2024-01-16T05:10:50Z","title":"E2HQV: High-Quality Video Generation from Event Camera via\n Theory-Inspired Model-Aided Deep Learning","summary":" The bio-inspired event cameras or dynamic vision sensors are capable of\nasynchronously capturing per-pixel brightness changes (called event-streams) in\nhigh temporal resolution and high dynamic range. However, the non-structural\nspatial-temporal event-streams make it challenging for providing intuitive\nvisualization with rich semantic information for human vision. It calls for\nevents-to-video (E2V) solutions which take event-streams as input and generate\nhigh quality video frames for intuitive visualization. However, current\nsolutions are predominantly data-driven without considering the prior knowledge\nof the underlying statistics relating event-streams and video frames. It highly\nrelies on the non-linearity and generalization capability of the deep neural\nnetworks, thus, is struggling on reconstructing detailed textures when the\nscenes are complex. In this work, we propose \\textbf{E2HQV}, a novel E2V\nparadigm designed to produce high-quality video frames from events. This\napproach leverages a model-aided deep learning framework, underpinned by a\ntheory-inspired E2V model, which is meticulously derived from the fundamental\nimaging principles of event cameras. To deal with the issue of state-reset in\nthe recurrent components of E2HQV, we also design a temporal shift embedding\nmodule to further improve the quality of the video frames. Comprehensive\nevaluations on the real world event camera datasets validate our approach, with\nE2HQV, notably outperforming state-of-the-art approaches, e.g., surpassing the\nsecond best by over 40\\% for some evaluation metrics.\n","authors":["Qiang Qu","Yiran Shen","Xiaoming Chen","Yuk Ying Chung","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08117v1.pdf","comment":"Accepted in AAAI2024"},{"id":"http://arxiv.org/abs/2401.08115v1","updated":"2024-01-16T05:05:08Z","published":"2024-01-16T05:05:08Z","title":"No-Clean-Reference Image Super-Resolution: Application to Electron\n Microscopy","summary":" The inability to acquire clean high-resolution (HR) electron microscopy (EM)\nimages over a large brain tissue volume hampers many neuroscience studies. To\naddress this challenge, we propose a deep-learning-based image super-resolution\n(SR) approach to computationally reconstruct clean HR 3D-EM with a large field\nof view (FoV) from noisy low-resolution (LR) acquisition. Our contributions are\nI) Investigating training with no-clean references for $\\ell_2$ and $\\ell_1$\nloss functions; II) Introducing a novel network architecture, named EMSR, for\nenhancing the resolution of LR EM images while reducing inherent noise; and,\nIII) Comparing different training strategies including using acquired LR and HR\nimage pairs, i.e., real pairs with no-clean references contaminated with real\ncorruptions, the pairs of synthetic LR and acquired HR, as well as acquired LR\nand denoised HR pairs. Experiments with nine brain datasets showed that\ntraining with real pairs can produce high-quality super-resolved results,\ndemonstrating the feasibility of training with non-clean references for both\nloss functions. Additionally, comparable results were observed, both visually\nand numerically, when employing denoised and noisy references for training.\nMoreover, utilizing the network trained with synthetically generated LR images\nfrom HR counterparts proved effective in yielding satisfactory SR results, even\nin certain cases, outperforming training with real pairs. The proposed SR\nnetwork was compared quantitatively and qualitatively with several established\nSR techniques, showcasing either the superiority or competitiveness of the\nproposed method in mitigating noise while recovering fine details.\n","authors":["Mohammad Khateri","Morteza Ghahremani","Alejandra Sierra","Jussi Tohka"],"pdf_url":"https://arxiv.org/pdf/2401.08115v1.pdf","comment":"14 pages, 12 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2401.08111v1","updated":"2024-01-16T04:42:54Z","published":"2024-01-16T04:42:54Z","title":"Mobile Contactless Palmprint Recognition: Use of Multiscale, Multimodel\n Embeddings","summary":" Contactless palmprints are comprised of both global and local discriminative\nfeatures. Most prior work focuses on extracting global features or local\nfeatures alone for palmprint matching, whereas this research introduces a novel\nframework that combines global and local features for enhanced palmprint\nmatching accuracy. Leveraging recent advancements in deep learning, this study\nintegrates a vision transformer (ViT) and a convolutional neural network (CNN)\nto extract complementary local and global features. Next, a mobile-based,\nend-to-end palmprint recognition system is developed, referred to as Palm-ID.\nOn top of the ViT and CNN features, Palm-ID incorporates a palmprint\nenhancement module and efficient dimensionality reduction (for faster\nmatching). Palm-ID balances the trade-off between accuracy and latency,\nrequiring just 18ms to extract a template of size 516 bytes, which can be\nefficiently searched against a 10,000 palmprint gallery in 0.33ms on an AMD\nEPYC 7543 32-Core CPU utilizing 128-threads. Cross-database matching protocols\nand evaluations on large-scale operational datasets demonstrate the robustness\nof the proposed method, achieving a TAR of 98.06% at FAR=0.01% on a newly\ncollected, time-separated dataset. To show a practical deployment of the\nend-to-end system, the entire recognition pipeline is embedded within a mobile\ndevice for enhanced user privacy and security.\n","authors":["Steven A. Grosz","Akash Godbole","Anil K. Jain"],"pdf_url":"https://arxiv.org/pdf/2401.08111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07932v2","updated":"2024-01-16T04:39:59Z","published":"2023-10-11T23:04:07Z","title":"What Matters to You? Towards Visual Representation Alignment for Robot\n Learning","summary":" When operating in service of people, robots need to optimize rewards aligned\nwith end-user preferences. Since robots will rely on raw perceptual inputs like\nRGB images, their rewards will inevitably use visual representations. Recently\nthere has been excitement in using representations from pre-trained visual\nmodels, but key to making these work in robotics is fine-tuning, which is\ntypically done via proxy tasks like dynamics prediction or enforcing temporal\ncycle-consistency. However, all these proxy tasks bypass the human's input on\nwhat matters to them, exacerbating spurious correlations and ultimately leading\nto robot behaviors that are misaligned with user preferences. In this work, we\npropose that robots should leverage human feedback to align their visual\nrepresentations with the end-user and disentangle what matters for the task. We\npropose Representation-Aligned Preference-based Learning (RAPL), a method for\nsolving the visual representation alignment problem and visual reward learning\nproblem through the lens of preference-based learning and optimal transport.\nAcross experiments in X-MAGICAL and in robotic manipulation, we find that\nRAPL's reward consistently generates preferred robot behaviors with high sample\nefficiency, and shows strong zero-shot generalization when the visual\nrepresentation is learned from a different embodiment than the robot's.\n","authors":["Ran Tian","Chenfeng Xu","Masayoshi Tomizuka","Jitendra Malik","Andrea Bajcsy"],"pdf_url":"https://arxiv.org/pdf/2310.07932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07586v3","updated":"2024-01-16T04:36:23Z","published":"2023-12-11T02:40:40Z","title":"Characteristic Guidance: Non-linear Correction for Diffusion Model at\n Large Guidance Scale","summary":" Popular guidance for denoising diffusion probabilistic model (DDPM) linearly\ncombines distinct conditional models together to provide enhanced control over\nsamples. However, this approach overlooks nonlinear effects that become\nsignificant when guidance scale is large. To address this issue, we propose\ncharacteristic guidance, a sampling method that provides first-principle\nnon-linear correction for classifier-free guided DDPMs. Such correction forces\nthe guided DDPMs to respect the Fokker-Planck equation of their underlying\ndiffusion process, in a way that is training-free, derivative-free, and\ncompatible with existing sampling methods. Experiments show that characteristic\nguidance enhances control and reduces color and exposure issues in image\ngeneration, proving effective in diverse applications ranging from latent space\nsampling to solving physics problems like magnet phase transitions.\n","authors":["Candi Zheng","Yuan Lan"],"pdf_url":"https://arxiv.org/pdf/2312.07586v3.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.08107v1","updated":"2024-01-16T04:28:09Z","published":"2024-01-16T04:28:09Z","title":"Deep Shape-Texture Statistics for Completely Blind Image Quality\n Evaluation","summary":" Opinion-Unaware Blind Image Quality Assessment (OU-BIQA) models aim to\npredict image quality without training on reference images and subjective\nquality scores. Thereinto, image statistical comparison is a classic paradigm,\nwhile the performance is limited by the representation ability of visual\ndescriptors. Deep features as visual descriptors have advanced IQA in recent\nresearch, but they are discovered to be highly texture-biased and lack of\nshape-bias. On this basis, we find out that image shape and texture cues\nrespond differently towards distortions, and the absence of either one results\nin an incomplete image representation. Therefore, to formulate a well-round\nstatistical description for images, we utilize the shapebiased and\ntexture-biased deep features produced by Deep Neural Networks (DNNs)\nsimultaneously. More specifically, we design a Shape-Texture Adaptive Fusion\n(STAF) module to merge shape and texture information, based on which we\nformulate qualityrelevant image statistics. The perceptual quality is\nquantified by the variant Mahalanobis Distance between the inner and outer\nShape-Texture Statistics (DSTS), wherein the inner and outer statistics\nrespectively describe the quality fingerprints of the distorted image and\nnatural images. The proposed DSTS delicately utilizes shape-texture statistical\nrelations between different data scales in the deep domain, and achieves\nstate-of-the-art (SOTA) quality prediction performance on images with\nartificial and authentic distortions.\n","authors":["Yixuan Li","Peilin Chen","Hanwei Zhu","Keyan Ding","Leida Li","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00436v3","updated":"2024-01-16T04:17:56Z","published":"2023-12-31T09:24:28Z","title":"Diff-PCR: Diffusion-Based Correspondence Searching in Doubly Stochastic\n Matrix Space for Point Cloud Registration","summary":" Efficiently finding optimal correspondences between point clouds is crucial\nfor solving both rigid and non-rigid point cloud registration problems.\nExisting methods often rely on geometric or semantic feature embedding to\nestablish correspondences and estimate transformations or flow fields.\nRecently, state-of-the-art methods have employed RAFT-like iterative updates to\nrefine the solution. However, these methods have certain limitations. Firstly,\ntheir iterative refinement design lacks transparency, and their iterative\nupdates follow a fixed path during the refinement process, which can lead to\nsuboptimal results. Secondly, these methods overlook the importance of refining\nor optimizing correspondences (or matching matrices) as a precursor to solving\ntransformations or flow fields. They typically compute candidate\ncorrespondences based on distances in the point feature space. However, they\nonly project the candidate matching matrix into some matrix space once with\nSinkhorn or dual softmax operations to obtain final correspondences. This\none-shot projected matching matrix may be far from the globally optimal one,\nand these approaches do not consider the distribution of the target matching\nmatrix. In this paper, we propose a novel approach that exploits the Denoising\nDiffusion Model to predict a searching gradient for the optimal matching matrix\nwithin the Doubly Stochastic Matrix Space. During the reverse denoising\nprocess, our method iteratively searches for better solutions along this\ndenoising gradient, which points towards the maximum likelihood direction of\nthe target matching matrix. Our method offers flexibility by allowing the\nsearch to start from any initial matching matrix provided by the online\nbackbone or white noise. Experimental evaluations on the 3DMatch/3DLoMatch and\n4DMatch/4DLoMatch datasets demonstrate the effectiveness of our newly designed\nframework.\n","authors":["Qianliang Wu","Haobo Jiang","Yaqing Ding","Lei Luo","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00436v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08105v1","updated":"2024-01-16T04:16:46Z","published":"2024-01-16T04:16:46Z","title":"Hardware Acceleration for Real-Time Wildfire Detection Onboard Drone\n Networks","summary":" Early wildfire detection in remote and forest areas is crucial for minimizing\ndevastation and preserving ecosystems. Autonomous drones offer agile access to\nremote, challenging terrains, equipped with advanced imaging technology that\ndelivers both high-temporal and detailed spatial resolution, making them\nvaluable assets in the early detection and monitoring of wildfires. However,\nthe limited computation and battery resources of Unmanned Aerial Vehicles\n(UAVs) pose significant challenges in implementing robust and efficient image\nclassification models. Current works in this domain often operate offline,\nemphasizing the need for solutions that can perform inference in real time,\ngiven the constraints of UAVs. To address these challenges, this paper aims to\ndevelop a real-time image classification and fire segmentation model. It\npresents a comprehensive investigation into hardware acceleration using the\nJetson Nano P3450 and the implications of TensorRT, NVIDIA's high-performance\ndeep-learning inference library, on fire classification accuracy and speed. The\nstudy includes implementations of Quantization Aware Training (QAT), Automatic\nMixed Precision (AMP), and post-training mechanisms, comparing them against the\nlatest baselines for fire segmentation and classification. All experiments\nutilize the FLAME dataset - an image dataset collected by low-altitude drones\nduring a prescribed forest fire. This work contributes to the ongoing efforts\nto enable real-time, on-board wildfire detection capabilities for UAVs,\naddressing speed and the computational and energy constraints of these crucial\nmonitoring systems. The results show a 13% increase in classification speed\ncompared to similar models without hardware optimization. Comparatively, loss\nand accuracy are within 1.225% of the original values.\n","authors":["Austin Briley","Fatemeh Afghah"],"pdf_url":"https://arxiv.org/pdf/2401.08105v1.pdf","comment":"6 pages, 7 figures, NETROBOTICS conference submission"},{"id":"http://arxiv.org/abs/2401.08100v1","updated":"2024-01-16T04:01:49Z","published":"2024-01-16T04:01:49Z","title":"KTVIC: A Vietnamese Image Captioning Dataset on the Life Domain","summary":" Image captioning is a crucial task with applications in a wide range of\ndomains, including healthcare and education. Despite extensive research on\nEnglish image captioning datasets, the availability of such datasets for\nVietnamese remains limited, with only two existing datasets. In this study, we\nintroduce KTVIC, a comprehensive Vietnamese Image Captioning dataset focused on\nthe life domain, covering a wide range of daily activities. This dataset\ncomprises 4,327 images and 21,635 Vietnamese captions, serving as a valuable\nresource for advancing image captioning in the Vietnamese language. We conduct\nexperiments using various deep neural networks as the baselines on our dataset,\nevaluating them using the standard image captioning metrics, including BLEU,\nMETEOR, CIDEr, and ROUGE. Our findings underscore the effectiveness of the\nproposed dataset and its potential contributions to the field of image\ncaptioning in the Vietnamese context.\n","authors":["Anh-Cuong Pham","Van-Quang Nguyen","Thi-Hong Vuong","Quang-Thuy Ha"],"pdf_url":"https://arxiv.org/pdf/2401.08100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08099v1","updated":"2024-01-16T03:59:07Z","published":"2024-01-16T03:59:07Z","title":"Inpainting Normal Maps for Lightstage data","summary":" This study introduces a novel method for inpainting normal maps using a\ngenerative adversarial network (GAN). Normal maps, often derived from a\nlightstage, are crucial in performance capture but can have obscured areas due\nto movement (e.g., by arms, hair, or props). Inpainting fills these missing\nareas with plausible data. Our approach extends previous general image\ninpainting techniques, employing a bow tie-like generator network and a\ndiscriminator network, with alternating training phases. The generator aims to\nsynthesize images aligning with the ground truth and deceive the discriminator,\nwhich differentiates between real and processed images. Periodically, the\ndiscriminator undergoes retraining to enhance its ability to identify processed\nimages. Importantly, our method adapts to the unique characteristics of normal\nmap data, necessitating modifications to the loss function. We utilize a cosine\nloss instead of mean squared error loss for generator training. Limited\ntraining data availability, even with synthetic datasets, demands significant\naugmentation, considering the specific nature of the input data. This includes\nappropriate image flipping and in-plane rotations to accurately alter normal\nvectors. Throughout training, we monitored key metrics such as average loss,\nStructural Similarity Index Measure (SSIM), and Peak Signal-to-Noise Ratio\n(PSNR) for the generator, along with average loss and accuracy for the\ndiscriminator. Our findings suggest that the proposed model effectively\ngenerates high-quality, realistic inpainted normal maps, suitable for\nperformance capture applications. These results establish a foundation for\nfuture research, potentially involving more advanced networks and comparisons\nwith inpainting of source images used to create the normal maps.\n","authors":["Hancheng Zuo","Bernard Tiddeman"],"pdf_url":"https://arxiv.org/pdf/2401.08099v1.pdf","comment":"8 pages, 4 figures, CGVC Conference, The Eurographics Association"},{"id":"http://arxiv.org/abs/2307.02251v3","updated":"2024-01-16T03:38:44Z","published":"2023-07-05T12:49:02Z","title":"RanPAC: Random Projections and Pre-trained Models for Continual Learning","summary":" Continual learning (CL) aims to incrementally learn different tasks (such as\nclassification) in a non-stationary data stream without forgetting old ones.\nMost CL works focus on tackling catastrophic forgetting under a\nlearning-from-scratch paradigm. However, with the increasing prominence of\nfoundation models, pre-trained models equipped with informative representations\nhave become available for various downstream requirements. Several CL methods\nbased on pre-trained models have been explored, either utilizing pre-extracted\nfeatures directly (which makes bridging distribution gaps challenging) or\nincorporating adaptors (which may be subject to forgetting). In this paper, we\npropose a concise and effective approach for CL with pre-trained models. Given\nthat forgetting occurs during parameter updating, we contemplate an alternative\napproach that exploits training-free random projectors and class-prototype\naccumulation, which thus bypasses the issue. Specifically, we inject a frozen\nRandom Projection layer with nonlinear activation between the pre-trained\nmodel's feature representations and output head, which captures interactions\nbetween features with expanded dimensionality, providing enhanced linear\nseparability for class-prototype-based CL. We also demonstrate the importance\nof decorrelating the class-prototypes to reduce the distribution disparity when\nusing pre-trained representations. These techniques prove to be effective and\ncircumvent the problem of forgetting for both class- and domain-incremental\ncontinual learning. Compared to previous methods applied to pre-trained\nViT-B/16 models, we reduce final error rates by between 20% and 62% on seven\nclass-incremental benchmarks, despite not using any rehearsal memory. We\nconclude that the full potential of pre-trained models for simple, effective,\nand fast CL has not hitherto been fully tapped. Code is at\ngithub.com/RanPAC/RanPAC.\n","authors":["Mark D. McDonnell","Dong Gong","Amin Parveneh","Ehsan Abbasnejad","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2307.02251v3.pdf","comment":"32 pages, 11 figures"},{"id":"http://arxiv.org/abs/2305.19556v2","updated":"2024-01-16T03:26:22Z","published":"2023-05-31T04:50:32Z","title":"Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation","summary":" Talking face generation is the challenging task of synthesizing a natural and\nrealistic face that requires accurate synchronization with a given audio. Due\nto co-articulation, where an isolated phone is influenced by the preceding or\nfollowing phones, the articulation of a phone varies upon the phonetic context.\nTherefore, modeling lip motion with the phonetic context can generate more\nspatio-temporally aligned lip movement. In this respect, we investigate the\nphonetic context in generating lip motion for talking face generation. We\npropose Context-Aware Lip-Sync framework (CALS), which explicitly leverages\nphonetic context to generate lip movement of the target face. CALS is comprised\nof an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained\nbased on masked learning to map each phone to a contextualized lip motion unit.\nThe contextualized lip motion unit then guides the latter in synthesizing a\ntarget identity with context-aware lip motion. From extensive experiments, we\nverify that simply exploiting the phonetic context in the proposed CALS\nframework effectively enhances spatio-temporal alignment. We also demonstrate\nthe extent to which the phonetic context assists in lip synchronization and\nfind the effective window size for lip generation to be approximately 1.2\nseconds.\n","authors":["Se Jin Park","Minsu Kim","Jeongsoo Choi","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2305.19556v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.08086v1","updated":"2024-01-16T03:25:12Z","published":"2024-01-16T03:25:12Z","title":"Spatial-Semantic Collaborative Cropping for User Generated Content","summary":" A large amount of User Generated Content (UGC) is uploaded to the Internet\ndaily and displayed to people world-widely through the client side (e.g.,\nmobile and PC). This requires the cropping algorithms to produce the aesthetic\nthumbnail within a specific aspect ratio on different devices. However,\nexisting image cropping works mainly focus on landmark or landscape images,\nwhich fail to model the relations among the multi-objects with the complex\nbackground in UGC. Besides, previous methods merely consider the aesthetics of\nthe cropped images while ignoring the content integrity, which is crucial for\nUGC cropping. In this paper, we propose a Spatial-Semantic Collaborative\ncropping network (S2CNet) for arbitrary user generated content accompanied by a\nnew cropping benchmark. Specifically, we first mine the visual genes of the\npotential objects. Then, the suggested adaptive attention graph recasts this\ntask as a procedure of information association over visual nodes. The\nunderlying spatial and semantic relations are ultimately centralized to the\ncrop candidate through differentiable message passing, which helps our network\nefficiently to preserve both the aesthetics and the content integrity.\nExtensive experiments on the proposed UGCrop5K and other public datasets\ndemonstrate the superiority of our approach over state-of-the-art counterparts.\nOur project is available at https://github.com/suyukun666/S2CNet.\n","authors":["Yukun Su","Yiwen Cao","Jingliang Deng","Fengyun Rao","Qingyao Wu"],"pdf_url":"https://arxiv.org/pdf/2401.08086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08083v1","updated":"2024-01-16T03:21:42Z","published":"2024-01-16T03:21:42Z","title":"UV-SAM: Adapting Segment Anything Model for Urban Village Identification","summary":" Urban villages, defined as informal residential areas in or around urban\ncenters, are characterized by inadequate infrastructures and poor living\nconditions, closely related to the Sustainable Development Goals (SDGs) on\npoverty, adequate housing, and sustainable cities. Traditionally, governments\nheavily depend on field survey methods to monitor the urban villages, which\nhowever are time-consuming, labor-intensive, and possibly delayed. Thanks to\nwidely available and timely updated satellite images, recent studies develop\ncomputer vision techniques to detect urban villages efficiently. However,\nexisting studies either focus on simple urban village image classification or\nfail to provide accurate boundary information. To accurately identify urban\nvillage boundaries from satellite images, we harness the power of the vision\nfoundation model and adapt the Segment Anything Model (SAM) to urban village\nsegmentation, named UV-SAM. Specifically, UV-SAM first leverages a small-sized\nsemantic segmentation model to produce mixed prompts for urban villages,\nincluding mask, bounding box, and image representations, which are then fed\ninto SAM for fine-grained boundary identification. Extensive experimental\nresults on two datasets in China demonstrate that UV-SAM outperforms existing\nbaselines, and identification results over multiple years show that both the\nnumber and area of urban villages are decreasing over time, providing deeper\ninsights into the development trends of urban villages and sheds light on the\nvision foundation models for sustainable cities. The dataset and codes of this\nstudy are available at https://github.com/tsinghua-fib-lab/UV-SAM.\n","authors":["Xin Zhang","Yu Liu","Yuming Lin","Qingming Liao","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2401.08083v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.08079v1","updated":"2024-01-16T03:09:45Z","published":"2024-01-16T03:09:45Z","title":"Adversarial Masking Contrastive Learning for vein recognition","summary":" Vein recognition has received increasing attention due to its high security\nand privacy. Recently, deep neural networks such as Convolutional neural\nnetworks (CNN) and Transformers have been introduced for vein recognition and\nachieved state-of-the-art performance. Despite the recent advances, however,\nexisting solutions for finger-vein feature extraction are still not optimal due\nto scarce training image samples. To overcome this problem, in this paper, we\npropose an adversarial masking contrastive learning (AMCL) approach, that\ngenerates challenging samples to train a more robust contrastive learning model\nfor the downstream palm-vein recognition task, by alternatively optimizing the\nencoder in the contrastive learning model and a set of latent variables. First,\na huge number of masks are generated to train a robust generative adversarial\nnetwork (GAN). The trained generator transforms a latent variable from the\nlatent variable space into a mask space. Then, we combine the trained generator\nwith a contrastive learning model to obtain our AMCL, where the generator\nproduces challenging masking images to increase the contrastive loss and the\ncontrastive learning model is trained based on the harder images to learn a\nmore robust feature representation. After training, the trained encoder in the\ncontrastive learning model is combined with a classification layer to build a\nclassifier, which is further fine-tuned on labeled training data for vein\nrecognition. The experimental results on three databases demonstrate that our\napproach outperforms existing contrastive learning approaches in terms of\nimproving identification accuracy of vein classifiers and achieves\nstate-of-the-art recognition results.\n","authors":["Huafeng Qin","Yiquan Wu","Mounim A. El-Yacoubi","Jun Wang","Guangxiang Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08068v1","updated":"2024-01-16T02:51:47Z","published":"2024-01-16T02:51:47Z","title":"Representation Learning on Event Stream via an Elastic Net-incorporated\n Tensor Network","summary":" Event cameras are neuromorphic sensors that capture asynchronous and sparse\nevent stream when per-pixel brightness changes. The state-of-the-art processing\nmethods for event signals typically aggregate events into a frame or a grid.\nHowever, events are dense in time, these works are limited to local information\nof events due to the stacking. In this paper, we present a novel spatiotemporal\nrepresentation learning method which can capture the global correlations of all\nevents in the event stream simultaneously by tensor decomposition. In addition,\nwith the events are sparse in space, we propose an Elastic Net-incorporated\ntensor network (ENTN) model to obtain more spatial and temporal details about\nevent stream. Empirically, the results indicate that our method can represent\nthe spatiotemporal correlation of events with high quality, and can achieve\neffective results in applications like filtering noise compared with the\nstate-of-the-art methods.\n","authors":["Beibei Yang","Weiling Li","Yan Fang"],"pdf_url":"https://arxiv.org/pdf/2401.08068v1.pdf","comment":"7 pages, 3 figure"},{"id":"http://arxiv.org/abs/2401.08066v1","updated":"2024-01-16T02:49:52Z","published":"2024-01-16T02:49:52Z","title":"Achieve Fairness without Demographics for Dermatological Disease\n Diagnosis","summary":" In medical image diagnosis, fairness has become increasingly crucial. Without\nbias mitigation, deploying unfair AI would harm the interests of the\nunderprivileged population and potentially tear society apart. Recent research\naddresses prediction biases in deep learning models concerning demographic\ngroups (e.g., gender, age, and race) by utilizing demographic (sensitive\nattribute) information during training. However, many sensitive attributes\nnaturally exist in dermatological disease images. If the trained model only\ntargets fairness for a specific attribute, it remains unfair for other\nattributes. Moreover, training a model that can accommodate multiple sensitive\nattributes is impractical due to privacy concerns. To overcome this, we propose\na method enabling fair predictions for sensitive attributes during the testing\nphase without using such information during training. Inspired by prior work\nhighlighting the impact of feature entanglement on fairness, we enhance the\nmodel features by capturing the features related to the sensitive and target\nattributes and regularizing the feature entanglement between corresponding\nclasses. This ensures that the model can only classify based on the features\nrelated to the target attribute without relying on features associated with\nsensitive attributes, thereby improving fairness and accuracy. Additionally, we\nuse disease masks from the Segment Anything Model (SAM) to enhance the quality\nof the learned feature. Experimental results demonstrate that the proposed\nmethod can improve fairness in classification compared to state-of-the-art\nmethods in two dermatological disease datasets.\n","authors":["Ching-Hao Chiu","Yu-Jen Chen","Yawen Wu","Yiyu Shi","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2401.08066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08061v1","updated":"2024-01-16T02:42:45Z","published":"2024-01-16T02:42:45Z","title":"Augmenting Ground-Level PM2.5 Prediction via Kriging-Based Pseudo-Label\n Generation","summary":" Fusing abundant satellite data with sparse ground measurements constitutes a\nmajor challenge in climate modeling. To address this, we propose a strategy to\naugment the training dataset by introducing unlabeled satellite images paired\nwith pseudo-labels generated through a spatial interpolation technique known as\nordinary kriging, thereby making full use of the available satellite data\nresources. We show that the proposed data augmentation strategy helps enhance\nthe performance of the state-of-the-art convolutional neural network-random\nforest (CNN-RF) model by a reasonable amount, resulting in a noteworthy\nimprovement in spatial correlation and a reduction in prediction error.\n","authors":["Lei Duan","Ziyang Jiang","David Carlson"],"pdf_url":"https://arxiv.org/pdf/2401.08061v1.pdf","comment":"8 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change\n with Machine Learning"},{"id":"http://arxiv.org/abs/2401.08058v1","updated":"2024-01-16T02:26:29Z","published":"2024-01-16T02:26:29Z","title":"Toward Clinically Trustworthy Deep Learning: Applying Conformal\n Prediction to Intracranial Hemorrhage Detection","summary":" As deep learning (DL) continues to demonstrate its ability in radiological\ntasks, it is critical that we optimize clinical DL solutions to include safety.\nOne of the principal concerns in the clinical adoption of DL tools is trust.\nThis study aims to apply conformal prediction as a step toward trustworthiness\nfor DL in radiology. This is a retrospective study of 491 non-contrast head CTs\nfrom the CQ500 dataset, in which three senior radiologists annotated slices\ncontaining intracranial hemorrhage (ICH). The dataset was split into definite\nand challenging subsets, where challenging images were defined to those in\nwhich there was disagreement among readers. A DL model was trained on 146\npatients (10,815 slices) from the definite data (training dataset) to perform\nICH localization and classification for five classes of ICH. To develop an\nuncertainty-aware DL model, 1,546 cases of the definite data (calibration\ndataset) was used for Mondrian conformal prediction (MCP). The\nuncertainty-aware DL model was tested on 8,401 definite and challenging cases\nto assess its ability to identify challenging cases. After the MCP procedure,\nthe model achieved an F1 score of 0.920 for ICH classification on the test\ndataset. Additionally, it correctly identified 6,837 of the 6,856 total\nchallenging cases as challenging (99.7% accuracy). It did not incorrectly label\nany definite cases as challenging. The uncertainty-aware ICH detector performs\non par with state-of-the-art models. MCP's performance in detecting challenging\ncases demonstrates that it is useful in automated ICH detection and promising\nfor trustworthiness in radiological DL.\n","authors":["Cooper Gamble","Shahriar Faghani","Bradley J. Erickson"],"pdf_url":"https://arxiv.org/pdf/2401.08058v1.pdf","comment":"14 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.08056v1","updated":"2024-01-16T02:14:33Z","published":"2024-01-16T02:14:33Z","title":"Robust Tiny Object Detection in Aerial Images amidst Label Noise","summary":" Precise detection of tiny objects in remote sensing imagery remains a\nsignificant challenge due to their limited visual information and frequent\noccurrence within scenes. This challenge is further exacerbated by the\npractical burden and inherent errors associated with manual annotation:\nannotating tiny objects is laborious and prone to errors (i.e., label noise).\nTraining detectors for such objects using noisy labels often leads to\nsuboptimal performance, with networks tending to overfit on noisy labels. In\nthis study, we address the intricate issue of tiny object detection under noisy\nlabel supervision. We systematically investigate the impact of various types of\nnoise on network training, revealing the vulnerability of object detectors to\nclass shifts and inaccurate bounding boxes for tiny objects. To mitigate these\nchallenges, we propose a DeNoising Tiny Object Detector (DN-TOD), which\nincorporates a Class-aware Label Correction (CLC) scheme to address class\nshifts and a Trend-guided Learning Strategy (TLS) to handle bounding box noise.\nCLC mitigates inaccurate class supervision by identifying and filtering out\nclass-shifted positive samples, while TLS reduces noisy box-induced erroneous\nsupervision through sample reweighting and bounding box regeneration.\nAdditionally, Our method can be seamlessly integrated into both one-stage and\ntwo-stage object detection pipelines. Comprehensive experiments conducted on\nsynthetic (i.e., noisy AI-TOD-v2.0 and DOTA-v2.0) and real-world (i.e., AI-TOD)\nnoisy datasets demonstrate the robustness of DN-TOD under various types of\nlabel noise. Notably, when applied to the strong baseline RFLA, DN-TOD exhibits\na noteworthy performance improvement of 4.9 points under 40% mixed noise.\nDatasets, codes, and models will be made publicly available.\n","authors":["Haoran Zhu","Chang Xu","Wen Yang","Ruixiang Zhang","Yan Zhang","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2401.08056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08053v1","updated":"2024-01-16T02:10:13Z","published":"2024-01-16T02:10:13Z","title":"SCoFT: Self-Contrastive Fine-Tuning for Equitable Image Generation","summary":" Accurate representation in media is known to improve the well-being of the\npeople who consume it. Generative image models trained on large web-crawled\ndatasets such as LAION are known to produce images with harmful stereotypes and\nmisrepresentations of cultures. We improve inclusive representation in\ngenerated images by (1) engaging with communities to collect a culturally\nrepresentative dataset that we call the Cross-Cultural Understanding Benchmark\n(CCUB) and (2) proposing a novel Self-Contrastive Fine-Tuning (SCoFT) method\nthat leverages the model's known biases to self-improve. SCoFT is designed to\nprevent overfitting on small datasets, encode only high-level information from\nthe data, and shift the generated distribution away from misrepresentations\nencoded in a pretrained model. Our user study conducted on 51 participants from\n5 different countries based on their self-selected national cultural\naffiliation shows that fine-tuning on CCUB consistently generates images with\nhigher cultural relevance and fewer stereotypes when compared to the Stable\nDiffusion baseline, which is further improved with our SCoFT technique.\n","authors":["Zhixuan Liu","Peter Schaldenbrand","Beverley-Claire Okogwu","Wenxuan Peng","Youngsik Yun","Andrew Hundt","Jihie Kim","Jean Oh"],"pdf_url":"https://arxiv.org/pdf/2401.08053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08049v1","updated":"2024-01-16T02:02:44Z","published":"2024-01-16T02:02:44Z","title":"EmoTalker: Emotionally Editable Talking Face Generation via Diffusion\n Model","summary":" In recent years, the field of talking faces generation has attracted\nconsiderable attention, with certain methods adept at generating virtual faces\nthat convincingly imitate human expressions. However, existing methods face\nchallenges related to limited generalization, particularly when dealing with\nchallenging identities. Furthermore, methods for editing expressions are often\nconfined to a singular emotion, failing to adapt to intricate emotions. To\novercome these challenges, this paper proposes EmoTalker, an emotionally\neditable portraits animation approach based on the diffusion model. EmoTalker\nmodifies the denoising process to ensure preservation of the original\nportrait's identity during inference. To enhance emotion comprehension from\ntext input, Emotion Intensity Block is introduced to analyze fine-grained\nemotions and strengths derived from prompts. Additionally, a crafted dataset is\nharnessed to enhance emotion comprehension within prompts. Experiments show the\neffectiveness of EmoTalker in generating high-quality, emotionally customizable\nfacial expressions.\n","authors":["Bingyuan Zhang","Xulong Zhang","Ning Cheng","Jun Yu","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08049v1.pdf","comment":"Accepted by 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing (ICASSP2024)"},{"id":"http://arxiv.org/abs/2205.00932v3","updated":"2024-01-16T02:00:14Z","published":"2022-05-02T14:27:35Z","title":"Understanding CNNs from excitations","summary":" Saliency maps have proven to be a highly efficacious approach for explicating\nthe decisions of Convolutional Neural Networks. However, extant methodologies\npredominantly rely on gradients, which constrain their ability to explicate\ncomplex models. Furthermore, such approaches are not fully adept at leveraging\nnegative gradient information to improve interpretive veracity. In this study,\nwe present a novel concept, termed positive and negative excitation, which\nenables the direct extraction of positive and negative excitation for each\nlayer, thus enabling complete layer-by-layer information utilization sans\ngradients. To organize these excitations into final saliency maps, we introduce\na double-chain backpropagation procedure. A comprehensive experimental\nevaluation, encompassing both binary classification and multi-classification\ntasks, was conducted to gauge the effectiveness of our proposed method.\nEncouragingly, the results evince that our approach offers a significant\nimprovement over the state-of-the-art methods in terms of salient pixel\nremoval, minor pixel removal, and inconspicuous adversarial perturbation\ngeneration guidance. Additionally, we verify the correlation between positive\nand negative excitations.\n","authors":["Zijian Ying","Qianmu Li","Zhichao Lian","Jun Hou","Tong Lin","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2205.00932v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10771v3","updated":"2024-01-16T01:58:56Z","published":"2023-04-21T06:41:17Z","title":"A Revisit of the Normalized Eight-Point Algorithm and A Self-Supervised\n Deep Solution","summary":" The normalized eight-point algorithm has been widely viewed as the\ncornerstone in two-view geometry computation, where the seminal Hartley's\nnormalization has greatly improved the performance of the direct linear\ntransformation algorithm. A natural question is, whether there exists and how\nto find other normalization methods that may further improve the performance as\nper each input sample. In this paper, we provide a novel perspective and\npropose two contributions to this fundamental problem: 1) we revisit the\nnormalized eight-point algorithm and make a theoretical contribution by\npresenting the existence of different and better normalization algorithms; 2)\nwe introduce a deep convolutional neural network with a self-supervised\nlearning strategy for normalization. Given eight pairs of correspondences, our\nnetwork directly predicts the normalization matrices, thus learning to\nnormalize each input sample. Our learning-based normalization module can be\nintegrated with both traditional (e.g., RANSAC) and deep learning frameworks\n(affording good interpretability) with minimal effort. Extensive experiments on\nboth synthetic and real images demonstrate the effectiveness of our proposed\napproach.\n","authors":["Bin Fan","Yuchao Dai","Yongduek Seo","Mingyi He"],"pdf_url":"https://arxiv.org/pdf/2304.10771v3.pdf","comment":"Accepted by Visual Intelligence"},{"id":"http://arxiv.org/abs/2401.08045v1","updated":"2024-01-16T01:57:24Z","published":"2024-01-16T01:57:24Z","title":"Forging Vision Foundation Models for Autonomous Driving: Challenges,\n Methodologies, and Opportunities","summary":" The rise of large foundation models, trained on extensive datasets, is\nrevolutionizing the field of AI. Models such as SAM, DALL-E2, and GPT-4\nshowcase their adaptability by extracting intricate patterns and performing\neffectively across diverse tasks, thereby serving as potent building blocks for\na wide range of AI applications. Autonomous driving, a vibrant front in AI\napplications, remains challenged by the lack of dedicated vision foundation\nmodels (VFMs). The scarcity of comprehensive training data, the need for\nmulti-sensor integration, and the diverse task-specific architectures pose\nsignificant obstacles to the development of VFMs in this field. This paper\ndelves into the critical challenge of forging VFMs tailored specifically for\nautonomous driving, while also outlining future directions. Through a\nsystematic analysis of over 250 papers, we dissect essential techniques for VFM\ndevelopment, including data preparation, pre-training strategies, and\ndownstream task adaptation. Moreover, we explore key advancements such as NeRF,\ndiffusion models, 3D Gaussian Splatting, and world models, presenting a\ncomprehensive roadmap for future research. To empower researchers, we have\nbuilt and maintained https://github.com/zhanghm1995/Forge_VFM4AD, an\nopen-access repository constantly updated with the latest advancements in\nforging VFMs for autonomous driving.\n","authors":["Xu Yan","Haiming Zhang","Yingjie Cai","Jingming Guo","Weichao Qiu","Bin Gao","Kaiqiang Zhou","Yue Zhao","Huan Jin","Jiantao Gao","Zhen Li","Lihui Jiang","Wei Zhang","Hongbo Zhang","Dengxin Dai","Bingbing Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08045v1.pdf","comment":"Github Repo: https://github.com/zhanghm1995/Forge_VFM4AD"},{"id":"http://arxiv.org/abs/2401.08043v1","updated":"2024-01-16T01:48:45Z","published":"2024-01-16T01:48:45Z","title":"Cross-Modal Semi-Dense 6-DoF Tracking of an Event Camera in Challenging\n Conditions","summary":" Vision-based localization is a cost-effective and thus attractive solution\nfor many intelligent mobile platforms. However, its accuracy and especially\nrobustness still suffer from low illumination conditions, illumination changes,\nand aggressive motion. Event-based cameras are bio-inspired visual sensors that\nperform well in HDR conditions and have high temporal resolution, and thus\nprovide an interesting alternative in such challenging scenarios. While purely\nevent-based solutions currently do not yet produce satisfying mapping results,\nthe present work demonstrates the feasibility of purely event-based tracking if\nan alternative sensor is permitted for mapping. The method relies on geometric\n3D-2D registration of semi-dense maps and events, and achieves highly reliable\nand accurate cross-modal tracking results. Practically relevant scenarios are\ngiven by depth camera-supported tracking or map-based localization with a\nsemi-dense map prior created by a regular image-based visual SLAM or\nstructure-from-motion system. Conventional edge-based 3D-2D alignment is\nextended by a novel polarity-aware registration that makes use of signed\ntime-surface maps (STSM) obtained from event streams. We furthermore introduce\na novel culling strategy for occluded points. Both modifications increase the\nspeed of the tracker and its robustness against occlusions or large view-point\nvariations. The approach is validated on many real datasets covering the\nabove-mentioned challenging conditions, and compared against similar solutions\nrealised with regular cameras.\n","authors":["Yi-Fan Zuo","Wanting Xu","Xia Wang","Yifu Wang","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2401.08043v1.pdf","comment":"accepted by IEEE Transactions on Robotics (T-RO). arXiv admin note:\n text overlap with arXiv:2202.02556"},{"id":"http://arxiv.org/abs/2401.00241v2","updated":"2024-01-16T01:23:13Z","published":"2023-12-30T14:11:08Z","title":"Image Super-resolution Reconstruction Network based on Enhanced Swin\n Transformer via Alternating Aggregation of Local-Global Features","summary":" The Swin Transformer image super-resolution reconstruction network only\nrelies on the long-range relationship of window attention and shifted window\nattention to explore features. This mechanism has two limitations. On the one\nhand, it only focuses on global features while ignoring local features. On the\nother hand, it is only concerned with spatial feature interactions while\nignoring channel features and channel interactions, thus limiting its\nnon-linear mapping ability. To address the above limitations, this paper\nproposes enhanced Swin Transformer modules via alternating aggregation of\nlocal-global features. In the local feature aggregation stage, we introduce a\nshift convolution to realize the interaction between local spatial information\nand channel information. Then, a block sparse global perception module is\nintroduced in the global feature aggregation stage. In this module, we\nreorganize the spatial information first, then send the recombination\ninformation into a multi-layer perceptron unit to implement the global\nperception. After that, a multi-scale self-attention module and a low-parameter\nresidual channel attention module are introduced to realize information\naggregation at different scales. Finally, the proposed network is validated on\nfive publicly available datasets. The experimental results show that the\nproposed network outperforms the other state-of-the-art super-resolution\nnetworks.\n","authors":["Yuming Huang","Yingpin Chen","Changhui Wu","Hanrong Xie","Binhui Song","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2401.00241v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08036v1","updated":"2024-01-16T01:12:24Z","published":"2024-01-16T01:12:24Z","title":"3D Lane Detection from Front or Surround-View using Joint-Modeling &\n Matching","summary":" 3D lanes offer a more comprehensive understanding of the road surface\ngeometry than 2D lanes, thereby providing crucial references for driving\ndecisions and trajectory planning. While many efforts aim to improve prediction\naccuracy, we recognize that an efficient network can bring results closer to\nlane modeling. However, if the modeling data is imprecise, the results might\nnot accurately capture the real-world scenario. Therefore, accurate lane\nmodeling is essential to align prediction results closely with the environment.\nThis study centers on efficient and accurate lane modeling, proposing a joint\nmodeling approach that combines Bezier curves and interpolation methods.\nFurthermore, based on this lane modeling approach, we developed a Global2Local\nLane Matching method with Bezier Control-Point and Key-Point, which serve as a\ncomprehensive solution that leverages hierarchical features with two\nmathematical models to ensure a precise match. We also introduce a novel 3D\nSpatial Constructor, representing an exploration of 3D surround-view lane\ndetection research. The framework is suitable for front-view or surround-view\n3D lane detection. By directly outputting the key points of lanes in 3D space,\nit overcomes the limitations of anchor-based methods, enabling accurate\nprediction of closed-loop or U-shaped lanes and effective adaptation to complex\nroad conditions. This innovative method establishes a new benchmark in\nfront-view 3D lane detection on the Openlane dataset and achieves competitive\nperformance in surround-view 2D lane detection on the Argoverse2 dataset.\n","authors":["Haibin Zhou","Jun Chang","Tao Lu","Huabing Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.08036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08035v1","updated":"2024-01-16T01:08:19Z","published":"2024-01-16T01:08:19Z","title":"BanglaNet: Bangla Handwritten Character Recognition using Ensembling of\n Convolutional Neural Network","summary":" Handwritten character recognition is a crucial task because of its abundant\napplications. The recognition task of Bangla handwritten characters is\nespecially challenging because of the cursive nature of Bangla characters and\nthe presence of compound characters with more than one way of writing. In this\npaper, a classification model based on the ensembling of several Convolutional\nNeural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic\ncharacters, compound characters, numerals, and modifiers. Three different\nmodels based on the idea of state-of-the-art CNN models like Inception, ResNet,\nand DenseNet have been trained with both augmented and non-augmented inputs.\nFinally, all these models are averaged or ensembled to get the finishing model.\nRigorous experimentation on three benchmark Bangla handwritten characters\ndatasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited\nsignificant recognition accuracies compared to some recent CNN-based research.\nThe top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and\nthe top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb,\nBanglaLekha-Isolated, and Ekush datasets respectively.\n","authors":["Chandrika Saha","Md. Mostafijur Rahman"],"pdf_url":"https://arxiv.org/pdf/2401.08035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08023v1","updated":"2024-01-16T00:29:05Z","published":"2024-01-16T00:29:05Z","title":"Spatial Channel State Information Prediction with Generative AI: Towards\n Holographic Communication and Digital Radio Twin","summary":" As 5G technology becomes increasingly established, the anticipation for 6G is\ngrowing, which promises to deliver faster and more reliable wireless\nconnections via cutting-edge radio technologies. However, efficient management\nmethod of the large-scale antenna arrays deployed by those radio technologies\nis crucial. Traditional management methods are mainly reactive, usually based\non feedback from users to adapt to the dynamic wireless channel. However, a\nmore promising approach lies in the prediction of spatial channel state\ninformation (spatial-CSI), which is an all-inclusive channel characterization\nand consists of all the feasible line-of-sight (LoS) and non-line-of-sight\n(NLoS) paths between the transmitter (Tx) and receiver (Rx), with the\nthree-dimension (3D) trajectory, attenuation, phase shift, delay, and\npolarization of each path. Advances in hardware and neural networks make it\npossible to predict such spatial-CSI using precise environmental information,\nand further look into the possibility of holographic communication, which\nimplies complete control over every aspect of the radio waves emitted. Based on\nthe integration of holographic communication and digital twin, we proposed a\nnew framework, digital radio twin, which takes advantages from both the digital\nworld and deterministic control over radio waves, supporting a wide range of\nhigh-level applications. As a preliminary attempt towards this visionary\ndirection, in this paper, we explore the use of generative artificial\nintelligence (AI) to pinpoint the valid paths in a given environment,\ndemonstrating promising results, and highlighting the potential of this\napproach in driving forward the evolution of 6G wireless communication\ntechnologies.\n","authors":["Lihao Zhang","Haijian Sun","Yong Zeng","Rose Qingyang Hu"],"pdf_url":"https://arxiv.org/pdf/2401.08023v1.pdf","comment":"submitted to IEEE for potential publication"},{"id":"http://arxiv.org/abs/2401.08017v1","updated":"2024-01-16T00:01:23Z","published":"2024-01-16T00:01:23Z","title":"Small Object Detection by DETR via Information Augmentation and Adaptive\n Feature Fusion","summary":" The main challenge for small object detection algorithms is to ensure\naccuracy while pursuing real-time performance. The RT-DETR model performs well\nin real-time object detection, but performs poorly in small object detection\naccuracy. In order to compensate for the shortcomings of the RT-DETR model in\nsmall object detection, two key improvements are proposed in this study.\nFirstly, The RT-DETR utilises a Transformer that receives input solely from the\nfinal layer of Backbone features. This means that the Transformer's input only\nreceives semantic information from the highest level of abstraction in the Deep\nNetwork, and ignores detailed information such as edges, texture or color\ngradients that are critical to the location of small objects at lower levels of\nabstraction. Including only deep features can introduce additional background\nnoise. This can have a negative impact on the accuracy of small object\ndetection. To address this issue, we propose the fine-grained path augmentation\nmethod. This method helps to locate small objects more accurately by providing\ndetailed information to the deep network. So, the input to the transformer\ncontains both semantic and detailed information. Secondly, In RT-DETR, the\ndecoder takes feature maps of different levels as input after concatenating\nthem with equal weight. However, this operation is not effective in dealing\nwith the complex relationship of multi-scale information captured by feature\nmaps of different sizes. Therefore, we propose an adaptive feature fusion\nalgorithm that assigns learnable parameters to each feature map from different\nlevels. This allows the model to adaptively fuse feature maps from different\nlevels and effectively integrate feature information from different scales.\nThis enhances the model's ability to capture object features at different\nscales, thereby improving the accuracy of detecting small objects.\n","authors":["Ji Huang","Hui Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08876v1","updated":"2024-01-16T23:19:30Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir lack of interpretability makes uncertainty quantification challenging. We\ninvestigate the effects of presenting conformal prediction\nsets$\\unicode{x2013}$a method for generating valid confidence sets in\ndistribution-free uncertainty quantification$\\unicode{x2013}$to express\nuncertainty in AI-advised decision-making. Through a large pre-registered\nexperiment, we compare the utility of conformal prediction sets to displays of\nTop-1 and Top-k predictions for AI-advised image labeling. We find that the\nutility of prediction sets for accuracy varies with the difficulty of the task:\nwhile they result in accuracy on par with or less than Top-1 and Top-k displays\nfor easy images, prediction sets excel at assisting humans in labeling\nout-of-distribution (OOD) images especially when the set size is small. Our\nresults empirically pinpoint the practical challenges of conformal prediction\nsets and provide implications on how to incorporate them for real-world\ndecision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v1.pdf","comment":"28 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2401.08868v1","updated":"2024-01-16T22:46:29Z","published":"2024-01-16T22:46:29Z","title":"B-Cos Aligned Transformers Learn Human-Interpretable Features","summary":" Vision Transformers (ViTs) and Swin Transformers (Swin) are currently\nstate-of-the-art in computational pathology. However, domain experts are still\nreluctant to use these models due to their lack of interpretability. This is\nnot surprising, as critical decisions need to be transparent and\nunderstandable. The most common approach to understanding transformers is to\nvisualize their attention. However, attention maps of ViTs are often\nfragmented, leading to unsatisfactory explanations. Here, we introduce a novel\narchitecture called the B-cos Vision Transformer (BvT) that is designed to be\nmore interpretable. It replaces all linear transformations with the B-cos\ntransform to promote weight-input alignment. In a blinded study, medical\nexperts clearly ranked BvTs above ViTs, suggesting that our network is better\nat capturing biomedically relevant structures. This is also true for the B-cos\nSwin Transformer (Bwin). Compared to the Swin Transformer, it even improves the\nF1-score by up to 4.7% on two public datasets.\n","authors":["Manuel Tran","Amal Lahiani","Yashin Dicente Cid","Melanie Boxberg","Peter Lienemann","Christian Matek","Sophia J. Wagner","Fabian J. Theis","Eldad Klaiman","Tingying Peng"],"pdf_url":"https://arxiv.org/pdf/2401.08868v1.pdf","comment":"Accepted at MICCAI 2023 (oral)"},{"id":"http://arxiv.org/abs/2401.08865v1","updated":"2024-01-16T22:36:23Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n Learning Differences Between Natural and Medical Images","summary":" This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v1.pdf","comment":"ICLR 2024. Code:\n https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2305.14243v5","updated":"2024-01-16T22:34:04Z","published":"2023-05-23T16:58:55Z","title":"Training Transitive and Commutative Multimodal Transformers with LoReTTa","summary":" Training multimodal foundation models is challenging due to the limited\navailability of multimodal datasets. While many public datasets pair images\nwith text, few combine images with audio or text with audio. Even rarer are\ndatasets that align all three modalities at once. Critical domains such as\nhealthcare, infrastructure, or transportation are particularly affected by\nmissing modalities. This makes it difficult to integrate all modalities into a\nlarge pre-trained neural network that can be used out-of-the-box or fine-tuned\nfor different downstream tasks. We introduce LoReTTa (Linking mOdalities with a\ntRansitive and commutativE pre-Training sTrAtegy) to address this understudied\nproblem. Our self-supervised framework unifies causal modeling and masked\nmodeling with the rules of commutativity and transitivity. This allows us to\ntransition within and between modalities. As a result, our pre-trained models\nare better at exploring the true underlying joint probability distribution.\nGiven a dataset containing only the disjoint combinations (A, B) and (B, C),\nLoReTTa can model the relation A <-> C with A <-> B <-> C. In particular, we\nshow that a transformer pre-trained with LoReTTa can handle any mixture of\nmodalities at inference time, including the never-seen pair (A, C) and the\ntriplet (A, B, C). We extensively evaluate our approach on a synthetic,\nmedical, and reinforcement learning dataset. Across different domains, our\nuniversal multimodal transformer consistently outperforms strong baselines such\nas GPT, BERT, and CLIP on tasks involving the missing modality tuple.\n","authors":["Manuel Tran","Yashin Dicente Cid","Amal Lahiani","Fabian J. Theis","Tingying Peng","Eldad Klaiman"],"pdf_url":"https://arxiv.org/pdf/2305.14243v5.pdf","comment":"Accepted at NeurIPS 2023 (poster). Camera-ready version"},{"id":"http://arxiv.org/abs/2401.08860v1","updated":"2024-01-16T22:22:09Z","published":"2024-01-16T22:22:09Z","title":"Cross-Level Multi-Instance Distillation for Self-Supervised Fine-Grained\n Visual Categorization","summary":" High-quality annotation of fine-grained visual categories demands great\nexpert knowledge, which is taxing and time consuming. Alternatively, learning\nfine-grained visual representation from enormous unlabeled images (e.g.,\nspecies, brands) by self-supervised learning becomes a feasible solution.\nHowever, recent researches find that existing self-supervised learning methods\nare less qualified to represent fine-grained categories. The bottleneck lies in\nthat the pre-text representation is built from every patch-wise embedding,\nwhile fine-grained categories are only determined by several key patches of an\nimage. In this paper, we propose a Cross-level Multi-instance Distillation\n(CMD) framework to tackle the challenge. Our key idea is to consider the\nimportance of each image patch in determining the fine-grained pre-text\nrepresentation by multiple instance learning. To comprehensively learn the\nrelation between informative patches and fine-grained semantics, the\nmulti-instance knowledge distillation is implemented on both the region/image\ncrop pairs from the teacher and student net, and the region-image crops inside\nthe teacher / student net, which we term as intra-level multi-instance\ndistillation and inter-level multi-instance distillation. Extensive experiments\non CUB-200-2011, Stanford Cars and FGVC Aircraft show that the proposed method\noutperforms the contemporary method by upto 10.14% and existing\nstate-of-the-art self-supervised learning approaches by upto 19.78% on both\ntop-1 accuracy and Rank-1 retrieval metric.\n","authors":["Qi Bi","Wei Ji","Jingjun Yi","Haolan Zhan","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2401.08860v1.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2312.10165v2","updated":"2024-01-16T21:47:36Z","published":"2023-12-15T19:22:21Z","title":"Test-Time Domain Adaptation by Learning Domain-Aware Batch Normalization","summary":" Test-time domain adaptation aims to adapt the model trained on source domains\nto unseen target domains using a few unlabeled images. Emerging research has\nshown that the label and domain information is separately embedded in the\nweight matrix and batch normalization (BN) layer. Previous works normally\nupdate the whole network naively without explicitly decoupling the knowledge\nbetween label and domain. As a result, it leads to knowledge interference and\ndefective distribution adaptation. In this work, we propose to reduce such\nlearning interference and elevate the domain knowledge learning by only\nmanipulating the BN layer. However, the normalization step in BN is\nintrinsically unstable when the statistics are re-estimated from a few samples.\nWe find that ambiguities can be greatly reduced when only updating the two\naffine parameters in BN while keeping the source domain statistics. To further\nenhance the domain knowledge extraction from unlabeled data, we construct an\nauxiliary branch with label-independent self-supervised learning (SSL) to\nprovide supervision. Moreover, we propose a bi-level optimization based on\nmeta-learning to enforce the alignment of two learning objectives of auxiliary\nand main branches. The goal is to use the auxiliary branch to adapt the domain\nand benefit main task for subsequent inference. Our method keeps the same\ncomputational cost at inference as the auxiliary branch can be thoroughly\ndiscarded after adaptation. Extensive experiments show that our method\noutperforms the prior works on five WILDS real-world domain shift datasets. Our\nmethod can also be integrated with methods with label-dependent optimization to\nfurther push the performance boundary. Our code is available at\nhttps://github.com/ynanwu/MABN.\n","authors":["Yanan Wu","Zhixiang Chi","Yang Wang","Konstantinos N. Plataniotis","Songhe Feng"],"pdf_url":"https://arxiv.org/pdf/2312.10165v2.pdf","comment":"AAAI2024(Oral), see this https URL: https://github.com/ynanwu/MABN"},{"id":"http://arxiv.org/abs/2401.08847v1","updated":"2024-01-16T21:45:08Z","published":"2024-01-16T21:45:08Z","title":"RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and\n Efficiency Assessment of Medical Image Segmentation Models","summary":" Deep learning techniques, despite their potential, often suffer from a lack\nof reproducibility and generalizability, impeding their clinical adoption.\nImage segmentation is one of the critical tasks in medical image analysis, in\nwhich one or several regions/volumes of interest should be annotated. This\npaper introduces the RIDGE checklist, a framework for assessing the\nReproducibility, Integrity, Dependability, Generalizability, and Efficiency of\ndeep learning-based medical image segmentation models. The checklist serves as\na guide for researchers to enhance the quality and transparency of their work,\nensuring that segmentation models are not only scientifically sound but also\nclinically relevant.\n","authors":["Farhad Maleki","Linda Moy","Reza Forghani","Tapotosh Ghosh","Katie Ovens","Steve Langer","Pouria Rouzrokh","Bardia Khosravi","Ali Ganjizadeh","Daniel Warren","Roxana Daneshjou","Mana Moassefi","Atlas Haddadi Avval","Susan Sotardi","Neil Tenenholtz","Felipe Kitamura","Timothy Kline"],"pdf_url":"https://arxiv.org/pdf/2401.08847v1.pdf","comment":"20 pages, 1 Figure, 1 Table"},{"id":"http://arxiv.org/abs/2303.00915v2","updated":"2024-01-16T21:42:24Z","published":"2023-03-02T02:20:04Z","title":"BiomedCLIP: a multimodal biomedical foundation model pretrained from\n fifteen million scientific image-text pairs","summary":" Biomedical data is inherently multimodal, comprising physical measurements\nand natural language narratives. A generalist biomedical AI model needs to\nsimultaneously process different modalities of data, including text and images.\nTherefore, training an effective generalist biomedical model requires\nhigh-quality multimodal data, such as parallel image-text pairs. Here, we\npresent PMC-15M, a novel dataset that is two orders of magnitude larger than\nexisting biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse\nrange of biomedical image types. PMC-15M contains 15 million biomedical\nimage-text pairs collected from 4.4 million scientific articles. Based on\nPMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with\ndomain-specific adaptations tailored to biomedical vision-language processing.\nWe conducted extensive experiments and ablation studies on standard biomedical\nimaging tasks from retrieval to classification to visual question-answering\n(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of\nstandard datasets, substantially outperforming prior approaches. Intriguingly,\nby large-scale pretraining on diverse biomedical image types, BiomedCLIP even\noutperforms state-of-the-art radiology-specific models such as BioViL in\nradiology-specific tasks such as RSNA pneumonia detection. In summary,\nBiomedCLIP is a fully open-access foundation model that achieves\nstate-of-the-art performance on various biomedical tasks, paving the way for\ntransformative multimodal biomedical discovery and applications. We release our\nmodels at https://aka.ms/biomedclip to facilitate future research in multimodal\nbiomedical AI.\n","authors":["Sheng Zhang","Yanbo Xu","Naoto Usuyama","Hanwen Xu","Jaspreet Bagga","Robert Tinn","Sam Preston","Rajesh Rao","Mu Wei","Naveen Valluri","Cliff Wong","Andrea Tupini","Yu Wang","Matt Mazzola","Swadheen Shukla","Lars Liden","Jianfeng Gao","Matthew P. Lungren","Tristan Naumann","Sheng Wang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2303.00915v2.pdf","comment":"The models are released at https://aka.ms/biomedclip"},{"id":"http://arxiv.org/abs/2401.08840v1","updated":"2024-01-16T21:33:01Z","published":"2024-01-16T21:33:01Z","title":"Efficient Neural Representation of Volumetric Data using\n Coordinate-Based Networks","summary":" In this paper, we propose an efficient approach for the compression and\nrepresentation of volumetric data utilizing coordinate-based networks and\nmulti-resolution hash encoding. Efficient compression of volumetric data is\ncrucial for various applications, such as medical imaging and scientific\nsimulations. Our approach enables effective compression by learning a mapping\nbetween spatial coordinates and intensity values. We compare different encoding\nschemes and demonstrate the superiority of multi-resolution hash encoding in\nterms of compression quality and training efficiency. Furthermore, we leverage\noptimization-based meta-learning, specifically using the Reptile algorithm, to\nlearn weight initialization for neural representations tailored to volumetric\ndata, enabling faster convergence during optimization. Additionally, we compare\nour approach with state-of-the-art methods to showcase improved image quality\nand compression ratios. These findings highlight the potential of\ncoordinate-based networks and multi-resolution hash encoding for an efficient\nand accurate representation of volumetric data, paving the way for advancements\nin large-scale data visualization and other applications.\n","authors":["Sudarshan Devkota","Sumanta Pattanaik"],"pdf_url":"https://arxiv.org/pdf/2401.08840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03996v3","updated":"2024-01-16T21:30:52Z","published":"2023-12-07T02:23:32Z","title":"Stable Diffusion for Data Augmentation in COCO and Weed Datasets","summary":" Generative models have increasingly impacted relative tasks, from computer\nvision to interior design and other fields. Stable diffusion is an outstanding\ndiffusion model that paves the way for producing high-resolution images with\nthorough details from text prompts or reference images. It will be an\ninteresting topic about gaining improvements for small datasets with\nimage-sparse categories. This study utilized seven common categories and three\nwidespread weed species to evaluate the efficiency of a stable diffusion model.\nIn detail, Stable diffusion was used to generate synthetic images belonging to\nthese classes; three techniques (i.e., Image-to-image translation, Dreambooth,\nand ControlNet) based on stable diffusion were leveraged for image generation\nwith different focuses. Then, classification and detection tasks were conducted\nbased on these synthetic images, whose performance was compared to the models\ntrained on original images. Promising results have been achieved in some\nclasses. This seminal study may expedite the adaption of stable diffusion\nmodels to different fields.\n","authors":["Boyang Deng"],"pdf_url":"https://arxiv.org/pdf/2312.03996v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08837v1","updated":"2024-01-16T21:21:17Z","published":"2024-01-16T21:21:17Z","title":"Image Fusion in Remote Sensing: An Overview and Meta Analysis","summary":" Image fusion in Remote Sensing (RS) has been a consistent demand due to its\nability to turn raw images of different resolutions, sources, and modalities\ninto accurate, complete, and spatio-temporally coherent images. It greatly\nfacilitates downstream applications such as pan-sharpening, change detection,\nland-cover classification, etc. Yet, image fusion solutions are highly\ndisparate to various remote sensing problems and thus are often narrowly\ndefined in existing reviews as topical applications, such as pan-sharpening,\nand spatial-temporal image fusion. Considering that image fusion can be\ntheoretically applied to any gridded data through pixel-level operations, in\nthis paper, we expanded its scope by comprehensively surveying relevant works\nwith a simple taxonomy: 1) many-to-one image fusion; 2) many-to-many image\nfusion. This simple taxonomy defines image fusion as a mapping problem that\nturns either a single or a set of images into another single or set of images,\ndepending on the desired coherence, e.g., spectral, spatial/resolution\ncoherence, etc. We show that this simple taxonomy, despite the significant\nmodality difference it covers, can be presented by a conceptually easy\nframework. In addition, we provide a meta-analysis to review the major papers\nstudying the various types of image fusion and their applications over the\nyears (from the 1980s to date), covering 5,926 peer-reviewed papers. Finally,\nwe discuss the main benefits and emerging challenges to provide open research\ndirections and potential future works.\n","authors":["Hessah Albanwan","Rongjun Qin","Yang Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08837v1.pdf","comment":"21pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.08825v1","updated":"2024-01-16T20:57:36Z","published":"2024-01-16T20:57:36Z","title":"AiGen-FoodReview: A Multimodal Dataset of Machine-Generated Restaurant\n Reviews and Images on Social Media","summary":" Online reviews in the form of user-generated content (UGC) significantly\nimpact consumer decision-making. However, the pervasive issue of not only human\nfake content but also machine-generated content challenges UGC's reliability.\nRecent advances in Large Language Models (LLMs) may pave the way to fabricate\nindistinguishable fake generated content at a much lower cost. Leveraging\nOpenAI's GPT-4-Turbo and DALL-E-2 models, we craft AiGen-FoodReview, a\nmulti-modal dataset of 20,144 restaurant review-image pairs divided into\nauthentic and machine-generated. We explore unimodal and multimodal detection\nmodels, achieving 99.80% multimodal accuracy with FLAVA. We use attributes from\nreadability and photographic theories to score reviews and images,\nrespectively, demonstrating their utility as hand-crafted features in scalable\nand interpretable detection models, with comparable performance. The paper\ncontributes by open-sourcing the dataset and releasing fake review detectors,\nrecommending its use in unimodal and multimodal fake review detection tasks,\nand evaluating linguistic and visual features in synthetic versus authentic\ndata.\n","authors":["Alessandro Gambetti","Qiwei Han"],"pdf_url":"https://arxiv.org/pdf/2401.08825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08815v1","updated":"2024-01-16T20:31:46Z","published":"2024-01-16T20:31:46Z","title":"Adversarial Supervision Makes Layout-to-Image Diffusion Models Thrive","summary":" Despite the recent advances in large-scale diffusion models, little progress\nhas been made on the layout-to-image (L2I) synthesis task. Current L2I models\neither suffer from poor editability via text or weak alignment between the\ngenerated image and the input layout. This limits their usability in practice.\nTo mitigate this, we propose to integrate adversarial supervision into the\nconventional training pipeline of L2I diffusion models (ALDM). Specifically, we\nemploy a segmentation-based discriminator which provides explicit feedback to\nthe diffusion generator on the pixel-level alignment between the denoised image\nand the input layout. To encourage consistent adherence to the input layout\nover the sampling steps, we further introduce the multistep unrolling strategy.\nInstead of looking at a single timestep, we unroll a few steps recursively to\nimitate the inference process, and ask the discriminator to assess the\nalignment of denoised images with the layout over a certain time window. Our\nexperiments show that ALDM enables layout faithfulness of the generated images,\nwhile allowing broad editability via text prompts. Moreover, we showcase its\nusefulness for practical applications: by synthesizing target distribution\nsamples via text control, we improve domain generalization of semantic\nsegmentation models by a large margin (~12 mIoU points).\n","authors":["Yumeng Li","Margret Keuper","Dan Zhang","Anna Khoreva"],"pdf_url":"https://arxiv.org/pdf/2401.08815v1.pdf","comment":"Accepted at ICLR 2024. Project page:\n https://yumengli007.github.io/ALDM/ and code:\n https://github.com/boschresearch/ALDM"},{"id":"http://arxiv.org/abs/2401.08809v1","updated":"2024-01-16T20:22:37Z","published":"2024-01-16T20:22:37Z","title":"Learning Implicit Representation for Reconstructing Articulated Objects","summary":" 3D Reconstruction of moving articulated objects without additional\ninformation about object structure is a challenging problem. Current methods\novercome such challenges by employing category-specific skeletal models.\nConsequently, they do not generalize well to articulated objects in the wild.\nWe treat an articulated object as an unknown, semi-rigid skeletal structure\nsurrounded by nonrigid material (e.g., skin). Our method simultaneously\nestimates the visible (explicit) representation (3D shapes, colors, camera\nparameters) and the implicit skeletal representation, from motion cues in the\nobject video without 3D supervision. Our implicit representation consists of\nfour parts. (1) Skeleton, which specifies how semi-rigid parts are connected.\n(2) \\textcolor{black}{Skinning Weights}, which associates each surface vertex\nwith semi-rigid parts with probability. (3) Rigidity Coefficients, specifying\nthe articulation of the local surface. (4) Time-Varying Transformations, which\nspecify the skeletal motion and surface deformation parameters. We introduce an\nalgorithm that uses physical constraints as regularization terms and\niteratively estimates both implicit and explicit representations. Our method is\ncategory-agnostic, thus eliminating the need for category-specific skeletons,\nwe show that our method outperforms state-of-the-art across standard video\ndatasets.\n","authors":["Hao Zhang","Fang Li","Samyak Rawlekar","Narendra Ahuja"],"pdf_url":"https://arxiv.org/pdf/2401.08809v1.pdf","comment":"Accepted by ICLR 2024. Code: https://github.com/haoz19/LIMR"},{"id":"http://arxiv.org/abs/2401.08787v1","updated":"2024-01-16T19:10:09Z","published":"2024-01-16T19:10:09Z","title":"Segment Anything Model Can Not Segment Anything: Assessing AI Foundation\n Model's Generalizability in Permafrost Mapping","summary":" This paper assesses trending AI foundation models, especially emerging\ncomputer vision foundation models and their performance in natural landscape\nfeature segmentation. While the term foundation model has quickly garnered\ninterest from the geospatial domain, its definition remains vague. Hence, this\npaper will first introduce AI foundation models and their defining\ncharacteristics. Built upon the tremendous success achieved by Large Language\nModels (LLMs) as the foundation models for language tasks, this paper discusses\nthe challenges of building foundation models for geospatial artificial\nintelligence (GeoAI) vision tasks. To evaluate the performance of large AI\nvision models, especially Meta's Segment Anything Model (SAM), we implemented\ndifferent instance segmentation pipelines that minimize the changes to SAM to\nleverage its power as a foundation model. A series of prompt strategies was\ndeveloped to test SAM's performance regarding its theoretical upper bound of\npredictive accuracy, zero-shot performance, and domain adaptability through\nfine-tuning. The analysis used two permafrost feature datasets, ice-wedge\npolygons and retrogressive thaw slumps because (1) these landform features are\nmore challenging to segment than manmade features due to their complicated\nformation mechanisms, diverse forms, and vague boundaries; (2) their presence\nand changes are important indicators for Arctic warming and climate change. The\nresults show that although promising, SAM still has room for improvement to\nsupport AI-augmented terrain mapping. The spatial and domain generalizability\nof this finding is further validated using a more general dataset EuroCrop for\nagricultural field mapping. Finally, we discuss future research directions that\nstrengthen SAM's applicability in challenging geospatial domains.\n","authors":["Wenwen Li","Chia-Yu Hsu","Sizhe Wang","Yezhou Yang","Hyunho Lee","Anna Liljedahl","Chandi Witharana","Yili Yang","Brendan M. Rogers","Samantha T. Arundel","Matthew B. Jones","Kenton McHenry","Patricia Solis"],"pdf_url":"https://arxiv.org/pdf/2401.08787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08577v1","updated":"2024-01-16T18:59:45Z","published":"2024-01-16T18:59:45Z","title":"MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in\n 3D World","summary":" Human beings possess the capability to multiply a melange of multisensory\ncues while actively exploring and interacting with the 3D world. Current\nmulti-modal large language models, however, passively absorb sensory data as\ninputs, lacking the capacity to actively interact with the objects in the 3D\nenvironment and dynamically collect their multisensory information. To usher in\nthe study of this area, we propose MultiPLY, a multisensory embodied large\nlanguage model that could incorporate multisensory interactive data, including\nvisual, audio, tactile, and thermal information into large language models,\nthereby establishing the correlation among words, actions, and percepts. To\nthis end, we first collect Multisensory Universe, a large-scale multisensory\ninteraction dataset comprising 500k data by deploying an LLM-powered embodied\nagent to engage with the 3D environment. To perform instruction tuning with\npre-trained LLM on such generated data, we first encode the 3D scene as\nabstracted object-centric representations and then introduce action tokens\ndenoting that the embodied agent takes certain actions within the environment,\nas well as state tokens that represent the multisensory state observations of\nthe agent at each time step. In the inference time, MultiPLY could generate\naction tokens, instructing the agent to take the action in the environment and\nobtain the next multisensory state observation. The observation is then\nappended back to the LLM via state tokens to generate subsequent text or action\ntokens. We demonstrate that MultiPLY outperforms baselines by a large margin\nthrough a diverse set of embodied tasks involving object retrieval, tool use,\nmultisensory captioning, and task decomposition.\n","authors":["Yining Hong","Zishuo Zheng","Peihao Chen","Yian Wang","Junyan Li","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2401.08577v1.pdf","comment":"Project page: https://vis-www.cs.umass.edu/multiply"},{"id":"http://arxiv.org/abs/2401.08743v1","updated":"2024-01-16T18:59:24Z","published":"2024-01-16T18:59:24Z","title":"MMToM-QA: Multimodal Theory of Mind Question Answering","summary":" Theory of Mind (ToM), the ability to understand people's minds, is an\nessential ingredient for developing machines with human-level social\nintelligence. Recent machine learning models, particularly large language\nmodels, seem to show some aspects of ToM understanding. However, existing ToM\nbenchmarks use unimodal datasets - either video or text. Human ToM, on the\nother hand, is more than video or text understanding. People can flexibly\nreason about another person's mind based on conceptual representations (e.g.,\ngoals, beliefs, plans) extracted from any available data, which can include\nvisual cues, linguistic narratives, or both. To address this, we introduce a\nmultimodal Theory of Mind question answering (MMToM-QA) benchmark. MMToM-QA\ncomprehensively evaluates machine ToM both on multimodal data and on different\nkinds of unimodal data about a person's activity in a household environment. To\nengineer multimodal ToM capacity, we propose a novel method, BIP-ALM (Bayesian\nInverse Planning Accelerated by Language Models). BIP-ALM extracts unified\nrepresentations from multimodal data and utilizes language models for scalable\nBayesian inverse planning. We conducted a systematic comparison of human\nperformance, BIP-ALM, and state-of-the-art models, including GPT-4. The\nexperiments demonstrate that large language models and large multimodal models\nstill lack robust ToM capacity. BIP-ALM, on the other hand, shows promising\nresults, by leveraging the power of both model-based mental inference and\nlanguage models.\n","authors":["Chuanyang Jin","Yutong Wu","Jing Cao","Jiannan Xiang","Yen-Ling Kuo","Zhiting Hu","Tomer Ullman","Antonio Torralba","Joshua B. Tenenbaum","Tianmin Shu"],"pdf_url":"https://arxiv.org/pdf/2401.08743v1.pdf","comment":"27 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.08573v1","updated":"2024-01-16T18:58:36Z","published":"2024-01-16T18:58:36Z","title":"Benchmarking the Robustness of Image Watermarks","summary":" This paper investigates the weaknesses of image watermarking techniques. We\npresent WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel\nbenchmark for assessing watermark robustness, overcoming the limitations of\ncurrent evaluation methods.WAVES integrates detection and identification tasks,\nand establishes a standardized evaluation protocol comprised of a diverse range\nof stress tests. The attacks in WAVES range from traditional image distortions\nto advanced and novel variations of adversarial, diffusive, and embedding-based\nattacks. We introduce a normalized score of attack potency which incorporates\nseveral widely used image quality metrics and allows us to produce of an\nordered ranking of attacks. Our comprehensive evaluation over reveals\npreviously undetected vulnerabilities of several modern watermarking\nalgorithms. WAVES is envisioned as a toolkit for the future development of\nrobust watermarking systems.\n","authors":["Bang An","Mucong Ding","Tahseen Rabbani","Aakriti Agrawal","Yuancheng Xu","Chenghao Deng","Sicheng Zhu","Abdirisak Mohamed","Yuxin Wen","Tom Goldstein","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08742v1","updated":"2024-01-16T18:58:36Z","published":"2024-01-16T18:58:36Z","title":"Fast Dynamic 3D Object Generation from a Single-view Video","summary":" Generating dynamic three-dimensional (3D) object from a single-view video is\nchallenging due to the lack of 4D labeled data. Existing methods extend\ntext-to-3D pipelines by transferring off-the-shelf image generation models such\nas score distillation sampling, but they are slow and expensive to scale (e.g.,\n150 minutes per object) due to the need for back-propagating the\ninformation-limited supervision signals through a large pretrained model. To\naddress this limitation, we propose an efficient video-to-4D object generation\nframework called Efficient4D. It generates high-quality spacetime-consistent\nimages under different camera views, and then uses them as labeled data to\ndirectly train a novel 4D Gaussian splatting model with explicit point cloud\ngeometry, enabling real-time rendering under continuous camera trajectories.\nExtensive experiments on synthetic and real videos show that Efficient4D offers\na remarkable 10-fold increase in speed when compared to prior art alternatives\nwhile preserving the same level of innovative view synthesis quality. For\nexample, Efficient4D takes only 14 minutes to model a dynamic object.\n","authors":["Zijie Pan","Zeyu Yang","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.08742v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.08570v1","updated":"2024-01-16T18:57:50Z","published":"2024-01-16T18:57:50Z","title":"RoHM: Robust Human Motion Reconstruction via Diffusion","summary":" We propose RoHM, an approach for robust 3D human motion reconstruction from\nmonocular RGB(-D) videos in the presence of noise and occlusions. Most previous\napproaches either train neural networks to directly regress motion in 3D or\nlearn data-driven motion priors and combine them with optimization at test\ntime. The former do not recover globally coherent motion and fail under\nocclusions; the latter are time-consuming, prone to local minima, and require\nmanual tuning. To overcome these shortcomings, we exploit the iterative,\ndenoising nature of diffusion models. RoHM is a novel diffusion-based motion\nmodel that, conditioned on noisy and occluded input data, reconstructs\ncomplete, plausible motions in consistent global coordinates. Given the\ncomplexity of the problem -- requiring one to address different tasks\n(denoising and infilling) in different solution spaces (local and global\nmotion) -- we decompose it into two sub-tasks and learn two models, one for\nglobal trajectory and one for local motion. To capture the correlations between\nthe two, we then introduce a novel conditioning module, combining it with an\niterative inference scheme. We apply RoHM to a variety of tasks -- from motion\nreconstruction and denoising to spatial and temporal infilling. Extensive\nexperiments on three popular datasets show that our method outperforms\nstate-of-the-art approaches qualitatively and quantitatively, while being\nfaster at test time. The code will be available at\nhttps://sanweiliti.github.io/ROHM/ROHM.html.\n","authors":["Siwei Zhang","Bharat Lal Bhatnagar","Yuanlu Xu","Alexander Winkler","Petr Kadlecek","Siyu Tang","Federica Bogo"],"pdf_url":"https://arxiv.org/pdf/2401.08570v1.pdf","comment":"With the appendix included"},{"id":"http://arxiv.org/abs/2401.08741v1","updated":"2024-01-16T18:55:54Z","published":"2024-01-16T18:55:54Z","title":"Fixed Point Diffusion Models","summary":" We introduce the Fixed Point Diffusion Model (FPDM), a novel approach to\nimage generation that integrates the concept of fixed point solving into the\nframework of diffusion-based generative modeling. Our approach embeds an\nimplicit fixed point solving layer into the denoising network of a diffusion\nmodel, transforming the diffusion process into a sequence of closely-related\nfixed point problems. Combined with a new stochastic training method, this\napproach significantly reduces model size, reduces memory usage, and\naccelerates training. Moreover, it enables the development of two new\ntechniques to improve sampling efficiency: reallocating computation across\ntimesteps and reusing fixed point solutions between timesteps. We conduct\nextensive experiments with state-of-the-art models on ImageNet, FFHQ,\nCelebA-HQ, and LSUN-Church, demonstrating substantial improvements in\nperformance and efficiency. Compared to the state-of-the-art DiT model, FPDM\ncontains 87% fewer parameters, consumes 60% less memory during training, and\nimproves image generation quality in situations where sampling computation or\ntime is limited. Our code and pretrained models are available at\nhttps://lukemelas.github.io/fixed-point-diffusion-models.\n","authors":["Xingjian Bai","Luke Melas-Kyriazi"],"pdf_url":"https://arxiv.org/pdf/2401.08741v1.pdf","comment":"Project page:\n https://lukemelas.github.io/fixed-point-diffusion-models"},{"id":"http://arxiv.org/abs/2401.08740v1","updated":"2024-01-16T18:55:25Z","published":"2024-01-16T18:55:25Z","title":"SiT: Exploring Flow and Diffusion-based Generative Models with Scalable\n Interpolant Transformers","summary":" We present Scalable Interpolant Transformers (SiT), a family of generative\nmodels built on the backbone of Diffusion Transformers (DiT). The interpolant\nframework, which allows for connecting two distributions in a more flexible way\nthan standard diffusion models, makes possible a modular study of various\ndesign choices impacting generative models built on dynamical transport: using\ndiscrete vs. continuous time learning, deciding the objective for the model to\nlearn, choosing the interpolant connecting the distributions, and deploying a\ndeterministic or stochastic sampler. By carefully introducing the above\ningredients, SiT surpasses DiT uniformly across model sizes on the conditional\nImageNet 256x256 benchmark using the exact same backbone, number of parameters,\nand GFLOPs. By exploring various diffusion coefficients, which can be tuned\nseparately from learning, SiT achieves an FID-50K score of 2.06.\n","authors":["Nanye Ma","Mark Goldstein","Michael S. Albergo","Nicholas M. Boffi","Eric Vanden-Eijnden","Saining Xie"],"pdf_url":"https://arxiv.org/pdf/2401.08740v1.pdf","comment":"Code available: https://github.com/willisma/SiT"},{"id":"http://arxiv.org/abs/2401.08739v1","updated":"2024-01-16T18:55:22Z","published":"2024-01-16T18:55:22Z","title":"EgoGen: An Egocentric Synthetic Data Generator","summary":" Understanding the world in first-person view is fundamental in Augmented\nReality (AR). This immersive perspective brings dramatic visual changes and\nunique challenges compared to third-person views. Synthetic data has empowered\nthird-person-view vision models, but its application to embodied egocentric\nperception tasks remains largely unexplored. A critical challenge lies in\nsimulating natural human movements and behaviors that effectively steer the\nembodied cameras to capture a faithful egocentric representation of the 3D\nworld. To address this challenge, we introduce EgoGen, a new synthetic data\ngenerator that can produce accurate and rich ground-truth training data for\negocentric perception tasks. At the heart of EgoGen is a novel human motion\nsynthesis model that directly leverages egocentric visual inputs of a virtual\nhuman to sense the 3D environment. Combined with collision-avoiding motion\nprimitives and a two-stage reinforcement learning approach, our motion\nsynthesis model offers a closed-loop solution where the embodied perception and\nmovement of the virtual human are seamlessly coupled. Compared to previous\nworks, our model eliminates the need for a pre-defined global path, and is\ndirectly applicable to dynamic environments. Combined with our easy-to-use and\nscalable data generation pipeline, we demonstrate EgoGen's efficacy in three\ntasks: mapping and localization for head-mounted cameras, egocentric camera\ntracking, and human mesh recovery from egocentric views. EgoGen will be fully\nopen-sourced, offering a practical solution for creating realistic egocentric\ntraining data and aiming to serve as a useful tool for egocentric computer\nvision research. Refer to our project page: https://ego-gen.github.io/.\n","authors":["Gen Li","Kaifeng Zhao","Siwei Zhang","Xiaozhong Lyu","Mihai Dusmanu","Yan Zhang","Marc Pollefeys","Siyu Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08739v1.pdf","comment":"22 pages, 16 figures. Project page: https://ego-gen.github.io/"},{"id":"http://arxiv.org/abs/2108.12056v9","updated":"2024-01-16T18:55:17Z","published":"2021-08-26T22:53:27Z","title":"Continual learning under domain transfer with sparse synaptic bursting","summary":" Existing machines are functionally specific tools that were made for easy\nprediction and control. Tomorrow's machines may be closer to biological systems\nin their mutability, resilience, and autonomy. But first they must be capable\nof learning and retaining new information without being exposed to it\narbitrarily often. Past efforts to engineer such systems have sought to build\nor regulate artificial neural networks using disjoint sets of weights that are\nuniquely sensitive to specific tasks or inputs. This has not yet enabled\ncontinual learning over long sequences of previously unseen data without\ncorrupting existing knowledge: a problem known as catastrophic forgetting. In\nthis paper, we introduce a system that can learn sequentially over previously\nunseen datasets (ImageNet, CIFAR-100) with little forgetting over time. This is\ndone by controlling the activity of weights in a convolutional neural network\non the basis of inputs using top-down regulation generated by a second\nfeed-forward neural network. We find that our method learns continually under\ndomain transfer with sparse bursts of activity in weights that are recycled\nacross tasks, rather than by maintaining task-specific modules. Sparse synaptic\nbursting is found to balance activity and suppression such that new functions\ncan be learned without corrupting extant knowledge, thus mirroring the balance\nof order and disorder in systems at the edge of chaos. This behavior emerges\nduring a prior pre-training (or 'meta-learning') phase in which regulated\nsynapses are selectively disinhibited, or grown, from an initial state of\nuniform suppression through prediction error minimization.\n","authors":["Shawn L. Beaulieu","Jeff Clune","Nick Cheney"],"pdf_url":"https://arxiv.org/pdf/2108.12056v9.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08567v1","updated":"2024-01-16T18:52:27Z","published":"2024-01-16T18:52:27Z","title":"Connect, Collapse, Corrupt: Learning Cross-Modal Tasks with Uni-Modal\n Data","summary":" Building cross-modal applications is challenging due to limited paired\nmulti-modal data. Recent works have shown that leveraging a pre-trained\nmulti-modal contrastive representation space enables cross-modal tasks to be\nlearned from uni-modal data. This is based on the assumption that contrastive\noptimization makes embeddings from different modalities interchangeable.\nHowever, this assumption is under-explored due to the poorly understood\ngeometry of the multi-modal contrastive space, where a modality gap exists. In\nour study, we provide a theoretical explanation of this space's geometry and\nintroduce a three-step method, $C^3$ (Connect, Collapse, Corrupt), to bridge\nthe modality gap, enhancing the interchangeability of embeddings. Our $C^3$\nmethod significantly improves cross-modal learning from uni-modal data,\nachieving state-of-the-art results on zero-shot image / audio / video\ncaptioning and text-to-image generation.\n","authors":["Yuhui Zhang","Elaine Sui","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2401.08567v1.pdf","comment":"Published at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08562v1","updated":"2024-01-16T18:47:38Z","published":"2024-01-16T18:47:38Z","title":"Registration of algebraic varieties using Riemannian optimization","summary":" We consider the point cloud registration problem, the task of finding a\ntransformation between two point clouds that represent the same object but are\nexpressed in different coordinate systems. Our approach is not based on a\npoint-to-point correspondence, matching every point in the source point cloud\nto a point in the target point cloud. Instead, we assume and leverage a\nlow-dimensional nonlinear geometric structure of the data. Firstly, we\napproximate each point cloud by an algebraic variety (a set defined by finitely\nmany polynomial equations). This is done by solving an optimization problem on\nthe Grassmann manifold, using a connection between algebraic varieties and\npolynomial bases. Secondly, we solve an optimization problem on the orthogonal\ngroup to find the transformation (rotation $+$ translation) which makes the two\nalgebraic varieties overlap. We use second-order Riemannian optimization\nmethods for the solution of both steps. Numerical experiments on real and\nsynthetic data are provided, with encouraging results. Our approach is\nparticularly useful when the two point clouds describe different parts of an\nobjects (which may not even be overlapping), on the condition that the surface\nof the object may be well approximated by a set of polynomial equations. The\nfirst procedure -- the approximation -- is of independent interest, as it can\nbe used for denoising data that belongs to an algebraic variety. We provide\nstatistical guarantees for the estimation error of the denoising using Stein's\nunbiased estimator.\n","authors":["Florentin Goyens","Coralia Cartis","Stéphane Chrétien"],"pdf_url":"https://arxiv.org/pdf/2401.08562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08559v1","updated":"2024-01-16T18:39:15Z","published":"2024-01-16T18:39:15Z","title":"Multi-Track Timeline Control for Text-Driven 3D Human Motion Generation","summary":" Recent advances in generative modeling have led to promising progress on\nsynthesizing 3D human motion from text, with methods that can generate\ncharacter animations from short prompts and specified durations. However, using\na single text prompt as input lacks the fine-grained control needed by\nanimators, such as composing multiple actions and defining precise durations\nfor parts of the motion. To address this, we introduce the new problem of\ntimeline control for text-driven motion synthesis, which provides an intuitive,\nyet fine-grained, input interface for users. Instead of a single prompt, users\ncan specify a multi-track timeline of multiple prompts organized in temporal\nintervals that may overlap. This enables specifying the exact timings of each\naction and composing multiple actions in sequence or at overlapping intervals.\nTo generate composite animations from a multi-track timeline, we propose a new\ntest-time denoising method. This method can be integrated with any pre-trained\nmotion diffusion model to synthesize realistic motions that accurately reflect\nthe timeline. At every step of denoising, our method processes each timeline\ninterval (text prompt) individually, subsequently aggregating the predictions\nwith consideration for the specific body parts engaged in each action.\nExperimental comparisons and ablations validate that our method produces\nrealistic motions that respect the semantics and timing of given text prompts.\nOur code and models are publicly available at https://mathis.petrovich.fr/stmc.\n","authors":["Mathis Petrovich","Or Litany","Umar Iqbal","Michael J. Black","Gül Varol","Xue Bin Peng","Davis Rempe"],"pdf_url":"https://arxiv.org/pdf/2401.08559v1.pdf","comment":"Project page: https://mathis.petrovich.fr/stmc"},{"id":"http://arxiv.org/abs/2204.10993v2","updated":"2024-01-16T18:37:11Z","published":"2022-04-23T05:18:39Z","title":"TerrainMesh: Metric-Semantic Terrain Reconstruction from Aerial Images\n Using Joint 2D-3D Learning","summary":" This paper considers outdoor terrain mapping using RGB images obtained from\nan aerial vehicle. While feature-based localization and mapping techniques\ndeliver real-time vehicle odometry and sparse keypoint depth reconstruction, a\ndense model of the environment geometry and semantics (vegetation, buildings,\netc.) is usually recovered offline with significant computation and storage.\nThis paper develops a joint 2D-3D learning approach to reconstruct a local\nmetric-semantic mesh at each camera keyframe maintained by a visual odometry\nalgorithm. Given the estimated camera trajectory, the local meshes can be\nassembled into a global environment model to capture the terrain topology and\nsemantics during online operation. A local mesh is reconstructed using an\ninitialization and refinement stage. In the initialization stage, we estimate\nthe mesh vertex elevation by solving a least squares problem relating the\nvertex barycentric coordinates to the sparse keypoint depth measurements. In\nthe refinement stage, we associate 2D image and semantic features with the 3D\nmesh vertices using camera projection and apply graph convolution to refine the\nmesh vertex spatial coordinates and semantic features based on joint 2D and 3D\nsupervision. Quantitative and qualitative evaluation using real aerial images\nshow the potential of our method to support environmental monitoring and\nsurveillance applications.\n","authors":["Qiaojun Feng","Nikolay Atanasov"],"pdf_url":"https://arxiv.org/pdf/2204.10993v2.pdf","comment":"19 pages, 17 figures. arXiv admin note: text overlap with\n arXiv:2101.01844"},{"id":"http://arxiv.org/abs/2310.19224v2","updated":"2024-01-16T18:26:50Z","published":"2023-10-30T02:03:28Z","title":"CHAMMI: A benchmark for channel-adaptive models in microscopy imaging","summary":" Most neural networks assume that input images have a fixed number of channels\n(three for RGB images). However, there are many settings where the number of\nchannels may vary, such as microscopy images where the number of channels\nchanges depending on instruments and experimental goals. Yet, there has not\nbeen a systemic attempt to create and evaluate neural networks that are\ninvariant to the number and type of channels. As a result, trained models\nremain specific to individual studies and are hardly reusable for other\nmicroscopy settings. In this paper, we present a benchmark for investigating\nchannel-adaptive models in microscopy imaging, which consists of 1) a dataset\nof varied-channel single-cell images, and 2) a biologically relevant evaluation\nframework. In addition, we adapted several existing techniques to create\nchannel-adaptive models and compared their performance on this benchmark to\nfixed-channel, baseline models. We find that channel-adaptive models can\ngeneralize better to out-of-domain tasks and can be computationally efficient.\nWe contribute a curated dataset (https://doi.org/10.5281/zenodo.7988357) and an\nevaluation API (https://github.com/broadinstitute/MorphEm.git) to facilitate\nobjective comparisons in future research and applications.\n","authors":["Zitong Chen","Chau Pham","Siqi Wang","Michael Doron","Nikita Moshkov","Bryan A. Plummer","Juan C. Caicedo"],"pdf_url":"https://arxiv.org/pdf/2310.19224v2.pdf","comment":"Accepted at NeurIPS Track on Datasets and Benchmarks, 2023"},{"id":"http://arxiv.org/abs/2311.14656v3","updated":"2024-01-16T18:20:08Z","published":"2023-11-24T18:46:02Z","title":"Charting New Territories: Exploring the Geographic and Geospatial\n Capabilities of Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown remarkable capabilities\nacross a broad range of tasks but their knowledge and abilities in the\ngeographic and geospatial domains are yet to be explored, despite potential\nwide-ranging benefits to navigation, environmental research, urban development,\nand disaster response. We conduct a series of experiments exploring various\nvision capabilities of MLLMs within these domains, particularly focusing on the\nfrontier model GPT-4V, and benchmark its performance against open-source\ncounterparts. Our methodology involves challenging these models with a\nsmall-scale geographic benchmark consisting of a suite of visual tasks, testing\ntheir abilities across a spectrum of complexity. The analysis uncovers not only\nwhere such models excel, including instances where they outperform humans, but\nalso where they falter, providing a balanced view of their capabilities in the\ngeographic domain. To enable the comparison and evaluation of future models,\nour benchmark will be publicly released.\n","authors":["Jonathan Roberts","Timo Lüddecke","Rehan Sheikh","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2311.14656v3.pdf","comment":"V3: Fixed typo in Fig.1; V2: Minor formatting changes and added\n missing subfigure captions"},{"id":"http://arxiv.org/abs/2401.08541v1","updated":"2024-01-16T18:03:37Z","published":"2024-01-16T18:03:37Z","title":"Scalable Pre-training of Large Autoregressive Image Models","summary":" This paper introduces AIM, a collection of vision models pre-trained with an\nautoregressive objective. These models are inspired by their textual\ncounterparts, i.e., Large Language Models (LLMs), and exhibit similar scaling\nproperties. Specifically, we highlight two key findings: (1) the performance of\nthe visual features scale with both the model capacity and the quantity of\ndata, (2) the value of the objective function correlates with the performance\nof the model on downstream tasks. We illustrate the practical implication of\nthese findings by pre-training a 7 billion parameter AIM on 2 billion images,\nthat achieves 84.0% on ImageNet-1k with a frozen trunk. Interestingly, even at\nthis scale, we observe no sign of saturation in performance, suggesting that\nAIM potentially represents a new frontier for training large-scale vision\nmodels. The pre-training of AIM is similar to the pre-training of LLMs, and\ndoes not require any image-specific strategy to stabilize the training at\nscale.\n","authors":["Alaaeldin El-Nouby","Michal Klein","Shuangfei Zhai","Miguel Angel Bautista","Alexander Toshev","Vaishaal Shankar","Joshua M Susskind","Armand Joulin"],"pdf_url":"https://arxiv.org/pdf/2401.08541v1.pdf","comment":"https://github.com/apple/ml-aim"},{"id":"http://arxiv.org/abs/2311.01017v3","updated":"2024-01-16T18:02:27Z","published":"2023-11-02T06:21:56Z","title":"Learning Unsupervised World Models for Autonomous Driving via Discrete\n Diffusion","summary":" Learning world models can teach an agent how the world works in an\nunsupervised manner. Even though it can be viewed as a special case of sequence\nmodeling, progress for scaling world models on robotic applications such as\nautonomous driving has been somewhat less rapid than scaling language models\nwith Generative Pre-trained Transformers (GPT). We identify two reasons as\nmajor bottlenecks: dealing with complex and unstructured observation space, and\nhaving a scalable generative model. Consequently, we propose a novel world\nmodeling approach that first tokenizes sensor observations with VQVAE, then\npredicts the future via discrete diffusion. To efficiently decode and denoise\ntokens in parallel, we recast Masked Generative Image Transformer into the\ndiscrete diffusion framework with a few simple changes, resulting in notable\nimprovement. When applied to learning world models on point cloud observations,\nour model reduces prior SOTA Chamfer distance by more than 65% for 1s\nprediction, and more than 50% for 3s prediction, across NuScenes, KITTI\nOdometry, and Argoverse2 datasets. Our results demonstrate that discrete\ndiffusion on tokenized agent experience can unlock the power of GPT-like\nunsupervised learning for robotic agents.\n","authors":["Lunjun Zhang","Yuwen Xiong","Ze Yang","Sergio Casas","Rui Hu","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2311.01017v3.pdf","comment":"Published as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08527v1","updated":"2024-01-16T17:45:01Z","published":"2024-01-16T17:45:01Z","title":"MICA: Towards Explainable Skin Lesion Diagnosis via Multi-Level\n Image-Concept Alignment","summary":" Black-box deep learning approaches have showcased significant potential in\nthe realm of medical image analysis. However, the stringent trustworthiness\nrequirements intrinsic to the medical field have catalyzed research into the\nutilization of Explainable Artificial Intelligence (XAI), with a particular\nfocus on concept-based methods. Existing concept-based methods predominantly\napply concept annotations from a single perspective (e.g., global level),\nneglecting the nuanced semantic relationships between sub-regions and concepts\nembedded within medical images. This leads to underutilization of the valuable\nmedical information and may cause models to fall short in harmoniously\nbalancing interpretability and performance when employing inherently\ninterpretable architectures such as Concept Bottlenecks. To mitigate these\nshortcomings, we propose a multi-modal explainable disease diagnosis framework\nthat meticulously aligns medical images and clinical-related concepts\nsemantically at multiple strata, encompassing the image level, token level, and\nconcept level. Moreover, our method allows for model intervention and offers\nboth textual and visual explanations in terms of human-interpretable concepts.\nExperimental results on three skin image datasets demonstrate that our method,\nwhile preserving model interpretability, attains high performance and label\nefficiency for concept detection and disease diagnosis.\n","authors":["Yequan Bie","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08525v1","updated":"2024-01-16T17:43:42Z","published":"2024-01-16T17:43:42Z","title":"GATS: Gather-Attend-Scatter","summary":" As the AI community increasingly adopts large-scale models, it is crucial to\ndevelop general and flexible tools to integrate them. We introduce\nGather-Attend-Scatter (GATS), a novel module that enables seamless combination\nof pretrained foundation models, both trainable and frozen, into larger\nmultimodal networks. GATS empowers AI systems to process and generate\ninformation across multiple modalities at different rates. In contrast to\ntraditional fine-tuning, GATS allows for the original component models to\nremain frozen, avoiding the risk of them losing important knowledge acquired\nduring the pretraining phase. We demonstrate the utility and versatility of\nGATS with a few experiments across games, robotics, and multimodal input-output\nsystems.\n","authors":["Konrad Zolna","Serkan Cabi","Yutian Chen","Eric Lau","Claudio Fantacci","Jurgis Pasukonis","Jost Tobias Springenberg","Sergio Gomez Colmenarejo"],"pdf_url":"https://arxiv.org/pdf/2401.08525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08734v1","updated":"2024-01-16T17:42:36Z","published":"2024-01-16T17:42:36Z","title":"Bag of Tricks to Boost Adversarial Transferability","summary":" Deep neural networks are widely known to be vulnerable to adversarial\nexamples. However, vanilla adversarial examples generated under the white-box\nsetting often exhibit low transferability across different models. Since\nadversarial transferability poses more severe threats to practical\napplications, various approaches have been proposed for better transferability,\nincluding gradient-based, input transformation-based, and model-related\nattacks, \\etc. In this work, we find that several tiny changes in the existing\nadversarial attacks can significantly affect the attack performance, \\eg, the\nnumber of iterations and step size. Based on careful studies of existing\nadversarial attacks, we propose a bag of tricks to enhance adversarial\ntransferability, including momentum initialization, scheduled step size, dual\nexample, spectral-based input transformation, and several ensemble strategies.\nExtensive experiments on the ImageNet dataset validate the high effectiveness\nof our proposed tricks and show that combining them can further boost\nadversarial transferability. Our work provides practical insights and\ntechniques to enhance adversarial transferability, and offers guidance to\nimprove the attack performance on the real-world application through simple\nadjustments.\n","authors":["Zeliang Zhang","Rongyi Zhu","Wei Yao","Xiaosen Wang","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2401.08734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08522v1","updated":"2024-01-16T17:33:54Z","published":"2024-01-16T17:33:54Z","title":"Video Quality Assessment Based on Swin TransformerV2 and Coarse to Fine\n Strategy","summary":" The objective of non-reference video quality assessment is to evaluate the\nquality of distorted video without access to reference high-definition\nreferences. In this study, we introduce an enhanced spatial perception module,\npre-trained on multiple image quality assessment datasets, and a lightweight\ntemporal fusion module to address the no-reference visual quality assessment\n(NR-VQA) task. This model implements Swin Transformer V2 as a local-level\nspatial feature extractor and fuses these multi-stage representations through a\nseries of transformer layers. Furthermore, a temporal transformer is utilized\nfor spatiotemporal feature fusion across the video. To accommodate compressed\nvideos of varying bitrates, we incorporate a coarse-to-fine contrastive\nstrategy to enrich the model's capability to discriminate features from videos\nof different bitrates. This is an expanded version of the one-page abstract.\n","authors":["Zihao Yu","Fengbin Guan","Yiting Lu","Xin Li","Zhibo Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08518v1","updated":"2024-01-16T17:31:43Z","published":"2024-01-16T17:31:43Z","title":"PPSURF: Combining Patches and Point Convolutions for Detailed Surface\n Reconstruction","summary":" 3D surface reconstruction from point clouds is a key step in areas such as\ncontent creation, archaeology, digital cultural heritage, and engineering.\nCurrent approaches either try to optimize a non-data-driven surface\nrepresentation to fit the points, or learn a data-driven prior over the\ndistribution of commonly occurring surfaces and how they correlate with\npotentially noisy point clouds. Data-driven methods enable robust handling of\nnoise and typically either focus on a global or a local prior, which trade-off\nbetween robustness to noise on the global end and surface detail preservation\non the local end. We propose PPSurf as a method that combines a global prior\nbased on point convolutions and a local prior based on processing local point\ncloud patches. We show that this approach is robust to noise while recovering\nsurface details more accurately than the current state-of-the-art.\n Our source code, pre-trained model and dataset are available at:\nhttps://github.com/cg-tuwien/ppsurf\n","authors":["Philipp Erler","Lizeth Fuentes","Pedro Hermosilla","Paul Guerrero","Renato Pajarola Michael Wimmer"],"pdf_url":"https://arxiv.org/pdf/2401.08518v1.pdf","comment":"Published in Computer Graphics Forum (Jan 2024):\n https://onlinelibrary.wiley.com/doi/10.1111/cgf.15000"},{"id":"http://arxiv.org/abs/2310.13349v2","updated":"2024-01-16T17:17:26Z","published":"2023-10-20T08:27:13Z","title":"DeepFDR: A Deep Learning-based False Discovery Rate Control Method for\n Neuroimaging Data","summary":" Voxel-based multiple testing is widely used in neuroimaging data analysis.\nTraditional false discovery rate (FDR) control methods often ignore the spatial\ndependence among the voxel-based tests and thus suffer from substantial loss of\ntesting power. While recent spatial FDR control methods have emerged, their\nvalidity and optimality remain questionable when handling the complex spatial\ndependencies of the brain. Concurrently, deep learning methods have\nrevolutionized image segmentation, a task closely related to voxel-based\nmultiple testing. In this paper, we propose DeepFDR, a novel spatial FDR\ncontrol method that leverages unsupervised deep learning-based image\nsegmentation to address the voxel-based multiple testing problem. Numerical\nstudies, including comprehensive simulations and Alzheimer's disease FDG-PET\nimage analysis, demonstrate DeepFDR's superiority over existing methods.\nDeepFDR not only excels in FDR control and effectively diminishes the false\nnondiscovery rate, but also boasts exceptional computational efficiency highly\nsuited for tackling large-scale neuroimaging data.\n","authors":["Taehyo Kim","Hai Shu","Qiran Jia","Mony de Leon"],"pdf_url":"https://arxiv.org/pdf/2310.13349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08503v1","updated":"2024-01-16T17:04:30Z","published":"2024-01-16T17:04:30Z","title":"Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis","summary":" One-shot 3D talking portrait generation aims to reconstruct a 3D avatar from\nan unseen image, and then animate it with a reference video or audio to\ngenerate a talking portrait video. The existing methods fail to simultaneously\nachieve the goals of accurate 3D avatar reconstruction and stable talking face\nanimation. Besides, while the existing works mainly focus on synthesizing the\nhead part, it is also vital to generate natural torso and background segments\nto obtain a realistic talking portrait video. To address these limitations, we\npresent Real3D-Potrait, a framework that (1) improves the one-shot 3D\nreconstruction power with a large image-to-plane model that distills 3D prior\nknowledge from a 3D face generative model; (2) facilitates accurate\nmotion-conditioned animation with an efficient motion adapter; (3) synthesizes\nrealistic video with natural torso movement and switchable background using a\nhead-torso-background super-resolution model; and (4) supports one-shot\naudio-driven talking face generation with a generalizable audio-to-motion\nmodel. Extensive experiments show that Real3D-Portrait generalizes well to\nunseen identities and generates more realistic talking portrait videos compared\nto previous methods.\n","authors":["Zhenhui Ye","Tianyun Zhong","Yi Ren","Jiaqi Yang","Weichuang Li","Jiawei Huang","Ziyue Jiang","Jinzheng He","Rongjie Huang","Jinglin Liu","Chen Zhang","Xiang Yin","Zejun Ma","Zhou Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.08503v1.pdf","comment":"ICLR 2024 (Spotlight). Project page: https://real3dportrait.github.io"},{"id":"http://arxiv.org/abs/2401.08501v1","updated":"2024-01-16T17:02:21Z","published":"2024-01-16T17:02:21Z","title":"ValUES: A Framework for Systematic Validation of Uncertainty Estimation\n in Semantic Segmentation","summary":" Uncertainty estimation is an essential and heavily-studied component for the\nreliable application of semantic segmentation methods. While various studies\nexist claiming methodological advances on the one hand, and successful\napplication on the other hand, the field is currently hampered by a gap between\ntheory and practice leaving fundamental questions unanswered: Can data-related\nand model-related uncertainty really be separated in practice? Which components\nof an uncertainty method are essential for real-world performance? Which\nuncertainty method works well for which application? In this work, we link this\nresearch gap to a lack of systematic and comprehensive evaluation of\nuncertainty methods. Specifically, we identify three key pitfalls in current\nliterature and present an evaluation framework that bridges the research gap by\nproviding 1) a controlled environment for studying data ambiguities as well as\ndistribution shifts, 2) systematic ablations of relevant method components, and\n3) test-beds for the five predominant uncertainty applications: OoD-detection,\nactive learning, failure detection, calibration, and ambiguity modeling.\nEmpirical results on simulated as well as real-world data demonstrate how the\nproposed framework is able to answer the predominant questions in the field\nrevealing for instance that 1) separation of uncertainty types works on\nsimulated data but does not necessarily translate to real-world data, 2)\naggregation of scores is a crucial but currently neglected component of\nuncertainty methods, 3) While ensembles are performing most robustly across the\ndifferent downstream tasks and settings, test-time augmentation often\nconstitutes a light-weight alternative. Code is at:\nhttps://github.com/IML-DKFZ/values\n","authors":["Kim-Celine Kahl","Carsten T. Lüth","Maximilian Zenk","Klaus Maier-Hein","Paul F. Jaeger"],"pdf_url":"https://arxiv.org/pdf/2401.08501v1.pdf","comment":"ICLR 2024 (oral)"},{"id":"http://arxiv.org/abs/2401.03201v2","updated":"2024-01-16T16:39:57Z","published":"2024-01-06T12:20:18Z","title":"3DMIT: 3D Multi-modal Instruction Tuning for Scene Understanding","summary":" The remarkable potential of multi-modal large language models (MLLMs) in\ncomprehending both vision and language information has been widely\nacknowledged. However, the scarcity of 3D scenes-language pairs in comparison\nto their 2D counterparts, coupled with the inadequacy of existing approaches in\nunderstanding of 3D scenes by LLMs, poses a significant challenge. In response,\nwe collect and construct an extensive dataset comprising 75K\ninstruction-response pairs tailored for 3D scenes. This dataset addresses tasks\nrelated to 3D VQA, 3D grounding, and 3D conversation. To further enhance the\nintegration of 3D spatial information into LLMs, we introduce a novel and\nefficient prompt tuning paradigm, 3DMIT. This paradigm eliminates the alignment\nstage between 3D scenes and language and extends the instruction prompt with\nthe 3D modality information including the entire scene and segmented objects.\nWe evaluate the effectiveness of our method across diverse tasks in the 3D\nscene domain and find that our approach serves as a strategic means to enrich\nLLMs' comprehension of the 3D world. Our code is available at\nhttps://github.com/staymylove/3DMIT.\n","authors":["Zeju Li","Chao Zhang","Xiaoyan Wang","Ruilong Ren","Yifan Xu","Ruifei Ma","Xiangde Liu"],"pdf_url":"https://arxiv.org/pdf/2401.03201v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.12474v3","updated":"2024-01-16T16:31:48Z","published":"2023-10-19T05:15:17Z","title":"Enhancing High-Resolution 3D Generation through Pixel-wise Gradient\n Clipping","summary":" High-resolution 3D object generation remains a challenging task primarily due\nto the limited availability of comprehensive annotated training data. Recent\nadvancements have aimed to overcome this constraint by harnessing image\ngenerative models, pretrained on extensive curated web datasets, using\nknowledge transfer techniques like Score Distillation Sampling (SDS).\nEfficiently addressing the requirements of high-resolution rendering often\nnecessitates the adoption of latent representation-based models, such as the\nLatent Diffusion Model (LDM). In this framework, a significant challenge\narises: To compute gradients for individual image pixels, it is necessary to\nbackpropagate gradients from the designated latent space through the frozen\ncomponents of the image model, such as the VAE encoder used within LDM.\nHowever, this gradient propagation pathway has never been optimized, remaining\nuncontrolled during training. We find that the unregulated gradients adversely\naffect the 3D model's capacity in acquiring texture-related information from\nthe image generative model, leading to poor quality appearance synthesis. To\naddress this overarching challenge, we propose an innovative operation termed\nPixel-wise Gradient Clipping (PGC) designed for seamless integration into\nexisting 3D generative models, thereby enhancing their synthesis quality.\nSpecifically, we control the magnitude of stochastic gradients by clipping the\npixel-wise gradients efficiently, while preserving crucial texture-related\ngradient directions. Despite this simplicity and minimal extra cost, extensive\nexperiments demonstrate the efficacy of our PGC in enhancing the performance of\nexisting 3D generative models for high-resolution object rendering.\n","authors":["Zijie Pan","Jiachen Lu","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12474v3.pdf","comment":"Technical report. Project page https://fudan-zvg.github.io/PGC-3D"},{"id":"http://arxiv.org/abs/2401.00910v2","updated":"2024-01-16T16:28:58Z","published":"2023-12-31T23:53:50Z","title":"WoodScape Motion Segmentation for Autonomous Driving -- CVPR 2023 OmniCV\n Workshop Challenge","summary":" Motion segmentation is a complex yet indispensable task in autonomous\ndriving. The challenges introduced by the ego-motion of the cameras, radial\ndistortion in fisheye lenses, and the need for temporal consistency make the\ntask more complicated, rendering traditional and standard Convolutional Neural\nNetwork (CNN) approaches less effective. The consequent laborious data\nlabeling, representation of diverse and uncommon scenarios, and extensive data\ncapture requirements underscore the imperative of synthetic data for improving\nmachine learning model performance. To this end, we employ the PD-WoodScape\nsynthetic dataset developed by Parallel Domain, alongside the WoodScape fisheye\ndataset. Thus, we present the WoodScape fisheye motion segmentation challenge\nfor autonomous driving, held as part of the CVPR 2023 Workshop on\nOmnidirectional Computer Vision (OmniCV). As one of the first competitions\nfocused on fisheye motion segmentation, we aim to explore and evaluate the\npotential and impact of utilizing synthetic data in this domain. In this paper,\nwe provide a detailed analysis on the competition which attracted the\nparticipation of 112 global teams and a total of 234 submissions. This study\ndelineates the complexities inherent in the task of motion segmentation,\nemphasizes the significance of fisheye datasets, articulate the necessity for\nsynthetic datasets and the resultant domain gap they engender, outlining the\nfoundational blueprint for devising successful solutions. Subsequently, we\ndelve into the details of the baseline experiments and winning methods\nevaluating their qualitative and quantitative results, providing with useful\ninsights.\n","authors":["Saravanabalagi Ramachandran","Nathaniel Cibik","Ganesh Sistu","John McDonald"],"pdf_url":"https://arxiv.org/pdf/2401.00910v2.pdf","comment":"CVPR 2023 OmniCV Workshop Challenge"},{"id":"http://arxiv.org/abs/2401.08474v1","updated":"2024-01-16T16:25:37Z","published":"2024-01-16T16:25:37Z","title":"TUMTraf Event: Calibration and Fusion Resulting in a Dataset for\n Roadside Event-Based and RGB Cameras","summary":" Event-based cameras are predestined for Intelligent Transportation Systems\n(ITS). They provide very high temporal resolution and dynamic range, which can\neliminate motion blur and make objects easier to recognize at night. However,\nevent-based images lack color and texture compared to images from a\nconventional rgb camera. Considering that, data fusion between event-based and\nconventional cameras can combine the strengths of both modalities. For this\npurpose, extrinsic calibration is necessary. To the best of our knowledge, no\ntargetless calibration between event-based and rgb cameras can handle multiple\nmoving objects, nor data fusion optimized for the domain of roadside ITS\nexists, nor synchronized event-based and rgb camera datasets in the field of\nITS are known. To fill these research gaps, based on our previous work, we\nextend our targetless calibration approach with clustering methods to handle\nmultiple moving objects. Furthermore, we develop an early fusion, simple late\nfusion, and a novel spatiotemporal late fusion method. Lastly, we publish the\nTUMTraf Event Dataset, which contains more than 4k synchronized event-based and\nrgb images with 21.9k labeled 2D boxes. During our extensive experiments, we\nverified the effectiveness of our calibration method with multiple moving\nobjects. Furthermore, compared to a single rgb camera, we increased the\ndetection performance of up to +16% mAP in the day and up to +12% mAP in the\nchallenging night with our presented event-based sensor fusion methods. The\nTUMTraf Event Dataset is available at\nhttps://innovation-mobility.com/tumtraf-dataset.\n","authors":["Christian Creß","Walter Zimmer","Nils Purschke","Bach Ngoc Doan","Venkatnarayanan Lakshminarasimhan","Leah Strand","Alois C. Knoll"],"pdf_url":"https://arxiv.org/pdf/2401.08474v1.pdf","comment":"14 pages, 8 figures, 4 tables. This work has been submitted to the\n IEEE for possible publication. Copyright may be transferred without notice,\n after which this version may no longer be accessible"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.06355v2","updated":"2024-01-16T13:35:20Z","published":"2023-12-11T13:03:39Z","title":"Linguistic and Structural Basis of Engineering Design Knowledge","summary":" Artefact descriptions are the primary carriers of engineering design\nknowledge that is both an outcome and a driver of the design process. While an\nartefact could be described in different connotations, the design process\nrequires a description to embody engineering design knowledge, which is\nexpressed in the text through intricate placement of entities and\nrelationships. As large-language models learn from all kinds of text merely as\na sequence of characters/tokens, these are yet to generate text that embodies\nexplicit engineering design facts. Existing ontological design theories are\nless likely to guide the large-language models whose applications are currently\nlimited to ideation and learning purposes. In this article, we explicate\nengineering design knowledge as knowledge graphs from a large sample of 33,881\npatent documents. We examine the constituents of these knowledge graphs to\nunderstand the linguistic and structural basis of engineering design knowledge.\nIn terms of linguistic basis, we observe that entities and relationships could\nbe generalised to 64 and 24 linguistic syntaxes. While relationships mainly\ncapture attributes ('of'), structure ('in', 'with'), purpose ('to', 'for'),\nhierarchy ('include'), exemplification ('such as'), and behaviour ('to',\n'from'), the hierarchical relationships could specifically be identified using\n75 unique syntaxes. To understand the structural basis, we draw inspiration\nfrom various studies on biological/ecological networks and discover motifs from\npatent knowledge graphs. We identify four 3-node and four 4-node patterns that\ncould further be converged and simplified into sequence [->...->], aggregation\n[->...<-], and hierarchy [<-...->]. Expected to guide large-language model\nbased design tools, we propose few regulatory precepts for concretising\nabstract entities and relationships within subgraphs, while explicating\nhierarchical structures.\n","authors":["L. Siddharth","Jianxi Luo"],"pdf_url":"https://arxiv.org/pdf/2312.06355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02116v2","updated":"2024-01-16T12:53:42Z","published":"2024-01-04T07:52:00Z","title":"Starling: An I/O-Efficient Disk-Resident Graph Index Framework for\n High-Dimensional Vector Similarity Search on Data Segment","summary":" High-dimensional vector similarity search (HVSS) is gaining prominence as a\npowerful tool for various data science and AI applications. As vector data\nscales up, in-memory indexes pose a significant challenge due to the\nsubstantial increase in main memory requirements. A potential solution involves\nleveraging disk-based implementation, which stores and searches vector data on\nhigh-performance devices like NVMe SSDs. However, implementing HVSS for data\nsegments proves to be intricate in vector databases where a single machine\ncomprises multiple segments for system scalability. In this context, each\nsegment operates with limited memory and disk space, necessitating a delicate\nbalance between accuracy, efficiency, and space cost. Existing disk-based\nmethods fall short as they do not holistically address all these requirements\nsimultaneously. In this paper, we present Starling, an I/O-efficient\ndisk-resident graph index framework that optimizes data layout and search\nstrategy within the segment. It has two primary components: (1) a data layout\nincorporating an in-memory navigation graph and a reordered disk-based graph\nwith enhanced locality, reducing the search path length and minimizing disk\nbandwidth wastage; and (2) a block search strategy designed to minimize costly\ndisk I/O operations during vector query execution. Through extensive\nexperiments, we validate the effectiveness, efficiency, and scalability of\nStarling. On a data segment with 2GB memory and 10GB disk capacity, Starling\ncan accommodate up to 33 million vectors in 128 dimensions, offering HVSS with\nover 0.9 average precision and top-10 recall rate, and latency under 1\nmillisecond. The results showcase Starling's superior performance, exhibiting\n43.9$\\times$ higher throughput with 98% lower query latency compared to\nstate-of-the-art methods while maintaining the same level of accuracy.\n","authors":["Mengzhao Wang","Weizhi Xu","Xiaomeng Yi","Songlin Wu","Zhangyang Peng","Xiangyu Ke","Yunjun Gao","Xiaoliang Xu","Rentong Guo","Charles Xie"],"pdf_url":"https://arxiv.org/pdf/2401.02116v2.pdf","comment":"This paper has been accepted by SIGMOD 2024"},{"id":"http://arxiv.org/abs/2401.08272v1","updated":"2024-01-16T10:51:55Z","published":"2024-01-16T10:51:55Z","title":"Siamese Content-based Search Engine for a More Transparent Skin and\n Breast Cancer Diagnosis through Histological Imaging","summary":" Computer Aid Diagnosis (CAD) has developed digital pathology with Deep\nLearning (DL)-based tools to assist pathologists in decision-making.\nContent-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek\nhighly correlated patches in terms of similarity in histopathological features.\nIn this work, we proposed two CBHIR approaches on breast (Breast-twins) and\nskin cancer (Skin-twins) data sets for robust and accurate patch-level\nretrieval, integrating a custom-built Siamese network as a feature extractor.\nThe proposed Siamese network is able to generalize for unseen images by\nfocusing on the similar histopathological features of the input pairs. The\nproposed CBHIR approaches are evaluated on the Breast (public) and Skin\n(private) data sets with top K accuracy. Finding the optimum amount of K is\nchallenging, but also, as much as K increases, the dissimilarity between the\nquery and the returned images increases which might mislead the pathologists.\nTo the best of the author's belief, this paper is tackling this issue for the\nfirst time on histopathological images by evaluating the top first retrieved\nimages. The Breast-twins model achieves 70% of the F1score at the top first,\nwhich exceeds the other state-of-the-art methods at a higher amount of K such\nas 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto\nEncoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model\ntackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential\n(STUMP) to assist pathologists with retrieving top K images and their\ncorresponding labels. So, this approach can offer a more explainable CAD tool\nto pathologists in terms of transparency, trustworthiness, or reliability among\nother characteristics.\n","authors":["Zahra Tabatabaei","Adrián Colomer","JAvier Oliver Moll","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2401.08272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08267v1","updated":"2024-01-16T10:41:09Z","published":"2024-01-16T10:41:09Z","title":"Ranking Heterogeneous Search Result Pages using the Interactive\n Probability Ranking Principle","summary":" The Probability Ranking Principle (PRP) ranks search results based on their\nexpected utility derived solely from document contents, often overlooking the\nnuances of presentation and user interaction. However, with the evolution of\nSearch Engine Result Pages (SERPs), now comprising a variety of result cards,\nthe manner in which these results are presented is pivotal in influencing user\nengagement and satisfaction. This shift prompts the question: How does the PRP\nand its user-centric counterpart, the Interactive Probability Ranking Principle\n(iPRP), compare in the context of these heterogeneous SERPs? Our study draws a\ncomparison between the PRP and the iPRP, revealing significant differences in\ntheir output. The iPRP, accounting for item-specific costs and interaction\nprobabilities to determine the ``Expected Perceived Utility\" (EPU), yields\ndifferent result orderings compared to the PRP. We evaluate the effect of the\nEPU on the ordering of results by observing changes in the ranking within a\nheterogeneous SERP compared to the traditional ``ten blue links''. We find that\nchanging the presentation affects the ranking of items according to the (iPRP)\nby up to 48\\% (with respect to DCG, TBG and RBO) in ad-hoc search tasks on the\nTREC WaPo Collection. This work suggests that the iPRP should be employed when\nranking heterogeneous SERPs to provide a user-centric ranking that adapts the\nordering based on the presentation and user engagement.\n","authors":["Kanaad Pathak. Leif Azzopardi","Martin Halvey"],"pdf_url":"https://arxiv.org/pdf/2401.08267v1.pdf","comment":"To be presented as a full paper at ECIR 2024 in Glasgow, UK"},{"id":"http://arxiv.org/abs/2401.08228v1","updated":"2024-01-16T09:27:28Z","published":"2024-01-16T09:27:28Z","title":"MCRPL: A Pretrain, Prompt & Fine-tune Paradigm for Non-overlapping\n Many-to-one Cross-domain Recommendation","summary":" Cross-domain Recommendation (CR) is the task that tends to improve the\nrecommendations in the sparse target domain by leveraging the information from\nother rich domains. Existing methods of cross-domain recommendation mainly\nfocus on overlapping scenarios by assuming users are totally or partially\noverlapped, which are taken as bridges to connect different domains. However,\nthis assumption does not always hold since it is illegal to leak users'\nidentity information to other domains. Conducting Non-overlapping MCR (NMCR) is\nchallenging since 1) The absence of overlapping information prevents us from\ndirectly aligning different domains, and this situation may get worse in the\nMCR scenario. 2) The distribution between source and target domains makes it\ndifficult for us to learn common information across domains. To overcome the\nabove challenges, we focus on NMCR, and devise MCRPL as our solution. To\naddress Challenge 1, we first learn shared domain-agnostic and domain-dependent\nprompts, and pre-train them in the pre-training stage. To address Challenge 2,\nwe further update the domain-dependent prompts with other parameters kept fixed\nto transfer the domain knowledge to the target domain. We conduct experiments\non five real-world domains, and the results show the advance of our MCRPL\nmethod compared with several recent SOTA baselines.\n","authors":["Hao Liu","Lei Guo","Lei Zhu","Yongqiang Jiang","Min Gao","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.08228v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08217v1","updated":"2024-01-16T09:04:17Z","published":"2024-01-16T09:04:17Z","title":"LLM-Guided Multi-View Hypergraph Learning for Human-Centric Explainable\n Recommendation","summary":" As personalized recommendation systems become vital in the age of information\noverload, traditional methods relying solely on historical user interactions\noften fail to fully capture the multifaceted nature of human interests. To\nenable more human-centric modeling of user preferences, this work proposes a\nnovel explainable recommendation framework, i.e., LLMHG, synergizing the\nreasoning capabilities of large language models (LLMs) and the structural\nadvantages of hypergraph neural networks. By effectively profiling and\ninterpreting the nuances of individual user interests, our framework pioneers\nenhancements to recommendation systems with increased explainability. We\nvalidate that explicitly accounting for the intricacies of human preferences\nallows our human-centric and explainable LLMHG approach to consistently\noutperform conventional models across diverse real-world datasets. The proposed\nplug-and-play enhancement framework delivers immediate gains in recommendation\nperformance while offering a pathway to apply advanced LLMs for better\ncapturing the complexity of human interests across machine learning\napplications.\n","authors":["Zhixuan Chu","Yan Wang","Qing Cui","Longfei Li","Wenqing Chen","Sheng Li","Zhan Qin","Kui Ren"],"pdf_url":"https://arxiv.org/pdf/2401.08217v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.08206v1","updated":"2024-01-16T08:44:29Z","published":"2024-01-16T08:44:29Z","title":"Generative Multi-Modal Knowledge Retrieval with Large Language Models","summary":" Knowledge retrieval with multi-modal queries plays a crucial role in\nsupporting knowledge-intensive multi-modal applications. However, existing\nmethods face challenges in terms of their effectiveness and training\nefficiency, especially when it comes to training and integrating multiple\nretrievers to handle multi-modal queries. In this paper, we propose an\ninnovative end-to-end generative framework for multi-modal knowledge retrieval.\nOur framework takes advantage of the fact that large language models (LLMs) can\neffectively serve as virtual knowledge bases, even when trained with limited\ndata. We retrieve knowledge via a two-step process: 1) generating knowledge\nclues related to the queries, and 2) obtaining the relevant document by\nsearching databases using the knowledge clue. In particular, we first introduce\nan object-aware prefix-tuning technique to guide multi-grained visual learning.\nThen, we align multi-grained visual features into the textual feature space of\nthe LLM, employing the LLM to capture cross-modal interactions. Subsequently,\nwe construct instruction data with a unified format for model training.\nFinally, we propose the knowledge-guided generation strategy to impose prior\nconstraints in the decoding steps, thereby promoting the generation of\ndistinctive knowledge clues. Through experiments conducted on three benchmarks,\nwe demonstrate significant improvements ranging from 3.0% to 14.6% across all\nevaluation metrics when compared to strong baselines.\n","authors":["Xinwei Long","Jiali Zeng","Fandong Meng","Zhiyuan Ma","Kaiyan Zhang","Bowen Zhou","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.08206v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.03648v2","updated":"2024-01-16T08:03:02Z","published":"2024-01-08T03:37:43Z","title":"Reproducibility Analysis and Enhancements for Multi-Aspect Dense\n Retriever with Aspect Learning","summary":" Multi-aspect dense retrieval aims to incorporate aspect information (e.g.,\nbrand and category) into dual encoders to facilitate relevance matching. As an\nearly and representative multi-aspect dense retriever, MADRAL learns several\nextra aspect embeddings and fuses the explicit aspects with an implicit aspect\n\"OTHER\" for final representation. MADRAL was evaluated on proprietary data and\nits code was not released, making it challenging to validate its effectiveness\non other datasets. We failed to reproduce its effectiveness on the public\nMA-Amazon data, motivating us to probe the reasons and re-examine its\ncomponents. We propose several component alternatives for comparisons,\nincluding replacing \"OTHER\" with \"CLS\" and representing aspects with the first\nseveral content tokens. Through extensive experiments, we confirm that learning\n\"OTHER\" from scratch in aspect fusion is harmful. In contrast, our proposed\nvariants can greatly enhance the retrieval performance. Our research not only\nsheds light on the limitations of MADRAL but also provides valuable insights\nfor future studies on more powerful multi-aspect dense retrieval models. Code\nwill be released at:\nhttps://github.com/sunxiaojie99/Reproducibility-for-MADRAL.\n","authors":["Keping Bi","Xiaojie Sun","Jiafeng Guo","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.03648v2.pdf","comment":"accepted by ecir2024 as a reproducibility paper"},{"id":"http://arxiv.org/abs/2312.02538v2","updated":"2024-01-16T07:57:34Z","published":"2023-12-05T07:08:08Z","title":"A Multi-Granularity-Aware Aspect Learning Model for Multi-Aspect Dense\n Retrieval","summary":" Dense retrieval methods have been mostly focused on unstructured text and\nless attention has been drawn to structured data with various aspects, e.g.,\nproducts with aspects such as category and brand. Recent work has proposed two\napproaches to incorporate the aspect information into item representations for\neffective retrieval by predicting the values associated with the item aspects.\nDespite their efficacy, they treat the values as isolated classes (e.g., \"Smart\nHomes\", \"Home, Garden & Tools\", and \"Beauty & Health\") and ignore their\nfine-grained semantic relation. Furthermore, they either enforce the learning\nof aspects into the CLS token, which could confuse it from its designated use\nfor representing the entire content semantics, or learn extra aspect embeddings\nonly with the value prediction objective, which could be insufficient\nespecially when there are no annotated values for an item aspect. Aware of\nthese limitations, we propose a MUlti-granulaRity-aware Aspect Learning model\n(MURAL) for multi-aspect dense retrieval. It leverages aspect information\nacross various granularities to capture both coarse and fine-grained semantic\nrelations between values. Moreover, MURAL incorporates separate aspect\nembeddings as input to transformer encoders so that the masked language model\nobjective can assist implicit aspect learning even without aspect-value\nannotations. Extensive experiments on two real-world datasets of products and\nmini-programs show that MURAL outperforms state-of-the-art baselines\nsignificantly.\n","authors":["Xiaojie Sun","Keping Bi","Jiafeng Guo","Sihui Yang","Qishen Zhang","Zhongyi Liu","Guannan Zhang","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.02538v2.pdf","comment":"Accepted by WSDM2024, update"},{"id":"http://arxiv.org/abs/2401.08104v1","updated":"2024-01-16T04:14:44Z","published":"2024-01-16T04:14:44Z","title":"A Reproducibility Study of Goldilocks: Just-Right Tuning of BERT for TAR","summary":" Screening documents is a tedious and time-consuming aspect of high-recall\nretrieval tasks, such as compiling a systematic literature review, where the\ngoal is to identify all relevant documents for a topic. To help streamline this\nprocess, many Technology-Assisted Review (TAR) methods leverage active learning\ntechniques to reduce the number of documents requiring review. BERT-based\nmodels have shown high effectiveness in text classification, leading to\ninterest in their potential use in TAR workflows. In this paper, we investigate\nrecent work that examined the impact of further pre-training epochs on the\neffectiveness and efficiency of a BERT-based active learning pipeline. We first\nreport that we could replicate the original experiments on two specific TAR\ndatasets, confirming some of the findings: importantly, that further\npre-training is critical to high effectiveness, but requires attention in terms\nof selecting the correct training epoch. We then investigate the\ngeneralisability of the pipeline on a different TAR task, that of medical\nsystematic reviews. In this context, we show that there is no need for further\npre-training if a domain-specific BERT backbone is used within the active\nlearning pipeline. This finding provides practical implications for using the\nstudied active learning pipeline within domain-specific TAR tasks.\n","authors":["Xinyu Mao","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2401.08104v1.pdf","comment":"Accepted at ECIR 2024 (reproducibility)"},{"id":"http://arxiv.org/abs/2401.08841v1","updated":"2024-01-16T21:33:06Z","published":"2024-01-16T21:33:06Z","title":"Exploring Content-Based and Meta-Data Analysis for Detecting Fake News\n Infodemic: A case study on COVID-19","summary":" The coronavirus pandemic (COVID-19) is probably the most disruptive global\nhealth disaster in recent history. It negatively impacted the whole world and\nvirtually brought the global economy to a standstill. However, as the virus was\nspreading, infecting people and claiming thousands of lives so was the spread\nand propagation of fake news, misinformation and disinformation about the\nevent. These included the spread of unconfirmed health advice and remedies on\nsocial media. In this paper, false information about the pandemic is identified\nusing a content-based approach and metadata curated from messages posted to\nonline social networks. A content-based approach combined with metadata as well\nas an initial feature analysis is used and then several supervised learning\nmodels are tested for identifying and predicting misleading posts. Our approach\nshows up to 93% accuracy in the detection of fake news related posts about the\nCOVID-19 pandemic\n","authors":["Oluwaseun Ajao","Ashish Garg","Marjory Da Costa-Abreu"],"pdf_url":"https://arxiv.org/pdf/2401.08841v1.pdf","comment":"8 pages, 5 figures, 3 tables, International Conference for Pattern\n Recognition Systems (ICPRS 2022)"},{"id":"http://arxiv.org/abs/2401.08818v1","updated":"2024-01-16T20:41:11Z","published":"2024-01-16T20:41:11Z","title":"Link Me Baby One More Time: Social Music Discovery on Spotify","summary":" We explore the social and contextual factors that influence the outcome of\nperson-to-person music recommendations and discovery. Specifically, we use data\nfrom Spotify to investigate how a link sent from one user to another results in\nthe receiver engaging with the music of the shared artist. We consider several\nfactors that may influence this process, such as the strength of the\nsender-receiver relationship, the user's role in the Spotify social network,\ntheir music social cohesion, and how similar the new artist is to the\nreceiver's taste. We find that the receiver of a link is more likely to engage\nwith a new artist when (1) they have similar music taste to the sender and the\nshared track is a good fit for their taste, (2) they have a stronger and more\nintimate tie with the sender, and (3) the shared artist is popular with the\nreceiver's connections. Finally, we use these findings to build a Random Forest\nclassifier to predict whether a shared music track will result in the\nreceiver's engagement with the shared artist. This model elucidates which type\nof social and contextual features are most predictive, although peak\nperformance is achieved when a diverse set of features are included. These\nfindings provide new insights into the multifaceted mechanisms underpinning the\ninterplay between music discovery and social processes.\n","authors":["Shazia'Ayn Babul","Desislava Hristova","Antonio Lima","Renaud Lambiotte","Mariano Beguerisse-Díaz"],"pdf_url":"https://arxiv.org/pdf/2401.08818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05492v2","updated":"2024-01-16T18:37:59Z","published":"2023-04-11T20:55:02Z","title":"Towards More Robust and Accurate Sequential Recommendation with\n Cascade-guided Adversarial Training","summary":" Sequential recommendation models, models that learn from chronological\nuser-item interactions, outperform traditional recommendation models in many\nsettings. Despite the success of sequential recommendation models, their\nrobustness has recently come into question. Two properties unique to the nature\nof sequential recommendation models may impair their robustness - the cascade\neffects induced during training and the model's tendency to rely too heavily on\ntemporal information. To address these vulnerabilities, we propose\nCascade-guided Adversarial training, a new adversarial training procedure that\nis specifically designed for sequential recommendation models. Our approach\nharnesses the intrinsic cascade effects present in sequential modeling to\nproduce strategic adversarial perturbations to item embeddings during training.\nExperiments on training state-of-the-art sequential models on four public\ndatasets from different domains show that our training approach produces\nsuperior model ranking accuracy and superior model robustness to real item\nreplacement perturbations when compared to both standard model training and\ngeneric adversarial training.\n","authors":["Juntao Tan","Shelby Heinecke","Zhiwei Liu","Yongjun Chen","Yongfeng Zhang","Huan Wang"],"pdf_url":"https://arxiv.org/pdf/2304.05492v2.pdf","comment":"Accepted to present at SIAM International Conference on Data Mining\n (SDM24)"},{"id":"http://arxiv.org/abs/2401.08519v1","updated":"2024-01-16T17:31:54Z","published":"2024-01-16T17:31:54Z","title":"From Graphs to Hypergraphs: Hypergraph Projection and its Remediation","summary":" We study the implications of the modeling choice to use a graph, instead of a\nhypergraph, to represent real-world interconnected systems whose constituent\nrelationships are of higher order by nature. Such a modeling choice typically\ninvolves an underlying projection process that maps the original hypergraph\nonto a graph, and is common in graph-based analysis. While hypergraph\nprojection can potentially lead to loss of higher-order relations, there exists\nvery limited studies on the consequences of doing so, as well as its\nremediation. This work fills this gap by doing two things: (1) we develop\nanalysis based on graph and set theory, showing two ubiquitous patterns of\nhyperedges that are root to structural information loss in all hypergraph\nprojections; we also quantify the combinatorial impossibility of recovering the\nlost higher-order structures if no extra help is provided; (2) we still seek to\nrecover the lost higher-order structures in hypergraph projection, and in light\nof (1)'s findings we propose to relax the problem into a learning-based\nsetting. Under this setting, we develop a learning-based hypergraph\nreconstruction method based on an important statistic of hyperedge\ndistributions that we find. Our reconstruction method is evaluated on 8\nreal-world datasets under different settings, and exhibits consistently good\nperformance. We also demonstrate benefits of the reconstructed hypergraphs via\nuse cases of protein rankings and link predictions.\n","authors":["Yanbang Wang","Jon Kleinberg"],"pdf_url":"https://arxiv.org/pdf/2401.08519v1.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08506v1","updated":"2024-01-16T17:07:24Z","published":"2024-01-16T17:07:24Z","title":"Content-Aware Tweet Location Inference using Quadtree Spatial\n Partitioning and Jaccard-Cosine Word Embedding","summary":" Inferring locations from user texts on social media platforms is a\nnon-trivial and challenging problem relating to public safety. We propose a\nnovel non-uniform grid-based approach for location inference from Twitter\nmessages using Quadtree spatial partitions. The proposed algorithm uses natural\nlanguage processing (NLP) for semantic understanding and incorporates Cosine\nsimilarity and Jaccard similarity measures for feature vector extraction and\ndimensionality reduction. We chose Twitter as our experimental social media\nplatform due to its popularity and effectiveness for the dissemination of news\nand stories about recent events happening around the world. Our approach is the\nfirst of its kind to make location inference from tweets using Quadtree spatial\npartitions and NLP, in hybrid word-vector representations. The proposed\nalgorithm achieved significant classification accuracy and outperformed\nstate-of-the-art grid-based content-only location inference methods by up to\n24% in correctly predicting tweet locations within a 161km radius and by 300km\nin median error distance on benchmark datasets.\n","authors":["Oluwaseun Ajao","Deepayan Bhowmik","Shahrzad Zargari"],"pdf_url":"https://arxiv.org/pdf/2401.08506v1.pdf","comment":"8 pages, 7 figures, 5 tables, International Conference on Advances in\n Social Networks Analysis and Mining (ASONAM 2018)"},{"id":"http://arxiv.org/abs/2401.08444v1","updated":"2024-01-16T15:44:32Z","published":"2024-01-16T15:44:32Z","title":"Revealing the Hidden Impact of Top-N Metrics on Optimization in\n Recommender Systems","summary":" The hyperparameters of recommender systems for top-n predictions are\ntypically optimized to enhance the predictive performance of algorithms.\nThereby, the optimization algorithm, e.g., grid search or random search,\nsearches for the best hyperparameter configuration according to an\noptimization-target metric, like nDCG or Precision. In contrast, the optimized\nalgorithm, internally optimizes a different loss function during training, like\nsquared error or cross-entropy. To tackle this discrepancy, recent work focused\non generating loss functions better suited for recommender systems. Yet, when\nevaluating an algorithm using a top-n metric during optimization, another\ndiscrepancy between the optimization-target metric and the training loss has so\nfar been ignored. During optimization, the top-n items are selected for\ncomputing a top-n metric; ignoring that the top-n items are selected from the\nrecommendations of a model trained with an entirely different loss function.\nItem recommendations suitable for optimization-target metrics could be outside\nthe top-n recommended items; hiddenly impacting the optimization performance.\nTherefore, we were motivated to analyze whether the top-n items are optimal for\noptimization-target top-n metrics. In pursuit of an answer, we exhaustively\nevaluate the predictive performance of 250 selection strategies besides\nselecting the top-n. We extensively evaluate each selection strategy over\ntwelve implicit feedback and eight explicit feedback data sets with eleven\nrecommender systems algorithms. Our results show that there exist selection\nstrategies other than top-n that increase predictive performance for various\nalgorithms and recommendation domains. However, the performance of the top ~43%\nof selection strategies is not significantly different. We discuss the impact\nof our findings on optimization and re-ranking in recommender systems and\nfeasible solutions.\n","authors":["Lukas Wegmeth","Tobias Vente","Lennart Purucker"],"pdf_url":"https://arxiv.org/pdf/2401.08444v1.pdf","comment":"Accepted in the Full Paper Track for ECIR 2024"},{"id":"http://arxiv.org/abs/2309.08420v7","updated":"2024-01-16T13:44:35Z","published":"2023-09-15T14:23:20Z","title":"FedDCSR: Federated Cross-domain Sequential Recommendation via\n Disentangled Representation Learning","summary":" Cross-domain Sequential Recommendation (CSR) which leverages user sequence\ndata from multiple domains has received extensive attention in recent years.\nHowever, the existing CSR methods require sharing origin user data across\ndomains, which violates the General Data Protection Regulation (GDPR). Thus, it\nis necessary to combine federated learning (FL) and CSR to fully utilize\nknowledge from different domains while preserving data privacy. Nonetheless,\nthe sequence feature heterogeneity across different domains significantly\nimpacts the overall performance of FL. In this paper, we propose FedDCSR, a\nnovel federated cross-domain sequential recommendation framework via\ndisentangled representation learning. Specifically, to address the sequence\nfeature heterogeneity across domains, we introduce an approach called\ninter-intra domain sequence representation disentanglement (SRD) to disentangle\nthe user sequence features into domain-shared and domain-exclusive features. In\naddition, we design an intra domain contrastive infomax (CIM) strategy to learn\nricher domain-exclusive features of users by performing data augmentation on\nuser sequences. Extensive experiments on three real-world scenarios demonstrate\nthat FedDCSR achieves significant improvements over existing baselines.\n","authors":["Hongyu Zhang","Dongyi Zheng","Xu Yang","Jiyuan Feng","Qing Liao"],"pdf_url":"https://arxiv.org/pdf/2309.08420v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08267v1","updated":"2024-01-16T10:41:09Z","published":"2024-01-16T10:41:09Z","title":"Ranking Heterogeneous Search Result Pages using the Interactive\n Probability Ranking Principle","summary":" The Probability Ranking Principle (PRP) ranks search results based on their\nexpected utility derived solely from document contents, often overlooking the\nnuances of presentation and user interaction. However, with the evolution of\nSearch Engine Result Pages (SERPs), now comprising a variety of result cards,\nthe manner in which these results are presented is pivotal in influencing user\nengagement and satisfaction. This shift prompts the question: How does the PRP\nand its user-centric counterpart, the Interactive Probability Ranking Principle\n(iPRP), compare in the context of these heterogeneous SERPs? Our study draws a\ncomparison between the PRP and the iPRP, revealing significant differences in\ntheir output. The iPRP, accounting for item-specific costs and interaction\nprobabilities to determine the ``Expected Perceived Utility\" (EPU), yields\ndifferent result orderings compared to the PRP. We evaluate the effect of the\nEPU on the ordering of results by observing changes in the ranking within a\nheterogeneous SERP compared to the traditional ``ten blue links''. We find that\nchanging the presentation affects the ranking of items according to the (iPRP)\nby up to 48\\% (with respect to DCG, TBG and RBO) in ad-hoc search tasks on the\nTREC WaPo Collection. This work suggests that the iPRP should be employed when\nranking heterogeneous SERPs to provide a user-centric ranking that adapts the\nordering based on the presentation and user engagement.\n","authors":["Kanaad Pathak","Leif Azzopardi","Martin Halvey"],"pdf_url":"https://arxiv.org/pdf/2401.08267v1.pdf","comment":"To be presented as a full paper at ECIR 2024 in Glasgow, UK"},{"id":"http://arxiv.org/abs/2401.09490v1","updated":"2024-01-16T21:03:10Z","published":"2024-01-16T21:03:10Z","title":"Gene-associated Disease Discovery Powered by Large Language Models","summary":" The intricate relationship between genetic variation and human diseases has\nbeen a focal point of medical research, evidenced by the identification of risk\ngenes regarding specific diseases. The advent of advanced genome sequencing\ntechniques has significantly improved the efficiency and cost-effectiveness of\ndetecting these genetic markers, playing a crucial role in disease diagnosis\nand forming the basis for clinical decision-making and early risk assessment.\nTo overcome the limitations of existing databases that record disease-gene\nassociations from existing literature, which often lack real-time updates, we\npropose a novel framework employing Large Language Models (LLMs) for the\ndiscovery of diseases associated with specific genes. This framework aims to\nautomate the labor-intensive process of sifting through medical literature for\nevidence linking genetic variations to diseases, thereby enhancing the\nefficiency of disease identification. Our approach involves using LLMs to\nconduct literature searches, summarize relevant findings, and pinpoint diseases\nrelated to specific genes. This paper details the development and application\nof our LLM-powered framework, demonstrating its potential in streamlining the\ncomplex process of literature retrieval and summarization to identify diseases\nassociated with specific genetic variations.\n","authors":["Jiayu Chang","Shiyu Wang","Chen Ling","Zhaohui Qin","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.09490v1.pdf","comment":"This is the official paper accepted by AAAI 2024 Workshop on Large\n Language Models for Biological Discoveries"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.08381v1","updated":"2024-01-16T14:11:54Z","published":"2024-01-16T14:11:54Z","title":"Robotic Imitation of Human Actions","summary":" Imitation can allow us to quickly gain an understanding of a new task.\nThrough a demonstration, we can gain direct knowledge about which actions need\nto be performed and which goals they have. In this paper, we introduce a new\napproach to imitation learning that tackles the challenges of a robot imitating\na human, such as the change in perspective and body schema. Our approach can\nuse a single human demonstration to abstract information about the demonstrated\ntask, and use that information to generalise and replicate it. We facilitate\nthis ability by a new integration of two state-of-the-art methods: a diffusion\naction segmentation model to abstract temporal information from the\ndemonstration and an open vocabulary object detector for spatial information.\nFurthermore, we refine the abstracted information and use symbolic reasoning to\ncreate an action plan utilising inverse kinematics, to allow the robot to\nimitate the demonstrated action.\n","authors":["Josua Spisak","Matthias Kerzel","Stefan Wermter"],"pdf_url":"https://arxiv.org/pdf/2401.08381v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08375v1","updated":"2024-01-16T14:07:36Z","published":"2024-01-16T14:07:36Z","title":"Sparse PCA with False Discovery Rate Controlled Variable Selection","summary":" Sparse principal component analysis (PCA) aims at mapping large dimensional\ndata to a linear subspace of lower dimension. By imposing loading vectors to be\nsparse, it performs the double duty of dimension reduction and variable\nselection. Sparse PCA algorithms are usually expressed as a trade-off between\nexplained variance and sparsity of the loading vectors (i.e., number of\nselected variables). As a high explained variance is not necessarily synonymous\nwith relevant information, these methods are prone to select irrelevant\nvariables. To overcome this issue, we propose an alternative formulation of\nsparse PCA driven by the false discovery rate (FDR). We then leverage the\nTerminating-Random Experiments (T-Rex) selector to automatically determine an\nFDR-controlled support of the loading vectors. A major advantage of the\nresulting T-Rex PCA is that no sparsity parameter tuning is required. Numerical\nexperiments and a stock market data example demonstrate a significant\nperformance improvement.\n","authors":["Jasin Machkour","Arnaud Breloy","Michael Muma","Daniel P. Palomar","Frédéric Pascal"],"pdf_url":"https://arxiv.org/pdf/2401.08375v1.pdf","comment":"Published in ICASSP 2024 - 2024 IEEE International Conference on\n Acoustics, Speech and Signal Processing (ICASSP), scheduled for 14-19 April\n 2024 in Seoul, Korea"},{"id":"http://arxiv.org/abs/2401.08364v1","updated":"2024-01-16T13:46:10Z","published":"2024-01-16T13:46:10Z","title":"Weighted Spectral Filters for Kernel Interpolation on Spheres: Estimates\n of Prediction Accuracy for Noisy Data","summary":" Spherical radial-basis-based kernel interpolation abounds in image sciences\nincluding geophysical image reconstruction, climate trends description and\nimage rendering due to its excellent spatial localization property and perfect\napproximation performance. However, in dealing with noisy data, kernel\ninterpolation frequently behaves not so well due to the large condition number\nof the kernel matrix and instability of the interpolation process. In this\npaper, we introduce a weighted spectral filter approach to reduce the condition\nnumber of the kernel matrix and then stabilize kernel interpolation. The main\nbuilding blocks of the proposed method are the well developed spherical\npositive quadrature rules and high-pass spectral filters. Using a recently\ndeveloped integral operator approach for spherical data analysis, we\ntheoretically demonstrate that the proposed weighted spectral filter approach\nsucceeds in breaking through the bottleneck of kernel interpolation, especially\nin fitting noisy data. We provide optimal approximation rates of the new method\nto show that our approach does not compromise the predicting accuracy.\nFurthermore, we conduct both toy simulations and two real-world data\nexperiments with synthetically added noise in geophysical image reconstruction\nand climate image processing to verify our theoretical assertions and show the\nfeasibility of the weighted spectral filter approach.\n","authors":["Xiaotong Liu","Jinxin Wang","Di Wang","Shao-Bo Lin"],"pdf_url":"https://arxiv.org/pdf/2401.08364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08351v1","updated":"2024-01-16T13:30:37Z","published":"2024-01-16T13:30:37Z","title":"Personalized Federated Learning of Probabilistic Models: A PAC-Bayesian\n Approach","summary":" Federated learning aims to infer a shared model from private and\ndecentralized data stored locally by multiple clients. Personalized federated\nlearning (PFL) goes one step further by adapting the global model to each\nclient, enhancing the model's fit for different clients. A significant level of\npersonalization is required for highly heterogeneous clients, but can be\nchallenging to achieve especially when they have small datasets. To address\nthis problem, we propose a PFL algorithm named PAC-PFL for learning\nprobabilistic models within a PAC-Bayesian framework that utilizes differential\nprivacy to handle data-dependent priors. Our algorithm collaboratively learns a\nshared hyper-posterior and regards each client's posterior inference as the\npersonalization step. By establishing and minimizing a generalization bound on\nthe average true risk of clients, PAC-PFL effectively combats over-fitting.\nPACPFL achieves accurate and well-calibrated predictions, supported by\nexperiments on a dataset of photovoltaic panel power generation, FEMNIST\ndataset (Caldas et al., 2019), and Dirichlet-partitioned EMNIST dataset (Cohen\net al., 2017).\n","authors":["Mahrokh Ghoddousi Boroujeni","Andreas Krause","Giancarlo Ferrari Trecate"],"pdf_url":"https://arxiv.org/pdf/2401.08351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08348v1","updated":"2024-01-16T13:29:30Z","published":"2024-01-16T13:29:30Z","title":"We don't need no labels: Estimating post-deployment model performance\n under covariate shift without ground truth","summary":" The performance of machine learning models often degrades after deployment\ndue to data distribution shifts. In many use cases, it is impossible to\ncalculate the post-deployment performance because labels are unavailable or\nsignificantly delayed. Proxy methods for evaluating model performance\nstability, like drift detection techniques, do not properly quantify data\ndistribution shift impact. As a solution, we propose a robust and accurate\nperformance estimation method for evaluating ML classification models on\nunlabeled data that accurately quantifies the impact of covariate shift on\nmodel performance. We call it multi-calibrated confidence-based performance\nestimation (M-CBPE). It is model and data-type agnostic and works for any\nperformance metric. It does not require access to the monitored model - it uses\nthe model predictions and probability estimates. M-CBPE does not need user\ninput on the nature of the covariate shift as it fully learns from the data. We\nevaluate it with over 600 dataset-model pairs from US census data and compare\nit with multiple benchmarks using several evaluation metrics. Results show that\nM-CBPE is the best method to estimate the performance of classification models\nin any evaluation context.\n","authors":["Jakub Białek","Wojtek Kuberski","Nikolaos Perrakis"],"pdf_url":"https://arxiv.org/pdf/2401.08348v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08330v1","updated":"2024-01-16T12:49:10Z","published":"2024-01-16T12:49:10Z","title":"Boosting Gradient Ascent for Continuous DR-submodular Maximization","summary":" Projected Gradient Ascent (PGA) is the most commonly used optimization scheme\nin machine learning and operations research areas. Nevertheless, numerous\nstudies and examples have shown that the PGA methods may fail to achieve the\ntight approximation ratio for continuous DR-submodular maximization problems.\nTo address this challenge, we present a boosting technique in this paper, which\ncan efficiently improve the approximation guarantee of the standard PGA to\n\\emph{optimal} with only small modifications on the objective function. The\nfundamental idea of our boosting technique is to exploit non-oblivious search\nto derive a novel auxiliary function $F$, whose stationary points are excellent\napproximations to the global maximum of the original DR-submodular objective\n$f$. Specifically, when $f$ is monotone and $\\gamma$-weakly DR-submodular, we\npropose an auxiliary function $F$ whose stationary points can provide a better\n$(1-e^{-\\gamma})$-approximation than the\n$(\\gamma^2/(1+\\gamma^2))$-approximation guaranteed by the stationary points of\n$f$ itself. Similarly, for the non-monotone case, we devise another auxiliary\nfunction $F$ whose stationary points can achieve an optimal\n$\\frac{1-\\min_{\\boldsymbol{x}\\in\\mathcal{C}}\\|\\boldsymbol{x}\\|_{\\infty}}{4}$-approximation\nguarantee where $\\mathcal{C}$ is a convex constraint set. In contrast, the\nstationary points of the original non-monotone DR-submodular function can be\narbitrarily bad~\\citep{chen2023continuous}. Furthermore, we demonstrate the\nscalability of our boosting technique on four problems. In all of these four\nproblems, our resulting variants of boosting PGA algorithm beat the previous\nstandard PGA in several aspects such as approximation ratio and efficiency.\nFinally, we corroborate our theoretical findings with numerical experiments,\nwhich demonstrate the effectiveness of our boosting PGA methods.\n","authors":["Qixin Zhang","Zongqi Wan","Zengde Deng","Zaiyi Chen","Xiaoming Sun","Jialin Zhang","Yu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08330v1.pdf","comment":"74 pages, 6 figures and 9 tables. An extended version of Stochastic\n Continuous Submodular Maximization: Boosting via Non-oblivious Function (ICML\n 2022)"},{"id":"http://arxiv.org/abs/2401.08327v1","updated":"2024-01-16T12:45:15Z","published":"2024-01-16T12:45:15Z","title":"Learn What You Need in Personalized Federated Learning","summary":" Personalized federated learning aims to address data heterogeneity across\nlocal clients in federated learning. However, current methods blindly\nincorporate either full model parameters or predefined partial parameters in\npersonalized federated learning. They fail to customize the collaboration\nmanner according to each local client's data characteristics, causing\nunpleasant aggregation results. To address this essential issue, we propose\n$\\textit{Learn2pFed}$, a novel algorithm-unrolling-based personalized federated\nlearning framework, enabling each client to adaptively select which part of its\nlocal model parameters should participate in collaborative training. The key\nnovelty of the proposed $\\textit{Learn2pFed}$ is to optimize each local model\nparameter's degree of participant in collaboration as learnable parameters via\nalgorithm unrolling methods. This approach brings two benefits: 1)\nmathmatically determining the participation degree of local model parameters in\nthe federated collaboration, and 2) obtaining more stable and improved\nsolutions. Extensive experiments on various tasks, including regression,\nforecasting, and image classification, demonstrate that $\\textit{Learn2pFed}$\nsignificantly outperforms previous personalized federated learning methods.\n","authors":["Kexin Lv","Rui Ye","Xiaolin Huang","Jie Yang","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19273v2","updated":"2024-01-16T12:38:15Z","published":"2023-10-30T05:12:24Z","title":"The Memory Perturbation Equation: Understanding Model's Sensitivity to\n Data","summary":" Understanding model's sensitivity to its training data is crucial but can\nalso be challenging and costly, especially during training. To simplify such\nissues, we present the Memory-Perturbation Equation (MPE) which relates model's\nsensitivity to perturbation in its training data. Derived using Bayesian\nprinciples, the MPE unifies existing sensitivity measures, generalizes them to\na wide-variety of models and algorithms, and unravels useful properties\nregarding sensitivities. Our empirical results show that sensitivity estimates\nobtained during training can be used to faithfully predict generalization on\nunseen test data. The proposed equation is expected to be useful for future\nresearch on robust and adaptive learning.\n","authors":["Peter Nickl","Lu Xu","Dharmesh Tailor","Thomas Möllenhoff","Mohammad Emtiyaz Khan"],"pdf_url":"https://arxiv.org/pdf/2310.19273v2.pdf","comment":"37th Conference on Neural Information Processing Systems (NeurIPS\n 2023)"},{"id":"http://arxiv.org/abs/2401.08318v1","updated":"2024-01-16T12:36:17Z","published":"2024-01-16T12:36:17Z","title":"OpenDPD: An Open-Source End-to-End Learning & Benchmarking Framework for\n Wideband Power Amplifier Modeling and Digital Pre-Distortion","summary":" With the rise in communication capacity, deep neural networks (DNN) for\ndigital pre-distortion (DPD) to correct non-linearity in wideband power\namplifiers (PAs) have become prominent. Yet, there is a void in open-source and\nmeasurement-setup-independent platforms for fast DPD exploration and objective\nDPD model comparison. This paper presents an open-source framework, OpenDPD,\ncrafted in PyTorch, with an associated dataset for PA modeling and DPD\nlearning. We introduce a Dense Gated Recurrent Unit (DGRU)-DPD, trained via a\nnovel end-to-end learning architecture, outperforming previous DPD models on a\ndigital PA DPA in the new digital transmitter (DTX) architecture with\nunconventional transfer characteristics compared to analog PAs. Measurements\nshow our DGRU-DPD achieves an ACPR of -44.69/-44.47 dBc and an EVM of -35.22 dB\nfor 200 MHz OFDM signals. OpenDPD code, datasets, and documentation are\npublicly available at https://github.com/lab-emi/OpenDPD.\n","authors":["Yizhuo Wu","Gagan Deep Singh","Mohammadreza Beikmirza","Leo de Vreede","Morteza Alavi","Chang Gao"],"pdf_url":"https://arxiv.org/pdf/2401.08318v1.pdf","comment":"To be published at the 2024 IEEE International Symposium on Circuits\n and Systems (ISCAS), Singapore"},{"id":"http://arxiv.org/abs/2311.09620v2","updated":"2024-01-16T12:26:08Z","published":"2023-11-16T07:05:12Z","title":"GAIA: Delving into Gradient-based Attribution Abnormality for\n Out-of-distribution Detection","summary":" Detecting out-of-distribution (OOD) examples is crucial to guarantee the\nreliability and safety of deep neural networks in real-world settings. In this\npaper, we offer an innovative perspective on quantifying the disparities\nbetween in-distribution (ID) and OOD data -- analyzing the uncertainty that\narises when models attempt to explain their predictive decisions. This\nperspective is motivated by our observation that gradient-based attribution\nmethods encounter challenges in assigning feature importance to OOD data,\nthereby yielding divergent explanation patterns. Consequently, we investigate\nhow attribution gradients lead to uncertain explanation outcomes and introduce\ntwo forms of abnormalities for OOD detection: the zero-deflation abnormality\nand the channel-wise average abnormality. We then propose GAIA, a simple and\neffective approach that incorporates Gradient Abnormality Inspection and\nAggregation. The effectiveness of GAIA is validated on both commonly utilized\n(CIFAR) and large-scale (ImageNet-1k) benchmarks. Specifically, GAIA reduces\nthe average FPR95 by 23.10% on CIFAR10 and by 45.41% on CIFAR100 compared to\nadvanced post-hoc methods.\n","authors":["Jinggang Chen","Junjie Li","Xiaoyang Qu","Jianzong Wang","Jiguang Wan","Jing Xiao"],"pdf_url":"https://arxiv.org/pdf/2311.09620v2.pdf","comment":"Accepted by NeurIPS2023"},{"id":"http://arxiv.org/abs/2401.08309v1","updated":"2024-01-16T12:10:49Z","published":"2024-01-16T12:10:49Z","title":"Anchor function: a type of benchmark functions for studying language\n models","summary":" Understanding transformer-based language models is becoming increasingly\ncrucial, particularly as they play pivotal roles in advancing towards\nartificial general intelligence. However, language model research faces\nsignificant challenges, especially for academic research groups with\nconstrained resources. These challenges include complex data structures,\nunknown target functions, high computational costs and memory requirements, and\na lack of interpretability in the inference process, etc. Drawing a parallel to\nthe use of simple models in scientific research, we propose the concept of an\nanchor function. This is a type of benchmark function designed for studying\nlanguage models in learning tasks that follow an \"anchor-key\" pattern. By\nutilizing the concept of an anchor function, we can construct a series of\nfunctions to simulate various language tasks. The anchor function plays a role\nanalogous to that of mice in diabetes research, particularly suitable for\nacademic research. We demonstrate the utility of the anchor function with an\nexample, revealing two basic operations by attention structures in language\nmodels: shifting tokens and broadcasting one token from one position to many\npositions. These operations are also commonly observed in large language\nmodels. The anchor function framework, therefore, opens up a series of valuable\nand accessible research questions for further exploration, especially for\ntheoretical study.\n","authors":["Zhongwang Zhang","Zhiwei Wang","Junjie Yao","Zhangchen Zhou","Xiaolong Li","Weinan E","Zhi-Qin John Xu"],"pdf_url":"https://arxiv.org/pdf/2401.08309v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08307v1","updated":"2024-01-16T12:08:31Z","published":"2024-01-16T12:08:31Z","title":"On Quantum Natural Policy Gradients","summary":" This research delves into the role of the quantum Fisher Information Matrix\n(FIM) in enhancing the performance of Parameterized Quantum Circuit (PQC)-based\nreinforcement learning agents. While previous studies have highlighted the\neffectiveness of PQC-based policies preconditioned with the quantum FIM in\ncontextual bandits, its impact in broader reinforcement learning contexts, such\nas Markov Decision Processes, is less clear. Through a detailed analysis of\nL\\\"owner inequalities between quantum and classical FIMs, this study uncovers\nthe nuanced distinctions and implications of using each type of FIM. Our\nresults indicate that a PQC-based agent using the quantum FIM without\nadditional insights typically incurs a larger approximation error and does not\nguarantee improved performance compared to the classical FIM. Empirical\nevaluations in classic control benchmarks suggest even though quantum FIM\npreconditioning outperforms standard gradient ascent, in general it is not\nsuperior to classical FIM preconditioning.\n","authors":["André Sequeira","Luis Paulo Santos","Luis Soares Barbosa"],"pdf_url":"https://arxiv.org/pdf/2401.08307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04350v2","updated":"2024-01-16T12:07:27Z","published":"2023-12-07T15:12:12Z","title":"CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language\n Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsights into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v2.pdf","comment":"NeurIPS 2023; updated with CLadder dataset v1.5"},{"id":"http://arxiv.org/abs/2401.08301v1","updated":"2024-01-16T11:54:32Z","published":"2024-01-16T11:54:32Z","title":"Sum Throughput Maximization in Multi-BD Symbiotic Radio NOMA Network\n Assisted by Active-STAR-RIS","summary":" In this paper, we employ active simultaneously transmitting and reflecting\nreconfigurable intelligent surface (ASRIS) to aid in establishing and enhancing\ncommunication within a commensal symbiotic radio (CSR) network. Unlike\ntraditional RIS, ASRIS not only ensures coverage in an omni directional manner\nbut also amplifies received signals, consequently elevating overall network\nperformance. in the first phase, base station (BS) with active massive MIMO\nantennas, send ambient signal to SBDs. In the first phase, the BS transmits\nambient signals to the symbiotic backscatter devices (SBDs), and after\nharvesting the energy and modulating their information onto the signal carrier,\nthe SBDs send Backscatter signals back to the BS. In this scheme, we employ the\nBackscatter Relay system to facilitate the transmission of information from the\nSBDs to the symbiotic User Equipments (SUEs) with the assistance of the BS. In\nthe second phase, the BS transmits information signals to the SUEs after\neliminating interference using the Successive Interference Cancellation (SIC)\nmethod. ASRIS is employed to establish communication among SUEs lacking a line\nof sight (LoS) and to amplify power signals for SUEs with a LoS connection to\nthe BS. It is worth noting that we use NOMA for multiple access in all network.\n The main goal of this paper is to maximize the sum throughput between all\nusers. To achieve this, we formulate an optimization problem with variables\nincluding active beamforming coefficients at the BS and ASRIS, as well as the\nphase adjustments of ASRIS and scheduling parameters between the first and\nsecond phases. To model this optimization problem, we employ three deep\nreinforcement learning (DRL) methods, namely PPO, TD3, and A3C. Finally, the\nmentioned methods are simulated and compared with each other.\n","authors":["Rahman Saadat Yeganeh","Mohammad Javad Omidi","Farshad Zeinali","Mohammad Robatmili","Mohammad Ghavami"],"pdf_url":"https://arxiv.org/pdf/2401.08301v1.pdf","comment":"This article will be submitted to the Transactions journal"},{"id":"http://arxiv.org/abs/2312.16895v2","updated":"2024-01-16T11:33:28Z","published":"2023-12-28T08:36:19Z","title":"RLPlanner: Reinforcement Learning based Floorplanning for Chiplets with\n Fast Thermal Analysis","summary":" Chiplet-based systems have gained significant attention in recent years due\nto their low cost and competitive performance. As the complexity and\ncompactness of a chiplet-based system increase, careful consideration must be\ngiven to microbump assignments, interconnect delays, and thermal limitations\nduring the floorplanning stage. This paper introduces RLPlanner, an efficient\nearly-stage floorplanning tool for chiplet-based systems with a novel fast\nthermal evaluation method. RLPlanner employs advanced reinforcement learning to\njointly minimize total wirelength and temperature. To alleviate the\ntime-consuming thermal calculations, RLPlanner incorporates the developed fast\nthermal evaluation method to expedite the iterations and optimizations.\nComprehensive experiments demonstrate that our proposed fast thermal evaluation\nmethod achieves a mean absolute error (MAE) of 0.25 K and delivers over 120x\nspeed-up compared to the open-source thermal solver HotSpot. When integrated\nwith our fast thermal evaluation method, RLPlanner achieves an average\nimprovement of 20.28\\% in minimizing the target objective (a combination of\nwirelength and temperature), within a similar running time, compared to the\nclassic simulated annealing method with HotSpot.\n","authors":["Yuanyuan Duan","Xingchen Liu","Zhiping Yu","Hanming Wu","Leilai Shao","Xiaolei Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.16895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01352v2","updated":"2024-01-16T11:32:48Z","published":"2023-11-02T16:00:32Z","title":"Deep learning based Image Compression for Microscopy Images: An\n Empirical Study","summary":" With the fast development of modern microscopes and bioimaging techniques, an\nunprecedentedly large amount of imaging data are being generated, stored,\nanalyzed, and even shared through networks. The size of the data poses great\nchallenges for current data infrastructure. One common way to reduce the data\nsize is by image compression. This present study analyzes classic and deep\nlearning based image compression methods, and their impact on deep learning\nbased image processing models. Deep learning based label-free prediction models\n(i.e., predicting fluorescent images from bright field images) are used as an\nexample application for comparison and analysis. Effective image compression\nmethods could help reduce the data size significantly without losing necessary\ninformation, and therefore reduce the burden on data management infrastructure\nand permit fast transmission through the network for data sharing or cloud\ncomputing. To compress images in such a wanted way, multiple classical lossy\nimage compression techniques are compared to several AI-based compression\nmodels provided by and trained with the CompressAI toolbox using python. These\ndifferent compression techniques are compared in compression ratio, multiple\nimage similarity measures and, most importantly, the prediction accuracy from\nlabel-free models on compressed images. We found that AI-based compression\ntechniques largely outperform the classic ones and will minimally affect the\ndownstream label-free task in 2D cases. In the end, we hope the present study\ncould shed light on the potential of deep learning based image compression and\nthe impact of image compression on downstream deep learning based image\nanalysis models.\n","authors":["Yu Zhou","Jan Sollmann","Jianxu Chen"],"pdf_url":"https://arxiv.org/pdf/2311.01352v2.pdf","comment":"- Update github link; - correct the author name; - update the table\n (correct some errors during calculation); - update the implementation detail\n section and the discussion section"},{"id":"http://arxiv.org/abs/2307.00673v2","updated":"2024-01-16T11:15:43Z","published":"2023-07-02T21:46:30Z","title":"ENN: A Neural Network with DCT Adaptive Activation Functions","summary":" The expressiveness of neural networks highly depends on the nature of the\nactivation function, although these are usually assumed predefined and fixed\nduring the training stage. Under a signal processing perspective, in this paper\nwe present Expressive Neural Network (ENN), a novel model in which the\nnon-linear activation functions are modeled using the Discrete Cosine Transform\n(DCT) and adapted using backpropagation during training. This parametrization\nkeeps the number of trainable parameters low, is appropriate for gradient-based\nschemes, and adapts to different learning tasks. This is the first non-linear\nmodel for activation functions that relies on a signal processing perspective,\nproviding high flexibility and expressiveness to the network. We contribute\nwith insights in the explainability of the network at convergence by recovering\nthe concept of bump, this is, the response of each activation function in the\noutput space. Finally, through exhaustive experiments we show that the model\ncan adapt to classification and regression tasks. The performance of ENN\noutperforms state of the art benchmarks, providing above a 40% gap in accuracy\nin some scenarios.\n","authors":["Marc Martinez-Gost","Ana Pérez-Neira","Miguel Ángel Lagunas"],"pdf_url":"https://arxiv.org/pdf/2307.00673v2.pdf","comment":"Paper accepted in IEEE Journal of Selected Topics in Signal\n Processing (JSTSP) Special Series on AI in Signal & Data Science - Toward\n Explainable, Reliable, and Sustainable Machine Learning"},{"id":"http://arxiv.org/abs/2401.08281v1","updated":"2024-01-16T11:12:36Z","published":"2024-01-16T11:12:36Z","title":"The Faiss library","summary":" Vector databases manage large collections of embedding vectors. As AI\napplications are growing rapidly, so are the number of embeddings that need to\nbe stored and indexed. The Faiss library is dedicated to vector similarity\nsearch, a core functionality of vector databases. Faiss is a toolkit of\nindexing methods and related primitives used to search, cluster, compress and\ntransform vectors. This paper first describes the tradeoff space of vector\nsearch, then the design principles of Faiss in terms of structure, approach to\noptimization and interfacing. We benchmark key features of the library and\ndiscuss a few selected applications to highlight its broad applicability.\n","authors":["Matthijs Douze","Alexandr Guzhva","Chengqi Deng","Jeff Johnson","Gergely Szilvasy","Pierre-Emmanuel Mazaré","Maria Lomeli","Lucas Hosseini","Hervé Jégou"],"pdf_url":"https://arxiv.org/pdf/2401.08281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09467v2","updated":"2024-01-16T11:10:28Z","published":"2023-06-15T19:42:11Z","title":"AQuA: A Benchmarking Tool for Label Quality Assessment","summary":" Machine learning (ML) models are only as good as the data they are trained\non. But recent studies have found datasets widely used to train and evaluate ML\nmodels, e.g. ImageNet, to have pervasive labeling errors. Erroneous labels on\nthe train set hurt ML models' ability to generalize, and they impact evaluation\nand model selection using the test set. Consequently, learning in the presence\nof labeling errors is an active area of research, yet this field lacks a\ncomprehensive benchmark to evaluate these methods. Most of these methods are\nevaluated on a few computer vision datasets with significant variance in the\nexperimental protocols. With such a large pool of methods and inconsistent\nevaluation, it is also unclear how ML practitioners can choose the right models\nto assess label quality in their data. To this end, we propose a benchmarking\nenvironment AQuA to rigorously evaluate methods that enable machine learning in\nthe presence of label noise. We also introduce a design space to delineate\nconcrete design choices of label error detection models. We hope that our\nproposed design space and benchmark enable practitioners to choose the right\ntools to improve their label quality and that our benchmark enables objective\nand rigorous evaluation of machine learning tools facing mislabeled data.\n","authors":["Mononito Goswami","Vedant Sanil","Arjun Choudhry","Arvind Srinivasan","Chalisa Udompanyawit","Artur Dubrawski"],"pdf_url":"https://arxiv.org/pdf/2306.09467v2.pdf","comment":"Accepted at the 37th Conference on Neural Information Processing\n Systems (NeurIPS 2023) Track on Datasets and Benchmarks. Source code can be\n found at www.github.com/autonlab/aqua/"},{"id":"http://arxiv.org/abs/2401.08273v1","updated":"2024-01-16T10:53:11Z","published":"2024-01-16T10:53:11Z","title":"Large Language Models are Null-Shot Learners","summary":" This paper presents null-shot prompting. Null-shot prompting exploits\nhallucination in large language models (LLMs) by instructing LLMs to utilize\ninformation from the \"Examples\" section that never exists within the provided\ncontext to perform a task. While reducing hallucination is crucial and\nnon-negligible for daily and critical uses of LLMs, we propose that in the\ncurrent landscape in which these LLMs still hallucinate, it is possible, in\nfact, to exploit hallucination to increase performance in performing tasks\ncompared to standard zero-shot prompting. Experiments with six LLMs show\nimprovements in performance across the majority of eight datasets, including\nreading comprehension, arithmetic reasoning, and closed-book question\nanswering. The observed inconsistency in increased relative performance across\nLLMs also potentially indicates a different degree of inherent hallucination in\neach model. These differences show that it is possible to utilize null-shot\nprompting as a way to detect degrees of hallucination in LLMs using existing\nbenchmarking datasets. We also perform ablation studies, including\nexperimenting with a modified version of null-shot prompting that incorporates\nideas from zero-shot chain-of-thought prompting, which shows different trends\nof results.\n","authors":["Pittawat Taveekitworachai","Febri Abdullah","Ruck Thawonmas"],"pdf_url":"https://arxiv.org/pdf/2401.08273v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2305.15944v3","updated":"2024-01-16T10:53:05Z","published":"2023-05-25T11:30:27Z","title":"How to Turn Your Knowledge Graph Embeddings into Generative Models","summary":" Some of the most successful knowledge graph embedding (KGE) models for link\nprediction -- CP, RESCAL, TuckER, ComplEx -- can be interpreted as energy-based\nmodels. Under this perspective they are not amenable for exact\nmaximum-likelihood estimation (MLE), sampling and struggle to integrate logical\nconstraints. This work re-interprets the score functions of these KGEs as\ncircuits -- constrained computational graphs allowing efficient\nmarginalisation. Then, we design two recipes to obtain efficient generative\ncircuit models by either restricting their activations to be non-negative or\nsquaring their outputs. Our interpretation comes with little or no loss of\nperformance for link prediction, while the circuits framework unlocks exact\nlearning by MLE, efficient sampling of new triples, and guarantee that logical\nconstraints are satisfied by design. Furthermore, our models scale more\ngracefully than the original KGEs on graphs with millions of entities.\n","authors":["Lorenzo Loconte","Nicola Di Mauro","Robert Peharz","Antonio Vergari"],"pdf_url":"https://arxiv.org/pdf/2305.15944v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08272v1","updated":"2024-01-16T10:51:55Z","published":"2024-01-16T10:51:55Z","title":"Siamese Content-based Search Engine for a More Transparent Skin and\n Breast Cancer Diagnosis through Histological Imaging","summary":" Computer Aid Diagnosis (CAD) has developed digital pathology with Deep\nLearning (DL)-based tools to assist pathologists in decision-making.\nContent-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek\nhighly correlated patches in terms of similarity in histopathological features.\nIn this work, we proposed two CBHIR approaches on breast (Breast-twins) and\nskin cancer (Skin-twins) data sets for robust and accurate patch-level\nretrieval, integrating a custom-built Siamese network as a feature extractor.\nThe proposed Siamese network is able to generalize for unseen images by\nfocusing on the similar histopathological features of the input pairs. The\nproposed CBHIR approaches are evaluated on the Breast (public) and Skin\n(private) data sets with top K accuracy. Finding the optimum amount of K is\nchallenging, but also, as much as K increases, the dissimilarity between the\nquery and the returned images increases which might mislead the pathologists.\nTo the best of the author's belief, this paper is tackling this issue for the\nfirst time on histopathological images by evaluating the top first retrieved\nimages. The Breast-twins model achieves 70% of the F1score at the top first,\nwhich exceeds the other state-of-the-art methods at a higher amount of K such\nas 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto\nEncoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model\ntackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential\n(STUMP) to assist pathologists with retrieving top K images and their\ncorresponding labels. So, this approach can offer a more explainable CAD tool\nto pathologists in terms of transparency, trustworthiness, or reliability among\nother characteristics.\n","authors":["Zahra Tabatabaei","Adrián Colomer","JAvier Oliver Moll","Valery Naranjo"],"pdf_url":"https://arxiv.org/pdf/2401.08272v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08268v1","updated":"2024-01-16T10:41:33Z","published":"2024-01-16T10:41:33Z","title":"An Explainable Proxy Model for Multiabel Audio Segmentation","summary":" Audio signal segmentation is a key task for automatic audio indexing. It\nconsists of detecting the boundaries of class-homogeneous segments in the\nsignal. In many applications, explainable AI is a vital process for\ntransparency of decision-making with machine learning. In this paper, we\npropose an explainable multilabel segmentation model that solves speech\nactivity (SAD), music (MD), noise (ND), and overlapped speech detection (OSD)\nsimultaneously. This proxy uses the non-negative matrix factorization (NMF) to\nmap the embedding used for the segmentation to the frequency domain.\nExperiments conducted on two datasets show similar performances as the\npre-trained black box model while showing strong explainability features.\nSpecifically, the frequency bins used for the decision can be easily identified\nat both the segment level (local explanations) and global level (class\nprototypes).\n","authors":["Théo Mariotte","Antonio Almudévar","Marie Tahon","Alsonfo Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.08268v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08260v1","updated":"2024-01-16T10:31:27Z","published":"2024-01-16T10:31:27Z","title":"Fast Kernel Summation in High Dimensions via Slicing and Fourier\n Transforms","summary":" Kernel-based methods are heavily used in machine learning. However, they\nsuffer from $O(N^2)$ complexity in the number $N$ of considered data points. In\nthis paper, we propose an approximation procedure, which reduces this\ncomplexity to $O(N)$. Our approach is based on two ideas. First, we prove that\nany radial kernel with analytic basis function can be represented as sliced\nversion of some one-dimensional kernel and derive an analytic formula for the\none-dimensional counterpart. It turns out that the relation between one- and\n$d$-dimensional kernels is given by a generalized Riemann-Liouville fractional\nintegral. Hence, we can reduce the $d$-dimensional kernel summation to a\none-dimensional setting. Second, for solving these one-dimensional problems\nefficiently, we apply fast Fourier summations on non-equispaced data, a sorting\nalgorithm or a combination of both. Due to its practical importance we pay\nspecial attention to the Gaussian kernel, where we show a dimension-independent\nerror bound and represent its one-dimensional counterpart via a closed-form\nFourier transform. We provide a run time comparison and error estimate of our\nfast kernel summations.\n","authors":["Johannes Hertrich"],"pdf_url":"https://arxiv.org/pdf/2401.08260v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10818v2","updated":"2024-01-16T10:03:54Z","published":"2023-05-18T08:56:05Z","title":"Diffusion Language Models Generation Can Be Halted Early","summary":" Diffusion Language models (DLMs) are a promising avenue for text generation\ndue to their practical properties on tractable controllable generation. They\nalso have the advantage of not having to predict text autoregressively.\nHowever, despite these notable features, DLMs have not yet reached the\nperformance levels of their Autoregressive counterparts. One of the ways to\nreduce the performance gap between these two types of language models is to\nspeed up the generation of DLMs. Therefore, we propose a pioneering methodology\nto address this issue in this work. It enables the execution of more generation\nsteps within a given time frame, potentially leading to higher-quality outputs.\nSpecifically, our methods estimate DLMs completeness of text generation and\nallow adaptive halting of the generation process. We test and refine our\nmethods on Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their\ngeneration workflows. Finally, we confirm that our methods allow halting Plaid,\nSSD, and CDCD models and decrease the generation time by $10$-$40$% without a\ndrop in the quality of model samples.\n","authors":["Sofia Maria Lo Cicero Vaina","Nikita Balagansky","Daniil Gavrilov"],"pdf_url":"https://arxiv.org/pdf/2305.10818v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08245v1","updated":"2024-01-16T09:59:36Z","published":"2024-01-16T09:59:36Z","title":"Optimizing $k$ in $k$NN Graphs with Graph Learning Perspective","summary":" In this paper, we propose a method, based on graph signal processing, to\noptimize the choice of $k$ in $k$-nearest neighbor graphs ($k$NNGs). $k$NN is\none of the most popular approaches and is widely used in machine learning and\nsignal processing. The parameter $k$ represents the number of neighbors that\nare connected to the target node; however, its appropriate selection is still a\nchallenging problem. Therefore, most $k$NNGs use ad hoc selection methods for\n$k$. In the proposed method, we assume that a different $k$ can be chosen for\neach node. We formulate a discrete optimization problem to seek the best $k$\nwith a constraint on the sum of distances of the connected nodes. The optimal\n$k$ values are efficiently obtained without solving a complex optimization.\nFurthermore, we reveal that the proposed method is closely related to existing\ngraph learning methods. In experiments on real datasets, we demonstrate that\nthe $k$NNGs obtained with our method are sparse and can determine an\nappropriate variable number of edges per node. We validate the effectiveness of\nthe proposed method for point cloud denoising, comparing our denoising\nperformance with achievable graph construction methods that can be scaled to\ntypical point cloud sizes (e.g., thousands of nodes).\n","authors":["Asuka Tamaru","Junya Hara","Hiroshi Higashi","Yuichi Tanaka","Antonio Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.08245v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05394v2","updated":"2024-01-16T09:57:58Z","published":"2023-12-19T09:21:08Z","title":"Iterative Regularization with k-support Norm: An Important Complement to\n Sparse Recovery","summary":" Sparse recovery is ubiquitous in machine learning and signal processing. Due\nto the NP-hard nature of sparse recovery, existing methods are known to suffer\neither from restrictive (or even unknown) applicability conditions, or high\ncomputational cost. Recently, iterative regularization methods have emerged as\na promising fast approach because they can achieve sparse recovery in one pass\nthrough early stopping, rather than the tedious grid-search used in the\ntraditional methods. However, most of those iterative methods are based on the\n$\\ell_1$ norm which requires restrictive applicability conditions and could\nfail in many cases. Therefore, achieving sparse recovery with iterative\nregularization methods under a wider range of conditions has yet to be further\nexplored. To address this issue, we propose a novel iterative regularization\nalgorithm, IRKSN, based on the $k$-support norm regularizer rather than the\n$\\ell_1$ norm. We provide conditions for sparse recovery with IRKSN, and\ncompare them with traditional conditions for recovery with $\\ell_1$ norm\nregularizers. Additionally, we give an early stopping bound on the model error\nof IRKSN with explicit constants, achieving the standard linear rate for sparse\nrecovery. Finally, we illustrate the applicability of our algorithm on several\nexperiments, including a support recovery experiment with a correlated design\nmatrix.\n","authors":["William de Vazelhes","Bhaskar Mukhoty","Xiao-Tong Yuan","Bin Gu"],"pdf_url":"https://arxiv.org/pdf/2401.05394v2.pdf","comment":"Accepted at AAAI 2024. Code at\n https://github.com/wdevazelhes/IRKSN_AAAI2024"},{"id":"http://arxiv.org/abs/2308.13983v2","updated":"2024-01-16T09:50:37Z","published":"2023-08-27T01:32:23Z","title":"Interpolation of mountain weather forecasts by machine learning","summary":" Recent advances in numerical simulation methods based on physical models and\ntheir combination with machine learning have improved the accuracy of weather\nforecasts. However, the accuracy decreases in complex terrains such as\nmountainous regions because these methods usually use grids of several\nkilometers square and simple machine learning models. While deep learning has\nalso made significant progress in recent years, its direct application is\ndifficult to utilize the physical knowledge used in the simulation. This paper\nproposes a method that uses machine learning to interpolate future weather in\nmountainous regions using forecast data from surrounding plains and past\nobserved data to improve weather forecasts in mountainous regions. We focus on\nmountainous regions in Japan and predict temperature and precipitation mainly\nusing LightGBM as a machine learning model. Despite the use of a small dataset,\nthrough feature engineering and model tuning, our method partially achieves\nimprovements in the RMSE with significantly less training time.\n","authors":["Kazuma Iwase","Tomoyuki Takenawa"],"pdf_url":"https://arxiv.org/pdf/2308.13983v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2401.08233v1","updated":"2024-01-16T09:34:17Z","published":"2024-01-16T09:34:17Z","title":"Enhancing Wind Speed and Wind Power Forecasting Using Shape-Wise Feature\n Engineering: A Novel Approach for Improved Accuracy and Robustness","summary":" Accurate prediction of wind speed and power is vital for enhancing the\nefficiency of wind energy systems. Numerous solutions have been implemented to\ndate, demonstrating their potential to improve forecasting. Among these, deep\nlearning is perceived as a revolutionary approach in the field. However,\ndespite their effectiveness, the noise present in the collected data remains a\nsignificant challenge. This noise has the potential to diminish the performance\nof these algorithms, leading to inaccurate predictions. In response to this,\nthis study explores a novel feature engineering approach. This approach\ninvolves altering the data input shape in both Convolutional Neural\nNetwork-Long Short-Term Memory (CNN-LSTM) and Autoregressive models for various\nforecasting horizons. The results reveal substantial enhancements in model\nresilience against noise resulting from step increases in data. The approach\ncould achieve an impressive 83% accuracy in predicting unseen data up to the\n24th steps. Furthermore, this method consistently provides high accuracy for\nshort, mid, and long-term forecasts, outperforming the performance of\nindividual models. These findings pave the way for further research on noise\nreduction strategies at different forecasting horizons through shape-wise\nfeature engineering.\n","authors":["Mulomba Mukendi Christian","Yun Seon Kim","Hyebong Choi","Jaeyoung Lee","SongHee You"],"pdf_url":"https://arxiv.org/pdf/2401.08233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06362v2","updated":"2024-01-16T09:29:39Z","published":"2023-12-23T05:46:05Z","title":"Attention, Distillation, and Tabularization: Towards Practical Neural\n Network-Based Prefetching","summary":" Attention-based Neural Networks (NN) have demonstrated their effectiveness in\naccurate memory access prediction, an essential step in data prefetching.\nHowever, the substantial computational overheads associated with these models\nresult in high inference latency, limiting their feasibility as practical\nprefetchers. To close the gap, we propose a new approach based on\ntabularization that significantly reduces model complexity and inference\nlatency without sacrificing prediction accuracy. Our novel tabularization\nmethodology takes as input a distilled, yet highly accurate attention-based\nmodel for memory access prediction and efficiently converts its expensive\nmatrix multiplications into a hierarchy of fast table lookups. As an exemplar\nof the above approach, we develop DART, a prefetcher comprised of a simple\nhierarchy of tables. With a modest 0.09 drop in F1-score, DART reduces 99.99%\nof arithmetic operations from the large attention-based model and 91.83% from\nthe distilled model. DART accelerates the large model inference by 170x and the\ndistilled model by 9.4x. DART has comparable latency and storage costs as\nstate-of-the-art rule-based prefetcher BO but surpasses it by 6.1% in IPC\nimprovement. DART outperforms state-of-the-art NN-based prefetchers TransFetch\nby 33.1% and Voyager by 37.2% in terms of IPC improvement, primarily due to its\nlow prefetching latency.\n","authors":["Pengmiao Zhang","Neelesh Gupta","Rajgopal Kannan","Viktor K. Prasanna"],"pdf_url":"https://arxiv.org/pdf/2401.06362v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08225v1","updated":"2024-01-16T09:22:38Z","published":"2024-01-16T09:22:38Z","title":"Efficient and Mathematically Robust Operations for Certified Neural\n Networks Inference","summary":" In recent years, machine learning (ML) and neural networks (NNs) have gained\nwidespread use and attention across various domains, particularly in\ntransportation for achieving autonomy, including the emergence of flying taxis\nfor urban air mobility (UAM). However, concerns about certification have come\nup, compelling the development of standardized processes encompassing the\nentire ML and NN pipeline. This paper delves into the inference stage and the\nrequisite hardware, highlighting the challenges associated with IEEE 754\nfloating-point arithmetic and proposing alternative number representations. By\nevaluating diverse summation and dot product algorithms, we aim to mitigate\nissues related to non-associativity. Additionally, our exploration of\nfixed-point arithmetic reveals its advantages over floating-point methods,\ndemonstrating significant hardware efficiencies. Employing an empirical\napproach, we ascertain the optimal bit-width necessary to attain an acceptable\nlevel of accuracy, considering the inherent complexity of bit-width\noptimization.\n","authors":["Fabien Geyer","Johannes Freitag","Tobias Schulz","Sascha Uhrig"],"pdf_url":"https://arxiv.org/pdf/2401.08225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08224v1","updated":"2024-01-16T09:22:12Z","published":"2024-01-16T09:22:12Z","title":"Differentially Private Estimation of CATE in Adaptive Experiment","summary":" Adaptive experiment is widely adopted to estimate conditional average\ntreatment effect (CATE) in clinical trials and many other scenarios. While the\nprimary goal in experiment is to maximize estimation accuracy, due to the\nimperative of social welfare, it's also crucial to provide treatment with\nsuperior outcomes to patients, which is measured by regret in contextual bandit\nframework. These two objectives often lead to contrast optimal allocation\nmechanism. Furthermore, privacy concerns arise in clinical scenarios containing\nsensitive data like patients health records. Therefore, it's essential for the\ntreatment allocation mechanism to incorporate robust privacy protection\nmeasures. In this paper, we investigate the tradeoff between loss of social\nwelfare and statistical power in contextual bandit experiment. We propose a\nmatched upper and lower bound for the multi-objective optimization problem, and\nthen adopt the concept of Pareto optimality to mathematically characterize the\noptimality condition. Furthermore, we propose differentially private algorithms\nwhich still matches the lower bound, showing that privacy is \"almost free\".\nAdditionally, we derive the asymptotic normality of the estimator, which is\nessential in statistical inference and hypothesis testing.\n","authors":["Jiachun Li","David Simchi-Levi","Kaining Shi"],"pdf_url":"https://arxiv.org/pdf/2401.08224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08221v1","updated":"2024-01-16T09:15:43Z","published":"2024-01-16T09:15:43Z","title":"Towards Causal Relationship in Indefinite Data: Baseline Model and New\n Datasets","summary":" Integrating deep learning and causal discovery has encouraged us to spot that\nlearning causal structures and representations in dialogue and video is full of\nchallenges. We defined These data forms as \"Indefinite Data\", characterized by\nmulti-structure data and multi-value representations. Unlike existing adaptable\ndata forms, Indefinite Data still faces gaps in datasets and methods. To\naddress the dataset gap, we release two high-quality datasets - Causalogue and\nCausaction, containing text dialogue samples and video action samples with\ncausal annotations respectively. Moreover, the method gap arises from the\ncoexistence of multi-structure data and multi-value representations, breaking\nthe assumptions of all current methods and rendering them infeasible on\nIndefinite Data. To this end, we propose a probabilistic framework as a\nbaseline, incorporating three designed highlights for this gap: 1) establishing\nCausation Condition of representations using the independence of noise terms\nunder non-fixed causal structures, 2) treating causal strength as a latent\nvariable and measuring the reconstruction loss in the correlation space, and 3)\nestimating the effects of latent confounders. These highpoints make the\nprobabilistic model capable of overcoming challenges brought by the coexistence\nof multi-structure data and multi-value representations and pave the way for\nthe extension of latent confounders. Comprehensive experiments have evaluated\nbaseline results of causal structures, causal representations, and confounding\ndisentanglement.\n","authors":["Hang Chen","Xinyu Yang","Keqing Du"],"pdf_url":"https://arxiv.org/pdf/2401.08221v1.pdf","comment":"If you are interested in the two new datasets, pls contact us by\n email"},{"id":"http://arxiv.org/abs/2312.08723v2","updated":"2024-01-16T09:15:05Z","published":"2023-12-14T08:09:20Z","title":"StemGen: A music generation model that listens","summary":" End-to-end generation of musical audio using deep learning techniques has\nseen an explosion of activity recently. However, most models concentrate on\ngenerating fully mixed music in response to abstract conditioning information.\nIn this work, we present an alternative paradigm for producing music generation\nmodels that can listen and respond to musical context. We describe how such a\nmodel can be constructed using a non-autoregressive, transformer-based model\narchitecture and present a number of novel architectural and sampling\nimprovements. We train the described architecture on both an open-source and a\nproprietary dataset. We evaluate the produced models using standard quality\nmetrics and a new approach based on music information retrieval descriptors.\nThe resulting model reaches the audio quality of state-of-the-art\ntext-conditioned models, as well as exhibiting strong musical coherence with\nits context.\n","authors":["Julian D. Parker","Janne Spijkervet","Katerina Kosta","Furkan Yesiler","Boris Kuznetsov","Ju-Chiang Wang","Matt Avent","Jitong Chen","Duc Le"],"pdf_url":"https://arxiv.org/pdf/2312.08723v2.pdf","comment":"Accepted for publication at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08216v1","updated":"2024-01-16T09:02:34Z","published":"2024-01-16T09:02:34Z","title":"Towards Efficient and Certified Recovery from Poisoning Attacks in\n Federated Learning","summary":" Federated learning (FL) is vulnerable to poisoning attacks, where malicious\nclients manipulate their updates to affect the global model. Although various\nmethods exist for detecting those clients in FL, identifying malicious clients\nrequires sufficient model updates, and hence by the time malicious clients are\ndetected, FL models have been already poisoned. Thus, a method is needed to\nrecover an accurate global model after malicious clients are identified.\nCurrent recovery methods rely on (i) all historical information from\nparticipating FL clients and (ii) the initial model unaffected by the malicious\nclients, leading to a high demand for storage and computational resources. In\nthis paper, we show that highly effective recovery can still be achieved based\non (i) selective historical information rather than all historical information\nand (ii) a historical model that has not been significantly affected by\nmalicious clients rather than the initial model. In this scenario, while\nmaintaining comparable recovery performance, we can accelerate the recovery\nspeed and decrease memory consumption. Following this concept, we introduce\nCrab, an efficient and certified recovery method, which relies on selective\ninformation storage and adaptive model rollback. Theoretically, we demonstrate\nthat the difference between the global model recovered by Crab and the one\nrecovered by train-from-scratch can be bounded under certain assumptions. Our\nempirical evaluation, conducted across three datasets over multiple machine\nlearning models, and a variety of untargeted and targeted poisoning attacks\nreveals that Crab is both accurate and efficient, and consistently outperforms\nprevious approaches in terms of both recovery speed and memory consumption.\n","authors":["Yu Jiang","Jiyuan Shen","Ziyao Liu","Chee Wei Tan","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2401.08216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04432v2","updated":"2024-01-16T08:40:12Z","published":"2023-12-07T16:56:24Z","title":"FreqFed: A Frequency Analysis-Based Approach for Mitigating Poisoning\n Attacks in Federated Learning","summary":" Federated learning (FL) is a collaborative learning paradigm allowing\nmultiple clients to jointly train a model without sharing their training data.\nHowever, FL is susceptible to poisoning attacks, in which the adversary injects\nmanipulated model updates into the federated model aggregation process to\ncorrupt or destroy predictions (untargeted poisoning) or implant hidden\nfunctionalities (targeted poisoning or backdoors). Existing defenses against\npoisoning attacks in FL have several limitations, such as relying on specific\nassumptions about attack types and strategies or data distributions or not\nsufficiently robust against advanced injection techniques and strategies and\nsimultaneously maintaining the utility of the aggregated model. To address the\ndeficiencies of existing defenses, we take a generic and completely different\napproach to detect poisoning (targeted and untargeted) attacks. We present\nFreqFed, a novel aggregation mechanism that transforms the model updates (i.e.,\nweights) into the frequency domain, where we can identify the core frequency\ncomponents that inherit sufficient information about weights. This allows us to\neffectively filter out malicious updates during local training on the clients,\nregardless of attack types, strategies, and clients' data distributions. We\nextensively evaluate the efficiency and effectiveness of FreqFed in different\napplication domains, including image classification, word prediction, IoT\nintrusion detection, and speech recognition. We demonstrate that FreqFed can\nmitigate poisoning attacks effectively with a negligible impact on the utility\nof the aggregated model.\n","authors":["Hossein Fereidooni","Alessandro Pegoraro","Phillip Rieger","Alexandra Dmitrienko","Ahmad-Reza Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2312.04432v2.pdf","comment":"To appear in the Network and Distributed System Security (NDSS)\n Symposium 2024. 16 pages, 8 figures, 12 tables, 1 algorithm, 3 equations"},{"id":"http://arxiv.org/abs/2311.06748v2","updated":"2024-01-16T08:35:30Z","published":"2023-11-12T06:20:21Z","title":"How do Minimum-Norm Shallow Denoisers Look in Function Space?","summary":" Neural network (NN) denoisers are an essential building block in many common\ntasks, ranging from image reconstruction to image generation. However, the\nsuccess of these models is not well understood from a theoretical perspective.\nIn this paper, we aim to characterize the functions realized by shallow ReLU NN\ndenoisers -- in the common theoretical setting of interpolation (i.e., zero\ntraining loss) with a minimal representation cost (i.e., minimal $\\ell^2$ norm\nweights). First, for univariate data, we derive a closed form for the NN\ndenoiser function, find it is contractive toward the clean data points, and\nprove it generalizes better than the empirical MMSE estimator at a low noise\nlevel. Next, for multivariate data, we find the NN denoiser functions in a\nclosed form under various geometric assumptions on the training data: data\ncontained in a low-dimensional subspace, data contained in a union of one-sided\nrays, or several types of simplexes. These functions decompose into a sum of\nsimple rank-one piecewise linear interpolations aligned with edges and/or faces\nconnecting training samples. We empirically verify this alignment phenomenon on\nsynthetic data and real images.\n","authors":["Chen Zeno","Greg Ongie","Yaniv Blumenfeld","Nir Weinberger","Daniel Soudry"],"pdf_url":"https://arxiv.org/pdf/2311.06748v2.pdf","comment":"Thirty-seventh Conference on Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2311.12082v2","updated":"2024-01-16T08:34:28Z","published":"2023-11-20T10:47:52Z","title":"Tiny-VBF: Resource-Efficient Vision Transformer based Lightweight\n Beamformer for Ultrasound Single-Angle Plane Wave Imaging","summary":" Accelerating compute intensive non-real-time beam-forming algorithms in\nultrasound imaging using deep learning architectures has been gaining momentum\nin the recent past. Nonetheless, the complexity of the state-of-the-art deep\nlearning techniques poses challenges for deployment on resource-constrained\nedge devices. In this work, we propose a novel vision transformer based tiny\nbeamformer (Tiny-VBF), which works on the raw radio-frequency channel data\nacquired through single-angle plane wave insonification. The output of our\nTiny-VBF provides fast envelope detection requiring very low frame rate, i.e.\n0.34 GOPs/Frame for a frame size of 368 x 128 in comparison to the\nstate-of-the-art deep learning models. It also exhibited an 8% increase in\ncontrast and gains of 5% and 33% in axial and lateral resolution respectively\nwhen compared to Tiny-CNN on in-vitro dataset. Additionally, our model showed a\n4.2% increase in contrast and gains of 4% and 20% in axial and lateral\nresolution respectively when compared against conventional Delay-and-Sum (DAS)\nbeamformer. We further propose an accelerator architecture and implement our\nTiny-VBF model on a Zynq UltraScale+ MPSoC ZCU104 FPGA using a hybrid\nquantization scheme with 50% less resource consumption compared to the\nfloating-point implementation, while preserving the image quality.\n","authors":["Abdul Rahoof","Vivek Chaturvedi","Mahesh Raveendranatha Panicker","Muhammad Shafique"],"pdf_url":"https://arxiv.org/pdf/2311.12082v2.pdf","comment":"6 pages, DATE 2024"},{"id":"http://arxiv.org/abs/2305.17547v3","updated":"2024-01-16T08:27:38Z","published":"2023-05-27T18:30:54Z","title":"Translatotron 3: Speech to Speech Translation with Monolingual Data","summary":" This paper presents Translatotron 3, a novel approach to unsupervised direct\nspeech-to-speech translation from monolingual speech-text datasets by combining\nmasked autoencoder, unsupervised embedding mapping, and back-translation.\nExperimental results in speech-to-speech translation tasks between Spanish and\nEnglish show that Translatotron 3 outperforms a baseline cascade system,\nreporting $18.14$ BLEU points improvement on the synthesized\nUnpaired-Conversational dataset. In contrast to supervised approaches that\nnecessitate real paired data, or specialized modeling to replicate\npara-/non-linguistic information such as pauses, speaking rates, and speaker\nidentity, Translatotron 3 showcases its capability to retain it. Audio samples\ncan be found at http://google-research.github.io/lingvo-lab/translatotron3\n","authors":["Eliya Nachmani","Alon Levkovitch","Yifan Ding","Chulayuth Asawaroengchai","Heiga Zen","Michelle Tadmor Ramanovich"],"pdf_url":"https://arxiv.org/pdf/2305.17547v3.pdf","comment":"To appear in ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08197v1","updated":"2024-01-16T08:25:29Z","published":"2024-01-16T08:25:29Z","title":"Matrix Completion with Hypergraphs:Sharp Thresholds and Efficient\n Algorithms","summary":" This paper considers the problem of completing a rating matrix based on\nsub-sampled matrix entries as well as observed social graphs and hypergraphs.\nWe show that there exists a \\emph{sharp threshold} on the sample probability\nfor the task of exactly completing the rating matrix -- the task is achievable\nwhen the sample probability is above the threshold, and is impossible otherwise\n-- demonstrating a phase transition phenomenon. The threshold can be expressed\nas a function of the ``quality'' of hypergraphs, enabling us to \\emph{quantify}\nthe amount of reduction in sample probability due to the exploitation of\nhypergraphs. This also highlights the usefulness of hypergraphs in the matrix\ncompletion problem. En route to discovering the sharp threshold, we develop a\ncomputationally efficient matrix completion algorithm that effectively exploits\nthe observed graphs and hypergraphs. Theoretical analyses show that our\nalgorithm succeeds with high probability as long as the sample probability\nexceeds the aforementioned threshold, and this theoretical result is further\nvalidated by synthetic experiments. Moreover, our experiments on a real social\nnetwork dataset (with both graphs and hypergraphs) show that our algorithm\noutperforms other state-of-the-art matrix completion algorithms.\n","authors":["Zhongtian Ma","Qiaosheng Zhang","Zhen Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08189v1","updated":"2024-01-16T08:04:50Z","published":"2024-01-16T08:04:50Z","title":"PRewrite: Prompt Rewriting with Reinforcement Learning","summary":" Prompt engineering is critical for the development of LLM-based applications.\nHowever, it is usually done manually in a \"trial and error\" fashion. This\nmanual procedure can be time consuming, ineffective, and the generated prompts\nare, in a lot of cases, sub-optimal. Even for the prompts which seemingly work\nwell, there is always a lingering question: can the prompts be made better with\nfurther modifications?\n To address these questions, in this paper, we investigate prompt engineering\nautomation. We consider a specific use case scenario in which developers/users\nhave drafted initial prompts, but lack the time/expertise to optimize them. We\npropose PRewrite, an automated tool to rewrite these drafts and to generate\nhighly effective new prompts. PRewrite is based on the Reinforcement Learning\n(RL) framework which allows for end-to-end optimization and our design allows\nthe RL search to happen in a large action space. The automated tool leverages\nmanually crafted prompts as starting points which makes the rewriting procedure\nmore guided and efficient. The generated prompts are human readable, and\nself-explanatory, unlike some of those in previous works. We conducted\nextensive experiments on diverse datasets and found that the prompts generated\nwith this new method not only outperform professionally crafted prompts, but\nalso prompts generated with other previously proposed methods.\n","authors":["Weize Kong","Spurthi Amba Hombaiah","Mingyang Zhang","Qiaozhu Mei","Michael Bendersky"],"pdf_url":"https://arxiv.org/pdf/2401.08189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08169v1","updated":"2024-01-16T07:18:47Z","published":"2024-01-16T07:18:47Z","title":"Statistical Test for Attention Map in Vision Transformer","summary":" The Vision Transformer (ViT) demonstrates exceptional performance in various\ncomputer vision tasks. Attention is crucial for ViT to capture complex\nwide-ranging relationships among image patches, allowing the model to weigh the\nimportance of image patches and aiding our understanding of the decision-making\nprocess. However, when utilizing the attention of ViT as evidence in\nhigh-stakes decision-making tasks such as medical diagnostics, a challenge\narises due to the potential of attention mechanisms erroneously focusing on\nirrelevant regions. In this study, we propose a statistical test for ViT's\nattentions, enabling us to use the attentions as reliable quantitative evidence\nindicators for ViT's decision-making with a rigorously controlled error rate.\nUsing the framework called selective inference, we quantify the statistical\nsignificance of attentions in the form of p-values, which enables the\ntheoretically grounded quantification of the false positive detection\nprobability of attentions. We demonstrate the validity and the effectiveness of\nthe proposed method through numerical experiments and applications to brain\nimage diagnoses.\n","authors":["Tomohiro Shiraishi","Daiki Miwa","Teruyuki Katsuoka","Vo Nguyen Le Duy","Koichi Taji","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2401.08169v1.pdf","comment":"42pages, 17figures"},{"id":"http://arxiv.org/abs/2304.04162v2","updated":"2024-01-16T07:11:42Z","published":"2023-04-09T05:10:05Z","title":"Design of Two-Level Incentive Mechanisms for Hierarchical Federated\n Learning","summary":" Hierarchical Federated Learning (HFL) is a distributed machine learning\nparadigm tailored for multi-tiered computation architectures, which supports\nmassive access of devices' models simultaneously. To enable efficient HFL, it\nis crucial to design suitable incentive mechanisms to ensure that devices\nactively participate in local training. However, there are few studies on\nincentive mechanism design for HFL. In this paper, we design two-level\nincentive mechanisms for the HFL with a two-tiered computing structure to\nencourage the participation of entities in each tier in the HFL training. In\nthe lower-level game, we propose a coalition formation game to joint optimize\nthe edge association and bandwidth allocation problem, and obtain efficient\ncoalition partitions by the proposed preference rule, which can be proven to be\nstable by exact potential game. In the upper-level game, we design the\nStackelberg game algorithm, which not only determines the optimal number of\nedge aggregations for edge servers to maximize their utility, but also optimize\nthe unit reward provided for the edge aggregation performance to ensure the\ninterests of cloud servers. Furthermore, numerical results indicate that the\nproposed algorithms can achieve better performance than the benchmark schemes.\n","authors":["Shunfeng Chu","Jun Li","Kang Wei","Yuwen Qian","Kunlun Wang","Feng Shu","Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2304.04162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08949v3","updated":"2024-01-16T06:59:29Z","published":"2022-12-17T20:45:34Z","title":"Managing Temporal Resolution in Continuous Value Estimation: A\n Fundamental Trade-off","summary":" A default assumption in reinforcement learning (RL) and optimal control is\nthat observations arrive at discrete time points on a fixed clock cycle. Yet,\nmany applications involve continuous-time systems where the time\ndiscretization, in principle, can be managed. The impact of time discretization\non RL methods has not been fully characterized in existing theory, but a more\ndetailed analysis of its effect could reveal opportunities for improving\ndata-efficiency. We address this gap by analyzing Monte-Carlo policy evaluation\nfor LQR systems and uncover a fundamental trade-off between approximation and\nstatistical error in value estimation. Importantly, these two errors behave\ndifferently to time discretization, leading to an optimal choice of temporal\nresolution for a given data budget. These findings show that managing the\ntemporal resolution can provably improve policy evaluation efficiency in LQR\nsystems with finite data. Empirically, we demonstrate the trade-off in\nnumerical simulations of LQR instances and standard RL benchmarks for\nnon-linear continuous control.\n","authors":["Zichen Zhang","Johannes Kirschner","Junxi Zhang","Francesco Zanini","Alex Ayoub","Masood Dehghan","Dale Schuurmans"],"pdf_url":"https://arxiv.org/pdf/2212.08949v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.12447v2","updated":"2024-01-16T06:56:51Z","published":"2023-10-19T03:54:31Z","title":"Constrained Reweighting of Distributions: an Optimal Transport Approach","summary":" We commonly encounter the problem of identifying an optimally weight adjusted\nversion of the empirical distribution of observed data, adhering to predefined\nconstraints on the weights. Such constraints often manifest as restrictions on\nthe moments, tail behaviour, shapes, number of modes, etc., of the resulting\nweight adjusted empirical distribution. In this article, we substantially\nenhance the flexibility of such methodology by introducing a nonparametrically\nimbued distributional constraints on the weights, and developing a general\nframework leveraging the maximum entropy principle and tools from optimal\ntransport. The key idea is to ensure that the maximum entropy weight adjusted\nempirical distribution of the observed data is close to a pre-specified\nprobability distribution in terms of the optimal transport metric while\nallowing for subtle departures. The versatility of the framework is\ndemonstrated in the context of three disparate applications where data\nre-weighting is warranted to satisfy side constraints on the optimization\nproblem at the heart of the statistical task: namely, portfolio allocation,\nsemi-parametric inference for complex surveys, and ensuring algorithmic\nfairness in machine learning algorithms.\n","authors":["Abhisek Chakraborty","Anirban Bhattacharya","Debdeep Pati"],"pdf_url":"https://arxiv.org/pdf/2310.12447v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.10085"},{"id":"http://arxiv.org/abs/2401.08150v1","updated":"2024-01-16T06:47:43Z","published":"2024-01-16T06:47:43Z","title":"Differentially Private Sliced Inverse Regression: Minimax Optimality and\n Algorithm","summary":" Privacy preservation has become a critical concern in high-dimensional data\nanalysis due to the growing prevalence of data-driven applications. Proposed by\nLi (1991), sliced inverse regression has emerged as a widely utilized\nstatistical technique for reducing covariate dimensionality while maintaining\nsufficient statistical information. In this paper, we propose optimally\ndifferentially private algorithms specifically designed to address privacy\nconcerns in the context of sufficient dimension reduction. We proceed to\nestablish lower bounds for differentially private sliced inverse regression in\nboth the low and high-dimensional settings. Moreover, we develop differentially\nprivate algorithms that achieve the minimax lower bounds up to logarithmic\nfactors. Through a combination of simulations and real data analysis, we\nillustrate the efficacy of these differentially private algorithms in\nsafeguarding privacy while preserving vital information within the reduced\ndimension space. As a natural extension, we can readily offer analogous lower\nand upper bounds for differentially private sparse principal component\nanalysis, a topic that may also be of potential interest to the statistical and\nmachine learning community.\n","authors":["Xintao Xia","Linjun Zhang","Zhanrui Cai"],"pdf_url":"https://arxiv.org/pdf/2401.08150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08147v1","updated":"2024-01-16T06:40:24Z","published":"2024-01-16T06:40:24Z","title":"Machine Learning on Dynamic Graphs: A Survey on Applications","summary":" Dynamic graph learning has gained significant attention as it offers a\npowerful means to model intricate interactions among entities across various\nreal-world and scientific domains. Notably, graphs serve as effective\nrepresentations for diverse networks such as transportation, brain, social, and\ninternet networks. Furthermore, the rapid advancements in machine learning have\nexpanded the scope of dynamic graph applications beyond the aforementioned\ndomains. In this paper, we present a review of lesser-explored applications of\ndynamic graph learning. This study revealed the potential of machine learning\non dynamic graphs in addressing challenges across diverse domains, including\nthose with limited levels of association with the field.\n","authors":["Sanaz Hasanzadeh Fard"],"pdf_url":"https://arxiv.org/pdf/2401.08147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08139v1","updated":"2024-01-16T06:18:11Z","published":"2024-01-16T06:18:11Z","title":"Transferring Core Knowledge via Learngenes","summary":" The pre-training paradigm fine-tunes the models trained on large-scale\ndatasets to downstream tasks with enhanced performance. It transfers all\nknowledge to downstream tasks without discriminating which part is necessary or\nunnecessary, which may lead to negative transfer. In comparison, knowledge\ntransfer in nature is much more efficient. When passing genetic information to\ndescendants, ancestors encode only the essential knowledge into genes, which\nact as the medium. Inspired by that, we adopt a recent concept called\n``learngene'' and refine its structures by mimicking the structures of natural\ngenes. We propose the Genetic Transfer Learning (GTL) -- a framework to copy\nthe evolutionary process of organisms into neural networks. GTL trains a\npopulation of networks, selects superior learngenes by tournaments, performs\nlearngene mutations, and passes the learngenes to next generations. Finally, we\nsuccessfully extract the learngenes of VGG11 and ResNet12. We show that the\nlearngenes bring the descendant networks instincts and strong learning ability:\nwith 20% parameters, the learngenes bring 12% and 16% improvements of accuracy\non CIFAR-FS and miniImageNet. Besides, the learngenes have the scalability and\nadaptability on the downstream structure of networks and datasets. Overall, we\noffer a novel insight that transferring core knowledge via learngenes may be\nsufficient and efficient for neural networks.\n","authors":["Fu Feng","Jing Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2401.08139v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08135v1","updated":"2024-01-16T06:01:02Z","published":"2024-01-16T06:01:02Z","title":"Machine Learning-Based Malicious Vehicle Detection for Security Threats\n and Attacks in Vehicle Ad-hoc Network (VANET) Communications","summary":" With the rapid growth of Vehicle Ad-hoc Network (VANET) as a promising\ntechnology for efficient and reliable communication among vehicles and\ninfrastructure, the security and integrity of VANET communications has become a\ncritical concern. One of the significant threats to VANET is the presence of\nblackhole attacks, where malicious nodes disrupt the network's functionality\nand compromise data confidentiality, integrity, and availability. In this\npaper, we propose a machine learning-based approach for blackhole detection in\nVANET. To achieve this task, we first create a comprehensive dataset comprising\nnormal and malicious traffic flows. Afterward, we study and define a promising\nset of features to discriminate the blackhole attacks. Finally, we evaluate\nvarious machine learning algorithms, including Gradient Boosting, Random\nForest, Support Vector Machines, k-Nearest Neighbors, Gaussian Naive Bayes, and\nLogistic Regression. Experimental results demonstrate the effectiveness of\nthese algorithms in distinguishing between normal and malicious nodes. Our\nfindings also highlight the potential of machine learning based approach in\nenhancing the security of VANET by detecting and mitigating blackhole attacks.\n","authors":["Thanh Nguyen Canh","Xiem HoangVan"],"pdf_url":"https://arxiv.org/pdf/2401.08135v1.pdf","comment":"In the 2023 RIVF International Conference on Computing and\n Communication Technologies, Hanoi, Vietnam"},{"id":"http://arxiv.org/abs/2306.00196v3","updated":"2024-01-16T05:42:06Z","published":"2023-05-31T21:26:43Z","title":"Restless Bandits with Average Reward: Breaking the Uniform Global\n Attractor Assumption","summary":" We study the infinite-horizon restless bandit problem with the average reward\ncriterion, in both discrete-time and continuous-time settings. A fundamental\ngoal is to efficiently compute policies that achieve a diminishing optimality\ngap as the number of arms, $N$, grows large. Existing results on asymptotic\noptimality all rely on the uniform global attractor property (UGAP), a complex\nand challenging-to-verify assumption. In this paper, we propose a general,\nsimulation-based framework, Follow-the-Virtual-Advice, that converts any\nsingle-armed policy into a policy for the original $N$-armed problem. This is\ndone by simulating the single-armed policy on each arm and carefully steering\nthe real state towards the simulated state. Our framework can be instantiated\nto produce a policy with an $O(1/\\sqrt{N})$ optimality gap. In the\ndiscrete-time setting, our result holds under a simpler synchronization\nassumption, which covers some problem instances that violate UGAP. More\nnotably, in the continuous-time setting, we do not require \\emph{any}\nadditional assumptions beyond the standard unichain condition. In both\nsettings, our work is the first asymptotic optimality result that does not\nrequire UGAP.\n","authors":["Yige Hong","Qiaomin Xie","Yudong Chen","Weina Wang"],"pdf_url":"https://arxiv.org/pdf/2306.00196v3.pdf","comment":"NeurIPS 2023. 35 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.08121v1","updated":"2024-01-16T05:28:12Z","published":"2024-01-16T05:28:12Z","title":"CycLight: learning traffic signal cooperation with a cycle-level\n strategy","summary":" This study introduces CycLight, a novel cycle-level deep reinforcement\nlearning (RL) approach for network-level adaptive traffic signal control\n(NATSC) systems. Unlike most traditional RL-based traffic controllers that\nfocus on step-by-step decision making, CycLight adopts a cycle-level strategy,\noptimizing cycle length and splits simultaneously using Parameterized Deep\nQ-Networks (PDQN) algorithm. This cycle-level approach effectively reduces the\ncomputational burden associated with frequent data communication, meanwhile\nenhancing the practicality and safety of real-world applications. A\ndecentralized framework is formulated for multi-agent cooperation, while\nattention mechanism is integrated to accurately assess the impact of the\nsurroundings on the current intersection. CycLight is tested in a large\nsynthetic traffic grid using the microscopic traffic simulation tool, SUMO.\nExperimental results not only demonstrate the superiority of CycLight over\nother state-of-the-art approaches but also showcase its robustness against\ninformation transmission delays.\n","authors":["Gengyue Han","Xiaohan Liu","Xianyue Peng","Hao Wang","Yu Han"],"pdf_url":"https://arxiv.org/pdf/2401.08121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08119v1","updated":"2024-01-16T05:23:34Z","published":"2024-01-16T05:23:34Z","title":"SpecSTG: A Fast Spectral Diffusion Framework for Probabilistic\n Spatio-Temporal Traffic Forecasting","summary":" Traffic forecasting, a crucial application of spatio-temporal graph (STG)\nlearning, has traditionally relied on deterministic models for accurate point\nestimations. Yet, these models fall short of identifying latent risks of\nunexpected volatility in future observations. To address this gap,\nprobabilistic methods, especially variants of diffusion models, have emerged as\nuncertainty-aware solutions. However, existing diffusion methods typically\nfocus on generating separate future time series for individual sensors in the\ntraffic network, resulting in insufficient involvement of spatial network\ncharacteristics in the probabilistic learning process. To better leverage\nspatial dependencies and systematic patterns inherent in traffic data, we\npropose SpecSTG, a novel spectral diffusion framework. Our method generates the\nFourier representation of future time series, transforming the learning process\ninto the spectral domain enriched with spatial information. Additionally, our\napproach incorporates a fast spectral graph convolution designed for Fourier\ninput, alleviating the computational burden associated with existing models.\nNumerical experiments show that SpecSTG achieves outstanding performance with\ntraffic flow and traffic speed datasets compared to state-of-the-art baselines.\nThe source code for SpecSTG is available at\nhttps://anonymous.4open.science/r/SpecSTG.\n","authors":["Lequan Lin","Dai Shi","Andi Han","Junbin Gao"],"pdf_url":"https://arxiv.org/pdf/2401.08119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07586v3","updated":"2024-01-16T04:36:23Z","published":"2023-12-11T02:40:40Z","title":"Characteristic Guidance: Non-linear Correction for Diffusion Model at\n Large Guidance Scale","summary":" Popular guidance for denoising diffusion probabilistic model (DDPM) linearly\ncombines distinct conditional models together to provide enhanced control over\nsamples. However, this approach overlooks nonlinear effects that become\nsignificant when guidance scale is large. To address this issue, we propose\ncharacteristic guidance, a sampling method that provides first-principle\nnon-linear correction for classifier-free guided DDPMs. Such correction forces\nthe guided DDPMs to respect the Fokker-Planck equation of their underlying\ndiffusion process, in a way that is training-free, derivative-free, and\ncompatible with existing sampling methods. Experiments show that characteristic\nguidance enhances control and reduces color and exposure issues in image\ngeneration, proving effective in diverse applications ranging from latent space\nsampling to solving physics problems like magnet phase transitions.\n","authors":["Candi Zheng","Yuan Lan"],"pdf_url":"https://arxiv.org/pdf/2312.07586v3.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.20004v2","updated":"2024-01-16T04:15:25Z","published":"2023-05-31T16:25:07Z","title":"Learning to solve Bayesian inverse problems: An amortized variational\n inference approach using Gaussian and Flow guides","summary":" Inverse problems, i.e., estimating parameters of physical models from\nexperimental data, are ubiquitous in science and engineering. The Bayesian\nformulation is the gold standard because it alleviates ill-posedness issues and\nquantifies epistemic uncertainty. Since analytical posteriors are not typically\navailable, one resorts to Markov chain Monte Carlo sampling or approximate\nvariational inference. However, inference needs to be rerun from scratch for\neach new set of data. This drawback limits the applicability of the Bayesian\nformulation to real-time settings, e.g., health monitoring of engineered\nsystems, and medical diagnosis. The objective of this paper is to develop a\nmethodology that enables real-time inference by learning the Bayesian inverse\nmap, i.e., the map from data to posteriors. Our approach is as follows. We\nparameterize the posterior distribution as a function of data. This work\noutlines two distinct approaches to do this. The first method involves\nparameterizing the posterior using an amortized full-rank Gaussian guide,\nimplemented through neural networks. The second method utilizes a Conditional\nNormalizing Flow guide, employing conditional invertible neural networks for\ncases where the target posterior is arbitrarily complex. In both approaches, we\nlearn the network parameters by amortized variational inference which involves\nmaximizing the expectation of evidence lower bound over all possible datasets\ncompatible with the model. We demonstrate our approach by solving a set of\nbenchmark problems from science and engineering. Our results show that the\nposterior estimates of our approach are in agreement with the corresponding\nground truth obtained by Markov chain Monte Carlo. Once trained, our approach\nprovides the posterior distribution for a given observation just at the cost of\na forward pass of the neural network.\n","authors":["Sharmila Karumuri","Ilias Bilionis"],"pdf_url":"https://arxiv.org/pdf/2305.20004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11589v3","updated":"2024-01-16T03:46:39Z","published":"2023-06-20T15:07:37Z","title":"Sampling from Gaussian Process Posteriors using Stochastic Gradient\n Descent","summary":" Gaussian processes are a powerful framework for quantifying uncertainty and\nfor sequential decision-making but are limited by the requirement of solving\nlinear systems. In general, this has a cubic cost in dataset size and is\nsensitive to conditioning. We explore stochastic gradient algorithms as a\ncomputationally efficient method of approximately solving these linear systems:\nwe develop low-variance optimization objectives for sampling from the posterior\nand extend these to inducing points. Counterintuitively, stochastic gradient\ndescent often produces accurate predictions, even in cases where it does not\nconverge quickly to the optimum. We explain this through a spectral\ncharacterization of the implicit bias from non-convergence. We show that\nstochastic gradient descent produces predictive distributions close to the true\nposterior both in regions with sufficient data coverage, and in regions\nsufficiently far away from the data. Experimentally, stochastic gradient\ndescent achieves state-of-the-art performance on sufficiently large-scale or\nill-conditioned regression tasks. Its uncertainty estimates match the\nperformance of significantly more expensive baselines on a large-scale Bayesian\noptimization task.\n","authors":["Jihao Andreas Lin","Javier Antorán","Shreyas Padhy","David Janz","José Miguel Hernández-Lobato","Alexander Terenin"],"pdf_url":"https://arxiv.org/pdf/2306.11589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02251v3","updated":"2024-01-16T03:38:44Z","published":"2023-07-05T12:49:02Z","title":"RanPAC: Random Projections and Pre-trained Models for Continual Learning","summary":" Continual learning (CL) aims to incrementally learn different tasks (such as\nclassification) in a non-stationary data stream without forgetting old ones.\nMost CL works focus on tackling catastrophic forgetting under a\nlearning-from-scratch paradigm. However, with the increasing prominence of\nfoundation models, pre-trained models equipped with informative representations\nhave become available for various downstream requirements. Several CL methods\nbased on pre-trained models have been explored, either utilizing pre-extracted\nfeatures directly (which makes bridging distribution gaps challenging) or\nincorporating adaptors (which may be subject to forgetting). In this paper, we\npropose a concise and effective approach for CL with pre-trained models. Given\nthat forgetting occurs during parameter updating, we contemplate an alternative\napproach that exploits training-free random projectors and class-prototype\naccumulation, which thus bypasses the issue. Specifically, we inject a frozen\nRandom Projection layer with nonlinear activation between the pre-trained\nmodel's feature representations and output head, which captures interactions\nbetween features with expanded dimensionality, providing enhanced linear\nseparability for class-prototype-based CL. We also demonstrate the importance\nof decorrelating the class-prototypes to reduce the distribution disparity when\nusing pre-trained representations. These techniques prove to be effective and\ncircumvent the problem of forgetting for both class- and domain-incremental\ncontinual learning. Compared to previous methods applied to pre-trained\nViT-B/16 models, we reduce final error rates by between 20% and 62% on seven\nclass-incremental benchmarks, despite not using any rehearsal memory. We\nconclude that the full potential of pre-trained models for simple, effective,\nand fast CL has not hitherto been fully tapped. Code is at\ngithub.com/RanPAC/RanPAC.\n","authors":["Mark D. McDonnell","Dong Gong","Amin Parveneh","Ehsan Abbasnejad","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2307.02251v3.pdf","comment":"32 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.08092v1","updated":"2024-01-16T03:35:26Z","published":"2024-01-16T03:35:26Z","title":"A Survey of Resource-efficient LLM and Multimodal Foundation Models","summary":" Large foundation models, including large language models (LLMs), vision\ntransformers (ViTs), diffusion, and LLM-based multimodal models, are\nrevolutionizing the entire machine learning lifecycle, from training to\ndeployment. However, the substantial advancements in versatility and\nperformance these models offer come at a significant cost in terms of hardware\nresources. To support the growth of these large models in a scalable and\nenvironmentally sustainable way, there has been a considerable focus on\ndeveloping resource-efficient strategies. This survey delves into the critical\nimportance of such research, examining both algorithmic and systemic aspects.\nIt offers a comprehensive analysis and valuable insights gleaned from existing\nliterature, encompassing a broad array of topics from cutting-edge model\narchitectures and training/serving algorithms to practical system designs and\nimplementations. The goal of this survey is to provide an overarching\nunderstanding of how current approaches are tackling the resource challenges\nposed by large foundation models and to potentially inspire future\nbreakthroughs in this field.\n","authors":["Mengwei Xu","Wangsong Yin","Dongqi Cai","Rongjie Yi","Daliang Xu","Qipeng Wang","Bingyang Wu","Yihao Zhao","Chen Yang","Shihe Wang","Qiyang Zhang","Zhenyan Lu","Li Zhang","Shangguang Wang","Yuanchun Li","Yunxin Liu","Xin Jin","Xuanzhe Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03590v2","updated":"2024-01-16T03:27:31Z","published":"2023-07-07T13:34:27Z","title":"Accelerated Optimization Landscape of Linear-Quadratic Regulator","summary":" Linear-quadratic regulator (LQR) is a landmark problem in the field of\noptimal control, which is the concern of this paper. Generally, LQR is\nclassified into state-feedback LQR (SLQR) and output-feedback LQR (OLQR) based\non whether the full state is obtained. It has been suggested in existing\nliterature that both SLQR and OLQR could be viewed as \\textit{constrained\nnonconvex matrix optimization} problems in which the only variable to be\noptimized is the feedback gain matrix. In this paper, we introduce a\nfirst-order accelerated optimization framework of handling the LQR problem, and\ngive its convergence analysis for the cases of SLQR and OLQR, respectively.\n Specifically, a Lipschiz Hessian property of LQR performance criterion is\npresented, which turns out to be a crucial property for the application of\nmodern optimization techniques. For the SLQR problem, a continuous-time hybrid\ndynamic system is introduced, whose solution trajectory is shown to converge\nexponentially to the optimal feedback gain with Nesterov-optimal order\n$1-\\frac{1}{\\sqrt{\\kappa}}$ ($\\kappa$ the condition number). Then, the\nsymplectic Euler scheme is utilized to discretize the hybrid dynamic system,\nand a Nesterov-type method with a restarting rule is proposed that preserves\nthe continuous-time convergence rate, i.e., the discretized algorithm admits\nthe Nesterov-optimal convergence order. For the OLQR problem, a Hessian-free\naccelerated framework is proposed, which is a two-procedure method consisting\nof semiconvex function optimization and negative curvature exploitation. In a\ntime $\\mathcal{O}(\\epsilon^{-7/4}\\log(1/\\epsilon))$, the method can find an\n$\\epsilon$-stationary point of the performance criterion; this entails that the\nmethod improves upon the $\\mathcal{O}(\\epsilon^{-2})$ complexity of vanilla\ngradient descent. Moreover, our method provides the second-order guarantee of\nstationary point.\n","authors":["Lechen Feng","Yuan-Hua Ni"],"pdf_url":"https://arxiv.org/pdf/2307.03590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07272v2","updated":"2024-01-16T03:22:15Z","published":"2023-08-14T16:58:50Z","title":"Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt\n Generation for Few-shot Learning","summary":" Prompt-based pre-trained language models (PLMs) paradigm have succeeded\nsubstantially in few-shot natural language processing (NLP) tasks. However,\nprior discrete prompt optimization methods require expert knowledge to design\nthe base prompt set and identify high-quality prompts, which is costly,\ninefficient, and subjective. Meanwhile, existing continuous prompt optimization\nmethods improve the performance by learning the ideal prompts through the\ngradient information of PLMs, whose high computational cost, and low\nreadability and generalizability are often concerning. To address the research\ngap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt\nOptimization ($DP_2O$) method. We first design a multi-round dialogue alignment\nstrategy for readability prompt set generation based on GPT-4. Furthermore, we\npropose an efficient prompt screening metric to identify high-quality prompts\nwith linear complexity. Finally, we construct a reinforcement learning (RL)\nframework based on policy gradients to match the prompts to inputs optimally.\nBy training a policy network with only 0.67% of the PLM parameter size on the\ntasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA)\nmethod by 1.52% in accuracy on average on four open-source datasets. Moreover,\nsubsequent experiments also demonstrate that $DP_2O$ has good universality,\nrobustness, and generalization ability.\n","authors":["Chengzhengxu Li","Xiaoming Liu","Yichen Wang","Duyi Li","Yu Lan","Chao Shen"],"pdf_url":"https://arxiv.org/pdf/2308.07272v2.pdf","comment":"AAAI 2024 Main Track"},{"id":"http://arxiv.org/abs/2401.08081v1","updated":"2024-01-16T03:15:52Z","published":"2024-01-16T03:15:52Z","title":"Predicting Next Useful Location With Context-Awareness: The\n State-Of-The-Art","summary":" Predicting the future location of mobile objects reinforces location-aware\nservices with proactive intelligence and helps businesses and decision-makers\nwith better planning and near real-time scheduling in different applications\nsuch as traffic congestion control, location-aware advertisements, and\nmonitoring public health and well-being. The recent developments in the\nsmartphone and location sensors technology and the prevalence of using\nlocation-based social networks alongside the improvements in artificial\nintelligence and machine learning techniques provide an excellent opportunity\nto exploit massive amounts of historical and real-time contextual information\nto recognise mobility patterns and achieve more accurate and intelligent\npredictions. This survey provides a comprehensive overview of the next useful\nlocation prediction problem with context-awareness. First, we explain the\nconcepts of context and context-awareness and define the next location\nprediction problem. Then we analyse nearly thirty studies in this field\nconcerning the prediction method, the challenges addressed, the datasets and\nmetrics used for training and evaluating the model, and the types of context\nincorporated. Finally, we discuss the advantages and disadvantages of different\napproaches, focusing on the usefulness of the predicted location and\nidentifying the open challenges and future work on this subject by introducing\ntwo potential use cases of next location prediction in the automotive industry.\n","authors":["Alireza Nezhadettehad","Arkady Zaslavsky","Rakib Abdur","Siraj Ahmed Shaikh","Seng W. Loke","Guang-Li Huang","Alireza Hassani"],"pdf_url":"https://arxiv.org/pdf/2401.08081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08077v1","updated":"2024-01-16T03:03:39Z","published":"2024-01-16T03:03:39Z","title":"Transformer-based approach for Ethereum Price Prediction Using\n Crosscurrency correlation and Sentiment Analysis","summary":" The research delves into the capabilities of a transformer-based neural\nnetwork for Ethereum cryptocurrency price forecasting. The experiment runs\naround the hypothesis that cryptocurrency prices are strongly correlated with\nother cryptocurrencies and the sentiments around the cryptocurrency. The model\nemploys a transformer architecture for several setups from single-feature\nscenarios to complex configurations incorporating volume, sentiment, and\ncorrelated cryptocurrency prices. Despite a smaller dataset and less complex\narchitecture, the transformer model surpasses ANN and MLP counterparts on some\nparameters. The conclusion presents a hypothesis on the illusion of causality\nin cryptocurrency price movements driven by sentiments.\n","authors":["Shubham Singh","Mayur Bhat"],"pdf_url":"https://arxiv.org/pdf/2401.08077v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2303.14822v3","updated":"2024-01-16T02:48:05Z","published":"2023-03-26T21:12:36Z","title":"MGTBench: Benchmarking Machine-Generated Text Detection","summary":" Nowadays, powerful large language models (LLMs) such as ChatGPT have\ndemonstrated revolutionary power in a variety of tasks. Consequently, the\ndetection of machine-generated texts (MGTs) is becoming increasingly crucial as\nLLMs become more advanced and prevalent. These models have the ability to\ngenerate human-like language, making it challenging to discern whether a text\nis authored by a human or a machine. This raises concerns regarding\nauthenticity, accountability, and potential bias. However, existing methods for\ndetecting MGTs are evaluated using different model architectures, datasets, and\nexperimental settings, resulting in a lack of a comprehensive evaluation\nframework that encompasses various methodologies. Furthermore, it remains\nunclear how existing detection methods would perform against powerful LLMs. In\nthis paper, we fill this gap by proposing the first benchmark framework for MGT\ndetection against powerful LLMs, named MGTBench. Extensive evaluations on\npublic datasets with curated texts generated by various powerful LLMs such as\nChatGPT-turbo and Claude demonstrate the effectiveness of different detection\nmethods. Our ablation study shows that a larger number of words in general\nleads to better performance and most detection methods can achieve similar\nperformance with much fewer training samples. Moreover, we delve into a more\nchallenging task: text attribution. Our findings indicate that the model-based\ndetection methods still perform well in the text attribution task. To\ninvestigate the robustness of different detection methods, we consider three\nadversarial attacks, namely paraphrasing, random spacing, and adversarial\nperturbations. We discover that these attacks can significantly diminish\ndetection effectiveness, underscoring the critical need for the development of\nmore robust detection methods.\n","authors":["Xinlei He","Xinyue Shen","Zeyuan Chen","Michael Backes","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2303.14822v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08061v1","updated":"2024-01-16T02:42:45Z","published":"2024-01-16T02:42:45Z","title":"Augmenting Ground-Level PM2.5 Prediction via Kriging-Based Pseudo-Label\n Generation","summary":" Fusing abundant satellite data with sparse ground measurements constitutes a\nmajor challenge in climate modeling. To address this, we propose a strategy to\naugment the training dataset by introducing unlabeled satellite images paired\nwith pseudo-labels generated through a spatial interpolation technique known as\nordinary kriging, thereby making full use of the available satellite data\nresources. We show that the proposed data augmentation strategy helps enhance\nthe performance of the state-of-the-art convolutional neural network-random\nforest (CNN-RF) model by a reasonable amount, resulting in a noteworthy\nimprovement in spatial correlation and a reduction in prediction error.\n","authors":["Lei Duan","Ziyang Jiang","David Carlson"],"pdf_url":"https://arxiv.org/pdf/2401.08061v1.pdf","comment":"8 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change\n with Machine Learning"},{"id":"http://arxiv.org/abs/2401.08047v1","updated":"2024-01-16T02:00:17Z","published":"2024-01-16T02:00:17Z","title":"Incremental Extractive Opinion Summarization Using Cover Trees","summary":" Extractive opinion summarization involves automatically producing a summary\nof text about an entity (e.g., a product's reviews) by extracting\nrepresentative sentences that capture prevalent opinions in the review set.\nTypically, in online marketplaces user reviews accrue over time, and opinion\nsummaries need to be updated periodically to provide customers with up-to-date\ninformation. In this work, we study the task of extractive opinion\nsummarization in an incremental setting, where the underlying review set\nevolves over time. Many of the state-of-the-art extractive opinion\nsummarization approaches are centrality-based, such as CentroidRank.\nCentroidRank performs extractive summarization by selecting a subset of review\nsentences closest to the centroid in the representation space as the summary.\nHowever, these methods are not capable of operating efficiently in an\nincremental setting, where reviews arrive one at a time. In this paper, we\npresent an efficient algorithm for accurately computing the CentroidRank\nsummaries in an incremental setting. Our approach, CoverSumm, relies on\nindexing review representations in a cover tree and maintaining a reservoir of\ncandidate summary review sentences. CoverSumm's efficacy is supported by a\ntheoretical and empirical analysis of running time. Empirically, on a diverse\ncollection of data (both real and synthetically created to illustrate scaling\nconsiderations), we demonstrate that CoverSumm is up to 25x faster than\nbaseline methods, and capable of adapting to nuanced changes in data\ndistribution. We also conduct human evaluations of the generated summaries and\nfind that CoverSumm is capable of producing informative summaries consistent\nwith the underlying review set.\n","authors":["Somnath Basu Roy Chowdhury","Nicholas Monath","Avinava Dubey","Manzil Zaheer","Andrew McCallum","Amr Ahmed","Snigdha Chaturvedi"],"pdf_url":"https://arxiv.org/pdf/2401.08047v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2205.00932v3","updated":"2024-01-16T02:00:14Z","published":"2022-05-02T14:27:35Z","title":"Understanding CNNs from excitations","summary":" Saliency maps have proven to be a highly efficacious approach for explicating\nthe decisions of Convolutional Neural Networks. However, extant methodologies\npredominantly rely on gradients, which constrain their ability to explicate\ncomplex models. Furthermore, such approaches are not fully adept at leveraging\nnegative gradient information to improve interpretive veracity. In this study,\nwe present a novel concept, termed positive and negative excitation, which\nenables the direct extraction of positive and negative excitation for each\nlayer, thus enabling complete layer-by-layer information utilization sans\ngradients. To organize these excitations into final saliency maps, we introduce\na double-chain backpropagation procedure. A comprehensive experimental\nevaluation, encompassing both binary classification and multi-classification\ntasks, was conducted to gauge the effectiveness of our proposed method.\nEncouragingly, the results evince that our approach offers a significant\nimprovement over the state-of-the-art methods in terms of salient pixel\nremoval, minor pixel removal, and inconspicuous adversarial perturbation\ngeneration guidance. Additionally, we verify the correlation between positive\nand negative excitations.\n","authors":["Zijian Ying","Qianmu Li","Zhichao Lian","Jun Hou","Tong Lin","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2205.00932v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.13114v3","updated":"2024-01-16T01:34:20Z","published":"2022-05-26T02:43:29Z","title":"Contextual Pandora's Box","summary":" Pandora's Box is a fundamental stochastic optimization problem, where the\ndecision-maker must find a good alternative while minimizing the search cost of\nexploring the value of each alternative. In the original formulation, it is\nassumed that accurate distributions are given for the values of all the\nalternatives, while recent work studies the online variant of Pandora's Box\nwhere the distributions are originally unknown. In this work, we study\nPandora's Box in the online setting, while incorporating context. At every\nround, we are presented with a number of alternatives each having a context, an\nexploration cost and an unknown value drawn from an unknown distribution that\nmay change at every round. Our main result is a no-regret algorithm that\nperforms comparably well to the optimal algorithm which knows all prior\ndistributions exactly. Our algorithm works even in the bandit setting where the\nalgorithm never learns the values of the alternatives that were not explored.\nThe key technique that enables our result is a novel modification of the\nrealizability condition in contextual bandits that connects a context to a\nsufficient statistic of each alternative's distribution (its \"reservation\nvalue\") rather than its mean.\n","authors":["Alexia Atsidakou","Constantine Caramanis","Evangelia Gergatsouli","Orestis Papadigenopoulos","Christos Tzamos"],"pdf_url":"https://arxiv.org/pdf/2205.13114v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08038v1","updated":"2024-01-16T01:27:26Z","published":"2024-01-16T01:27:26Z","title":"Calpric: Inclusive and Fine-grain Labeling of Privacy Policies with\n Crowdsourcing and Active Learning","summary":" A significant challenge to training accurate deep learning models on privacy\npolicies is the cost and difficulty of obtaining a large and comprehensive set\nof training data. To address these challenges, we present Calpric , which\ncombines automatic text selection and segmentation, active learning and the use\nof crowdsourced annotators to generate a large, balanced training set for\nprivacy policies at low cost. Automated text selection and segmentation\nsimplifies the labeling task, enabling untrained annotators from crowdsourcing\nplatforms, like Amazon's Mechanical Turk, to be competitive with trained\nannotators, such as law students, and also reduces inter-annotator agreement,\nwhich decreases labeling cost. Having reliable labels for training enables the\nuse of active learning, which uses fewer training samples to efficiently cover\nthe input space, further reducing cost and improving class and data category\nbalance in the data set. The combination of these techniques allows Calpric to\nproduce models that are accurate over a wider range of data categories, and\nprovide more detailed, fine-grain labels than previous work. Our crowdsourcing\nprocess enables Calpric to attain reliable labeled data at a cost of roughly\n$0.92-$1.71 per labeled text segment. Calpric 's training process also\ngenerates a labeled data set of 16K privacy policy text segments across 9 Data\ncategories with balanced positive and negative samples.\n","authors":["Wenjun Qiu","David Lie","Lisa Austin"],"pdf_url":"https://arxiv.org/pdf/2401.08038v1.pdf","comment":"published at USENIX Security 2023; associated website:\n https://www.usenix.org/conference/usenixsecurity23/presentation/qiu"},{"id":"http://arxiv.org/abs/2311.02794v2","updated":"2024-01-16T01:18:50Z","published":"2023-11-05T23:37:31Z","title":"Modelling Cellular Perturbations with the Sparse Additive Mechanism\n Shift Variational Autoencoder","summary":" Generative models of observations under interventions have been a vibrant\ntopic of interest across machine learning and the sciences in recent years. For\nexample, in drug discovery, there is a need to model the effects of diverse\ninterventions on cells in order to characterize unknown biological mechanisms\nof action. We propose the Sparse Additive Mechanism Shift Variational\nAutoencoder, SAMS-VAE, to combine compositionality, disentanglement, and\ninterpretability for perturbation models. SAMS-VAE models the latent state of a\nperturbed sample as the sum of a local latent variable capturing\nsample-specific variation and sparse global variables of latent intervention\neffects. Crucially, SAMS-VAE sparsifies these global latent variables for\nindividual perturbations to identify disentangled, perturbation-specific latent\nsubspaces that are flexibly composable. We evaluate SAMS-VAE both\nquantitatively and qualitatively on a range of tasks using two popular single\ncell sequencing datasets. In order to measure perturbation-specific\nmodel-properties, we also introduce a framework for evaluation of perturbation\nmodels based on average treatment effects with links to posterior predictive\nchecks. SAMS-VAE outperforms comparable models in terms of generalization\nacross in-distribution and out-of-distribution tasks, including a combinatorial\nreasoning task under resource paucity, and yields interpretable latent\nstructures which correlate strongly to known biological mechanisms. Our results\nsuggest SAMS-VAE is an interesting addition to the modeling toolkit for machine\nlearning-driven scientific discovery.\n","authors":["Michael Bereket","Theofanis Karaletsos"],"pdf_url":"https://arxiv.org/pdf/2311.02794v2.pdf","comment":"Presented at the 37th Conference on Neural Information Processing\n Systems (NeurIPS 2023) (Post-NeurIPS fixes: cosmetic fixes, updated\n references, added simulation to appendix)"},{"id":"http://arxiv.org/abs/2401.08035v1","updated":"2024-01-16T01:08:19Z","published":"2024-01-16T01:08:19Z","title":"BanglaNet: Bangla Handwritten Character Recognition using Ensembling of\n Convolutional Neural Network","summary":" Handwritten character recognition is a crucial task because of its abundant\napplications. The recognition task of Bangla handwritten characters is\nespecially challenging because of the cursive nature of Bangla characters and\nthe presence of compound characters with more than one way of writing. In this\npaper, a classification model based on the ensembling of several Convolutional\nNeural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic\ncharacters, compound characters, numerals, and modifiers. Three different\nmodels based on the idea of state-of-the-art CNN models like Inception, ResNet,\nand DenseNet have been trained with both augmented and non-augmented inputs.\nFinally, all these models are averaged or ensembled to get the finishing model.\nRigorous experimentation on three benchmark Bangla handwritten characters\ndatasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited\nsignificant recognition accuracies compared to some recent CNN-based research.\nThe top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and\nthe top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb,\nBanglaLekha-Isolated, and Ekush datasets respectively.\n","authors":["Chandrika Saha","Md. Mostafijur Rahman"],"pdf_url":"https://arxiv.org/pdf/2401.08035v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08032v1","updated":"2024-01-16T01:03:39Z","published":"2024-01-16T01:03:39Z","title":"Structure-based out-of-distribution (OOD) materials property prediction:\n a benchmark study","summary":" In real-world material research, machine learning (ML) models are usually\nexpected to predict and discover novel exceptional materials that deviate from\nthe known materials. It is thus a pressing question to provide an objective\nevaluation of ML model performances in property prediction of\nout-of-distribution (OOD) materials that are different from the training set\ndistribution. Traditional performance evaluation of materials property\nprediction models through random splitting of the dataset frequently results in\nartificially high performance assessments due to the inherent redundancy of\ntypical material datasets. Here we present a comprehensive benchmark study of\nstructure-based graph neural networks (GNNs) for extrapolative OOD materials\nproperty prediction. We formulate five different categories of OOD ML problems\nfor three benchmark datasets from the MatBench study. Our extensive experiments\nshow that current state-of-the-art GNN algorithms significantly underperform\nfor the OOD property prediction tasks on average compared to their baselines in\nthe MatBench study, demonstrating a crucial generalization gap in realistic\nmaterial prediction tasks. We further examine the latent physical spaces of\nthese GNN models and identify the sources of CGCNN, ALIGNN, and DeeperGATGNN's\nsignificantly more robust OOD performance than those of the current best models\nin the MatBench study (coGN and coNGN), and provide insights to improve their\nperformance.\n","authors":["Sadman Sadeed Omee","Nihang Fu","Rongzhi Dong","Ming Hu","Jianjun Hu"],"pdf_url":"https://arxiv.org/pdf/2401.08032v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2401.08025v1","updated":"2024-01-16T00:46:29Z","published":"2024-01-16T00:46:29Z","title":"Self-Imagine: Effective Unimodal Reasoning with Multimodal Models using\n Self-Imagination","summary":" The potential of Vision-Language Models (\\textsc{vlm}s) often remains\nunderutilized in handling complex text-based problems, particularly when these\nproblems could benefit from visual representation. Resonating with humans'\nability to solve complex text-based problems by (1) creating a visual diagram\nfrom the problem and (2) deducing what steps they need to take to solve it, we\npropose \\textsc{Self-Imagine}. We leverage a single Vision-Language Model\n(\\textsc{vlm}) to generate a structured representation of the question using\nHTML, then render the HTML as an image, and finally use the same \\vlm to answer\nthe question using both the question and the image. Our approach does not\nrequire any additional training data or training. We evaluate our approach in\nthree mathematics tasks and nine general-purpose reasoning tasks using\nstate-of-the-art \\textsc{vlm}. Our approach boosts the performance of\n\\textsc{vlm} on all math tasks (\\gsm: +4.62\\%; \\asdiv: +4.49\\%; \\svamp:\n+9.30\\%) and the majority of the general-purpose reasoning tasks by 0.4\\% to\n13.20\\% while achieving comparable performance in other tasks.\n Code and data at https://github.com/snat1505027/self-imagine .\n","authors":["Syeda Nahida Akter","Aman Madaan","Sangwu Lee","Yiming Yang","Eric Nyberg"],"pdf_url":"https://arxiv.org/pdf/2401.08025v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.01644v2","updated":"2024-01-16T00:21:43Z","published":"2023-11-03T00:21:36Z","title":"Should Under-parameterized Student Networks Copy or Average Teacher\n Weights?","summary":" Any continuous function $f^*$ can be approximated arbitrarily well by a\nneural network with sufficiently many neurons $k$. We consider the case when\n$f^*$ itself is a neural network with one hidden layer and $k$ neurons.\nApproximating $f^*$ with a neural network with $n< k$ neurons can thus be seen\nas fitting an under-parameterized \"student\" network with $n$ neurons to a\n\"teacher\" network with $k$ neurons. As the student has fewer neurons than the\nteacher, it is unclear, whether each of the $n$ student neurons should copy one\nof the teacher neurons or rather average a group of teacher neurons. For\nshallow neural networks with erf activation function and for the standard\nGaussian input distribution, we prove that \"copy-average\" configurations are\ncritical points if the teacher's incoming vectors are orthonormal and its\noutgoing weights are unitary. Moreover, the optimum among such configurations\nis reached when $n-1$ student neurons each copy one teacher neuron and the\n$n$-th student neuron averages the remaining $k-n+1$ teacher neurons. For the\nstudent network with $n=1$ neuron, we provide additionally a closed-form\nsolution of the non-trivial critical point(s) for commonly used activation\nfunctions through solving an equivalent constrained optimization problem.\nEmpirically, we find for the erf activation function that gradient flow\nconverges either to the optimal copy-average critical point or to another point\nwhere each student neuron approximately copies a different teacher neuron.\nFinally, we find similar results for the ReLU activation function, suggesting\nthat the optimal solution of underparameterized networks has a universal\nstructure.\n","authors":["Berfin Şimşek","Amire Bendjeddou","Wulfram Gerstner","Johanni Brea"],"pdf_url":"https://arxiv.org/pdf/2311.01644v2.pdf","comment":"41 pages, presented at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.06872v2","updated":"2024-01-16T00:20:23Z","published":"2023-10-09T05:34:21Z","title":"On sparse regression, Lp-regularization, and automated model discovery","summary":" Sparse regression and feature extraction are the cornerstones of knowledge\ndiscovery from massive data. Their goal is to discover interpretable and\npredictive models that provide simple relationships among scientific variables.\nWhile the statistical tools for model discovery are well established in the\ncontext of linear regression, their generalization to nonlinear regression in\nmaterial modeling is highly problem-specific and insufficiently understood.\nHere we explore the potential of neural networks for automatic model discovery\nand induce sparsity by a hybrid approach that combines two strategies:\nregularization and physical constraints. We integrate the concept of Lp\nregularization for subset selection with constitutive neural networks that\nleverage our domain knowledge in kinematics and thermodynamics. We train our\nnetworks with both, synthetic and real data, and perform several thousand\ndiscovery runs to infer common guidelines and trends: L2 regularization or\nridge regression is unsuitable for model discovery; L1 regularization or lasso\npromotes sparsity, but induces strong bias; only L0 regularization allows us to\ntransparently fine-tune the trade-off between interpretability and\npredictability, simplicity and accuracy, and bias and variance. With these\ninsights, we demonstrate that Lp regularized constitutive neural networks can\nsimultaneously discover both, interpretable models and physically meaningful\nparameters. We anticipate that our findings will generalize to alternative\ndiscovery techniques such as sparse and symbolic regression, and to other\ndomains such as biology, chemistry, or medicine. Our ability to automatically\ndiscover material models from data could have tremendous applications in\ngenerative material design and open new opportunities to manipulate matter,\nalter properties of existing materials, and discover new materials with\nuser-defined properties.\n","authors":["Jeremy A. McCulloch","Skyler R. St. Pierre","Kevin Linka","Ellen Kuhl"],"pdf_url":"https://arxiv.org/pdf/2310.06872v2.pdf","comment":"35 pages, 15 figures, 2 tables, 62 references"},{"id":"http://arxiv.org/abs/2210.06758v2","updated":"2024-01-16T23:54:43Z","published":"2022-10-13T05:56:20Z","title":"Exploring Contextual Representation and Multi-Modality for End-to-End\n Autonomous Driving","summary":" Learning contextual and spatial environmental representations enhances\nautonomous vehicle's hazard anticipation and decision-making in complex\nscenarios. Recent perception systems enhance spatial understanding with sensor\nfusion but often lack full environmental context. Humans, when driving,\nnaturally employ neural maps that integrate various factors such as historical\ndata, situational subtleties, and behavioral predictions of other road users to\nform a rich contextual understanding of their surroundings. This neural\nmap-based comprehension is integral to making informed decisions on the road.\nIn contrast, even with their significant advancements, autonomous systems have\nyet to fully harness this depth of human-like contextual understanding.\nMotivated by this, our work draws inspiration from human driving patterns and\nseeks to formalize the sensor fusion approach within an end-to-end autonomous\ndriving framework. We introduce a framework that integrates three cameras\n(left, right, and center) to emulate the human field of view, coupled with\ntop-down bird-eye-view semantic data to enhance contextual representation. The\nsensor data is fused and encoded using a self-attention mechanism, leading to\nan auto-regressive waypoint prediction module. We treat feature representation\nas a sequential problem, employing a vision transformer to distill the\ncontextual interplay between sensor modalities. The efficacy of the proposed\nmethod is experimentally evaluated in both open and closed-loop settings. Our\nmethod achieves displacement error by 0.67m in open-loop settings, surpassing\ncurrent methods by 6.9% on the nuScenes dataset. In closed-loop evaluations on\nCARLA's Town05 Long and Longest6 benchmarks, the proposed method enhances\ndriving performance, route completion, and reduces infractions.\n","authors":["Shoaib Azam","Farzeen Munir","Ville Kyrki","Moongu Jeon","Witold Pedrycz"],"pdf_url":"https://arxiv.org/pdf/2210.06758v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08886v1","updated":"2024-01-16T23:45:14Z","published":"2024-01-16T23:45:14Z","title":"RiemannONets: Interpretable Neural Operators for Riemann Problems","summary":" Developing the proper representations for simulating high-speed flows with\nstrong shock waves, rarefactions, and contact discontinuities has been a\nlong-standing question in numerical analysis. Herein, we employ neural\noperators to solve Riemann problems encountered in compressible flows for\nextreme pressure jumps (up to $10^{10}$ pressure ratio). In particular, we\nfirst consider the DeepONet that we train in a two-stage process, following the\nrecent work of Lee and Shin, wherein the first stage, a basis is extracted from\nthe trunk net, which is orthonormalized and subsequently is used in the second\nstage in training the branch net. This simple modification of DeepONet has a\nprofound effect on its accuracy, efficiency, and robustness and leads to very\naccurate solutions to Riemann problems compared to the vanilla version. It also\nenables us to interpret the results physically as the hierarchical data-driven\nproduced basis reflects all the flow features that would otherwise be\nintroduced using ad hoc feature expansion layers. We also compare the results\nwith another neural operator based on the U-Net for low, intermediate, and very\nhigh-pressure ratios that are very accurate for Riemann problems, especially\nfor large pressure ratios, due to their multiscale nature but computationally\nmore expensive. Overall, our study demonstrates that simple neural network\narchitectures, if properly pre-trained, can achieve very accurate solutions of\nRiemann problems for real-time forecasting.\n","authors":["Ahmad Peyvan","Vivek Oommen","Ameya D. Jagtap","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2401.08886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08876v1","updated":"2024-01-16T23:19:30Z","published":"2024-01-16T23:19:30Z","title":"Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image\n Labeling","summary":" As deep neural networks are more commonly deployed in high-stakes domains,\ntheir lack of interpretability makes uncertainty quantification challenging. We\ninvestigate the effects of presenting conformal prediction\nsets$\\unicode{x2013}$a method for generating valid confidence sets in\ndistribution-free uncertainty quantification$\\unicode{x2013}$to express\nuncertainty in AI-advised decision-making. Through a large pre-registered\nexperiment, we compare the utility of conformal prediction sets to displays of\nTop-1 and Top-k predictions for AI-advised image labeling. We find that the\nutility of prediction sets for accuracy varies with the difficulty of the task:\nwhile they result in accuracy on par with or less than Top-1 and Top-k displays\nfor easy images, prediction sets excel at assisting humans in labeling\nout-of-distribution (OOD) images especially when the set size is small. Our\nresults empirically pinpoint the practical challenges of conformal prediction\nsets and provide implications on how to incorporate them for real-world\ndecision-making.\n","authors":["Dongping Zhang","Angelos Chatzimparmpas","Negar Kamali","Jessica Hullman"],"pdf_url":"https://arxiv.org/pdf/2401.08876v1.pdf","comment":"28 pages, 11 figures, 8 tables"},{"id":"http://arxiv.org/abs/2401.08875v1","updated":"2024-01-16T23:16:18Z","published":"2024-01-16T23:16:18Z","title":"DCRMTA: Unbiased Causal Representation for Multi-touch Attribution","summary":" Multi-touch attribution (MTA) currently plays a pivotal role in achieving a\nfair estimation of the contributions of each advertising touchpoint to-wards\nconversion behavior, deeply influencing budget allocation and advertising\nrecommenda-tion. Traditional multi-touch attribution methods initially build a\nconversion prediction model, an-ticipating learning the inherent relationship\nbe-tween touchpoint sequences and user purchasing behavior through historical\ndata. Based on this, counterfactual touchpoint sequences are con-structed from\nthe original sequence subset, and conversions are estimated using the\nprediction model, thus calculating advertising contributions. A covert\nassumption of these methods is the un-biased nature of conversion prediction\nmodels. However, due to confounding variables factors arising from user\npreferences and internet recom-mendation mechanisms such as homogenization of\nad recommendations resulting from past shop-ping records, bias can easily occur\nin conversion prediction models trained on observational data. This paper\nredefines the causal effect of user fea-tures on conversions and proposes a\nnovel end-to-end approach, Deep Causal Representation for MTA (DCRMTA). Our\nmodel while eliminating confounding variables, extracts features with causal\nrelations to conversions from users. Fur-thermore, Extensive experiments on\nboth synthet-ic and real-world Criteo data demonstrate DCRMTA's superior\nperformance in converting prediction across varying data distributions, while\nalso effectively attributing value across dif-ferent advertising channels\n","authors":["Jiaming Tang"],"pdf_url":"https://arxiv.org/pdf/2401.08875v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.03506v2","updated":"2024-01-16T23:12:55Z","published":"2024-01-07T14:54:57Z","title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language\n Models","summary":" In this paper, we introduce DiarizationLM, a framework to leverage large\nlanguage models (LLM) to post-process the outputs from a speaker diarization\nsystem. Various goals can be achieved with the proposed framework, such as\nimproving the readability of the diarized transcript, or reducing the word\ndiarization error rate (WDER). In this framework, the outputs of the automatic\nspeech recognition (ASR) and speaker diarization systems are represented as a\ncompact textual format, which is included in the prompt to an optionally\nfinetuned LLM. The outputs of the LLM can be used as the refined diarization\nresults with the desired enhancement. As a post-processing step, this framework\ncan be easily applied to any off-the-shelf ASR and speaker diarization systems\nwithout retraining existing components. Our experiments show that a finetuned\nPaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone\nconversation dataset, and rel. 44.9% on the Callhome English dataset.\n","authors":["Quan Wang","Yiling Huang","Guanlong Zhao","Evan Clark","Wei Xia","Hank Liao"],"pdf_url":"https://arxiv.org/pdf/2401.03506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08867v1","updated":"2024-01-16T22:44:12Z","published":"2024-01-16T22:44:12Z","title":"MambaTab: A Simple Yet Effective Approach for Handling Tabular Data","summary":" Tabular data remains ubiquitous across domains despite growing use of images\nand texts for machine learning. While deep learning models like convolutional\nneural networks and transformers achieve strong performance on tabular data,\nthey require extensive data preprocessing, tuning, and resources, limiting\naccessibility and scalability. This work develops an innovative approach based\non a structured state-space model (SSM), MambaTab, for tabular data. SSMs have\nstrong capabilities for efficiently extracting effective representations from\ndata with long-range dependencies. MambaTab leverages Mamba, an emerging SSM\nvariant, for end-to-end supervised learning on tables. Compared to\nstate-of-the-art baselines, MambaTab delivers superior performance while\nrequiring significantly fewer parameters and minimal preprocessing, as\nempirically validated on diverse benchmark datasets. MambaTab's efficiency,\nscalability, generalizability, and predictive gains signify it as a\nlightweight, \"out-of-the-box\" solution for diverse tabular data with promise\nfor enabling wider practical applications.\n","authors":["Md Atik Ahamed","Qiang Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.08867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08865v1","updated":"2024-01-16T22:36:23Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n Learning Differences Between Natural and Medical Images","summary":" This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v1.pdf","comment":"ICLR 2024. Code:\n https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2401.08864v1","updated":"2024-01-16T22:36:12Z","published":"2024-01-16T22:36:12Z","title":"Binaural Angular Separation Network","summary":" We propose a neural network model that can separate target speech sources\nfrom interfering sources at different angular regions using two microphones.\nThe model is trained with simulated room impulse responses (RIRs) using\nomni-directional microphones without needing to collect real RIRs. By relying\non specific angular regions and multiple room simulations, the model utilizes\nconsistent time difference of arrival (TDOA) cues, or what we call delay\ncontrast, to separate target and interference sources while remaining robust in\nvarious reverberation environments. We demonstrate the model is not only\ngeneralizable to a commercially available device with a slightly different\nmicrophone geometry, but also outperforms our previous work which uses one\nadditional microphone on the same device. The model runs in real-time on-device\nand is suitable for low-latency streaming applications such as telephony and\nvideo conferencing.\n","authors":["Yang Yang","George Sung","Shao-Fu Shih","Hakan Erdogan","Chehung Lee","Matthias Grundmann"],"pdf_url":"https://arxiv.org/pdf/2401.08864v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08863v1","updated":"2024-01-16T22:35:14Z","published":"2024-01-16T22:35:14Z","title":"Robust Localization of Key Fob Using Channel Impulse Response of Ultra\n Wide Band Sensors for Keyless Entry Systems","summary":" Using neural networks for localization of key fob within and surrounding a\ncar as a security feature for keyless entry is fast emerging. In this paper we\nstudy: 1) the performance of pre-computed features of neural networks based UWB\n(ultra wide band) localization classification forming the baseline of our\nexperiments. 2) Investigate the inherent robustness of various neural networks;\ntherefore, we include the study of robustness of the adversarial examples\nwithout any adversarial training in this work. 3) Propose a multi-head\nself-supervised neural network architecture which outperforms the baseline\nneural networks without any adversarial training. The model's performance\nimproved by 67% at certain ranges of adversarial magnitude for fast gradient\nsign method and 37% each for basic iterative method and projected gradient\ndescent method.\n","authors":["Abhiram Kolli","Filippo Casamassima","Horst Possegger","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2401.08863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08861v1","updated":"2024-01-16T22:23:27Z","published":"2024-01-16T22:23:27Z","title":"Semi-Supervised Learning Approach for Efficient Resource Allocation with\n Network Slicing in O-RAN","summary":" The Open Radio Access Network (O-RAN) technology has emerged as a promising\nsolution for network operators, providing them with an open and favorable\nenvironment. Ensuring effective coordination of x-applications (xAPPs) is\ncrucial to enhance flexibility and optimize network performance within the\nO-RAN. In this paper, we introduce an innovative approach to the resource\nallocation problem, aiming to coordinate multiple independent xAPPs for network\nslicing and resource allocation in O-RAN. Our proposed method focuses on\nmaximizing the weighted throughput among user equipments (UE), as well as\nallocating physical resource blocks (PRBs). We prioritize two service types,\nnamely enhanced Mobile Broadband and Ultra Reliable Low Latency Communication.\nTo achieve this, we have designed two xAPPs: a power control xAPP for each UE\nand a PRB allocation xAPP. The proposed method consists of a two-part training\nphase, where the first part uses supervised learning with a Variational\nAutoencoder trained to regress the power transmission as well as the user\nassociation and PRB allocation decisions, and the second part uses unsupervised\nlearning with a contrastive loss approach to improve the generalization and\nrobustness of the model. We evaluate the performance of our proposed method by\ncomparing its results to those obtained from an exhaustive search algorithm,\ndeep Q-network algorithm, and by reporting performance metrics for the\nregression task. We also evaluate the proposed model's performance in different\nscenarios among the service types. The results show that the proposed method is\na more efficient and effective solution for network slicing problems compared\nto state-of-the-art methods.\n","authors":["Salar Nouri","Mojdeh Karbalaee Motalleb","Vahid Shah-Mansouri","Seyed Pooya Shariatpanahi"],"pdf_url":"https://arxiv.org/pdf/2401.08861v1.pdf","comment":"Submitted to IEEE Transactions on Network and Service Management"},{"id":"http://arxiv.org/abs/2401.08859v1","updated":"2024-01-16T22:20:36Z","published":"2024-01-16T22:20:36Z","title":"Shabari: Delayed Decision-Making for Faster and Efficient Serverless\n Function","summary":" Serverless computing relieves developers from the burden of resource\nmanagement, thus providing ease-of-use to the users and the opportunity to\noptimize resource utilization for the providers. However, today's serverless\nsystems lack performance guarantees for function invocations, thus limiting\nsupport for performance-critical applications: we observed severe performance\nvariability (up to 6x). Providers lack visibility into user functions and hence\nfind it challenging to right-size them: we observed heavy resource\nunderutilization (up to 80%). To understand the causes behind the performance\nvariability and underutilization, we conducted a measurement study of commonly\ndeployed serverless functions and learned that the function performance and\nresource utilization depend crucially on function semantics and inputs. Our key\ninsight is to delay making resource allocation decisions until after the\nfunction inputs are available. We introduce Shabari, a resource management\nframework for serverless systems that makes decisions as late as possible to\nright-size each invocation to meet functions' performance objectives (SLOs) and\nimprove resource utilization. Shabari uses an online learning agent to\nright-size each function invocation based on the features of the function input\nand makes cold-start-aware scheduling decisions. For a range of serverless\nfunctions and inputs, Shabari reduces SLO violations by 11-73% while not\nwasting any vCPUs and reducing wasted memory by 64-94% in the median case,\ncompared to state-of-the-art systems, including Aquatope, Parrotfish, and\nCypress.\n","authors":["Prasoon Sinha","Kostis Kaffes","Neeraja J. Yadwadkar"],"pdf_url":"https://arxiv.org/pdf/2401.08859v1.pdf","comment":"17 pages, 14 figures"},{"id":"http://arxiv.org/abs/2308.12252v2","updated":"2024-01-16T21:58:37Z","published":"2023-08-23T17:01:53Z","title":"How Safe Am I Given What I See? Calibrated Prediction of Safety Chances\n for Image-Controlled Autonomy","summary":" End-to-end learning has emerged as a major paradigm for developing autonomous\nsystems. Unfortunately, with its performance and convenience comes an even\ngreater challenge of safety assurance. A key factor of this challenge is the\nabsence of the notion of a low-dimensional and interpretable dynamical state,\naround which traditional assurance methods revolve. Focusing on the online\nsafety prediction problem, this paper proposes a configurable family of\nlearning pipelines based on generative world models, which do not require\nlow-dimensional states. To implement these pipelines, we overcome the\nchallenges of learning safety-informed latent representations and missing\nsafety labels under prediction-induced distribution shift. These pipelines come\nwith statistical calibration guarantees on their safety chance predictions\nbased on conformal prediction. We perform an extensive evaluation of the\nproposed learning pipelines on two case studies of image-controlled systems: a\nracing car and a cartpole.\n","authors":["Zhenjiang Mao","Carson Sobolewski","Ivan Ruchkin"],"pdf_url":"https://arxiv.org/pdf/2308.12252v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08851v1","updated":"2024-01-16T21:56:27Z","published":"2024-01-16T21:56:27Z","title":"Using i-vectors for subject-independent cross-session EEG transfer\n learning","summary":" Cognitive load classification is the task of automatically determining an\nindividual's utilization of working memory resources during performance of a\ntask based on physiologic measures such as electroencephalography (EEG). In\nthis paper, we follow a cross-disciplinary approach, where tools and\nmethodologies from speech processing are used to tackle this problem. The\ncorpus we use was released publicly in 2021 as part of the first passive\nbrain-computer interface competition on cross-session workload estimation. We\npresent our approach which used i-vector-based neural network classifiers to\naccomplish inter-subject cross-session EEG transfer learning, achieving 18%\nrelative improvement over equivalent subject-dependent models. We also report\nexperiments showing how our subject-independent models perform competitively on\nheld-out subjects and improve with additional subject data, suggesting that\nsubject-dependent training is not required for effective cognitive load\ndetermination.\n","authors":["Jonathan Lasko","Jeff Ma","Mike Nicoletti","Jonathan Sussman-Fort","Sooyoung Jeong","William Hartmann"],"pdf_url":"https://arxiv.org/pdf/2401.08851v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2401.08850v1","updated":"2024-01-16T21:47:23Z","published":"2024-01-16T21:47:23Z","title":"REValueD: Regularised Ensemble Value-Decomposition for Factorisable\n Markov Decision Processes","summary":" Discrete-action reinforcement learning algorithms often falter in tasks with\nhigh-dimensional discrete action spaces due to the vast number of possible\nactions. A recent advancement leverages value-decomposition, a concept from\nmulti-agent reinforcement learning, to tackle this challenge. This study delves\ndeep into the effects of this value-decomposition, revealing that whilst it\ncurtails the over-estimation bias inherent to Q-learning algorithms, it\namplifies target variance. To counteract this, we present an ensemble of\ncritics to mitigate target variance. Moreover, we introduce a regularisation\nloss that helps to mitigate the effects that exploratory actions in one\ndimension can have on the value of optimal actions in other dimensions. Our\nnovel algorithm, REValueD, tested on discretised versions of the DeepMind\nControl Suite tasks, showcases superior performance, especially in the\nchallenging humanoid and dog tasks. We further dissect the factors influencing\nREValueD's performance, evaluating the significance of the regularisation loss\nand the scalability of REValueD with increasing sub-actions per dimension.\n","authors":["David Ireland","Giovanni Montana"],"pdf_url":"https://arxiv.org/pdf/2401.08850v1.pdf","comment":"To appear in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08847v1","updated":"2024-01-16T21:45:08Z","published":"2024-01-16T21:45:08Z","title":"RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and\n Efficiency Assessment of Medical Image Segmentation Models","summary":" Deep learning techniques, despite their potential, often suffer from a lack\nof reproducibility and generalizability, impeding their clinical adoption.\nImage segmentation is one of the critical tasks in medical image analysis, in\nwhich one or several regions/volumes of interest should be annotated. This\npaper introduces the RIDGE checklist, a framework for assessing the\nReproducibility, Integrity, Dependability, Generalizability, and Efficiency of\ndeep learning-based medical image segmentation models. The checklist serves as\na guide for researchers to enhance the quality and transparency of their work,\nensuring that segmentation models are not only scientifically sound but also\nclinically relevant.\n","authors":["Farhad Maleki","Linda Moy","Reza Forghani","Tapotosh Ghosh","Katie Ovens","Steve Langer","Pouria Rouzrokh","Bardia Khosravi","Ali Ganjizadeh","Daniel Warren","Roxana Daneshjou","Mana Moassefi","Atlas Haddadi Avval","Susan Sotardi","Neil Tenenholtz","Felipe Kitamura","Timothy Kline"],"pdf_url":"https://arxiv.org/pdf/2401.08847v1.pdf","comment":"20 pages, 1 Figure, 1 Table"},{"id":"http://arxiv.org/abs/2401.08830v1","updated":"2024-01-16T21:07:04Z","published":"2024-01-16T21:07:04Z","title":"Stochastic Subnetwork Annealing: A Regularization Technique for Fine\n Tuning Pruned Subnetworks","summary":" Pruning methods have recently grown in popularity as an effective way to\nreduce the size and computational complexity of deep neural networks. Large\nnumbers of parameters can be removed from trained models with little\ndiscernible loss in accuracy after a small number of continued training epochs.\nHowever, pruning too many parameters at once often causes an initial steep drop\nin accuracy which can undermine convergence quality. Iterative pruning\napproaches mitigate this by gradually removing a small number of parameters\nover multiple epochs. However, this can still lead to subnetworks that overfit\nlocal regions of the loss landscape. We introduce a novel and effective\napproach to tuning subnetworks through a regularization technique we call\nStochastic Subnetwork Annealing. Instead of removing parameters in a discrete\nmanner, we instead represent subnetworks with stochastic masks where each\nparameter has a probabilistic chance of being included or excluded on any given\nforward pass. We anneal these probabilities over time such that subnetwork\nstructure slowly evolves as mask values become more deterministic, allowing for\na smoother and more robust optimization of subnetworks at high levels of\nsparsity.\n","authors":["Tim Whitaker","Darrell Whitley"],"pdf_url":"https://arxiv.org/pdf/2401.08830v1.pdf","comment":"9 pages, 2 figures; Rejected at ICLR-2024; Revised and updated with\n new experiments; Submitted to WCCI-2024"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.08194v1","updated":"2024-01-16T08:16:10Z","published":"2024-01-16T08:16:10Z","title":"End-to-End Optimized Image Compression with the Frequency-Oriented\n Transform","summary":" Image compression constitutes a significant challenge amidst the era of\ninformation explosion. Recent studies employing deep learning methods have\ndemonstrated the superior performance of learning-based image compression\nmethods over traditional codecs. However, an inherent challenge associated with\nthese methods lies in their lack of interpretability. Following an analysis of\nthe varying degrees of compression degradation across different frequency\nbands, we propose the end-to-end optimized image compression model facilitated\nby the frequency-oriented transform. The proposed end-to-end image compression\nmodel consists of four components: spatial sampling, frequency-oriented\ntransform, entropy estimation, and frequency-aware fusion. The\nfrequency-oriented transform separates the original image signal into distinct\nfrequency bands, aligning with the human-interpretable concept. Leveraging the\nnon-overlapping hypothesis, the model enables scalable coding through the\nselective transmission of arbitrary frequency components. Extensive experiments\nare conducted to demonstrate that our model outperforms all traditional codecs\nincluding next-generation standard H.266/VVC on MS-SSIM metric. Moreover,\nvisual analysis tasks (i.e., object detection and semantic segmentation) are\nconducted to verify the proposed compression method could preserve semantic\nfidelity besides signal-level precision.\n","authors":["Yuefeng Zhang","Kai Lin"],"pdf_url":"https://arxiv.org/pdf/2401.08194v1.pdf","comment":"25 pages, accepted by MVAP"},{"id":"http://arxiv.org/abs/2311.05920v2","updated":"2024-01-16T06:33:46Z","published":"2023-11-10T08:09:42Z","title":"Understanding How People with Binge Eating Disorder and Bulimia Interact\n with Digital Food Content","summary":" A large body of research has focused on understanding how online content and\ndisordered eating behaviors are associated. However, there is a lack of\ncomprehensive studies investigating digital food content's influence on\nindividuals with eating disorders. We conducted two rounds of studies (N=23 and\n22, respectively) with individuals with binge eating disorder (BED) or bulimia\nnervosa (BN) to understand their motivations and practices of consuming digital\nfood content. Our study reveals that individuals with BED and BN anticipate\npositive effects from food media to overcome their condition, but in practice,\nit often exacerbates their disorder. We also discovered that many individuals\nhave experienced a cycle of quitting and returning to digital food content\nconsumption. Based on these findings, we articulate design implications for\ndigital food content and multimedia platforms to support vulnerable individuals\nin everyday online platform interactions.\n","authors":["Ryuhaerang Choi","Subin Park","Sujin Han","Sung-Ju Lee"],"pdf_url":"https://arxiv.org/pdf/2311.05920v2.pdf","comment":"28 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.08117v1","updated":"2024-01-16T05:10:50Z","published":"2024-01-16T05:10:50Z","title":"E2HQV: High-Quality Video Generation from Event Camera via\n Theory-Inspired Model-Aided Deep Learning","summary":" The bio-inspired event cameras or dynamic vision sensors are capable of\nasynchronously capturing per-pixel brightness changes (called event-streams) in\nhigh temporal resolution and high dynamic range. However, the non-structural\nspatial-temporal event-streams make it challenging for providing intuitive\nvisualization with rich semantic information for human vision. It calls for\nevents-to-video (E2V) solutions which take event-streams as input and generate\nhigh quality video frames for intuitive visualization. However, current\nsolutions are predominantly data-driven without considering the prior knowledge\nof the underlying statistics relating event-streams and video frames. It highly\nrelies on the non-linearity and generalization capability of the deep neural\nnetworks, thus, is struggling on reconstructing detailed textures when the\nscenes are complex. In this work, we propose \\textbf{E2HQV}, a novel E2V\nparadigm designed to produce high-quality video frames from events. This\napproach leverages a model-aided deep learning framework, underpinned by a\ntheory-inspired E2V model, which is meticulously derived from the fundamental\nimaging principles of event cameras. To deal with the issue of state-reset in\nthe recurrent components of E2HQV, we also design a temporal shift embedding\nmodule to further improve the quality of the video frames. Comprehensive\nevaluations on the real world event camera datasets validate our approach, with\nE2HQV, notably outperforming state-of-the-art approaches, e.g., surpassing the\nsecond best by over 40\\% for some evaluation metrics.\n","authors":["Qiang Qu","Yiran Shen","Xiaoming Chen","Yuk Ying Chung","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08117v1.pdf","comment":"Accepted in AAAI2024"},{"id":"http://arxiv.org/abs/2401.08107v1","updated":"2024-01-16T04:28:09Z","published":"2024-01-16T04:28:09Z","title":"Deep Shape-Texture Statistics for Completely Blind Image Quality\n Evaluation","summary":" Opinion-Unaware Blind Image Quality Assessment (OU-BIQA) models aim to\npredict image quality without training on reference images and subjective\nquality scores. Thereinto, image statistical comparison is a classic paradigm,\nwhile the performance is limited by the representation ability of visual\ndescriptors. Deep features as visual descriptors have advanced IQA in recent\nresearch, but they are discovered to be highly texture-biased and lack of\nshape-bias. On this basis, we find out that image shape and texture cues\nrespond differently towards distortions, and the absence of either one results\nin an incomplete image representation. Therefore, to formulate a well-round\nstatistical description for images, we utilize the shapebiased and\ntexture-biased deep features produced by Deep Neural Networks (DNNs)\nsimultaneously. More specifically, we design a Shape-Texture Adaptive Fusion\n(STAF) module to merge shape and texture information, based on which we\nformulate qualityrelevant image statistics. The perceptual quality is\nquantified by the variant Mahalanobis Distance between the inner and outer\nShape-Texture Statistics (DSTS), wherein the inner and outer statistics\nrespectively describe the quality fingerprints of the distorted image and\nnatural images. The proposed DSTS delicately utilizes shape-texture statistical\nrelations between different data scales in the deep domain, and achieves\nstate-of-the-art (SOTA) quality prediction performance on images with\nartificial and authentic distortions.\n","authors":["Yixuan Li","Peilin Chen","Hanwei Zhu","Keyan Ding","Leida Li","Shiqi Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03201v2","updated":"2024-01-16T16:39:57Z","published":"2024-01-06T12:20:18Z","title":"3DMIT: 3D Multi-modal Instruction Tuning for Scene Understanding","summary":" The remarkable potential of multi-modal large language models (MLLMs) in\ncomprehending both vision and language information has been widely\nacknowledged. However, the scarcity of 3D scenes-language pairs in comparison\nto their 2D counterparts, coupled with the inadequacy of existing approaches in\nunderstanding of 3D scenes by LLMs, poses a significant challenge. In response,\nwe collect and construct an extensive dataset comprising 75K\ninstruction-response pairs tailored for 3D scenes. This dataset addresses tasks\nrelated to 3D VQA, 3D grounding, and 3D conversation. To further enhance the\nintegration of 3D spatial information into LLMs, we introduce a novel and\nefficient prompt tuning paradigm, 3DMIT. This paradigm eliminates the alignment\nstage between 3D scenes and language and extends the instruction prompt with\nthe 3D modality information including the entire scene and segmented objects.\nWe evaluate the effectiveness of our method across diverse tasks in the 3D\nscene domain and find that our approach serves as a strategic means to enrich\nLLMs' comprehension of the 3D world. Our code is available at\nhttps://github.com/staymylove/3DMIT.\n","authors":["Zeju Li","Chao Zhang","Xiaoyan Wang","Ruilong Ren","Yifan Xu","Ruifei Ma","Xiangde Liu"],"pdf_url":"https://arxiv.org/pdf/2401.03201v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.08449v1","updated":"2024-01-16T15:51:45Z","published":"2024-01-16T15:51:45Z","title":"CLIPRerank: An Extremely Simple Method for Improving Ad-hoc Video Search","summary":" Ad-hoc Video Search (AVS) enables users to search for unlabeled video content\nusing on-the-fly textual queries. Current deep learning-based models for AVS\nare trained to optimize holistic similarity between short videos and their\nassociated descriptions. However, due to the diversity of ad-hoc queries, even\nfor a short video, its truly relevant part w.r.t. a given query can be of\nshorter duration. In such a scenario, the holistic similarity becomes\nsuboptimal. To remedy the issue, we propose in this paper CLIPRerank, a\nfine-grained re-scoring method. We compute cross-modal similarities between\nquery and video frames using a pre-trained CLIP model, with multi-frame scores\naggregated by max pooling. The fine-grained score is weightedly added to the\ninitial score for search result reranking. As such, CLIPRerank is agnostic to\nthe underlying video retrieval models and extremely simple, making it a handy\nplug-in for boosting AVS. Experiments on the challenging TRECVID AVS benchmarks\n(from 2016 to 2021) justify the effectiveness of the proposed strategy.\nCLIPRerank consistently improves the TRECVID top performers and multiple\nexisting models including SEA, W2VV++, Dual Encoding, Dual Task, LAFF,\nCLIP2Video, TS2-Net and X-CLIP. Our method also works when substituting BLIP-2\nfor CLIP.\n","authors":["Aozhu Chen","Fangming Zhou","Ziyuan Wang","Xirong Li"],"pdf_url":"https://arxiv.org/pdf/2401.08449v1.pdf","comment":"Accepted by ICASSP 2024"}]},"2024-01-15T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.11554v4","updated":"2024-01-15T23:52:21Z","published":"2023-05-19T09:54:21Z","title":"ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via\n Tool Embeddings","summary":" Augmenting large language models (LLMs) with external tools has emerged as a\npromising approach to solving complex problems. However, traditional methods,\nwhich finetune LLMs with tool demonstration data, can be both costly and\nrestricted to a predefined set of tools. Recent in-context learning paradigm\nalleviates these issues, but the limited context length only allows for a few\nshots of demonstrations, leading to suboptimal understandings of the tools.\nMoreover, when there are numerous tools to choose from, in-context learning\ncould completely fail to work. In this paper, we propose an alternative\napproach, $\\textbf{ToolkenGPT}$, which combines the benefits of both sides. Our\napproach represents each $\\underline{tool}$ as a to$\\underline{ken}$\n($\\textit{toolken}$) and learns an embedding for it, enabling tool calls in the\nsame way as generating a regular word token. Once a toolken is triggered, the\nLLM is prompted to complete arguments for the tool to execute. ToolkenGPT\noffers the flexibility to plug in an arbitrary number of tools by expanding the\nset of toolkens on the fly. In addition, it improves tool use by allowing\nextensive demonstration data for learning the toolken embeddings. In diverse\ndomains, including numerical reasoning, knowledge-based question answering, and\nembodied plan generation, our approach effectively augments LLMs with tools and\nsubstantially outperforms various latest baselines. ToolkenGPT demonstrates the\npromising ability to use relevant tools from a large tool set in complex\nscenarios.\n","authors":["Shibo Hao","Tianyang Liu","Zhen Wang","Zhiting Hu"],"pdf_url":"https://arxiv.org/pdf/2305.11554v4.pdf","comment":"NeurIPS 2023 (oral). Code: https://github.com/Ber666/ToolkenGPT"},{"id":"http://arxiv.org/abs/2401.07994v1","updated":"2024-01-15T22:36:31Z","published":"2024-01-15T22:36:31Z","title":"A Novel Approach for Automatic Program Repair using Round-Trip\n Translation with Large Language Models","summary":" Research shows that grammatical mistakes in a sentence can be corrected by\ntranslating it to another language and back using neural machine translation\nwith language models. We investigate whether this correction capability of\nLarge Language Models (LLMs) extends to Automatic Program Repair (APR). Current\ngenerative models for APR are pre-trained on source code and fine-tuned for\nrepair. This paper proposes bypassing the fine-tuning step and using Round-Trip\nTranslation (RTT): translation of code from one programming language to another\nprogramming or natural language, and back. We hypothesize that RTT with LLMs\nrestores the most commonly seen patterns in code during pre-training, i.e.,\nperforms a regression toward the mean, which removes bugs as they are a form of\nnoise w.r.t. the more frequent, natural, bug-free code in the training data. To\ntest this hypothesis, we employ eight recent LLMs pre-trained on code,\nincluding the latest GPT versions, and four common program repair benchmarks in\nJava. We find that RTT with English as an intermediate language repaired 101 of\n164 bugs with GPT-4 on the HumanEval-Java dataset. Moreover, 46 of these are\nunique bugs that are not repaired by other LLMs fine-tuned for APR. Our\nfindings highlight the viability of round-trip translation with LLMs as a\ntechnique for automated program repair and its potential for research in\nsoftware engineering.\n Keywords: automated program repair, large language model, machine translation\n","authors":["Fernando Vallecillos Ruiz","Anastasiia Grishina","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2401.07994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13304v2","updated":"2024-01-15T21:54:28Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":" This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from\n$\\textit{incoherent}$ weight and Hessian matrices, i.e., from the weights being\neven in magnitude and the directions in which it is important to round them\naccurately being unaligned with the coordinate axes. QuIP consists of two\nsteps: (1) an adaptive rounding procedure minimizing a quadratic proxy\nobjective; (2) efficient pre- and post-processing that ensures weight and\nHessian incoherence via multiplication by random orthogonal matrices. We\ncomplement QuIP with the first theoretical analysis for an LLM-scale\nquantization algorithm, and show that our theory also applies to an existing\nmethod, OPTQ. Empirically, we find that our incoherence preprocessing improves\nseveral existing quantization algorithms and yields the first LLM quantization\nmethods that produce viable results using only two bits per weight. Our code\ncan be found at https://github.com/Cornell-RelaxML/QuIP.\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07977v1","updated":"2024-01-15T21:43:46Z","published":"2024-01-15T21:43:46Z","title":"Leveraging External Knowledge Resources to Enable Domain-Specific\n Comprehension","summary":" Machine Reading Comprehension (MRC) has been a long-standing problem in NLP\nand, with the recent introduction of the BERT family of transformer based\nlanguage models, it has come a long way to getting solved. Unfortunately,\nhowever, when BERT variants trained on general text corpora are applied to\ndomain-specific text, their performance inevitably degrades on account of the\ndomain shift i.e. genre/subject matter discrepancy between the training and\ndownstream application data. Knowledge graphs act as reservoirs for either open\nor closed domain information and prior studies have shown that they can be used\nto improve the performance of general-purpose transformers in domain-specific\napplications. Building on existing work, we introduce a method using\nMulti-Layer Perceptrons (MLPs) for aligning and integrating embeddings\nextracted from knowledge graphs with the embeddings spaces of pre-trained\nlanguage models (LMs). We fuse the aligned embeddings with open-domain LMs BERT\nand RoBERTa, and fine-tune them for two MRC tasks namely span detection\n(COVID-QA) and multiple-choice questions (PubMedQA). On the COVID-QA dataset,\nwe see that our approach allows these models to perform similar to their\ndomain-specific counterparts, Bio/Sci-BERT, as evidenced by the Exact Match\n(EM) metric. With regards to PubMedQA, we observe an overall improvement in\naccuracy while the F1 stays relatively the same over the domain-specific\nmodels.\n","authors":["Saptarshi Sengupta","Connor Heaton","Prasenjit Mitra","Soumalya Sarkar"],"pdf_url":"https://arxiv.org/pdf/2401.07977v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07967v1","updated":"2024-01-15T21:10:19Z","published":"2024-01-15T21:10:19Z","title":"MCMChaos: Improvising Rap Music with MCMC Methods and Chaos Theory","summary":" A novel freestyle rap software, MCMChaos 0.0.1, based on rap music\ntranscriptions created in previous research is presented. The software has\nthree different versions, each making use of different mathematical simulation\nmethods: collapsed gibbs sampler and lorenz attractor simulation. As far as we\nknow, these simulation methods have never been used in rap music generation\nbefore. The software implements Python Text-to-Speech processing (pyttxs) to\nconvert text wrangled from the MCFlow corpus into English speech. In each\nversion, values simulated from each respective mathematical model alter the\nrate of speech, volume, and (in the multiple voice case) the voice of the\ntext-to-speech engine on a line-by-line basis. The user of the software is\npresented with a real-time graphical user interface (GUI) which instantaneously\nchanges the initial values read into the mathematical simulation methods.\nFuture research might attempt to allow for more user control and autonomy.\n","authors":["Robert G. Kimelman"],"pdf_url":"https://arxiv.org/pdf/2401.07967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07964v1","updated":"2024-01-15T21:06:20Z","published":"2024-01-15T21:06:20Z","title":"AI-as-exploration: Navigating intelligence space","summary":" Artificial Intelligence is a field that lives many lives, and the term has\ncome to encompass a motley collection of scientific and commercial endeavours.\nIn this paper, I articulate the contours of a rather neglected but central\nscientific role that AI has to play, which I dub `AI-as-exploration'.The basic\nthrust of AI-as-exploration is that of creating and studying systems that can\nreveal candidate building blocks of intelligence that may differ from the forms\nof human and animal intelligence we are familiar with. In other words, I\nsuggest that AI is one of the best tools we have for exploring intelligence\nspace, namely the space of possible intelligent systems. I illustrate the value\nof AI-as-exploration by focusing on a specific case study, i.e., recent work on\nthe capacity to combine novel and invented concepts in humans and Large\nLanguage Models. I show that the latter, despite showing human-level accuracy\nin such a task, most probably solve it in ways radically different, but no less\nrelevant to intelligence research, to those hypothesised for humans.\n","authors":["Dimitri Coelho Mollo"],"pdf_url":"https://arxiv.org/pdf/2401.07964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07955v1","updated":"2024-01-15T20:42:16Z","published":"2024-01-15T20:42:16Z","title":"A Study on Large Language Models' Limitations in Multiple-Choice\n Question Answering","summary":" The widespread adoption of Large Language Models (LLMs) has become\ncommonplace, particularly with the emergence of open-source models. More\nimportantly, smaller models are well-suited for integration into consumer\ndevices and are frequently employed either as standalone solutions or as\nsubroutines in various AI tasks. Despite their ubiquitous use, there is no\nsystematic analysis of their specific capabilities and limitations. In this\nstudy, we tackle one of the most widely used tasks - answering Multiple Choice\nQuestion (MCQ). We analyze 26 small open-source models and find that 65% of the\nmodels do not understand the task, only 4 models properly select an answer from\nthe given choices, and only 5 of these models are choice order independent.\nThese results are rather alarming given the extensive use of MCQ tests with\nthese models. We recommend exercising caution and testing task understanding\nbefore using MCQ to evaluate LLMs in any field whatsoever.\n","authors":["Aisha Khatun","Daniel G. Brown"],"pdf_url":"https://arxiv.org/pdf/2401.07955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07950v1","updated":"2024-01-15T20:22:21Z","published":"2024-01-15T20:22:21Z","title":"SciGLM: Training Scientific Language Models with Self-Reflective\n Instruction Annotation and Tuning","summary":" \\label{sec:abstract} Large Language Models (LLMs) have shown promise in\nassisting scientific discovery. However, such applications are currently\nlimited by LLMs' deficiencies in understanding intricate scientific concepts,\nderiving symbolic equations, and solving advanced numerical calculations. To\nbridge these gaps, we introduce SciGLM, a suite of scientific language models\nable to conduct college-level scientific reasoning. Central to our approach is\na novel self-reflective instruction annotation framework to address the data\nscarcity challenge in the science domain. This framework leverages existing\nLLMs to generate step-by-step reasoning for unlabelled scientific questions,\nfollowed by a process of self-reflective critic-and-revise. Applying this\nframework, we curated SciInstruct, a diverse and high-quality dataset\nencompassing mathematics, physics, chemistry, and formal proofs. We fine-tuned\nthe ChatGLM family of language models with SciInstruct, enhancing their\ncapabilities in scientific and mathematical reasoning. Remarkably, SciGLM\nconsistently improves both the base model (ChatGLM3-6B-Base) and larger-scale\nmodels (12B and 32B), without sacrificing the language understanding\ncapabilities of the base model. This makes SciGLM a suitable foundational model\nto facilitate diverse scientific discovery tasks. For the benefit of the wider\nresearch community, we release SciInstruct, SciGLM, alongside a self-reflective\nframework and fine-tuning code at \\url{https://github.com/THUDM/SciGLM}.\n","authors":["Dan Zhang","Ziniu Hu","Sining Zhoubian","Zhengxiao Du","Kaiyu Yang","Zihan Wang","Yisong Yue","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2401.07950v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.07944v1","updated":"2024-01-15T20:17:31Z","published":"2024-01-15T20:17:31Z","title":"SemEval-2017 Task 4: Sentiment Analysis in Twitter using BERT","summary":" This paper uses the BERT model, which is a transformer-based architecture, to\nsolve task 4A, English Language, Sentiment Analysis in Twitter of SemEval2017.\nBERT is a very powerful large language model for classification tasks when the\namount of training data is small. For this experiment, we have used the\nBERT{\\textsubscript{\\tiny BASE}} model, which has 12 hidden layers. This model\nprovides better accuracy, precision, recall, and f1 score than the Naive Bayes\nbaseline model. It performs better in binary classification subtasks than the\nmulti-class classification subtasks. We also considered all kinds of ethical\nissues during this experiment, as Twitter data contains personal and sensible\ninformation. The dataset and code used in our experiment can be found in this\nGitHub repository.\n","authors":["Rupak Kumar Das","Dr. Ted Pedersen"],"pdf_url":"https://arxiv.org/pdf/2401.07944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07928v1","updated":"2024-01-15T19:39:29Z","published":"2024-01-15T19:39:29Z","title":"A Lexicon for Studying Radicalization in Incel Communities","summary":" Incels are an extremist online community of men who believe in an ideology\nrooted in misogyny, racism, the glorification of violence, and dehumanization.\nIn their online forums, they use an extensive, evolving cryptolect - a set of\ningroup terms that have meaning within the group, reflect the ideology,\ndemonstrate membership in the community, and are difficult for outsiders to\nunderstand. This paper presents a lexicon with terms and definitions for common\nincel root words, prefixes, and affixes. The lexicon is text-based for use in\nautomated analysis and is derived via a Qualitative Content Analysis of the\nmost frequent incel words, their structure, and their meaning on five of the\nmost active incel communities from 2016 to 2023. This lexicon will support\nfuture work examining radicalization and deradicalization/disengagement within\nthe community.\n","authors":["Emily Klein","Jennifer Golbeck"],"pdf_url":"https://arxiv.org/pdf/2401.07928v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.07927v1","updated":"2024-01-15T19:39:15Z","published":"2024-01-15T19:39:15Z","title":"Can Large Language Models Explain Themselves?","summary":" Instruction-tuned large language models (LLMs) excel at many tasks, and will\neven provide explanations for their behavior. Since these models are directly\naccessible to the public, there is a risk that convincing and wrong\nexplanations can lead to unsupported confidence in LLMs. Therefore,\ninterpretability-faithfulness of self-explanations is an important\nconsideration for AI Safety. Assessing the interpretability-faithfulness of\nthese explanations, termed self-explanations, is challenging as the models are\ntoo complex for humans to annotate what is a correct explanation. To address\nthis, we propose employing self-consistency checks as a measure of\nfaithfulness. For example, if an LLM says a set of words is important for\nmaking a prediction, then it should not be able to make the same prediction\nwithout these words. While self-consistency checks are a common approach to\nfaithfulness, they have not previously been applied to LLM's self-explanations.\nWe apply self-consistency checks to three types of self-explanations:\ncounterfactuals, importance measures, and redactions. Our work demonstrate that\nfaithfulness is both task and model dependent, e.g., for sentiment\nclassification, counterfactual explanations are more faithful for Llama2,\nimportance measures for Mistral, and redaction for Falcon 40B. Finally, our\nfindings are robust to prompt-variations.\n","authors":["Andreas Madsen","Sarath Chandar","Siva Reddy"],"pdf_url":"https://arxiv.org/pdf/2401.07927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07923v1","updated":"2024-01-15T19:21:08Z","published":"2024-01-15T19:21:08Z","title":"Word Boundary Information Isn't Useful for Encoder Language Models","summary":" All existing transformer-based approaches to NLP using subword tokenisation\nalgorithms encode whitespace (word boundary information) through the use of\nspecial space symbols (such as \\#\\# or \\_) forming part of tokens. These\nsymbols have been shown to a) lead to reduced morphological validity of\ntokenisations, and b) give substantial vocabulary redundancy. As such, removing\nthese symbols has been shown to have a beneficial effect on the processing of\nmorphologically complex words for transformer encoders in the pretrain-finetune\nparadigm. In this work, we explore whether word boundary information is at all\nuseful to such models. In particular, we train transformer encoders across four\ndifferent training scales, and investigate several alternative approaches to\nincluding word boundary information, evaluating on a range of tasks across\ndifferent domains and problem set-ups: GLUE (for sentence-level\nclassification), NER (for token-level classification), and two classification\ndatasets involving complex words (Superbizarre and FLOTA). Overall, through an\nextensive experimental setup that includes the pre-training of 29 models, we\nfind no substantial improvements from our alternative approaches, suggesting\nthat modifying tokenisers to remove word boundary information isn't leading to\na loss of useful information.\n","authors":["Edward Gow-Smith","Dylan Phelps","Harish Tayyar Madabushi","Carolina Scarton","Aline Villavicencio"],"pdf_url":"https://arxiv.org/pdf/2401.07923v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.07897v1","updated":"2024-01-15T18:53:15Z","published":"2024-01-15T18:53:15Z","title":"The Pitfalls of Defining Hallucination","summary":" Despite impressive advances in Natural Language Generation (NLG) and Large\nLanguage Models (LLMs), researchers are still unclear about important aspects\nof NLG evaluation. To substantiate this claim, I examine current\nclassifications of hallucination and omission in Data-text NLG, and I propose a\nlogic-based synthesis of these classfications. I conclude by highlighting some\nremaining limitations of all current thinking about hallucination and by\ndiscussing implications for LLMs.\n","authors":["Kees van Deemter"],"pdf_url":"https://arxiv.org/pdf/2401.07897v1.pdf","comment":"Accepted for publication in Computational Linguistics on 30 Dec.\n 2023. (9 Pages.)"},{"id":"http://arxiv.org/abs/2401.02987v3","updated":"2024-01-15T18:50:17Z","published":"2024-01-02T17:08:26Z","title":"Has Your Pretrained Model Improved? A Multi-head Posterior Based\n Approach","summary":" The emergence of pretrained models has significantly impacted Natural\nLanguage Processing (NLP) and Computer Vision to relational datasets.\nTraditionally, these models are assessed through fine-tuned downstream tasks.\nHowever, this raises the question of how to evaluate these models more\nefficiently and more effectively. In this study, we explore a novel approach\nwhere we leverage the meta features associated with each entity as a source of\nworldly knowledge and employ entity representations from the models. We propose\nusing the consistency between these representations and the meta features as a\nmetric for evaluating pretrained models. Our method's effectiveness is\ndemonstrated across various domains, including models with relational datasets,\nlarge language models and image models.\n","authors":["Prince Aboagye","Yan Zheng","Junpeng Wang","Uday Singh Saini","Xin Dai","Michael Yeh","Yujie Fan","Zhongfang Zhuang","Shubham Jain","Liang Wang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.02987v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07886v1","updated":"2024-01-15T18:28:17Z","published":"2024-01-15T18:28:17Z","title":"Learned Best-Effort LLM Serving","summary":" Many applications must provide low-latency LLM service to users or risk\nunacceptable user experience. However, over-provisioning resources to serve\nfluctuating request patterns is often prohibitively expensive. In this work, we\npresent a best-effort serving system that employs deep reinforcement learning\nto adjust service quality based on the task distribution and system load. Our\nbest-effort system can maintain availability with over 10x higher client\nrequest rates, serves above 96% of peak performance 4.1x more often, and serves\nabove 98% of peak performance 2.3x more often than static serving on\nunpredictable workloads. Our learned router is robust to shifts in both the\narrival and task distribution. Compared to static serving, learned best-effort\nserving allows for cost-efficient serving through increased hardware utility.\nAdditionally, we argue that learned best-effort LLM serving is applicable in\nwide variety of settings and provides application developers great flexibility\nto meet their specific needs.\n","authors":["Siddharth Jha","Coleman Hooper","Xiaoxuan Liu","Sehoon Kim","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2401.07886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07883v1","updated":"2024-01-15T18:25:18Z","published":"2024-01-15T18:25:18Z","title":"The Chronicles of RAG: The Retriever, the Chunk and the Generator","summary":" Retrieval Augmented Generation (RAG) has become one of the most popular\nparadigms for enabling LLMs to access external data, and also as a mechanism\nfor grounding to mitigate against hallucinations. When implementing RAG you can\nface several challenges like effective integration of retrieval models,\nefficient representation learning, data diversity, computational efficiency\noptimization, evaluation, and quality of text generation. Given all these\nchallenges, every day a new technique to improve RAG appears, making it\nunfeasible to experiment with all combinations for your problem. In this\ncontext, this paper presents good practices to implement, optimize, and\nevaluate RAG for the Brazilian Portuguese language, focusing on the\nestablishment of a simple pipeline for inference and experiments. We explored a\ndiverse set of methods to answer questions about the first Harry Potter book.\nTo generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview,\ngpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the\nretriever, our approach achieved an improvement of MRR@10 by 35.4% compared to\nthe baseline. When optimizing the input size in the application, we observed\nthat it is possible to further enhance it by 2.4%. Finally, we present the\ncomplete architecture of the RAG with our recommendations. As result, we moved\nfrom a baseline of 57.88% to a maximum relative score of 98.61%.\n","authors":["Paulo Finardi","Leonardo Avila","Rodrigo Castaldoni","Pedro Gengo","Celio Larcher","Marcos Piau","Pablo Costa","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2401.07883v1.pdf","comment":"16 pages, 15 figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.07877v1","updated":"2024-01-15T18:12:01Z","published":"2024-01-15T18:12:01Z","title":"EMBRE: Entity-aware Masking for Biomedical Relation Extraction","summary":" Information extraction techniques, including named entity recognition (NER)\nand relation extraction (RE), are crucial in many domains to support making\nsense of vast amounts of unstructured text data by identifying and connecting\nrelevant information. Such techniques can assist researchers in extracting\nvaluable insights. In this paper, we introduce the Entity-aware Masking for\nBiomedical Relation Extraction (EMBRE) method for biomedical relation\nextraction, as applied in the context of the BioRED challenge Task 1, in which\nhuman-annotated entities are provided as input. Specifically, we integrate\nentity knowledge into a deep neural network by pretraining the backbone model\nwith an entity masking objective. We randomly mask named entities for each\ninstance and let the model identify the masked entity along with its type. In\nthis way, the model is capable of learning more specific knowledge and more\nrobust representations. Then, we utilize the pre-trained model as our backbone\nto encode language representations and feed these representations into two\nmultilayer perceptron (MLPs) to predict the logits for relation and novelty,\nrespectively. The experimental results demonstrate that our proposed method can\nimprove the performances of entity pair, relation and novelty extraction over\nour baseline.\n","authors":["Mingjie Li","Karin Verspoor"],"pdf_url":"https://arxiv.org/pdf/2401.07877v1.pdf","comment":"5 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.07872v1","updated":"2024-01-15T18:07:21Z","published":"2024-01-15T18:07:21Z","title":"The What, Why, and How of Context Length Extension Techniques in Large\n Language Models -- A Detailed Survey","summary":" The advent of Large Language Models (LLMs) represents a notable breakthrough\nin Natural Language Processing (NLP), contributing to substantial progress in\nboth text comprehension and generation. However, amidst these advancements, it\nis noteworthy that LLMs often face a limitation in terms of context length\nextrapolation. Understanding and extending the context length for LLMs is\ncrucial in enhancing their performance across various NLP applications. In this\nsurvey paper, we delve into the multifaceted aspects of exploring why it is\nessential, and the potential transformations that superior techniques could\nbring to NLP applications. We study the inherent challenges associated with\nextending context length and present an organized overview of the existing\nstrategies employed by researchers. Additionally, we discuss the intricacies of\nevaluating context extension techniques and highlight the open challenges that\nresearchers face in this domain. Furthermore, we explore whether there is a\nconsensus within the research community regarding evaluation standards and\nidentify areas where further agreement is needed. This comprehensive survey\naims to serve as a valuable resource for researchers, guiding them through the\nnuances of context length extension techniques and fostering discussions on\nfuture advancements in this evolving field.\n","authors":["Saurav Pawar","S. M Towhidul Islam Tonmoy","S M Mehedi Zaman","Vinija Jain","Aman Chadha","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2401.07872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07870v1","updated":"2024-01-15T18:04:29Z","published":"2024-01-15T18:04:29Z","title":"JumpCoder: Go Beyond Autoregressive Coder via Online Modification","summary":" While existing code large language models (code LLMs) exhibit impressive\ncapabilities in code generation, their autoregressive sequential generation\ninherently lacks reversibility. This limitation hinders them from timely\ncorrecting previous missing statements during coding as humans do, often\nleading to error propagation and suboptimal performance. We introduce\nJumpCoder, a novel modelagnostic framework that enables online modification and\nnon-sequential generation to augment the code LLMs. The key idea behind\nJumpCoder is to insert new code into the currently generated code when\nnecessary during generation, which is achieved through an auxiliary infilling\nmodel that works in tandem with the code LLM. Since identifying the best infill\nposition beforehand is intractable, we adopt an infill-first, judge-later\nstrategy, which experiments with filling at the $k$ most critical positions\nfollowing the generation of each line, and uses an Abstract Syntax Tree (AST)\nparser alongside the Generation Model Scoring to effectively judge the validity\nof each potential infill. Extensive experiments using six state-of-the-art code\nLLMs across multiple benchmarks consistently indicate significant improvements\nover all baselines. Notably, JumpCoder assists code LLMs in achieving up to a\n3.6% increase in Pass@1 for Python, 6.3% for Java, and 3.7% for C++ in the\nmultilingual HumanEval benchmarks. Our code is public at\nhttps://github.com/Keytoyze/JumpCoder.\n","authors":["Mouxiang Chen","Hao Tian","Zhongxin Liu","Xiaoxue Ren","Jianling Sun"],"pdf_url":"https://arxiv.org/pdf/2401.07870v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07868v1","updated":"2024-01-15T18:01:59Z","published":"2024-01-15T18:01:59Z","title":"Consolidating Trees of Robotic Plans Generated Using Large Language\n Models to Improve Reliability","summary":" The inherent probabilistic nature of Large Language Models (LLMs) introduces\nan element of unpredictability, raising concerns about potential discrepancies\nin their output. This paper introduces an innovative approach aims to generate\ncorrect and optimal robotic task plans for diverse real-world demands and\nscenarios. LLMs have been used to generate task plans, but they are unreliable\nand may contain wrong, questionable, or high-cost steps. The proposed approach\nuses LLM to generate a number of task plans as trees and amalgamates them into\na graph by removing questionable paths. Then an optimal task tree can be\nretrieved to circumvent questionable and high-cost nodes, thereby improving\nplanning accuracy and execution efficiency. The approach is further improved by\nincorporating a large knowledge network. Leveraging GPT-4 further, the\nhigh-level task plan is converted into a low-level Planning Domain Definition\nLanguage (PDDL) plan executable by a robot. Evaluation results highlight the\nsuperior accuracy and efficiency of our approach compared to previous\nmethodologies in the field of task planning.\n","authors":["Md Sadman Sakib","Yu Sun"],"pdf_url":"https://arxiv.org/pdf/2401.07868v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07867v1","updated":"2024-01-15T17:57:41Z","published":"2024-01-15T17:57:41Z","title":"Authorship Obfuscation in Multilingual Machine-Generated Text Detection","summary":" High-quality text generation capability of latest Large Language Models\n(LLMs) causes concerns about their misuse (e.g., in massive generation/spread\nof disinformation). Machine-generated text (MGT) detection is important to cope\nwith such threats. However, it is susceptible to authorship obfuscation (AO)\nmethods, such as paraphrasing, which can cause MGTs to evade detection. So far,\nthis was evaluated only in monolingual settings. Thus, the susceptibility of\nrecently proposed multilingual detectors is still unknown. We fill this gap by\ncomprehensively benchmarking the performance of 10 well-known AO methods,\nattacking 37 MGT detection methods against MGTs in 11 languages (i.e., 10\n$\\times$ 37 $\\times$ 11 = 4,070 combinations). We also evaluate the effect of\ndata augmentation on adversarial robustness using obfuscated texts. The results\nindicate that all tested AO methods can cause detection evasion in all tested\nlanguages, where homoglyph attacks are especially successful.\n","authors":["Dominik Macko","Robert Moro","Adaku Uchendu","Ivan Srba","Jason Samuel Lucas","Michiharu Yamashita","Nafis Irtiza Tripto","Dongwon Lee","Jakub Simko","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2401.07867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04076v3","updated":"2024-01-15T17:52:31Z","published":"2023-11-07T15:40:43Z","title":"Do LLMs exhibit human-like response biases? A case study in survey\n design","summary":" As large language models (LLMs) become more capable, there is growing\nexcitement about the possibility of using LLMs as proxies for humans in\nreal-world tasks where subjective labels are desired, such as in surveys and\nopinion polling. One widely-cited barrier to the adoption of LLMs is their\nsensitivity to prompt wording - but interestingly, humans also display\nsensitivities to instruction changes in the form of response biases. As such,\nwe argue that if LLMs are going to be used to approximate human opinions, it is\nnecessary to investigate the extent to which LLMs also reflect human response\nbiases, if at all. In this work, we use survey design as a case study, where\nhuman response biases caused by permutations in wordings of \"prompts\" have been\nextensively studied. Drawing from prior work in social psychology, we design a\ndataset and propose a framework to evaluate whether LLMs exhibit human-like\nresponse biases in survey questionnaires. Our comprehensive evaluation of nine\nmodels shows that popular open and commercial LLMs generally fail to reflect\nhuman-like behavior. These inconsistencies tend to be more prominent in models\nthat have been instruction fine-tuned. Furthermore, even if a model shows a\nsignificant change in the same direction as humans, we find that perturbations\nthat are not meant to elicit significant changes in humans may also result in a\nsimilar change. These results highlight the potential pitfalls of using LLMs to\nsubstitute humans in parts of the annotation pipeline, and further underscore\nthe importance of finer-grained characterizations of model behavior. Our code,\ndataset, and collected samples are available at\nhttps://github.com/lindiatjuatja/BiasMonkey\n","authors":["Lindia Tjuatja","Valerie Chen","Sherry Tongshuang Wu","Ameet Talwalkar","Graham Neubig"],"pdf_url":"https://arxiv.org/pdf/2311.04076v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07851v1","updated":"2024-01-15T17:26:50Z","published":"2024-01-15T17:26:50Z","title":"Unlocking Efficiency in Large Language Model Inference: A Comprehensive\n Survey of Speculative Decoding","summary":" To mitigate the high inference latency stemming from autoregressive decoding\nin Large Language Models (LLMs), Speculative Decoding has emerged as a novel\ndecoding paradigm for LLM inference. In each decoding step, this method first\nefficiently drafts several future tokens and then verifies them in parallel.\nUnlike autoregressive decoding, Speculative Decoding facilitates the\nsimultaneous decoding of multiple tokens per step, thereby accelerating\ninference. This paper presents a comprehensive overview and analysis of this\npromising decoding paradigm. We begin by providing a formal definition and\nformulation of Speculative Decoding. Then, we organize in-depth discussions on\nits key facets, including current leading techniques, the challenges faced, and\npotential future directions in this field. We aim for this work to serve as a\ncatalyst for further research on Speculative Decoding, ultimately contributing\nto more efficient LLM inference.\n","authors":["Heming Xia","Zhe Yang","Qingxiu Dong","Peiyi Wang","Yongqi Li","Tao Ge","Tianyu Liu","Wenjie Li","Zhifang Sui"],"pdf_url":"https://arxiv.org/pdf/2401.07851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07847v1","updated":"2024-01-15T17:23:02Z","published":"2024-01-15T17:23:02Z","title":"Milestones in Bengali Sentiment Analysis leveraging Transformer-models:\n Fundamentals, Challenges and Future Directions","summary":" Sentiment Analysis (SA) refers to the task of associating a view polarity\n(usually, positive, negative, or neutral; or even fine-grained such as slightly\nangry, sad, etc.) to a given text, essentially breaking it down to a supervised\n(since we have the view labels apriori) classification task. Although heavily\nstudied in resource-rich languages such as English thus pushing the SOTA by\nleaps and bounds, owing to the arrival of the Transformer architecture, the\nsame cannot be said for resource-poor languages such as Bengali (BN). For a\nlanguage spoken by roughly 300 million people, the technology enabling them to\nrun trials on their favored tongue is severely lacking. In this paper, we\nanalyze the SOTA for SA in Bengali, particularly, Transformer-based models. We\ndiscuss available datasets, their drawbacks, the nuances associated with\nBengali i.e. what makes this a challenging language to apply SA on, and finally\nprovide insights for future direction to mitigate the limitations in the field.\n","authors":["Saptarshi Sengupta","Shreya Ghosh","Prasenjit Mitra","Tarikul Islam Tamiti"],"pdf_url":"https://arxiv.org/pdf/2401.07847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07817v1","updated":"2024-01-15T16:39:10Z","published":"2024-01-15T16:39:10Z","title":"Question Translation Training for Better Multilingual Reasoning","summary":" Large language models show compelling performance on reasoning tasks but they\ntend to perform much worse in languages other than English. This is\nunsurprising given that their training data largely consists of English text\nand instructions. A typical solution is to translate instruction data into all\nlanguages of interest, and then train on the resulting multilingual data, which\nis called translate-training. This approach not only incurs high cost, but also\nresults in poorly translated data due to the non-standard formatting of\nchain-of-thought and mathematical reasoning instructions. In this paper, we\nexplore the benefits of question alignment, where we train the model to\ntranslate reasoning questions into English by finetuning on X-English question\ndata. In this way we perform targetted, in-domain language alignment which\nmakes best use of English instruction data to unlock the LLMs' multilingual\nreasoning abilities. Experimental results on LLaMA2-13B show that question\nalignment leads to consistent improvements over the translate-training\napproach: an average improvement of 11.3\\% and 16.1\\% accuracy across ten\nlanguages on the MGSM and MSVAMP maths reasoning benchmarks (The project will\nbe available at: https://github.com/NJUNLP/QAlign).\n","authors":["Wenhao Zhu","Shujian Huang","Fei Yuan","Shuaijie She","Jiajun Chen","Alexandra Birch"],"pdf_url":"https://arxiv.org/pdf/2401.07817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07812v1","updated":"2024-01-15T16:35:52Z","published":"2024-01-15T16:35:52Z","title":"Wikidata as a seed for Web Extraction","summary":" Wikidata has grown to a knowledge graph with an impressive size. To date, it\ncontains more than 17 billion triples collecting information about people,\nplaces, films, stars, publications, proteins, and many more. On the other side,\nmost of the information on the Web is not published in highly structured data\nrepositories like Wikidata, but rather as unstructured and semi-structured\ncontent, more concretely in HTML pages containing text and tables. Finding,\nmonitoring, and organizing this data in a knowledge graph is requiring\nconsiderable work from human editors. The volume and complexity of the data\nmake this task difficult and time-consuming. In this work, we present a\nframework that is able to identify and extract new facts that are published\nunder multiple Web domains so that they can be proposed for validation by\nWikidata editors. The framework is relying on question-answering technologies.\nWe take inspiration from ideas that are used to extract facts from textual\ncollections and adapt them to extract facts from Web pages. For achieving this,\nwe demonstrate that language models can be adapted to extract facts not only\nfrom textual collections but also from Web pages. By exploiting the information\nalready contained in Wikidata the proposed framework can be trained without the\nneed for any additional learning signals and can extract new facts for a wide\nrange of properties and domains. Following this path, Wikidata can be used as a\nseed to extract facts on the Web. Our experiments show that we can achieve a\nmean performance of 84.07 at F1-score. Moreover, our estimations show that we\ncan potentially extract millions of facts that can be proposed for human\nvalidation. The goal is to help editors in their daily tasks and contribute to\nthe completion of the Wikidata knowledge graph.\n","authors":["Kunpeng Guo","Dennis Diefenbach","Antoine Gourru","Christophe Gravier"],"pdf_url":"https://arxiv.org/pdf/2401.07812v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07810v1","updated":"2024-01-15T16:31:18Z","published":"2024-01-15T16:31:18Z","title":"Consolidating Strategies for Countering Hate Speech Using Persuasive\n Dialogues","summary":" Hateful comments are prevalent on social media platforms. Although tools for\nautomatically detecting, flagging, and blocking such false, offensive, and\nharmful content online have lately matured, such reactive and brute force\nmethods alone provide short-term and superficial remedies while the\nperpetrators persist. With the public availability of large language models\nwhich can generate articulate synthetic and engaging content at scale, there\nare concerns about the rapid growth of dissemination of such malicious content\non the web. There is now a need to focus on deeper, long-term solutions that\ninvolve engaging with the human perpetrator behind the source of the content to\nchange their viewpoint or at least bring down the rhetoric using persuasive\nmeans. To do that, we propose defining and experimenting with controllable\nstrategies for generating counter-arguments to hateful comments in online\nconversations. We experiment with controlling response generation using\nfeatures based on (i) argument structure and reasoning-based Walton argument\nschemes, (ii) counter-argument speech acts, and (iii) human\ncharacteristics-based qualities such as Big-5 personality traits and human\nvalues. Using automatic and human evaluations, we determine the best\ncombination of features that generate fluent, argumentative, and logically\nsound arguments for countering hate. We further share the developed\ncomputational models for automatically annotating text with such features, and\na silver-standard annotated version of an existing hate speech dialog corpora.\n","authors":["Sougata Saha","Rohini Srihari"],"pdf_url":"https://arxiv.org/pdf/2401.07810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07793v1","updated":"2024-01-15T16:00:50Z","published":"2024-01-15T16:00:50Z","title":"Flexibly Scaling Large Language Models Contexts Through Extensible\n Tokenization","summary":" Large language models (LLMs) are in need of sufficient contexts to handle\nmany critical applications, such as retrieval augmented generation and few-shot\nlearning. However, due to the constrained window size, the LLMs can only access\nto the information within a limited context. Although the size of context\nwindow can be extended by fine-tuning, it will result in a substantial cost in\nboth training and inference stage. In this paper, we present Extensible\nTokenization as an alternative method which realizes the flexible scaling of\nLLMs' context. Extensible Tokenization stands as a midware in between of the\ntokenized context and the LLM, which transforms the raw token embeddings into\nthe extensible embeddings. Such embeddings provide a more compact\nrepresentation for the long context, on top of which the LLM is able to\nperceive more information with the same context window. Extensible Tokenization\nis also featured by its flexibility: the scaling factor can be flexibly\ndetermined within a feasible scope, leading to the extension of an arbitrary\ncontext length at the inference time. Besides, Extensible Tokenization is\nintroduced as a drop-in component, which can be seamlessly plugged into not\nonly the LLM itself and but also its fine-tuned derivatives, bringing in the\nextended contextual information while fully preserving the LLM's existing\ncapabilities. We perform comprehensive experiments on long-context language\nmodeling and understanding tasks, which verify Extensible Tokenization as an\neffective, efficient, flexible, and compatible method to extend LLM's context.\nOur model and source code will be made publicly available.\n","authors":["Ninglu Shao","Shitao Xiao","Zheng Liu","Peitian Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07777v1","updated":"2024-01-15T15:40:16Z","published":"2024-01-15T15:40:16Z","title":"Quantum Transfer Learning for Acceptability Judgements","summary":" Hybrid quantum-classical classifiers promise to positively impact critical\naspects of natural language processing tasks, particularly\nclassification-related ones. Among the possibilities currently investigated,\nquantum transfer learning, i.e., using a quantum circuit for fine-tuning\npre-trained classical models for a specific task, is attracting significant\nattention as a potential platform for proving quantum advantage.\n This work shows potential advantages, both in terms of performance and\nexpressiveness, of quantum transfer learning algorithms trained on embedding\nvectors extracted from a large language model to perform classification on a\nclassical Linguistics task: acceptability judgments. Acceptability judgment is\nthe ability to determine whether a sentence is considered natural and\nwell-formed by a native speaker. The approach has been tested on sentences\nextracted from ItaCoLa, a corpus that collects Italian sentences labeled with\ntheir acceptability judgment. The evaluation phase shows results for the\nquantum transfer learning pipeline comparable to state-of-the-art classical\ntransfer learning algorithms, proving current quantum computers' capabilities\nto tackle NLP tasks for ready-to-use applications. Furthermore, a qualitative\nlinguistic analysis, aided by explainable AI methods, reveals the capabilities\nof quantum transfer learning algorithms to correctly classify complex and more\nstructured sentences, compared to their classical counterpart. This finding\nsets the ground for a quantifiable quantum advantage in NLP in the near future.\n","authors":["Giuseppe Buonaiuto","Raffaele Guarasci","Aniello Minutolo","Giuseppe De Pietro","Massimo Esposito"],"pdf_url":"https://arxiv.org/pdf/2401.07777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07760v1","updated":"2024-01-15T15:11:15Z","published":"2024-01-15T15:11:15Z","title":"On the importance of Data Scale in Pretraining Arabic Language Models","summary":" Pretraining monolingual language models have been proven to be vital for\nperformance in Arabic Natural Language Processing (NLP) tasks. In this paper,\nwe conduct a comprehensive study on the role of data in Arabic Pretrained\nLanguage Models (PLMs). More precisely, we reassess the performance of a suite\nof state-of-the-art Arabic PLMs by retraining them on massive-scale,\nhigh-quality Arabic corpora. We have significantly improved the performance of\nthe leading Arabic encoder-only BERT-base and encoder-decoder T5-base models on\nthe ALUE and ORCA leaderboards, thereby reporting state-of-the-art results in\ntheir respective model categories. In addition, our analysis strongly suggests\nthat pretraining data by far is the primary contributor to performance,\nsurpassing other factors. Our models and source code are publicly available at\nhttps://github.com/huawei-noah/Pretrained-Language-Model/tree/master/JABER-PyTorch.\n","authors":["Abbas Ghaddar","Philippe Langlais","Mehdi Rezagholizadeh","Boxing Chen"],"pdf_url":"https://arxiv.org/pdf/2401.07760v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04658v2","updated":"2024-01-15T14:57:29Z","published":"2024-01-09T16:27:28Z","title":"Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence\n Lengths in Large Language Models","summary":" Linear attention is an efficient attention mechanism that has recently\nemerged as a promising alternative to conventional softmax attention. With its\nability to process tokens in linear computational complexities, linear\nattention, in theory, can handle sequences of unlimited length without\nsacrificing speed, i.e., maintaining a constant training speed for various\nsequence lengths with a fixed memory consumption. However, due to the issue\nwith cumulative summation (cumsum), current linear attention algorithms cannot\ndemonstrate their theoretical advantage in a causal setting. In this paper, we\npresent Lightning Attention-2, the first linear attention implementation that\nenables linear attention to realize its theoretical computational benefits. To\nachieve this, we leverage the thought of tiling, separately handling the\nintra-block and inter-block components in linear attention calculation.\nSpecifically, we utilize the conventional attention computation mechanism for\nthe intra-blocks and apply linear attention kernel tricks for the inter-blocks.\nA tiling technique is adopted through both forward and backward procedures to\ntake full advantage of the GPU hardware. We implement our algorithm in Triton\nto make it IO-aware and hardware-friendly. Various experiments are conducted on\ndifferent model sizes and sequence lengths. Lightning Attention-2 retains\nconsistent training and inference speed regardless of input sequence length and\nis significantly faster than other attention mechanisms. The source code is\navailable at https://github.com/OpenNLPLab/lightning-attention.\n","authors":["Zhen Qin","Weigao Sun","Dong Li","Xuyang Shen","Weixuan Sun","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.04658v2.pdf","comment":"Technical Report. Yiran Zhong is the corresponding author. The source\n code is available at https://github.com/OpenNLPLab/lightning-attention"},{"id":"http://arxiv.org/abs/2401.07702v1","updated":"2024-01-15T14:19:47Z","published":"2024-01-15T14:19:47Z","title":"Prompting open-source and commercial language models for grammatical\n error correction of English learner text","summary":" Thanks to recent advances in generative AI, we are able to prompt large\nlanguage models (LLMs) to produce texts which are fluent and grammatical. In\naddition, it has been shown that we can elicit attempts at grammatical error\ncorrection (GEC) from LLMs when prompted with ungrammatical input sentences. We\nevaluate how well LLMs can perform at GEC by measuring their performance on\nestablished benchmark datasets. We go beyond previous studies, which only\nexamined GPT* models on a selection of English GEC datasets, by evaluating\nseven open-source and three commercial LLMs on four established GEC benchmarks.\nWe investigate model performance and report results against individual error\ntypes. Our results indicate that LLMs do not always outperform supervised\nEnglish GEC models except in specific contexts -- namely commercial LLMs on\nbenchmarks annotated with fluency corrections as opposed to minimal edits. We\nfind that several open-source models outperform commercial ones on minimal edit\nbenchmarks, and that in some settings zero-shot prompting is just as\ncompetitive as few-shot prompting.\n","authors":["Christopher Davis","Andrew Caines","Øistein Andersen","Shiva Taslimipoor","Helen Yannakoudakis","Zheng Yuan","Christopher Bryant","Marek Rei","Paula Buttery"],"pdf_url":"https://arxiv.org/pdf/2401.07702v1.pdf","comment":"8 pages with appendices"},{"id":"http://arxiv.org/abs/2401.07683v1","updated":"2024-01-15T13:51:00Z","published":"2024-01-15T13:51:00Z","title":"Assisted Knowledge Graph Authoring: Human-Supervised Knowledge Graph\n Construction from Natural Language","summary":" Encyclopedic knowledge graphs, such as Wikidata, host an extensive repository\nof millions of knowledge statements. However, domain-specific knowledge from\nfields such as history, physics, or medicine is significantly underrepresented\nin those graphs. Although few domain-specific knowledge graphs exist (e.g.,\nPubmed for medicine), developing specialized retrieval applications for many\ndomains still requires constructing knowledge graphs from scratch. To\nfacilitate knowledge graph construction, we introduce WAKA: a Web application\nthat allows domain experts to create knowledge graphs through the medium with\nwhich they are most familiar: natural language.\n","authors":["Marcel Gohsen","Benno Stein"],"pdf_url":"https://arxiv.org/pdf/2401.07683v1.pdf","comment":"accepted at CHIIR 2024"},{"id":"http://arxiv.org/abs/2401.01326v2","updated":"2024-01-15T13:39:38Z","published":"2024-01-02T18:32:14Z","title":"An Autoregressive Text-to-Graph Framework for Joint Entity and Relation\n Extraction","summary":" In this paper, we propose a novel method for joint entity and relation\nextraction from unstructured text by framing it as a conditional sequence\ngeneration problem. In contrast to conventional generative information\nextraction models that are left-to-right token-level generators, our approach\nis \\textit{span-based}. It generates a linearized graph where nodes represent\ntext spans and edges represent relation triplets. Our method employs a\ntransformer encoder-decoder architecture with pointing mechanism on a dynamic\nvocabulary of spans and relation types. Our model can capture the structural\ncharacteristics and boundaries of entities and relations through span\nrepresentations while simultaneously grounding the generated output in the\noriginal text thanks to the pointing mechanism. Evaluation on benchmark\ndatasets validates the effectiveness of our approach, demonstrating competitive\nresults. Code is available at https://github.com/urchade/ATG.\n","authors":["Urchade Zaratiana","Nadi Tomeh","Pierre Holat","Thierry Charnois"],"pdf_url":"https://arxiv.org/pdf/2401.01326v2.pdf","comment":"AAAI 2024 (camera ready version)"},{"id":"http://arxiv.org/abs/2401.07598v1","updated":"2024-01-15T11:06:43Z","published":"2024-01-15T11:06:43Z","title":"MAPLE: Multilingual Evaluation of Parameter Efficient Finetuning of\n Large Language Models","summary":" Parameter efficient finetuning has emerged as a viable solution for improving\nthe performance of Large Language Models without requiring massive resources\nand compute. Prior work on multilingual evaluation has shown that there is a\nlarge gap between the performance of LLMs on English and other languages.\nFurther, there is also a large gap between the performance of smaller\nopen-source models and larger LLMs. Finetuning can be an effective way to\nbridge this gap and make language models more equitable. In this work, we\nfinetune the LLaMA-7B and Mistral-7B models on synthetic multilingual\ninstruction tuning data to determine its effect on model performance on five\ndownstream tasks covering twenty three languages in all. Additionally, we\nexperiment with various parameters, such as rank for low-rank adaptation and\nvalues of quantisation to determine their effects on downstream performance and\nfind that higher rank and higher quantisation values benefit low-resource\nlanguages. We find that parameter efficient finetuning of smaller open source\nmodels sometimes bridges the gap between the performance of these models and\nthe larger ones, however, English performance can take a hit. We also find that\nfinetuning sometimes improves performance on low-resource languages, while\ndegrading performance on high-resource languages.\n","authors":["Divyanshu Aggarwal","Ashutosh Sathe","Sunayana Sitaram"],"pdf_url":"https://arxiv.org/pdf/2401.07598v1.pdf","comment":"23 pages, 23 figures, 14 tables"},{"id":"http://arxiv.org/abs/2306.05323v2","updated":"2024-01-15T11:05:23Z","published":"2023-06-08T16:15:46Z","title":"Advancing Italian Biomedical Information Extraction with\n Transformers-based Models: Methodological Insights and Multicenter Practical\n Application","summary":" The introduction of computerized medical records in hospitals has reduced\nburdensome activities like manual writing and information fetching. However,\nthe data contained in medical records are still far underutilized, primarily\nbecause extracting data from unstructured textual medical records takes time\nand effort. Information Extraction, a subfield of Natural Language Processing,\ncan help clinical practitioners overcome this limitation by using automated\ntext-mining pipelines. In this work, we created the first Italian\nneuropsychiatric Named Entity Recognition dataset, PsyNIT, and used it to\ndevelop a Transformers-based model. Moreover, we collected and leveraged three\nexternal independent datasets to implement an effective multicenter model, with\noverall F1-score 84.77%, Precision 83.16%, Recall 86.44%. The lessons learned\nare: (i) the crucial role of a consistent annotation process and (ii) a\nfine-tuning strategy that combines classical methods with a \"low-resource\"\napproach. This allowed us to establish methodological guidelines that pave the\nway for Natural Language Processing studies in less-resourced languages.\n","authors":["Claudio Crema","Tommaso Mario Buonocore","Silvia Fostinelli","Enea Parimbelli","Federico Verde","Cira Fundarò","Marina Manera","Matteo Cotta Ramusino","Marco Capelli","Alfredo Costa","Giuliano Binetti","Riccardo Bellazzi","Alberto Redolfi"],"pdf_url":"https://arxiv.org/pdf/2306.05323v2.pdf","comment":"2 figures, 6 tables, Supplementary Notes included"},{"id":"http://arxiv.org/abs/2401.07575v1","updated":"2024-01-15T10:18:08Z","published":"2024-01-15T10:18:08Z","title":"Cascaded Cross-Modal Transformer for Audio-Textual Classification","summary":" Speech classification tasks often require powerful language understanding\nmodels to grasp useful features, which becomes problematic when limited\ntraining data is available. To attain superior classification performance, we\npropose to harness the inherent value of multimodal representations by\ntranscribing speech using automatic speech recognition (ASR) models and\ntranslating the transcripts into different languages via pretrained translation\nmodels. We thus obtain an audio-textual (multimodal) representation for each\ndata sample. Subsequently, we combine language-specific Bidirectional Encoder\nRepresentations from Transformers (BERT) with Wav2Vec2.0 audio features via a\nnovel cascaded cross-modal transformer (CCMT). Our model is based on two\ncascaded transformer blocks. The first one combines text-specific features from\ndistinct languages, while the second one combines acoustic features with\nmultilingual features previously learned by the first transformer block. We\nemployed our system in the Requests Sub-Challenge of the ACM Multimedia 2023\nComputational Paralinguistics Challenge. CCMT was declared the winning\nsolution, obtaining an unweighted average recall (UAR) of 65.41% and 85.87% for\ncomplaint and request detection, respectively. Moreover, we applied our\nframework on the Speech Commands v2 and HarperValleyBank dialog data sets,\nsurpassing previous studies reporting results on these benchmarks. Our code is\nfreely available for download at: https://github.com/ristea/ccmt.\n","authors":["Nicolae-Catalin Ristea","Andrei Anghel","Radu Tudor Ionescu"],"pdf_url":"https://arxiv.org/pdf/2401.07575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07572v1","updated":"2024-01-15T10:16:44Z","published":"2024-01-15T10:16:44Z","title":"Exploiting GPT-4 Vision for Zero-shot Point Cloud Understanding","summary":" In this study, we tackle the challenge of classifying the object category in\npoint clouds, which previous works like PointCLIP struggle to address due to\nthe inherent limitations of the CLIP architecture. Our approach leverages GPT-4\nVision (GPT-4V) to overcome these challenges by employing its advanced\ngenerative abilities, enabling a more adaptive and robust classification\nprocess. We adapt the application of GPT-4V to process complex 3D data,\nenabling it to achieve zero-shot recognition capabilities without altering the\nunderlying model architecture. Our methodology also includes a systematic\nstrategy for point cloud image visualization, mitigating domain gap and\nenhancing GPT-4V's efficiency. Experimental validation demonstrates our\napproach's superiority in diverse scenarios, setting a new benchmark in\nzero-shot point cloud classification.\n","authors":["Qi Sun","Xiao Cui","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2401.07572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16989v4","updated":"2024-01-15T09:55:05Z","published":"2023-11-28T17:44:51Z","title":"ChatGPT's One-year Anniversary: Are Open-Source Large Language Models\n Catching up?","summary":" Upon its release in late 2022, ChatGPT has brought a seismic shift in the\nentire landscape of AI, both in research and commerce. Through\ninstruction-tuning a large language model (LLM) with supervised fine-tuning and\nreinforcement learning from human feedback, it showed that a model could answer\nhuman questions and follow instructions on a broad panel of tasks. Following\nthis success, interests in LLMs have intensified, with new LLMs flourishing at\nfrequent interval across academia and industry, including many start-ups\nfocused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's\nClaude) generally outperform their open-source counterparts, the progress on\nthe latter has been rapid with claims of achieving parity or even better on\ncertain tasks. This has crucial implications not only on research but also on\nbusiness. In this work, on the first anniversary of ChatGPT, we provide an\nexhaustive overview of this success, surveying all tasks where an open-source\nLLM has claimed to be on par or better than ChatGPT.\n","authors":["Hailin Chen","Fangkai Jiao","Xingxuan Li","Chengwei Qin","Mathieu Ravaut","Ruochen Zhao","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2311.16989v4.pdf","comment":"version v4, included latest top-performing open-sourced LLMs"},{"id":"http://arxiv.org/abs/2309.12689v2","updated":"2024-01-15T09:42:52Z","published":"2023-09-22T08:02:45Z","title":"AMPLIFY:Attention-based Mixup for Performance Improvement and Label\n Smoothing in Transformer","summary":" Mixup is an effective data augmentation method that generates new augmented\nsamples by aggregating linear combinations of different original samples.\nHowever, if there are noises or aberrant features in the original samples,\nMixup may propagate them to the augmented samples, leading to over-sensitivity\nof the model to these outliers . To solve this problem, this paper proposes a\nnew Mixup method called AMPLIFY. This method uses the Attention mechanism of\nTransformer itself to reduce the influence of noises and aberrant values in the\noriginal samples on the prediction results, without increasing additional\ntrainable parameters, and the computational cost is very low, thereby avoiding\nthe problem of high resource consumption in common Mixup methods such as\nSentence Mixup . The experimental results show that, under a smaller\ncomputational resource cost, AMPLIFY outperforms other Mixup methods in text\nclassification tasks on 7 benchmark datasets, providing new ideas and new ways\nto further improve the performance of pre-trained models based on the Attention\nmechanism, such as BERT, ALBERT, RoBERTa, and GPT. Our code can be obtained at\nhttps://github.com/kiwi-lilo/AMPLIFY.\n","authors":["Leixin Yang","Yu Xiang"],"pdf_url":"https://arxiv.org/pdf/2309.12689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07553v1","updated":"2024-01-15T09:37:03Z","published":"2024-01-15T09:37:03Z","title":"Safe Reinforcement Learning with Free-form Natural Language Constraints\n and Pre-Trained Language Models","summary":" Safe reinforcement learning (RL) agents accomplish given tasks while adhering\nto specific constraints. Employing constraints expressed via\neasily-understandable human language offers considerable potential for\nreal-world applications due to its accessibility and non-reliance on domain\nexpertise. Previous safe RL methods with natural language constraints typically\nadopt a recurrent neural network, which leads to limited capabilities when\ndealing with various forms of human language input. Furthermore, these methods\noften require a ground-truth cost function, necessitating domain expertise for\nthe conversion of language constraints into a well-defined cost function that\ndetermines constraint violation. To address these issues, we proposes to use\npre-trained language models (LM) to facilitate RL agents' comprehension of\nnatural language constraints and allow them to infer costs for safe policy\nlearning. Through the use of pre-trained LMs and the elimination of the need\nfor a ground-truth cost, our method enhances safe policy learning under a\ndiverse set of human-derived free-form natural language constraints.\nExperiments on grid-world navigation and robot control show that the proposed\nmethod can achieve strong performance while adhering to given constraints. The\nusage of pre-trained LMs allows our method to comprehend complicated\nconstraints and learn safe policies without the need for ground-truth cost at\nany stage of training or evaluation. Extensive ablation studies are conducted\nto demonstrate the efficacy of each part of our method.\n","authors":["Xingzhou Lou","Junge Zhang","Ziyan Wang","Kaiqi Huang","Yali Du"],"pdf_url":"https://arxiv.org/pdf/2401.07553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07544v1","updated":"2024-01-15T09:09:14Z","published":"2024-01-15T09:09:14Z","title":"See the Unseen: Better Context-Consistent Knowledge-Editing by Noises","summary":" Knowledge-editing updates knowledge of large language models (LLMs) and\ncontributes to the interpretability and application of LLMs. However, knowledge\napplying is context-consistent: LLMs can recall the same knowledge in different\ncontexts. Existing works ignore this property and the editing lacks\ngeneralization. In this paper, we empirically find that the effects of\ndifferent contexts upon LLMs in recalling the same knowledge follow a\nGaussian-like distribution. We then sample Gaussian noises to simulate the\neffects of different contexts when updating LLMs. By such, we can make LLMs see\nthe unseen contexts where the edited knowledge will be applied, therefore\nimproving the editing generalization. Experimental results on three LLMs\ndemonstrate the effectiveness of our methods and also distinguish our methods\nfrom the others of fine-tuning LLMs by noises.\n","authors":["Youcheng Huang","Wenqiang Lei","Zheng Zhang","Jiancheng Lv","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2401.07544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07529v1","updated":"2024-01-15T08:19:22Z","published":"2024-01-15T08:19:22Z","title":"MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of\n Multimodal Large Language Models in Perception","summary":" Multimodal Large Language Models (MLLMs) have shown their remarkable\nabilities in visual perception and understanding recently. However, how to\ncomprehensively evaluate the capabilities of MLLMs remains a challenge. Most of\nthe existing benchmarks predominantly focus on assessing perception, cognition,\nand reasoning, neglecting the abilities of self-awareness, referring to the\nmodel's recognition of its own capability boundary. In our study, we focus on\nself-awareness in image perception and introduce the knowledge quadrant for\nMLLMs, which clearly defines the knowns and unknowns in perception. Based on\nthis, we propose a novel benchmark specifically designed to evaluate the\nSelf-Aware capabilities in Perception for MLLMs(MM-SAP). MM-SAP encompasses\nthree distinct sub-datasets, each focusing on different aspects of\nself-awareness. We evaluated eight well-known MLLMs using MM-SAP, analyzing\ntheir self-awareness and providing detailed insights. Code and data are\navailable at https://github.com/YHWmz/MM-SAP\n","authors":["Yuhao Wang","Yusheng Liao","Heyang Liu","Hongcheng Liu","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17267v2","updated":"2024-01-15T08:09:00Z","published":"2023-12-26T14:16:16Z","title":"Improving Low-resource Prompt-based Relation Representation with\n Multi-view Decoupling Learning","summary":" Recently, prompt-tuning with pre-trained language models (PLMs) has\ndemonstrated the significantly enhancing ability of relation extraction (RE)\ntasks. However, in low-resource scenarios, where the available training data is\nscarce, previous prompt-based methods may still perform poorly for prompt-based\nrepresentation learning due to a superficial understanding of the relation. To\nthis end, we highlight the importance of learning high-quality relation\nrepresentation in low-resource scenarios for RE, and propose a novel\nprompt-based relation representation method, named MVRE\n(\\underline{M}ulti-\\underline{V}iew \\underline{R}elation\n\\underline{E}xtraction), to better leverage the capacity of PLMs to improve the\nperformance of RE within the low-resource prompt-tuning paradigm. Specifically,\nMVRE decouples each relation into different perspectives to encompass\nmulti-view relation representations for maximizing the likelihood during\nrelation inference. Furthermore, we also design a Global-Local loss and a\nDynamic-Initialization method for better alignment of the multi-view\nrelation-representing virtual words, containing the semantics of relation\nlabels during the optimization learning process and initialization. Extensive\nexperiments on three benchmark datasets show that our method can achieve\nstate-of-the-art in low-resource settings.\n","authors":["Chenghao Fan","Wei Wei","Xiaoye Qu","Zhenyi Lu","Wenfeng Xie","Yu Cheng","Dangyang Chen"],"pdf_url":"https://arxiv.org/pdf/2312.17267v2.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.07526v1","updated":"2024-01-15T08:08:24Z","published":"2024-01-15T08:08:24Z","title":"Editing Arbitrary Propositions in LLMs without Subject Labels","summary":" Large Language Model (LLM) editing modifies factual information in LLMs.\nLocate-and-Edit (L\\&E) methods accomplish this by finding where relevant\ninformation is stored within the neural network, and editing the weights at\nthat location. The goal of editing is to modify the response of an LLM to a\nproposition independently of its phrasing, while not modifying its response to\nother related propositions. Existing methods are limited to binary\npropositions, which represent straightforward binary relations between a\nsubject and an object. Furthermore, existing methods rely on semantic subject\nlabels, which may not be available or even be well-defined in practice. In this\npaper, we show that both of these issues can be effectively skirted with a\nsimple and fast localization method called Gradient Tracing (GT). This\nlocalization method allows editing arbitrary propositions instead of just\nbinary ones, and does so without the need for subject labels. As propositions\nalways have a truth value, our experiments prompt an LLM as a boolean\nclassifier, and edit its T/F response to propositions. Our method applies GT\nfor location tracing, and then edit the model at that location using a mild\nvariant of Rank-One Model Editing (ROME). On datasets of binary propositions\nderived from the CounterFact dataset, we show that our method -- without access\nto subject labels -- performs close to state-of-the-art L\\&E methods which has\naccess subject labels. We then introduce a new dataset, Factual Accuracy\nClassification Test (FACT), which includes non-binary propositions and for\nwhich subject labels are not generally applicable, and therefore is beyond the\nscope of existing L\\&E methods. Nevertheless, we show that with our method\nediting is possible on FACT.\n","authors":["Itai Feigenbaum","Devansh Arpit","Huan Wang","Shelby Heinecke","Juan Carlos Niebles","Weiran Yao","Caiming Xiong","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2401.07526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07525v1","updated":"2024-01-15T07:57:58Z","published":"2024-01-15T07:57:58Z","title":"TAROT: A Hierarchical Framework with Multitask Co-Pretraining on\n Semi-Structured Data towards Effective Person-Job Fit","summary":" Person-job fit is an essential part of online recruitment platforms in\nserving various downstream applications like Job Search and Candidate\nRecommendation. Recently, pretrained large language models have further\nenhanced the effectiveness by leveraging richer textual information in user\nprofiles and job descriptions apart from user behavior features and job\nmetadata. However, the general domain-oriented design struggles to capture the\nunique structural information within user profiles and job descriptions,\nleading to a loss of latent semantic correlations. We propose TAROT, a\nhierarchical multitask co-pretraining framework, to better utilize structural\nand semantic information for informative text embeddings. TAROT targets\nsemi-structured text in profiles and jobs, and it is co-pretained with\nmulti-grained pretraining tasks to constrain the acquired semantic information\nat each level. Experiments on a real-world LinkedIn dataset show significant\nperformance improvements, proving its effectiveness in person-job fit tasks.\n","authors":["Yihan Cao","Xu Chen","Lun Du","Hao Chen","Qiang Fu","Shi Han","Yushu Du","Yanbin Kang","Guangming Lu","Zi Li"],"pdf_url":"https://arxiv.org/pdf/2401.07525v1.pdf","comment":"ICASSP 2024 camera ready. 5 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2401.07518v1","updated":"2024-01-15T07:48:42Z","published":"2024-01-15T07:48:42Z","title":"Survey of Natural Language Processing for Education: Taxonomy,\n Systematic Review, and Future Trends","summary":" Natural Language Processing (NLP) aims to analyze the text via techniques in\nthe computer science field. It serves the applications in healthcare, commerce,\nand education domains. Particularly, NLP has been applied to the education\ndomain to help teaching and learning. In this survey, we review recent advances\nin NLP with a focus on solving problems related to the education domain. In\ndetail, we begin with introducing the relevant background. Then, we present the\ntaxonomy of NLP in the education domain. Next, we illustrate the task\ndefinition, challenges, and corresponding techniques based on the above\ntaxonomy. After that, we showcase some off-the-shelf demonstrations in this\ndomain and conclude with future directions.\n","authors":["Yunshi Lan","Xinyuan Li","Hanyue Du","Xuesong Lu","Ming Gao","Weining Qian","Aoying Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.07518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07510v1","updated":"2024-01-15T07:21:16Z","published":"2024-01-15T07:21:16Z","title":"Developing ChatGPT for Biology and Medicine: A Complete Review of\n Biomedical Question Answering","summary":" ChatGPT explores a strategic blueprint of question answering (QA) in\ndelivering medical diagnosis, treatment recommendations, and other healthcare\nsupport. This is achieved through the increasing incorporation of medical\ndomain data via natural language processing (NLP) and multimodal paradigms. By\ntransitioning the distribution of text, images, videos, and other modalities\nfrom the general domain to the medical domain, these techniques have expedited\nthe progress of medical domain question answering (MDQA). They bridge the gap\nbetween human natural language and sophisticated medical domain knowledge or\nexpert manual annotations, handling large-scale, diverse, unbalanced, or even\nunlabeled data analysis scenarios in medical contexts. Central to our focus is\nthe utilizing of language models and multimodal paradigms for medical question\nanswering, aiming to guide the research community in selecting appropriate\nmechanisms for their specific medical research requirements. Specialized tasks\nsuch as unimodal-related question answering, reading comprehension, reasoning,\ndiagnosis, relation extraction, probability modeling, and others, as well as\nmultimodal-related tasks like vision question answering, image caption,\ncross-modal retrieval, report summarization, and generation, are discussed in\ndetail. Each section delves into the intricate specifics of the respective\nmethod under consideration. This paper highlights the structures and\nadvancements of medical domain explorations against general domain methods,\nemphasizing their applications across different tasks and datasets. It also\noutlines current challenges and opportunities for future medical domain\nresearch, paving the way for continued innovation and application in this\nrapidly evolving field.\n","authors":["Qing Li","Lei Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2401.07510v1.pdf","comment":"50 pages, 3 figures, Biophysics Reports"},{"id":"http://arxiv.org/abs/2401.06071v3","updated":"2024-01-15T06:29:17Z","published":"2024-01-11T17:41:57Z","title":"LEGO:Language Enhanced Multi-modal Grounding Model","summary":" Multi-modal large language models have demonstrated impressive performance\nacross various tasks in different modalities. However, existing multi-modal\nmodels primarily emphasize capturing global information within each modality\nwhile neglecting the importance of perceiving local information across\nmodalities. Consequently, these models lack the ability to effectively\nunderstand the fine-grained details of input data, limiting their performance\nin tasks that require a more nuanced understanding. To address this limitation,\nthere is a compelling need to develop models that enable fine-grained\nunderstanding across multiple modalities, thereby enhancing their applicability\nto a wide range of tasks. In this paper, we propose LEGO, a language enhanced\nmulti-modal grounding model. Beyond capturing global information like other\nmulti-modal models, our proposed model excels at tasks demanding a detailed\nunderstanding of local information within the input. It demonstrates precise\nidentification and localization of specific regions in images or moments in\nvideos. To achieve this objective, we design a diversified dataset construction\npipeline, resulting in a multi-modal, multi-granularity dataset for model\ntraining. The code, dataset, and demo of our model can be found at https:\n//github.com/lzw-lzw/LEGO.\n","authors":["Zhaowei Li","Qi Xu","Dong Zhang","Hang Song","Yiqing Cai","Qi Qi","Ran Zhou","Junting Pan","Zefeng Li","Van Tu Vu","Zhida Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.06071v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02330v2","updated":"2024-01-15T05:24:20Z","published":"2024-01-04T16:07:43Z","title":"LLaVA-Phi: Efficient Multi-Modal Assistant with Small Language Model","summary":" In this paper, we introduce LLaVA-$\\phi$ (LLaVA-Phi), an efficient\nmulti-modal assistant that harnesses the power of the recently advanced small\nlanguage model, Phi-2, to facilitate multi-modal dialogues. LLaVA-Phi marks a\nnotable advancement in the realm of compact multi-modal models. It demonstrates\nthat even smaller language models, with as few as 2.7B parameters, can\neffectively engage in intricate dialogues that integrate both textual and\nvisual elements, provided they are trained with high-quality corpora. Our model\ndelivers commendable performance on publicly available benchmarks that\nencompass visual comprehension, reasoning, and knowledge-based perception.\nBeyond its remarkable performance in multi-modal dialogue tasks, our model\nopens new avenues for applications in time-sensitive environments and systems\nthat require real-time interaction, such as embodied agents. It highlights the\npotential of smaller language models to achieve sophisticated levels of\nunderstanding and interaction, while maintaining greater resource\nefficiency.The project is available at {https://github.com/zhuyiche/llava-phi}.\n","authors":["Yichen Zhu","Minjie Zhu","Ning Liu","Zhicai Ou","Xiaofeng Mou","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.02330v2.pdf","comment":"technique report"},{"id":"http://arxiv.org/abs/2401.07475v1","updated":"2024-01-15T05:06:17Z","published":"2024-01-15T05:06:17Z","title":"GWPT: A Green Word-Embedding-based POS Tagger","summary":" As a fundamental tool for natural language processing (NLP), the\npart-of-speech (POS) tagger assigns the POS label to each word in a sentence. A\nnovel lightweight POS tagger based on word embeddings is proposed and named\nGWPT (green word-embedding-based POS tagger) in this work. Following the green\nlearning (GL) methodology, GWPT contains three modules in cascade: 1)\nrepresentation learning, 2) feature learning, and 3) decision learning modules.\nThe main novelty of GWPT lies in representation learning. It uses\nnon-contextual or contextual word embeddings, partitions embedding dimension\nindices into low-, medium-, and high-frequency sets, and represents them with\ndifferent N-grams. It is shown by experimental results that GWPT offers\nstate-of-the-art accuracies with fewer model parameters and significantly lower\ncomputational complexity in both training and inference as compared with\ndeep-learning-based methods.\n","authors":["Chengwei Wei","Runqi Pang","C. -C. Jay Kuo"],"pdf_url":"https://arxiv.org/pdf/2401.07475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07470v1","updated":"2024-01-15T04:58:50Z","published":"2024-01-15T04:58:50Z","title":"Utilizing deep learning models for the identification of enhancers and\n super-enhancers based on genomic and epigenomic features","summary":" This paper provides an extensive examination of a sizable dataset of English\ntweets focusing on nine widely recognized cryptocurrencies, specifically\nCardano, Binance, Bitcoin, Dogecoin, Ethereum, Fantom, Matic, Shiba, and\nRipple. Our primary objective was to conduct a psycholinguistic and emotion\nanalysis of social media content associated with these cryptocurrencies. To\nenable investigators to make more informed decisions. The study involved\ncomparing linguistic characteristics across the diverse digital coins, shedding\nlight on the distinctive linguistic patterns that emerge within each coin's\ncommunity. To achieve this, we utilized advanced text analysis techniques.\nAdditionally, our work unveiled an intriguing Understanding of the interplay\nbetween these digital assets within the cryptocurrency community. By examining\nwhich coin pairs are mentioned together most frequently in the dataset, we\nestablished correlations between different cryptocurrencies. To ensure the\nreliability of our findings, we initially gathered a total of 832,559 tweets\nfrom Twitter. These tweets underwent a rigorous preprocessing stage, resulting\nin a refined dataset of 115,899 tweets that were used for our analysis.\nOverall, our research offers valuable Perception into the linguistic nuances of\nvarious digital coins' online communities and provides a deeper understanding\nof their interactions in the cryptocurrency space.\n","authors":["Zahra Ahani","Moein Shahiki Tash","Yoel Ledo Mezquita","Jason Angel"],"pdf_url":"https://arxiv.org/pdf/2401.07470v1.pdf","comment":"13 pages, 7 figures, 6 Tables"},{"id":"http://arxiv.org/abs/2311.16522v2","updated":"2024-01-15T04:50:47Z","published":"2023-11-28T05:00:27Z","title":"Dynamic Fault Characteristics Evaluation in Power Grid","summary":" To enhance the intelligence degree in operation and maintenance, a novel\nmethod for fault detection in power grids is proposed. The proposed GNN-based\napproach first identifies fault nodes through a specialized feature extraction\nmethod coupled with a knowledge graph. By incorporating temporal data, the\nmethod leverages the status of nodes from preceding and subsequent time periods\nto help current fault detection. To validate the effectiveness of the node\nfeatures, a correlation analysis of the output features from each node was\nconducted. The results from experiments show that this method can accurately\nlocate fault nodes in simulation scenarios with a remarkable accuracy.\nAdditionally, the graph neural network based feature modeling allows for a\nqualitative examination of how faults spread across nodes, which provides\nvaluable insights for analyzing fault nodes.\n","authors":["Hao Pei","Si Lin","Chuanfu Li","Che Wang","Haoming Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.16522v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08279v2","updated":"2024-01-15T04:22:59Z","published":"2023-10-12T12:31:23Z","title":"Can Text-based Knowledge Graph Completion Benefit From Zero-Shot Large\n Language Models?","summary":" Text-based knowledge graph completion (KGC) methods, leveraging textual\nentity descriptions are at the research forefront. The efficacy of these models\nhinges on the quality of the textual data. This study explores whether enriched\nor more efficient textual descriptions can amplify model performance. Recently,\nLarge Language Models (LLMs) have shown remarkable improvements in NLP tasks,\nattributed to their sophisticated text generation and conversational\ncapabilities. LLMs assimilate linguistic patterns and integrate knowledge from\ntheir training data. Compared to traditional databases like Wikipedia, LLMs\nprovide several advantages, facilitating broader information querying and\ncontent augmentation. We hypothesize that LLMs, without fine-tuning, can refine\nentity descriptions, serving as an auxiliary knowledge source. An in-depth\nanalysis was conducted to verify this hypothesis. We found that (1) without\nfine-tuning, LLMs have the capability to further improve the quality of entity\ntext descriptions. We validated this through experiments on the FB15K-237 and\nWN18RR datasets. (2) LLMs exhibit text generation hallucination issues and\nselectively output words with multiple meanings. This was mitigated by\ncontextualizing prompts to constrain LLM outputs. (3) Larger model sizes do not\nnecessarily guarantee better performance; even the 7B model can achieve\noptimized results in this comparative task. These findings underscore the\nuntapped potential of large models in text-based KGC, which is a promising\ndirection for further research in KGC. The code and datasets are accessible at\n\\href{https://github.com/sjlmg/CP-KGC}.\n","authors":["Rui Yang","Li Fang","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.08279v2.pdf","comment":"new versionv"},{"id":"http://arxiv.org/abs/2401.07456v1","updated":"2024-01-15T04:04:26Z","published":"2024-01-15T04:04:26Z","title":"Only Send What You Need: Learning to Communicate Efficiently in\n Federated Multilingual Machine Translation","summary":" Federated learning (FL) is a promising approach for solving multilingual\ntasks, potentially enabling clients with their own language-specific data to\ncollaboratively construct a high-quality neural machine translation (NMT)\nmodel. However, communication constraints in practical network systems present\nchallenges for exchanging large-scale NMT engines between FL parties. In this\npaper, we propose a meta-learning-based adaptive parameter selection\nmethodology, MetaSend, that improves the communication efficiency of model\ntransmissions from clients during FL-based multilingual NMT training. Our\napproach learns a dynamic threshold for filtering parameters prior to\ntransmission without compromising the NMT model quality, based on the tensor\ndeviations of clients between different FL rounds. Through experiments on two\nNMT datasets with different language distributions, we demonstrate that\nMetaSend obtains substantial improvements over baselines in translation quality\nin the presence of a limited communication budget.\n","authors":["Yun-Wei Chu","Dong-Jun Han","Christopher G. Brinton"],"pdf_url":"https://arxiv.org/pdf/2401.07456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07453v1","updated":"2024-01-15T03:57:15Z","published":"2024-01-15T03:57:15Z","title":"Model Editing at Scale leads to Gradual and Catastrophic Forgetting","summary":" Editing knowledge in large language models is an attractive capability to\nhave which allows us to correct incorrectly learnt facts during pre-training,\nas well as update the model with an ever-growing list of new facts. While\nexisting model editing techniques have shown promise, they are usually\nevaluated using metrics for reliability, specificity and generalization over\none or few edits. We argue that for model editing to have practical utility, we\nmust be able to make multiple edits to the same model. With this in mind, we\nevaluate the current model editing methods at scale, focusing on two state of\nthe art methods: ROME and MEMIT. We find that as the model is edited\nsequentially with multiple facts, it continually forgets previously edited\nfacts and the ability to perform downstream tasks. This forgetting happens in\ntwo phases -- an initial gradual but progressive forgetting phase followed by\nabrupt or catastrophic forgetting phase. Both gradual and catastrophic\nforgetting limit the usefulness of model editing methods at scale -- the former\nmaking model editing less effective as multiple edits are made to the model\nwhile the latter caps the scalability of such model editing methods. Our\nanalysis also highlights other key limitations of ROME and MEMIT at scale. With\nour work, we push for the development and evaluation of model editing methods\nkeeping scalability in mind.\n","authors":["Akshat Gupta","Anurag Rao","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2401.07453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07447v1","updated":"2024-01-15T03:23:24Z","published":"2024-01-15T03:23:24Z","title":"Taec: a Manually annotated text dataset for trait and phenotype\n extraction and entity linking in wheat breeding literature","summary":" Wheat varieties show a large diversity of traits and phenotypes. Linking them\nto genetic variability is essential for shorter and more efficient wheat\nbreeding programs. Newly desirable wheat variety traits include disease\nresistance to reduce pesticide use, adaptation to climate change, resistance to\nheat and drought stresses, or low gluten content of grains. Wheat breeding\nexperiments are documented by a large body of scientific literature and\nobservational data obtained in-field and under controlled conditions. The\ncross-referencing of complementary information from the literature and\nobservational data is essential to the study of the genotype-phenotype\nrelationship and to the improvement of wheat selection. The scientific\nliterature on genetic marker-assisted selection describes much information\nabout the genotype-phenotype relationship. However, the variety of expressions\nused to refer to traits and phenotype values in scientific articles is a hinder\nto finding information and cross-referencing it. When trained adequately by\nannotated examples, recent text mining methods perform highly in named entity\nrecognition and linking in the scientific domain. While several corpora contain\nannotations of human and animal phenotypes, currently, no corpus is available\nfor training and evaluating named entity recognition and entity-linking methods\nin plant phenotype literature. The Triticum aestivum trait Corpus is a new gold\nstandard for traits and phenotypes of wheat. It consists of 540 PubMed\nreferences fully annotated for trait, phenotype, and species named entities\nusing the Wheat Trait and Phenotype Ontology and the species taxonomy of the\nNational Center for Biotechnology Information. A study of the performance of\ntools trained on the Triticum aestivum trait Corpus shows that the corpus is\nsuitable for the training and evaluation of named entity recognition and\nlinking.\n","authors":["Claire Nédellec","Clara Sauvion","Robert Bossy","Mariya Borovikova","Louise Deléger"],"pdf_url":"https://arxiv.org/pdf/2401.07447v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2312.14033v3","updated":"2024-01-15T03:18:25Z","published":"2023-12-21T17:02:06Z","title":"T-Eval: Evaluating the Tool Utilization Capability of Large Language\n Models Step by Step","summary":" Large language models (LLM) have achieved remarkable performance on various\nNLP tasks and are augmented by tools for broader applications. Yet, how to\nevaluate and analyze the tool-utilization capability of LLMs is still\nunder-explored. In contrast to previous works that evaluate models\nholistically, we comprehensively decompose the tool utilization into multiple\nsub-processes, including instruction following, planning, reasoning, retrieval,\nunderstanding, and review. Based on that, we further introduce T-Eval to\nevaluate the tool utilization capability step by step. T-Eval disentangles the\ntool utilization evaluation into several sub-domains along model capabilities,\nfacilitating the inner understanding of both holistic and isolated competency\nof LLMs. We conduct extensive experiments on T-Eval and in-depth analysis of\nvarious LLMs. T-Eval not only exhibits consistency with the outcome-oriented\nevaluation but also provides a more fine-grained analysis of the capabilities\nof LLMs, providing a new perspective in LLM evaluation on tool-utilization\nability. The benchmark will be available at\nhttps://github.com/open-compass/T-Eval.\n","authors":["Zehui Chen","Weihua Du","Wenwei Zhang","Kuikun Liu","Jiangning Liu","Miao Zheng","Jingming Zhuo","Songyang Zhang","Dahua Lin","Kai Chen","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.14033v3.pdf","comment":"Project: https://open-compass.github.io/T-Eval"},{"id":"http://arxiv.org/abs/2310.12798v3","updated":"2024-01-15T03:02:01Z","published":"2023-10-19T14:52:58Z","title":"MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and\n Uni-Modal Adapter","summary":" Language Models (LMs) have demonstrated impressive molecule understanding\nability on various 1D text-related tasks. However, they inherently lack 2D\ngraph perception - a critical ability of human professionals in comprehending\nmolecules' topological structures. To bridge this gap, we propose MolCA:\nMolecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal\nAdapter. MolCA enables an LM (e.g., Galactica) to understand both text- and\ngraph-based molecular contents via the cross-modal projector. Specifically, the\ncross-modal projector is implemented as a Q-Former to connect a graph encoder's\nrepresentation space and an LM's text space. Further, MolCA employs a uni-modal\nadapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks.\nUnlike previous studies that couple an LM with a graph encoder via cross-modal\ncontrastive learning, MolCA retains the LM's ability of open-ended text\ngeneration and augments it with 2D graph information. To showcase its\neffectiveness, we extensively benchmark MolCA on tasks of molecule captioning,\nIUPAC name prediction, and molecule-text retrieval, on which MolCA\nsignificantly outperforms the baselines. Our codes and checkpoints can be found\nat https://github.com/acharkq/MolCA.\n","authors":["Zhiyuan Liu","Sihang Li","Yanchen Luo","Hao Fei","Yixin Cao","Kenji Kawaguchi","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.12798v3.pdf","comment":"EMNLP main conference. 9 pages"},{"id":"http://arxiv.org/abs/2401.07441v1","updated":"2024-01-15T03:00:39Z","published":"2024-01-15T03:00:39Z","title":"Stability Analysis of ChatGPT-based Sentiment Analysis in AI Quality\n Assurance","summary":" In the era of large AI models, the complex architecture and vast parameters\npresent substantial challenges for effective AI quality management (AIQM), e.g.\nlarge language model (LLM). This paper focuses on investigating the quality\nassurance of a specific LLM-based AI product--a ChatGPT-based sentiment\nanalysis system. The study delves into stability issues related to both the\noperation and robustness of the expansive AI model on which ChatGPT is based.\nExperimental analysis is conducted using benchmark datasets for sentiment\nanalysis. The results reveal that the constructed ChatGPT-based sentiment\nanalysis system exhibits uncertainty, which is attributed to various\noperational factors. It demonstrated that the system also exhibits stability\nissues in handling conventional small text attacks involving robustness.\n","authors":["Tinghui Ouyang","AprilPyone MaungMaung","Koichi Konishi","Yoshiki Seo","Isao Echizen"],"pdf_url":"https://arxiv.org/pdf/2401.07441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07414v1","updated":"2024-01-15T01:40:39Z","published":"2024-01-15T01:40:39Z","title":"Leveraging the power of transformers for guilt detection in text","summary":" In recent years, language models and deep learning techniques have\nrevolutionized natural language processing tasks, including emotion detection.\nHowever, the specific emotion of guilt has received limited attention in this\nfield. In this research, we explore the applicability of three\ntransformer-based language models for detecting guilt in text and compare their\nperformance for general emotion detection and guilt detection. Our proposed\nmodel outformed BERT and RoBERTa models by two and one points respectively.\nAdditionally, we analyze the challenges in developing accurate guilt-detection\nmodels and evaluate our model's effectiveness in detecting related emotions\nlike \"shame\" through qualitative analysis of results.\n","authors":["Abdul Gafar Manuel Meque","Jason Angel","Grigori Sidorov","Alexander Gelbukh"],"pdf_url":"https://arxiv.org/pdf/2401.07414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13708v2","updated":"2024-01-15T00:14:58Z","published":"2023-11-22T21:59:46Z","title":"Dynamic Fault Analysis in Substations Based on Knowledge Graphs","summary":" To address the challenge of identifying hidden danger in substations from\nunstructured text, a novel dynamic analysis method is proposed. We first\nextract relevant information from the unstructured text, and then leverages a\nflexible distributed search engine built on Elastic-Search to handle the data.\nFollowing this, the hidden Markov model is employed to train the data within\nthe engine. The Viterbi algorithm is integrated to decipher the hidden state\nsequences, facilitating the segmentation and labeling of entities related to\nhidden dangers. The final step involves using the Neo4j graph database to\ndynamically create a knowledge graph that visualizes hidden dangers in the\nsubstation. The effectiveness of the proposed method is demonstrated through a\ncase analysis from a specific substation with hidden dangers revealed in the\ntext records.\n","authors":["Weiwei Li","Xing Liu","Wei Wang","Lu Chen","Sizhe Li","Hui Fan"],"pdf_url":"https://arxiv.org/pdf/2311.13708v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.08014v1","updated":"2024-01-15T23:52:35Z","published":"2024-01-15T23:52:35Z","title":"Convolutional Neural Network Compression via Dynamic Parameter Rank\n Pruning","summary":" While Convolutional Neural Networks (CNNs) excel at learning complex\nlatent-space representations, their over-parameterization can lead to\noverfitting and reduced performance, particularly with limited data. This,\nalongside their high computational and memory demands, limits the applicability\nof CNNs for edge deployment. Low-rank matrix approximation has emerged as a\npromising approach to reduce CNN parameters, but its application presents\nchallenges including rank selection and performance loss. To address these\nissues, we propose an efficient training method for CNN compression via dynamic\nparameter rank pruning. Our approach integrates efficient matrix factorization\nand novel regularization techniques, forming a robust framework for dynamic\nrank reduction and model compression. We use Singular Value Decomposition (SVD)\nto model low-rank convolutional filters and dense weight matrices and we\nachieve model compression by training the SVD factors with back-propagation in\nan end-to-end way. We evaluate our method on an array of modern CNNs, including\nResNet-18, ResNet-20, and ResNet-32, and datasets like CIFAR-10, CIFAR-100, and\nImageNet (2012), showcasing its applicability in computer vision. Our\nexperiments show that the proposed method can yield substantial storage savings\nwhile maintaining or even enhancing classification performance.\n","authors":["Manish Sharma","Jamison Heard","Eli Saber","Panos P. Markopoulos"],"pdf_url":"https://arxiv.org/pdf/2401.08014v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2301.05246v3","updated":"2024-01-15T23:44:36Z","published":"2023-01-12T19:00:27Z","title":"Online Class-Incremental Learning For Real-World Food Image\n Classification","summary":" Food image classification is essential for monitoring health and tracking\ndietary in image-based dietary assessment methods. However, conventional\nsystems often rely on static datasets with fixed classes and uniform\ndistribution. In contrast, real-world food consumption patterns, shaped by\ncultural, economic, and personal influences, involve dynamic and evolving data.\nThus, require the classification system to cope with continuously evolving\ndata. Online Class Incremental Learning (OCIL) addresses the challenge of\nlearning continuously from a single-pass data stream while adapting to the new\nknowledge and reducing catastrophic forgetting. Experience Replay (ER) based\nOCIL methods store a small portion of previous data and have shown encouraging\nperformance. However, most existing OCIL works assume that the distribution of\nencountered data is perfectly balanced, which rarely happens in real-world\nscenarios. In this work, we explore OCIL for real-world food image\nclassification by first introducing a probabilistic framework to simulate\nrealistic food consumption scenarios. Subsequently, we present an attachable\nDynamic Model Update (DMU) module designed for existing ER methods, which\nenables the selection of relevant images for model training, addressing\nchallenges arising from data repetition and imbalanced sample occurrences\ninherent in realistic food consumption patterns within the OCIL framework. Our\nperformance evaluation demonstrates significant enhancements compared to\nestablished ER methods, showing great potential for lifelong learning in\nreal-world food image classification scenarios. The code of our method is\npublicly accessible at\nhttps://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification\n","authors":["Siddeshwar Raghavan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.05246v3.pdf","comment":"Accepted at IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV 2024)"},{"id":"http://arxiv.org/abs/2401.08003v1","updated":"2024-01-15T23:10:50Z","published":"2024-01-15T23:10:50Z","title":"Jewelry Recognition via Encoder-Decoder Models","summary":" Jewelry recognition is a complex task due to the different styles and designs\nof accessories. Precise descriptions of the various accessories is something\nthat today can only be achieved by experts in the field of jewelry. In this\nwork, we propose an approach for jewelry recognition using computer vision\ntechniques and image captioning, trying to simulate this expert human behavior\nof analyzing accessories. The proposed methodology consist on using different\nimage captioning models to detect the jewels from an image and generate a\nnatural language description of the accessory. Then, this description is also\nutilized to classify the accessories at different levels of detail. The\ngenerated caption includes details such as the type of jewel, color, material,\nand design. To demonstrate the effectiveness of the proposed method in\naccurately recognizing different types of jewels, a dataset consisting of\nimages of accessories belonging to jewelry stores in C\\'ordoba (Spain) has been\ncreated. After testing the different image captioning architectures designed,\nthe final model achieves a captioning accuracy of 95\\%. The proposed\nmethodology has the potential to be used in various applications such as\njewelry e-commerce, inventory management or automatic jewels recognition to\nanalyze people's tastes and social status.\n","authors":["José M. Alcalde-Llergo","Enrique Yeguas-Bolívar","Andrea Zingoni","Alejandro Fuerte-Jurado"],"pdf_url":"https://arxiv.org/pdf/2401.08003v1.pdf","comment":"6 pages, 5 figures, MetroXRAINE 2023 Conference"},{"id":"http://arxiv.org/abs/2401.07990v1","updated":"2024-01-15T22:29:23Z","published":"2024-01-15T22:29:23Z","title":"How does self-supervised pretraining improve robustness against noisy\n labels across various medical image classification datasets?","summary":" Noisy labels can significantly impact medical image classification,\nparticularly in deep learning, by corrupting learned features. Self-supervised\npretraining, which doesn't rely on labeled data, can enhance robustness against\nnoisy labels. However, this robustness varies based on factors like the number\nof classes, dataset complexity, and training size. In medical images, subtle\ninter-class differences and modality-specific characteristics add complexity.\nPrevious research hasn't comprehensively explored the interplay between\nself-supervised learning and robustness against noisy labels in medical image\nclassification, considering all these factors. In this study, we address three\nkey questions: i) How does label noise impact various medical image\nclassification datasets? ii) Which types of medical image datasets are more\nchallenging to learn and more affected by label noise? iii) How do different\nself-supervised pretraining methods enhance robustness across various medical\nimage datasets? Our results show that DermNet, among five datasets (Fetal\nplane, DermNet, COVID-DU-Ex, MURA, NCT-CRC-HE-100K), is the most challenging\nbut exhibits greater robustness against noisy labels. Additionally, contrastive\nlearning stands out among the eight self-supervised methods as the most\neffective approach to enhance robustness against noisy labels.\n","authors":["Bidur Khanal","Binod Bhattarai","Bishesh Khanal","Cristian Linte"],"pdf_url":"https://arxiv.org/pdf/2401.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07899v2","updated":"2024-01-15T21:22:46Z","published":"2023-12-13T05:08:32Z","title":"Morphological Profiling for Drug Discovery in the Era of Deep Learning","summary":" Morphological profiling is a valuable tool in phenotypic drug discovery. The\nadvent of high-throughput automated imaging has enabled the capturing of a wide\nrange of morphological features of cells or organisms in response to\nperturbations at the single-cell resolution. Concurrently, significant advances\nin machine learning and deep learning, especially in computer vision, have led\nto substantial improvements in analyzing large-scale high-content images at\nhigh-throughput. These efforts have facilitated understanding of compound\nmechanism-of-action (MOA), drug repurposing, characterization of cell\nmorphodynamics under perturbation, and ultimately contributing to the\ndevelopment of novel therapeutics. In this review, we provide a comprehensive\noverview of the recent advances in the field of morphological profiling. We\nsummarize the image profiling analysis workflow, survey a broad spectrum of\nanalysis strategies encompassing feature engineering- and deep learning-based\napproaches, and introduce publicly available benchmark datasets. We place a\nparticular emphasis on the application of deep learning in this pipeline,\ncovering cell segmentation, image representation learning, and multimodal\nlearning. Additionally, we illuminate the application of morphological\nprofiling in phenotypic drug discovery and highlight potential challenges and\nopportunities in this field.\n","authors":["Qiaosi Tang","Ranjala Ratnayake","Gustavo Seabra","Zhe Jiang","Ruogu Fang","Lina Cui","Yousong Ding","Tamer Kahveci","Jiang Bian","Chenglong Li","Hendrik Luesch","Yanjun Li"],"pdf_url":"https://arxiv.org/pdf/2312.07899v2.pdf","comment":"44 pages, 5 figure, 5 tables"},{"id":"http://arxiv.org/abs/2309.16672v2","updated":"2024-01-15T21:13:58Z","published":"2023-09-28T17:59:58Z","title":"Learning to Transform for Generalizable Instance-wise Invariance","summary":" Computer vision research has long aimed to build systems that are robust to\nspatial transformations found in natural data. Traditionally, this is done\nusing data augmentation or hard-coding invariances into the architecture.\nHowever, too much or too little invariance can hurt, and the correct amount is\nunknown a priori and dependent on the instance. Ideally, the appropriate\ninvariance would be learned from data and inferred at test-time.\n We treat invariance as a prediction problem. Given any image, we use a\nnormalizing flow to predict a distribution over transformations and average the\npredictions over them. Since this distribution only depends on the instance, we\ncan align instances before classifying them and generalize invariance across\nclasses. The same distribution can also be used to adapt to out-of-distribution\nposes. This normalizing flow is trained end-to-end and can learn a much larger\nrange of transformations than Augerino and InstaAug. When used as data\naugmentation, our method shows accuracy and robustness gains on CIFAR 10,\nCIFAR10-LT, and TinyImageNet.\n","authors":["Utkarsh Singhal","Carlos Esteves","Ameesh Makadia","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16672v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2401.07962v1","updated":"2024-01-15T20:57:59Z","published":"2024-01-15T20:57:59Z","title":"Cesium Tiles for High-realism Simulation and Comparing SLAM Results in\n Corresponding Virtual and Real-world Environments","summary":" This article discusses the use of a simulated environment to predict\nalgorithm results in the real world. Simulators are crucial in allowing\nresearchers to test algorithms, sensor integration, and navigation systems\nwithout deploying expensive hardware. This article examines how the AirSim\nsimulator, Unreal Engine, and Cesium plugin can be used to generate simulated\ndigital twin models of real-world locations. Several technical challenges in\ncompleting the analysis are discussed and the technical solutions are detailed\nin this article. Work investigates how to assess mapping results for a\nreal-life experiment using Cesium Tiles provided by digital twins of the\nexperimental location. This is accompanied by a description of a process for\nduplicating real-world flights in simulation. The performance of these methods\nis evaluated by analyzing real-life and experimental image telemetry with the\nDirect Sparse Odometry (DSO) mapping algorithm. Results indicate that Cesium\nTiles environments can provide highly accurate models of ground truth geometry\nafter careful alignment. Further, results from real-life and simulated\ntelemetry analysis indicate that the virtual simulation results accurately\npredict real-life results. Findings indicate that the algorithm results in real\nlife and in the simulated duplicate exhibited a high degree of similarity. This\nindicates that the use of Cesium Tiles environments as a virtual digital twin\nfor real-life experiments will provide representative results for such\nalgorithms. The impact of this can be significant, potentially allowing\nexpansive virtual testing of robotic systems at specific deployment locations\nto develop solutions that are tailored to the environment and potentially\noutperforming solutions meant to work in completely generic environments.\n","authors":["Chris Beam","Jincheng Zhang","Nicholas Kakavitsas","Collin Hague","Artur Wolek","Andrew Willis"],"pdf_url":"https://arxiv.org/pdf/2401.07962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07958v1","updated":"2024-01-15T20:54:20Z","published":"2024-01-15T20:54:20Z","title":"GD-CAF: Graph Dual-stream Convolutional Attention Fusion for\n Precipitation Nowcasting","summary":" Accurate precipitation nowcasting is essential for various purposes,\nincluding flood prediction, disaster management, optimizing agricultural\nactivities, managing transportation routes and renewable energy. While several\nstudies have addressed this challenging task from a sequence-to-sequence\nperspective, most of them have focused on a single area without considering the\nexisting correlation between multiple disjoint regions. In this paper, we\nformulate precipitation nowcasting as a spatiotemporal graph sequence\nnowcasting problem. In particular, we introduce Graph Dual-stream Convolutional\nAttention Fusion (GD-CAF), a novel approach designed to learn from historical\nspatiotemporal graph of precipitation maps and nowcast future time step ahead\nprecipitation at different spatial locations. GD-CAF consists of\nspatio-temporal convolutional attention as well as gated fusion modules which\nare equipped with depthwise-separable convolutional operations. This\nenhancement enables the model to directly process the high-dimensional\nspatiotemporal graph of precipitation maps and exploits higher-order\ncorrelations between the data dimensions. We evaluate our model on seven years\nof precipitation maps across Europe and its neighboring areas collected from\nthe ERA5 dataset, provided by Copernicus. The model receives a fully connected\ngraph in which each node represents historical observations from a specific\nregion on the map. Consequently, each node contains a 3D tensor with time,\nheight, and width dimensions. Experimental results demonstrate that the\nproposed GD-CAF model outperforms the other examined models. Furthermore, the\naveraged seasonal spatial and temporal attention scores over the test set are\nvisualized to provide additional insights about the strongest connections\nbetween different regions or time steps. These visualizations shed light on the\ndecision-making process of our model.\n","authors":["Lorand Vatamany","Siamak Mehrkanoon"],"pdf_url":"https://arxiv.org/pdf/2401.07958v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.07957v1","updated":"2024-01-15T20:47:24Z","published":"2024-01-15T20:47:24Z","title":"Machine Perceptual Quality: Evaluating the Impact of Severe Lossy\n Compression on Audio and Image Models","summary":" In the field of neural data compression, the prevailing focus has been on\noptimizing algorithms for either classical distortion metrics, such as PSNR or\nSSIM, or human perceptual quality. With increasing amounts of data consumed by\nmachines rather than humans, a new paradigm of machine-oriented\ncompression$\\unicode{x2013}$which prioritizes the retention of features salient\nfor machine perception over traditional human-centric\ncriteria$\\unicode{x2013}$has emerged, creating several new challenges to the\ndevelopment, evaluation, and deployment of systems utilizing lossy compression.\nIn particular, it is unclear how different approaches to lossy compression will\naffect the performance of downstream machine perception tasks. To address this\nunder-explored area, we evaluate various perception\nmodels$\\unicode{x2013}$including image classification, image segmentation,\nspeech recognition, and music source separation$\\unicode{x2013}$under severe\nlossy compression. We utilize several popular codecs spanning conventional,\nneural, and generative compression architectures. Our results indicate three\nkey findings: (1) using generative compression, it is feasible to leverage\nhighly compressed data while incurring a negligible impact on machine\nperceptual quality; (2) machine perceptual quality correlates strongly with\ndeep similarity metrics, indicating a crucial role of these metrics in the\ndevelopment of machine-oriented codecs; and (3) using lossy compressed\ndatasets, (e.g. ImageNet) for pre-training can lead to counter-intuitive\nscenarios where lossy compression increases machine perceptual quality rather\nthan degrading it. To encourage engagement on this growing area of research,\nour code and experiments are available at:\nhttps://github.com/danjacobellis/MPQ.\n","authors":["Dan Jacobellis","Daniel Cummings","Neeraja J. Yadwadkar"],"pdf_url":"https://arxiv.org/pdf/2401.07957v1.pdf","comment":"10 pages; abridged version published in IEEE Data Compression\n Conference 2024"},{"id":"http://arxiv.org/abs/2401.07951v1","updated":"2024-01-15T20:23:05Z","published":"2024-01-15T20:23:05Z","title":"Image Similarity using An Ensemble of Context-Sensitive Models","summary":" Image similarity has been extensively studied in computer vision. In recently\nyears, machine-learned models have shown their ability to encode more semantics\nthan traditional multivariate metrics. However, in labelling similarity,\nassigning a numerical score to a pair of images is less intuitive than\ndetermining if an image A is closer to a reference image R than another image\nB. In this work, we present a novel approach for building an image similarity\nmodel based on labelled data in the form of A:R vs B:R. We address the\nchallenges of sparse sampling in the image space (R, A, B) and biases in the\nmodels trained with context-based data by using an ensemble model. In\nparticular, we employed two ML techniques to construct such an ensemble model,\nnamely dimensionality reduction and MLP regressors. Our testing results show\nthat the ensemble model constructed performs ~5% better than the best\nindividual context-sensitive models. They also performed better than the model\ntrained with mixed imagery data as well as existing similarity models, e.g.,\nCLIP and DINO. This work demonstrate that context-based labelling and model\ntraining can be effective when an appropriate ensemble approach is used to\nalleviate the limitation due to sparse sampling.\n","authors":["Zukang Liao","Min Chen"],"pdf_url":"https://arxiv.org/pdf/2401.07951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07942v1","updated":"2024-01-15T20:09:56Z","published":"2024-01-15T20:09:56Z","title":"Transformer-based Video Saliency Prediction with High Temporal Dimension\n Decoding","summary":" In recent years, finding an effective and efficient strategy for exploiting\nspatial and temporal information has been a hot research topic in video\nsaliency prediction (VSP). With the emergence of spatio-temporal transformers,\nthe weakness of the prior strategies, e.g., 3D convolutional networks and\nLSTM-based networks, for capturing long-range dependencies has been effectively\ncompensated. While VSP has drawn benefits from spatio-temporal transformers,\nfinding the most effective way for aggregating temporal features is still\nchallenging. To address this concern, we propose a transformer-based video\nsaliency prediction approach with high temporal dimension decoding network\n(THTD-Net). This strategy accounts for the lack of complex hierarchical\ninteractions between features that are extracted from the transformer-based\nspatio-temporal encoder: in particular, it does not require multiple decoders\nand aims at gradually reducing temporal features' dimensions in the decoder.\nThis decoder-based architecture yields comparable performance to multi-branch\nand over-complicated models on common benchmarks such as DHF1K, UCF-sports and\nHollywood-2.\n","authors":["Morteza Moradi","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2401.07942v1.pdf","comment":"8 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2401.07931v1","updated":"2024-01-15T19:47:14Z","published":"2024-01-15T19:47:14Z","title":"Vertical Federated Image Segmentation","summary":" With the popularization of AI solutions for image based problems, there has\nbeen a growing concern for both data privacy and acquisition. In a large number\nof cases, information is located on separate data silos and it can be difficult\nfor a developer to consolidate all of it in a fashion that is appropriate for\nmachine learning model development. Alongside this, a portion of these\nlocalized data regions may not have access to a labelled ground truth. This\nindicates that they have the capacity to reach conclusions numerically, but are\nnot able to assign classifications amid a lack of pertinent information. Such a\ndetermination is often negligible, especially when attempting to develop image\nbased solutions that often necessitate this capability. With this being the\ncase, we propose an innovative vertical federated learning (VFL) model\narchitecture that can operate under this common set of conditions. This is the\nfirst (and currently the only) implementation of a system that can work under\nthe constraints of a VFL environment and perform image segmentation while\nmaintaining nominal accuracies. We achieved this by utilizing an FCN that\nboasts the ability to operate on federates that lack labelled data and\nprivately share the respective weights with a central server, that of which\nhosts the necessary features for classification. Tests were conducted on the\nCamVid dataset in order to determine the impact of heavy feature compression\nrequired for the transfer of information between federates, as well as to reach\nnominal conclusions about the overall performance metrics when working under\nsuch constraints.\n","authors":["Paul K. Mandal","Cole Leo"],"pdf_url":"https://arxiv.org/pdf/2401.07931v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.07929v1","updated":"2024-01-15T19:46:05Z","published":"2024-01-15T19:46:05Z","title":"Machine Learning Based Object Tracking","summary":" Machine learning based object detection as well as tracking that object have\nbeen performed in this paper. The authors were able to set a range of interest\n(ROI) around an object using Open Computer Vision, better known as OpenCV. Next\na tracking algorithm has been used to maintain tracking on an object while\nsimultaneously operating two servo motors to keep the object centered in the\nframe. Detailed procedure and code are included in this paper.\n","authors":["Md Rakibul Karim Akanda","Joshua Reynolds","Treylin Jackson","Milijah Gray"],"pdf_url":"https://arxiv.org/pdf/2401.07929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07856v1","updated":"2024-01-15T17:37:27Z","published":"2024-01-15T17:37:27Z","title":"Information hiding cameras: optical concealment of object information\n into ordinary images","summary":" Data protection methods like cryptography, despite being effective,\ninadvertently signal the presence of secret communication, thereby drawing\nundue attention. Here, we introduce an optical information hiding camera\nintegrated with an electronic decoder, optimized jointly through deep learning.\nThis information hiding-decoding system employs a diffractive optical processor\nas its front-end, which transforms and hides input images in the form of\nordinary-looking patterns that deceive/mislead human observers. This\ninformation hiding transformation is valid for infinitely many combinations of\nsecret messages, all of which are transformed into ordinary-looking output\npatterns, achieved all-optically through passive light-matter interactions\nwithin the optical processor. By processing these ordinary-looking output\nimages, a jointly-trained electronic decoder neural network accurately\nreconstructs the original information hidden within the deceptive output\npattern. We numerically demonstrated our approach by designing an information\nhiding diffractive camera along with a jointly-optimized convolutional decoder\nneural network. The efficacy of this system was demonstrated under various\nlighting conditions and noise levels, showing its robustness. We further\nextended this information hiding camera to multi-spectral operation, allowing\nthe concealment and decoding of multiple images at different wavelengths, all\nperformed simultaneously in a single feed-forward operation. The feasibility of\nour framework was also demonstrated experimentally using THz radiation. This\noptical encoder-electronic decoder-based co-design provides a novel information\nhiding camera interface that is both high-speed and energy-efficient, offering\nan intriguing solution for visual information security.\n","authors":["Bijie Bai","Ryan Lee","Yuhang Li","Tianyi Gan","Yuntian Wang","Mona Jarrahi","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2401.07856v1.pdf","comment":"26 Pages, 8 Figures"},{"id":"http://arxiv.org/abs/2401.07854v1","updated":"2024-01-15T17:28:58Z","published":"2024-01-15T17:28:58Z","title":"$M^{2}$Fusion: Bayesian-based Multimodal Multi-level Fusion on\n Colorectal Cancer Microsatellite Instability Prediction","summary":" Colorectal cancer (CRC) micro-satellite instability (MSI) prediction on\nhistopathology images is a challenging weakly supervised learning task that\ninvolves multi-instance learning on gigapixel images. To date, radiology images\nhave proven to have CRC MSI information and efficient patient imaging\ntechniques. Different data modalities integration offers the opportunity to\nincrease the accuracy and robustness of MSI prediction. Despite the progress in\nrepresentation learning from the whole slide images (WSI) and exploring the\npotential of making use of radiology data, CRC MSI prediction remains a\nchallenge to fuse the information from multiple data modalities (e.g.,\npathology WSI and radiology CT image). In this paper, we propose $M^{2}$Fusion:\na Bayesian-based multimodal multi-level fusion pipeline for CRC MSI. The\nproposed fusion model $M^{2}$Fusion is capable of discovering more novel\npatterns within and across modalities that are beneficial for predicting MSI\nthan using a single modality alone, as well as other fusion methods. The\ncontribution of the paper is three-fold: (1) $M^{2}$Fusion is the first\npipeline of multi-level fusion on pathology WSI and 3D radiology CT image for\nMSI prediction; (2) CT images are the first time integrated into multimodal\nfusion for CRC MSI prediction; (3) feature-level fusion strategy is evaluated\non both Transformer-based and CNN-based method. Our approach is validated on\ncross-validation of 352 cases and outperforms either feature-level (0.8177 vs.\n0.7908) or decision-level fusion strategy (0.8177 vs. 0.7289) on AUC score.\n","authors":["Quan Liu","Jiawen Yao","Lisha Yao","Xin Chen","Jingren Zhou","Le Lu","Ling Zhang","Zaiyi Liu","Yuankai Huo"],"pdf_url":"https://arxiv.org/pdf/2401.07854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07853v1","updated":"2024-01-15T17:28:37Z","published":"2024-01-15T17:28:37Z","title":"VeCAF: VLM-empowered Collaborative Active Finetuning with Training\n Objective Awareness","summary":" Finetuning a pretrained vision model (PVM) is a common technique for learning\ndownstream vision tasks. The conventional finetuning process with the randomly\nsampled data points results in diminished training efficiency. To address this\ndrawback, we propose a novel approach, VLM-empowered Collaborative Active\nFinetuning (VeCAF). VeCAF optimizes a parametric data selection model by\nincorporating the training objective of the model being tuned. Effectively,\nthis guides the PVM towards the performance goal with improved data and\ncomputational efficiency. As vision-language models (VLMs) have achieved\nsignificant advancements by establishing a robust connection between image and\nlanguage domains, we exploit the inherent semantic richness of the text\nembedding space and utilize text embedding of pretrained VLM models to augment\nPVM image features for better data selection and finetuning. Furthermore, the\nflexibility of text-domain augmentation gives VeCAF a unique ability to handle\nout-of-distribution scenarios without external augmented data. Extensive\nexperiments show the leading performance and high efficiency of VeCAF that is\nsuperior to baselines in both in-distribution and out-of-distribution image\nclassification tasks. On ImageNet, VeCAF needs up to 3.3x less training batches\nto reach the target performance compared to full finetuning and achieves 2.8%\naccuracy improvement over SOTA methods with the same number of batches.\n","authors":["Rongyu Zhang","Zefan Cai","Huanrui Yang","Zidong Liu","Denis Gudovskiy","Tomoyuki Okuno","Yohei Nakata","Kurt Keutzer","Baobao Chang","Yuan Du","Li Du","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07853v1.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2401.07825v1","updated":"2024-01-15T16:53:20Z","published":"2024-01-15T16:53:20Z","title":"Phenotyping calcification in vascular tissues using artificial\n intelligence","summary":" Vascular calcification is implicated as an important factor in major adverse\ncardiovascular events (MACE), including heart attack and stroke. A controversy\nremains over how to integrate the diverse forms of vascular calcification into\nclinical risk assessment tools. Even the commonly used calcium score for\ncoronary arteries, which assumes risk scales positively with total\ncalcification, has important inconsistencies. Fundamental studies are needed to\ndetermine how risk is influenced by the diverse calcification phenotypes.\nHowever, studies of these kinds are hindered by the lack of high-throughput,\nobjective, and non-destructive tools for classifying calcification in imaging\ndata sets. Here, we introduce a new classification system for phenotyping\ncalcification along with a semi-automated, non-destructive pipeline that can\ndistinguish these phenotypes in even atherosclerotic tissues. The pipeline\nincludes a deep-learning-based framework for segmenting lipid pools in noisy\nmicro-CT images and an unsupervised clustering framework for categorizing\ncalcification based on size, clustering, and topology. This approach is\nillustrated for five vascular specimens, providing phenotyping for thousands of\ncalcification particles across as many as 3200 images in less than seven hours.\nAverage Dice Similarity Coefficients of 0.96 and 0.87 could be achieved for\ntissue and lipid pool, respectively, with training and validation needed on\nonly 13 images despite the high heterogeneity in these tissues. By introducing\nan efficient and comprehensive approach to phenotyping calcification, this work\nenables large-scale studies to identify a more reliable indicator of the risk\nof cardiovascular events, a leading cause of global mortality and morbidity.\n","authors":["Mehdi Ramezanpour","Anne M. Robertson","Yasutaka Tobe","Xiaowei Jia","Juan R. Cebral"],"pdf_url":"https://arxiv.org/pdf/2401.07825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17071v2","updated":"2024-01-15T16:43:32Z","published":"2023-12-28T15:33:16Z","title":"SCTNet: Single-Branch CNN with Transformer Semantic Information for\n Real-Time Segmentation","summary":" Recent real-time semantic segmentation methods usually adopt an additional\nsemantic branch to pursue rich long-range context. However, the additional\nbranch incurs undesirable computational overhead and slows inference speed. To\neliminate this dilemma, we propose SCTNet, a single branch CNN with transformer\nsemantic information for real-time segmentation. SCTNet enjoys the rich\nsemantic representations of an inference-free semantic branch while retaining\nthe high efficiency of lightweight single branch CNN. SCTNet utilizes a\ntransformer as the training-only semantic branch considering its superb ability\nto extract long-range context. With the help of the proposed transformer-like\nCNN block CFBlock and the semantic information alignment module, SCTNet could\ncapture the rich semantic information from the transformer branch in training.\nDuring the inference, only the single branch CNN needs to be deployed. We\nconduct extensive experiments on Cityscapes, ADE20K, and COCO-Stuff-10K, and\nthe results show that our method achieves the new state-of-the-art performance.\nThe code and model is available at https://github.com/xzz777/SCTNet\n","authors":["Zhengze Xu","Dongyue Wu","Changqian Yu","Xiangxiang Chu","Nong Sang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.17071v2.pdf","comment":"Accepted by AAAI 2024; typos corrected; code and models have been\n released at https://github.com/xzz777/SCTNet"},{"id":"http://arxiv.org/abs/2307.14288v3","updated":"2024-01-15T16:35:52Z","published":"2023-07-26T16:43:22Z","title":"US \\& MRI Image Fusion Based on Markerless Skin Registration","summary":" This paper presents an innovative automatic fusion imaging system that\ncombines 3D CT/MR images with real-time ultrasound (US) acquisition. The system\neliminates the need for external physical markers and complex training, making\nimage fusion feasible for physicians with different experience levels. The\nintegrated system involves a portable 3D camera for patient-specific surface\nacquisition, an electromagnetic tracking system, and US components. The fusion\nalgorithm comprises two main parts: skin segmentation and rigid\nco-registration, both integrated into the US machine. The co-registration\nsoftware aligns the surface extracted from CT/MR images with patient-specific\ncoordinates, facilitating rapid and effective fusion. Experimental testing in\ndifferent settings, including the clinical environment, validates the system's\naccuracy, computational efficiency, noise robustness, and operator\nindependence. The co-registration error remains under the acceptable range\nof~$1$ cm.\n","authors":["Martina Paccini","Giacomo Paschina","Stefano De Beni","Giuseppe Patanè"],"pdf_url":"https://arxiv.org/pdf/2307.14288v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07803v1","updated":"2024-01-15T16:21:19Z","published":"2024-01-15T16:21:19Z","title":"Uncovering the Full Potential of Visual Grounding Methods in VQA","summary":" Visual Grounding (VG) methods in Visual Question Answering (VQA) attempt to\nimprove VQA performance by strengthening a model's reliance on\nquestion-relevant visual information. The presence of such relevant information\nin the visual input is typically assumed in training and testing. This\nassumption, however, is inherently flawed when dealing with imperfect image\nrepresentations common in large-scale VQA, where the information carried by\nvisual features frequently deviates from expected ground-truth contents. As a\nresult, training and testing of VG-methods is performed with largely inaccurate\ndata, which obstructs proper assessment of their potential benefits.\n In this work, we demonstrate that current evaluation schemes for VG-methods\nare problematic due to the flawed assumption of availability of relevant visual\ninformation. Our experiments show that the potential benefits of these methods\nare severely underestimated as a result.\n","authors":["Daniel Reich","Tanja Schultz"],"pdf_url":"https://arxiv.org/pdf/2401.07803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12542v2","updated":"2024-01-15T16:18:13Z","published":"2023-07-24T06:12:37Z","title":"Client-Level Differential Privacy via Adaptive Intermediary in Federated\n Medical Imaging","summary":" Despite recent progress in enhancing the privacy of federated learning (FL)\nvia differential privacy (DP), the trade-off of DP between privacy protection\nand performance is still underexplored for real-world medical scenario. In this\npaper, we propose to optimize the trade-off under the context of client-level\nDP, which focuses on privacy during communications. However, FL for medical\nimaging involves typically much fewer participants (hospitals) than other\ndomains (e.g., mobile devices), thus ensuring clients be differentially private\nis much more challenging. To tackle this problem, we propose an adaptive\nintermediary strategy to improve performance without harming privacy.\nSpecifically, we theoretically find splitting clients into sub-clients, which\nserve as intermediaries between hospitals and the server, can mitigate the\nnoises introduced by DP without harming privacy. Our proposed approach is\nempirically evaluated on both classification and segmentation tasks using two\npublic datasets, and its effectiveness is demonstrated with significant\nperformance improvements and comprehensive analytical studies. Code is\navailable at: https://github.com/med-air/Client-DP-FL.\n","authors":["Meirui Jiang","Yuan Zhong","Anjie Le","Xiaoxiao Li","Qi Dou"],"pdf_url":"https://arxiv.org/pdf/2307.12542v2.pdf","comment":"Accepted by 26th International Conference on Medical Image Computing\n and Computer Assisted Intervention (MICCAI'23)"},{"id":"http://arxiv.org/abs/2401.07801v1","updated":"2024-01-15T16:13:17Z","published":"2024-01-15T16:13:17Z","title":"Pedestrian Detection in Low-Light Conditions: A Comprehensive Survey","summary":" Pedestrian detection remains a critical problem in various domains, such as\ncomputer vision, surveillance, and autonomous driving. In particular, accurate\nand instant detection of pedestrians in low-light conditions and reduced\nvisibility is of utmost importance for autonomous vehicles to prevent accidents\nand save lives. This paper aims to comprehensively survey various pedestrian\ndetection approaches, baselines, and datasets that specifically target\nlow-light conditions. The survey discusses the challenges faced in detecting\npedestrians at night and explores state-of-the-art methodologies proposed in\nrecent years to address this issue. These methodologies encompass a diverse\nrange, including deep learning-based, feature-based, and hybrid approaches,\nwhich have shown promising results in enhancing pedestrian detection\nperformance under challenging lighting conditions. Furthermore, the paper\nhighlights current research directions in the field and identifies potential\nsolutions that merit further investigation by researchers. By thoroughly\nexamining pedestrian detection techniques in low-light conditions, this survey\nseeks to contribute to the advancement of safer and more reliable autonomous\ndriving systems and other applications related to pedestrian safety.\nAccordingly, most of the current approaches in the field use deep\nlearning-based image fusion methodologies (i.e., early, halfway, and late\nfusion) for accurate and reliable pedestrian detection. Moreover, the majority\nof the works in the field (approximately 48%) have been evaluated on the KAIST\ndataset, while the real-world video feeds recorded by authors have been used in\nless than six percent of the works.\n","authors":["Bahareh Ghari","Ali Tourani","Asadollah Shahbahrami","Georgi Gaydadjiev"],"pdf_url":"https://arxiv.org/pdf/2401.07801v1.pdf","comment":"23 pages, 3 tables, 10 figures"},{"id":"http://arxiv.org/abs/2401.04464v2","updated":"2024-01-15T16:12:45Z","published":"2024-01-09T09:58:42Z","title":"PhilEO Bench: Evaluating Geo-Spatial Foundation Models","summary":" Massive amounts of unlabelled data are captured by Earth Observation (EO)\nsatellites, with the Sentinel-2 constellation generating 1.6 TB of data daily.\nThis makes Remote Sensing a data-rich domain well suited to Machine Learning\n(ML) solutions. However, a bottleneck in applying ML models to EO is the lack\nof annotated data as annotation is a labour-intensive and costly process. As a\nresult, research in this domain has focused on Self-Supervised Learning and\nFoundation Model approaches. This paper addresses the need to evaluate\ndifferent Foundation Models on a fair and uniform benchmark by introducing the\nPhilEO Bench, a novel evaluation framework for EO Foundation Models. The\nframework comprises of a testbed and a novel 400 GB Sentinel-2 dataset\ncontaining labels for three downstream tasks, building density estimation, road\nsegmentation, and land cover classification. We present experiments using our\nframework evaluating different Foundation Models, including Prithvi and SatMAE,\nat multiple n-shots and convergence rates.\n","authors":["Casper Fibaek","Luke Camilleri","Andreas Luyts","Nikolaos Dionelis","Bertrand Le Saux"],"pdf_url":"https://arxiv.org/pdf/2401.04464v2.pdf","comment":"6 pages, 5 figures, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2401.07796v1","updated":"2024-01-15T16:04:46Z","published":"2024-01-15T16:04:46Z","title":"Fusing Echocardiography Images and Medical Records for Continuous\n Patient Stratification","summary":" Deep learning now enables automatic and robust extraction of cardiac function\ndescriptors from echocardiographic sequences, such as ejection fraction or\nstrain. These descriptors provide fine-grained information that physicians\nconsider, in conjunction with more global variables from the clinical record,\nto assess patients' condition. Drawing on novel transformer models applied to\ntabular data (e.g., variables from electronic health records), we propose a\nmethod that considers all descriptors extracted from medical records and\nechocardiograms to learn the representation of a difficult-to-characterize\ncardiovascular pathology, namely hypertension. Our method first projects each\nvariable into its own representation space using modality-specific approaches.\nThese standardized representations of multimodal data are then fed to a\ntransformer encoder, which learns to merge them into a comprehensive\nrepresentation of the patient through a pretext task of predicting a clinical\nrating. This pretext task is formulated as an ordinal classification to enforce\na pathological continuum in the representation space. We observe the major\ntrends along this continuum for a cohort of 239 hypertensive patients to\ndescribe, with unprecedented gradation, the effect of hypertension on a number\nof cardiac function descriptors. Our analysis shows that i) pretrained weights\nfrom a foundation model allow to reach good performance (83% accuracy) even\nwith limited data (less than 200 training samples), ii) trends across the\npopulation are reproducible between trainings, and iii) for descriptors whose\ninteractions with hypertension are well documented, patterns are consistent\nwith prior physiological knowledge.\n","authors":["Nathan Painchaud","Pierre-Yves Courand","Pierre-Marc Jodoin","Nicolas Duchateau","Olivier Bernard"],"pdf_url":"https://arxiv.org/pdf/2401.07796v1.pdf","comment":"10 pages, submitted to IEEE TMI"},{"id":"http://arxiv.org/abs/2401.03836v4","updated":"2024-01-15T15:54:56Z","published":"2024-01-08T11:50:23Z","title":"WidthFormer: Toward Efficient Transformer-based BEV View Transformation","summary":" In this work, we present WidthFormer, a novel transformer-based\nBird's-Eye-View (BEV) 3D detection method tailored for real-time\nautonomous-driving applications. WidthFormer is computationally efficient,\nrobust and does not require any special engineering effort to deploy. In this\nwork, we propose a novel 3D positional encoding mechanism capable of accurately\nencapsulating 3D geometric information, which enables our model to generate\nhigh-quality BEV representations with only a single transformer decoder layer.\nThis mechanism is also beneficial for existing sparse 3D object detectors.\nInspired by the recently-proposed works, we further improve our model's\nefficiency by vertically compressing the image features when serving as\nattention keys and values. We also introduce two modules to compensate for\npotential information loss due to feature compression. Experimental evaluation\non the widely-used nuScenes 3D object detection benchmark demonstrates that our\nmethod outperforms previous approaches across different 3D detection\narchitectures. More importantly, our model is highly efficient. For example,\nwhen using $256\\times 704$ input images, it achieves 1.5 ms and 2.8 ms latency\non NVIDIA 3090 GPU and Horizon Journey-5 computation solutions, respectively.\nFurthermore, WidthFormer also exhibits strong robustness to different degrees\nof camera perturbations. Our study offers valuable insights into the deployment\nof BEV transformation methods in real-world, complex road environments. Code is\navailable at https://github.com/ChenhongyiYang/WidthFormer .\n","authors":["Chenhongyi Yang","Tianwei Lin","Lichao Huang","Elliot J. Crowley"],"pdf_url":"https://arxiv.org/pdf/2401.03836v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07787v1","updated":"2024-01-15T15:53:13Z","published":"2024-01-15T15:53:13Z","title":"Improving OCR Quality in 19th Century Historical Documents Using a\n Combined Machine Learning Based Approach","summary":" This paper addresses a major challenge to historical research on the 19th\ncentury. Large quantities of sources have become digitally available for the\nfirst time, while extraction techniques are lagging behind. Therefore, we\nresearched machine learning (ML) models to recognise and extract complex data\nstructures in a high-value historical primary source, the Schematismus. It\nrecords every single person in the Habsburg civil service above a certain\nhierarchical level between 1702 and 1918 and documents the genesis of the\ncentral administration over two centuries. Its complex and intricate structure\nas well as its enormous size have so far made any more comprehensive analysis\nof the administrative and social structure of the later Habsburg Empire on the\nbasis of this source impossible. We pursued two central objectives: Primarily,\nthe improvement of the OCR quality, for which we considered an improved\nstructure recognition to be essential; in the further course, it turned out\nthat this also made the extraction of the data structure possible. We chose\nFaster R-CNN as base for the ML architecture for structure recognition. In\norder to obtain the required amount of training data quickly and economically,\nwe synthesised Hof- und Staatsschematismus-style data, which we used to train\nour model. The model was then fine-tuned with a smaller set of manually\nannotated historical source data. We then used Tesseract-OCR, which was further\noptimised for the style of our documents, to complete the combined structure\nextraction and OCR process. Results show a significant decrease in the two\nstandard parameters of OCR-performance, WER and CER (where lower values are\nbetter). Combined structure detection and fine-tuned OCR improved CER and WER\nvalues by remarkable 71.98 percent (CER) respectively 52.49 percent (WER).\n","authors":["David Fleischhacker","Wolfgang Goederle","Roman Kern"],"pdf_url":"https://arxiv.org/pdf/2401.07787v1.pdf","comment":"29 pages, 23 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.07782v1","updated":"2024-01-15T15:43:56Z","published":"2024-01-15T15:43:56Z","title":"Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in\n Remote Sensing","summary":" Self-supervised learning through masked autoencoders (MAEs) has recently\nattracted great attention for remote sensing (RS) image representation\nlearning, and thus embodies a significant potential for content-based image\nretrieval (CBIR) from ever-growing RS image archives. However, the existing\nstudies on MAEs in RS assume that the considered RS images are acquired by a\nsingle image sensor, and thus are only suitable for uni-modal CBIR problems.\nThe effectiveness of MAEs for cross-sensor CBIR, which aims to search\nsemantically similar images across different image modalities, has not been\nexplored yet. In this paper, we take the first step to explore the\neffectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a\nsystematic overview on the possible adaptations of the vanilla MAE to exploit\nmasked image modeling on multi-sensor RS image archives (denoted as\ncross-sensor masked autoencoders [CSMAEs]). Based on different adjustments\napplied to the vanilla MAE, we introduce different CSMAE models. We also\nprovide an extensive experimental analysis of these CSMAE models. We finally\nderive a guideline to exploit masked image modeling for uni-modal and\ncross-modal CBIR problems in RS. The code of this work is publicly available at\nhttps://github.com/jakhac/CSMAE.\n","authors":["Jakob Hackstein","Gencer Sumbul","Kai Norman Clasen","Begüm Demir"],"pdf_url":"https://arxiv.org/pdf/2401.07782v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Our code is available at https://github.com/jakhac/CSMAE"},{"id":"http://arxiv.org/abs/2401.07781v1","updated":"2024-01-15T15:42:39Z","published":"2024-01-15T15:42:39Z","title":"Towards A Better Metric for Text-to-Video Generation","summary":" Generative models have demonstrated remarkable capability in synthesizing\nhigh-quality text, images, and videos. For video generation, contemporary\ntext-to-video models exhibit impressive capabilities, crafting visually\nstunning videos. Nonetheless, evaluating such videos poses significant\nchallenges. Current research predominantly employs automated metrics such as\nFVD, IS, and CLIP Score. However, these metrics provide an incomplete analysis,\nparticularly in the temporal assessment of video content, thus rendering them\nunreliable indicators of true video quality. Furthermore, while user studies\nhave the potential to reflect human perception accurately, they are hampered by\ntheir time-intensive and laborious nature, with outcomes that are often tainted\nby subjective bias. In this paper, we investigate the limitations inherent in\nexisting metrics and introduce a novel evaluation pipeline, the Text-to-Video\nScore (T2VScore). This metric integrates two pivotal criteria: (1) Text-Video\nAlignment, which scrutinizes the fidelity of the video in representing the\ngiven text description, and (2) Video Quality, which evaluates the video's\noverall production caliber with a mixture of experts. Moreover, to evaluate the\nproposed metrics and facilitate future improvements on them, we present the\nTVGE dataset, collecting human judgements of 2,543 text-to-video generated\nvideos on the two criteria. Experiments on the TVGE dataset demonstrate the\nsuperiority of the proposed T2VScore on offering a better metric for\ntext-to-video generation.\n","authors":["Jay Zhangjie Wu","Guian Fang","Haoning Wu","Xintao Wang","Yixiao Ge","Xiaodong Cun","David Junhao Zhang","Jia-Wei Liu","Yuchao Gu","Rui Zhao","Weisi Lin","Wynne Hsu","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2401.07781v1.pdf","comment":"Project page: https://showlab.github.io/T2VScore/"},{"id":"http://arxiv.org/abs/2401.07770v1","updated":"2024-01-15T15:28:30Z","published":"2024-01-15T15:28:30Z","title":"Seeing the Unseen: Visual Common Sense for Semantic Placement","summary":" Computer vision tasks typically involve describing what is present in an\nimage (e.g. classification, detection, segmentation, and captioning). We study\na visual common sense task that requires understanding what is not present.\nSpecifically, given an image (e.g. of a living room) and name of an object\n(\"cushion\"), a vision system is asked to predict semantically-meaningful\nregions (masks or bounding boxes) in the image where that object could be\nplaced or is likely be placed by humans (e.g. on the sofa). We call this task:\nSemantic Placement (SP) and believe that such common-sense visual understanding\nis critical for assitive robots (tidying a house), and AR devices\n(automatically rendering an object in the user's space). Studying the invisible\nis hard. Datasets for image description are typically constructed by curating\nrelevant images and asking humans to annotate the contents of the image;\nneither of those two steps are straightforward for objects not present in the\nimage. We overcome this challenge by operating in the opposite direction: we\nstart with an image of an object in context from web, and then remove that\nobject from the image via inpainting. This automated pipeline converts\nunstructured web data into a dataset comprising pairs of images with/without\nthe object. Using this, we collect a novel dataset, with ${\\sim}1.3$M images\nacross $9$ object categories, and train a SP prediction model called CLIP-UNet.\nCLIP-UNet outperforms existing VLMs and baselines that combine semantic priors\nwith object detectors on real-world and simulated images. In our user studies,\nwe find that the SP masks predicted by CLIP-UNet are favored $43.7\\%$ and\n$31.3\\%$ times when comparing against the $4$ SP baselines on real and\nsimulated images. In addition, we demonstrate leveraging SP mask predictions\nfrom CLIP-UNet enables downstream applications like building tidying robots in\nindoor environments.\n","authors":["Ram Ramrakhya","Aniruddha Kembhavi","Dhruv Batra","Zsolt Kira","Kuo-Hao Zeng","Luca Weihs"],"pdf_url":"https://arxiv.org/pdf/2401.07770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14685v4","updated":"2024-01-15T15:20:29Z","published":"2023-06-26T13:30:38Z","title":"DiffSketcher: Text Guided Vector Sketch Synthesis through Latent\n Diffusion Models","summary":" Even though trained mainly on images, we discover that pretrained diffusion\nmodels show impressive power in guiding sketch synthesis. In this paper, we\npresent DiffSketcher, an innovative algorithm that creates \\textit{vectorized}\nfree-hand sketches using natural language input. DiffSketcher is developed\nbased on a pre-trained text-to-image diffusion model. It performs the task by\ndirectly optimizing a set of B\\'ezier curves with an extended version of the\nscore distillation sampling (SDS) loss, which allows us to use a raster-level\ndiffusion model as a prior for optimizing a parametric vectorized sketch\ngenerator. Furthermore, we explore attention maps embedded in the diffusion\nmodel for effective stroke initialization to speed up the generation process.\nThe generated sketches demonstrate multiple levels of abstraction while\nmaintaining recognizability, underlying structure, and essential visual details\nof the subject drawn. Our experiments show that DiffSketcher achieves greater\nquality than prior work. The code and demo of DiffSketcher can be found at\nhttps://ximinng.github.io/DiffSketcher-project/.\n","authors":["Ximing Xing","Chuang Wang","Haitao Zhou","Jing Zhang","Qian Yu","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2306.14685v4.pdf","comment":"Accepted by NIPS 2023. Project page:\n https://ximinng.github.io/DiffSketcher-project/"},{"id":"http://arxiv.org/abs/2401.07753v1","updated":"2024-01-15T15:03:32Z","published":"2024-01-15T15:03:32Z","title":"Low-light Stereo Image Enhancement and De-noising in the Low-frequency\n Information Enhanced Image Space","summary":" Unlike single image task, stereo image enhancement can use another view\ninformation, and its key stage is how to perform cross-view feature interaction\nto extract useful information from another view. However, complex noise in\nlow-light image and its impact on subsequent feature encoding and interaction\nare ignored by the existing methods. In this paper, a method is proposed to\nperform enhancement and de-noising simultaneously. First, to reduce unwanted\nnoise interference, a low-frequency information enhanced module (IEM) is\nproposed to suppress noise and produce a new image space. Additionally, a\ncross-channel and spatial context information mining module (CSM) is proposed\nto encode long-range spatial dependencies and to enhance inter-channel feature\ninteraction. Relying on CSM, an encoder-decoder structure is constructed,\nincorporating cross-view and cross-scale feature interactions to perform\nenhancement in the new image space. Finally, the network is trained with the\nconstraints of both spatial and frequency domain losses. Extensive experiments\non both synthesized and real datasets show that our method obtains better\ndetail recovery and noise removal compared with state-of-the-art methods. In\naddition, a real stereo image enhancement dataset is captured with stereo\ncamera ZED2. The code and dataset are publicly available at:\nhttps://www.github.com/noportraits/LFENet.\n","authors":["Minghua Zhao","Xiangdong Qin","Shuangli Du","Xuefei Bai","Jiahao Lyu","Yiguang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.07753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07751v1","updated":"2024-01-15T14:59:56Z","published":"2024-01-15T14:59:56Z","title":"DeepThalamus: A novel deep learning method for automatic segmentation of\n brain thalamic nuclei from multimodal ultra-high resolution MRI","summary":" The implication of the thalamus in multiple neurological pathologies makes it\na structure of interest for volumetric analysis. In the present work, we have\ndesigned and implemented a multimodal volumetric deep neural network for the\nsegmentation of thalamic nuclei at ultra-high resolution (0.125 mm3). Current\ntools either operate at standard resolution (1 mm3) or use monomodal data. To\nachieve the proposed objective, first, a database of semiautomatically\nsegmented thalamic nuclei was created using ultra-high resolution T1, T2 and\nWhite Matter nulled (WMn) images. Then, a novel Deep learning based strategy\nwas designed to obtain the automatic segmentations and trained to improve its\nrobustness and accuaracy using a semisupervised approach. The proposed method\nwas compared with a related state-of-the-art method showing competitive results\nboth in terms of segmentation quality and efficiency. To make the proposed\nmethod fully available to the scientific community, a full pipeline able to\nwork with monomodal standard resolution T1 images is also proposed.\n","authors":["Marina Ruiz-Perez","Sergio Morell-Ortega","Marien Gadea","Roberto Vivo-Hernando","Gregorio Rubio","Fernando Aparici","Mariam de la Iglesia-Vaya","Thomas Tourdias","Pierrick Coupé","José V. Manjón"],"pdf_url":"https://arxiv.org/pdf/2401.07751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07746v1","updated":"2024-01-15T14:56:25Z","published":"2024-01-15T14:56:25Z","title":"Sparsity-based background removal for STORM super-resolution images","summary":" Single-molecule localization microscopy techniques, like stochastic optical\nreconstruction microscopy (STORM), visualize biological specimens by\nstochastically exciting sparse blinking emitters. The raw images suffer from\nunwanted background fluorescence, which must be removed to achieve\nsuper-resolution. We introduce a sparsity-based background removal method by\nadapting a neural network (SLNet) from a different microscopy domain. The SLNet\ncomputes a low-rank representation of the images, and then, by subtracting it\nfrom the raw images, the sparse component is computed, representing the frames\nwithout the background. We compared our approach with widely used background\nremoval methods, such as the median background removal or the rolling ball\nalgorithm, on two commonly used STORM datasets, one glial cell, and one\nmicrotubule dataset. The SLNet delivers STORM frames with less background,\nleading to higher emitters' localization precision and higher-resolution\nreconstructed images than commonly used methods. Notably, the SLNet is\nlightweight and easily trainable (<5 min). Since it is trained in an\nunsupervised manner, no prior information is required and can be applied to any\nSTORM dataset. We uploaded a pre-trained SLNet to the Bioimage model zoo,\neasily accessible through ImageJ. Our results show that our sparse\ndecomposition method could be an essential and efficient STORM pre-processing\ntool.\n","authors":["Patris Valera","Josué Page Vizcaíno","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2401.07746v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07745v1","updated":"2024-01-15T14:56:15Z","published":"2024-01-15T14:56:15Z","title":"MaskClustering: View Consensus based Mask Graph Clustering for\n Open-Vocabulary 3D Instance Segmentation","summary":" Open-vocabulary 3D instance segmentation has emerged as a frontier topic due\nto its capability to segment 3D instances beyond a predefined set of\ncategories. However, compared to significant progress in the 2D domain, methods\nfor 3D open-vocabulary instance segmentation are hindered by the limited scale\nof high-quality annotated 3D data. To harness the capabilities of 2D models,\nrecent efforts have focused on merging 2D masks based on metrics such as\ngeometric and semantic similarity to form 3D instances. In contrast to these\nlocal metrics, we propose a novel metric called view consensus to better\nexploit multi-view observation. The key insight is that two 2D masks should be\nconsidered as belonging to the same instance if a considerable number of other\n2D masks from other views contain both these two masks. Based on this metric,\nwe build a global mask graph and iteratively cluster masks, prioritizing mask\npairs with solid view consensus. The corresponding 3D points cluster of these\n2D mask clusters can be regarded as 3D instances, along with the fused\nopen-vocabulary features from clustered 2D masks. Through this multi-view\nverification and fusion mechanism, our method effectively leverages the prior\ninstance knowledge from massive 2D masks predicted by visual foundation models,\neliminating the need for training on 3D data. Experiments on publicly available\ndatasets, including ScanNet200 and MatterPort3D, demonstrate that our method\nachieves state-of-the-art performance in both open-vocabulary instance\nsegmentation and class-agnostic mask generation. Our project page is at\nhttps://pku-epic.github.io/MaskClustering.\n","authors":["Mi Yan","Jiazhao Zhang","Yan Zhu","He Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09373v2","updated":"2024-01-15T14:52:42Z","published":"2023-05-16T11:56:02Z","title":"Multi-task convolutional neural network for image aesthetic assessment","summary":" As people's aesthetic preferences for images are far from understood, image\naesthetic assessment is a challenging artificial intelligence task. The range\nof factors underlying this task is almost unlimited, but we know that some\naesthetic attributes affect those preferences. In this study, we present a\nmulti-task convolutional neural network that takes into account these\nattributes. The proposed neural network jointly learns the attributes along\nwith the overall aesthetic scores of images. This multi-task learning framework\nallows for effective generalization through the utilization of shared\nrepresentations. Our experiments demonstrate that the proposed method\noutperforms the state-of-the-art approaches in predicting overall aesthetic\nscores for images in one benchmark of image aesthetics. We achieve near-human\nperformance in terms of overall aesthetic scores when considering the\nSpearman's rank correlations. Moreover, our model pioneers the application of\nmulti-tasking in another benchmark, serving as a new baseline for future\nresearch. Notably, our approach achieves this performance while using fewer\nparameters compared to existing multi-task neural networks in the literature,\nand consequently makes our method more efficient in terms of computational\ncomplexity.\n","authors":["Derya Soydaner","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2305.09373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07729v1","updated":"2024-01-15T14:43:40Z","published":"2024-01-15T14:43:40Z","title":"SSL-Interactions: Pretext Tasks for Interactive Trajectory Prediction","summary":" This paper addresses motion forecasting in multi-agent environments, pivotal\nfor ensuring safety of autonomous vehicles. Traditional as well as recent\ndata-driven marginal trajectory prediction methods struggle to properly learn\nnon-linear agent-to-agent interactions. We present SSL-Interactions that\nproposes pretext tasks to enhance interaction modeling for trajectory\nprediction. We introduce four interaction-aware pretext tasks to encapsulate\nvarious aspects of agent interactions: range gap prediction, closest distance\nprediction, direction of movement prediction, and type of interaction\nprediction. We further propose an approach to curate interaction-heavy\nscenarios from datasets. This curated data has two advantages: it provides a\nstronger learning signal to the interaction model, and facilitates generation\nof pseudo-labels for interaction-centric pretext tasks. We also propose three\nnew metrics specifically designed to evaluate predictions in interactive\nscenes. Our empirical evaluations indicate SSL-Interactions outperforms\nstate-of-the-art motion forecasting methods quantitatively with up to 8%\nimprovement, and qualitatively, for interaction-heavy scenarios.\n","authors":["Prarthana Bhattacharyya","Chengjie Huang","Krzysztof Czarnecki"],"pdf_url":"https://arxiv.org/pdf/2401.07729v1.pdf","comment":"13 pages, 5 figures, submitted to IV-2024"},{"id":"http://arxiv.org/abs/2401.07727v1","updated":"2024-01-15T14:41:15Z","published":"2024-01-15T14:41:15Z","title":"HexaGen3D: StableDiffusion is just one step away from Fast and Diverse\n Text-to-3D Generation","summary":" Despite the latest remarkable advances in generative modeling, efficient\ngeneration of high-quality 3D assets from textual prompts remains a difficult\ntask. A key challenge lies in data scarcity: the most extensive 3D datasets\nencompass merely millions of assets, while their 2D counterparts contain\nbillions of text-image pairs. To address this, we propose a novel approach\nwhich harnesses the power of large, pretrained 2D diffusion models. More\nspecifically, our approach, HexaGen3D, fine-tunes a pretrained text-to-image\nmodel to jointly predict 6 orthographic projections and the corresponding\nlatent triplane. We then decode these latents to generate a textured mesh.\nHexaGen3D does not require per-sample optimization, and can infer high-quality\nand diverse objects from textual prompts in 7 seconds, offering significantly\nbetter quality-to-latency trade-offs when comparing to existing approaches.\nFurthermore, HexaGen3D demonstrates strong generalization to new objects or\ncompositions.\n","authors":["Antoine Mercier","Ramin Nakhli","Mahesh Reddy","Rajeev Yasarla","Hong Cai","Fatih Porikli","Guillaume Berger"],"pdf_url":"https://arxiv.org/pdf/2401.07727v1.pdf","comment":"9 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.07721v1","updated":"2024-01-15T14:36:38Z","published":"2024-01-15T14:36:38Z","title":"Graph Transformer GANs with Graph Masked Modeling for Architectural\n Layout Generation","summary":" We present a novel graph Transformer generative adversarial network (GTGAN)\nto learn effective graph node relations in an end-to-end fashion for\nchallenging graph-constrained architectural layout generation tasks. The\nproposed graph-Transformer-based generator includes a novel graph Transformer\nencoder that combines graph convolutions and self-attentions in a Transformer\nto model both local and global interactions across connected and non-connected\ngraph nodes. Specifically, the proposed connected node attention (CNA) and\nnon-connected node attention (NNA) aim to capture the global relations across\nconnected nodes and non-connected nodes in the input graph, respectively. The\nproposed graph modeling block (GMB) aims to exploit local vertex interactions\nbased on a house layout topology. Moreover, we propose a new node\nclassification-based discriminator to preserve the high-level semantic and\ndiscriminative node features for different house components. To maintain the\nrelative spatial relationships between ground truth and predicted graphs, we\nalso propose a novel graph-based cycle-consistency loss. Finally, we propose a\nnovel self-guided pre-training method for graph representation learning. This\napproach involves simultaneous masking of nodes and edges at an elevated mask\nratio (i.e., 40%) and their subsequent reconstruction using an asymmetric\ngraph-centric autoencoder architecture. This method markedly improves the\nmodel's learning proficiency and expediency. Experiments on three challenging\ngraph-constrained architectural layout generation tasks (i.e., house layout\ngeneration, house roof generation, and building layout generation) with three\npublic datasets demonstrate the effectiveness of the proposed method in terms\nof objective quantitative scores and subjective visual realism. New\nstate-of-the-art results are established by large margins on these three tasks.\n","authors":["Hao Tang","Ling Shao","Nicu Sebe","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2401.07721v1.pdf","comment":"Accepted to TPAMI, an extended version of a paper published in\n CVPR2023. arXiv admin note: substantial text overlap with arXiv:2303.08225"},{"id":"http://arxiv.org/abs/2401.07709v1","updated":"2024-01-15T14:25:54Z","published":"2024-01-15T14:25:54Z","title":"Towards Efficient Diffusion-Based Image Editing with Instant Attention\n Masks","summary":" Diffusion-based Image Editing (DIE) is an emerging research hot-spot, which\noften applies a semantic mask to control the target area for diffusion-based\nediting. However, most existing solutions obtain these masks via manual\noperations or off-line processing, greatly reducing their efficiency. In this\npaper, we propose a novel and efficient image editing method for Text-to-Image\n(T2I) diffusion models, termed Instant Diffusion Editing(InstDiffEdit). In\nparticular, InstDiffEdit aims to employ the cross-modal attention ability of\nexisting diffusion models to achieve instant mask guidance during the diffusion\nsteps. To reduce the noise of attention maps and realize the full automatics,\nwe equip InstDiffEdit with a training-free refinement scheme to adaptively\naggregate the attention distributions for the automatic yet accurate mask\ngeneration. Meanwhile, to supplement the existing evaluations of DIE, we\npropose a new benchmark called Editing-Mask to examine the mask accuracy and\nlocal editing ability of existing methods. To validate InstDiffEdit, we also\nconduct extensive experiments on ImageNet and Imagen, and compare it with a\nbunch of the SOTA methods. The experimental results show that InstDiffEdit not\nonly outperforms the SOTA methods in both image quality and editing results,\nbut also has a much faster inference speed, i.e., +5 to +6 times. Our code\navailable at https://anonymous.4open.science/r/InstDiffEdit-C306/\n","authors":["Siyu Zou","Jiji Tang","Yiyi Zhou","Jing He","Chaoyi Zhao","Rongsheng Zhang","Zhipeng Hu","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2401.07709v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.07669v1","updated":"2024-01-15T13:27:34Z","published":"2024-01-15T13:27:34Z","title":"FiGCLIP: Fine-Grained CLIP Adaptation via Densely Annotated Videos","summary":" While contrastive language image pretraining (CLIP) have exhibited impressive\nperformance by learning highly semantic and generalized representations, recent\nworks have exposed a fundamental drawback in its syntactic properties, that\nincludes interpreting fine-grained attributes, actions, spatial relations,\nstates, and details that require compositional reasoning. One reason for this\nis that natural captions often do not capture all the visual details of a\nscene. This leads to unaddressed visual concepts being misattributed to the\nwrong words. And the pooled image and text features, ends up acting as a bag of\nwords, hence losing the syntactic information. In this work, we ask: Is it\npossible to enhance CLIP's fine-grained and syntactic abilities without\ncompromising its semantic properties? We show that this is possible by adapting\nCLIP efficiently on a high-quality, comprehensive, and relatively small\ndataset. We demonstrate our adaptation strategy on VidSitu, a video situation\nrecognition dataset annotated with verbs and rich semantic role labels (SRL).\nWe use the SRL and verb information to create rule-based detailed captions,\nmaking sure they capture most of the visual concepts. Combined with hard\nnegatives and hierarchical losses, these annotations allow us to learn a\npowerful visual representation, dubbed Fine-Grained CLIP (FiGCLIP), that\npreserves semantic understanding while being detail-oriented. We evaluate on\nfive diverse vision-language tasks in both fine-tuning and zero-shot settings,\nachieving consistent improvements over the base CLIP model.\n","authors":["Darshan Singh S","Zeeshan Khan","Makarand Tapaswi"],"pdf_url":"https://arxiv.org/pdf/2401.07669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13629v2","updated":"2024-01-15T13:18:19Z","published":"2023-11-22T18:59:51Z","title":"Diffusion models meet image counter-forensics","summary":" From its acquisition in the camera sensors to its storage, different\noperations are performed to generate the final image. This pipeline imprints\nspecific traces into the image to form a natural watermark. Tampering with an\nimage disturbs these traces; these disruptions are clues that are used by most\nmethods to detect and locate forgeries. In this article, we assess the\ncapabilities of diffusion models to erase the traces left by forgers and,\ntherefore, deceive forensics methods. Such an approach has been recently\nintroduced for adversarial purification, achieving significant performance. We\nshow that diffusion purification methods are well suited for counter-forensics\ntasks. Such approaches outperform already existing counter-forensics techniques\nboth in deceiving forensics methods and in preserving the natural look of the\npurified images. The source code is publicly available at\nhttps://github.com/mtailanian/diff-cf.\n","authors":["Matías Tailanian","Marina Gardella","Álvaro Pardo","Pablo Musé"],"pdf_url":"https://arxiv.org/pdf/2311.13629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.00650v5","updated":"2024-01-15T13:17:58Z","published":"2020-12-01T17:23:53Z","title":"Decomposition, Compression, and Synthesis (DCS)-based Video Coding: A\n Neural Exploration via Resolution-Adaptive Learning","summary":" Inspired by the facts that retinal cells actually segregate the visual scene\ninto different attributes (e.g., spatial details, temporal motion) for\nrespective neuronal processing, we propose to first decompose the input video\ninto respective spatial texture frames (STF) at its native spatial resolution\nthat preserve the rich spatial details, and the other temporal motion frames\n(TMF) at a lower spatial resolution that retain the motion smoothness; then\ncompress them together using any popular video coder; and finally synthesize\ndecoded STFs and TMFs for high-fidelity video reconstruction at the same\nresolution as its native input. This work simply applies the bicubic resampling\nin decomposition and HEVC compliant codec in compression, and puts the focus on\nthe synthesis part. For resolution-adaptive synthesis, a motion compensation\nnetwork (MCN) is devised on TMFs to efficiently align and aggregate temporal\nmotion features that will be jointly processed with corresponding STFs using a\nnon-local texture transfer network (NL-TTN) to better augment spatial details,\nby which the compression and resolution resampling noises can be effectively\nalleviated with better rate-distortion efficiency. Such \"Decomposition,\nCompression, Synthesis (DCS)\" based scheme is codec agnostic, currently\nexemplifying averaged $\\approx$1 dB PSNR gain or $\\approx$25% BD-rate saving,\nagainst the HEVC anchor using reference software. In addition, experimental\ncomparisons to the state-of-the-art methods and ablation studies are conducted\nto further report the efficiency and generalization of DCS algorithm, promising\nan encouraging direction for future video coding.\n","authors":["Ming Lu","Tong Chen","Dandan Ding","Fengqing Zhu","Zhan Ma"],"pdf_url":"https://arxiv.org/pdf/2012.00650v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04978v2","updated":"2024-01-15T13:03:31Z","published":"2023-04-11T04:50:13Z","title":"StageInteractor: Query-based Object Detector with Cross-stage\n Interaction","summary":" Previous object detectors make predictions based on dense grid points or\nnumerous preset anchors. Most of these detectors are trained with one-to-many\nlabel assignment strategies. On the contrary, recent query-based object\ndetectors depend on a sparse set of learnable queries and a series of decoder\nlayers. The one-to-one label assignment is independently applied on each layer\nfor the deep supervision during training. Despite the great success of\nquery-based object detection, however, this one-to-one label assignment\nstrategy demands the detectors to have strong fine-grained discrimination and\nmodeling capacity. To solve the above problems, in this paper, we propose a new\nquery-based object detector with cross-stage interaction, coined as\nStageInteractor. During the forward propagation, we come up with an efficient\nway to improve this modeling ability by reusing dynamic operators with\nlightweight adapters. As for the label assignment, a cross-stage label assigner\nis applied subsequent to the one-to-one label assignment. With this assigner,\nthe training target class labels are gathered across stages and then\nreallocated to proper predictions at each decoder layer. On MS COCO benchmark,\nour model improves the baseline by 2.2 AP, and achieves 44.8 AP with ResNet-50\nas backbone, 100 queries and 12 training epochs. With longer training time and\n300 queries, StageInteractor achieves 51.1 AP and 52.2 AP with ResNeXt-101-DCN\nand Swin-S, respectively.\n","authors":["Yao Teng","Haisong Liu","Sheng Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2304.04978v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12825v2","updated":"2024-01-15T12:51:09Z","published":"2023-05-22T08:36:35Z","title":"Uncertainty-based Detection of Adversarial Attacks in Semantic\n Segmentation","summary":" State-of-the-art deep neural networks have proven to be highly powerful in a\nbroad range of tasks, including semantic image segmentation. However, these\nnetworks are vulnerable against adversarial attacks, i.e., non-perceptible\nperturbations added to the input image causing incorrect predictions, which is\nhazardous in safety-critical applications like automated driving. Adversarial\nexamples and defense strategies are well studied for the image classification\ntask, while there has been limited research in the context of semantic\nsegmentation. First works however show that the segmentation outcome can be\nseverely distorted by adversarial attacks. In this work, we introduce an\nuncertainty-based approach for the detection of adversarial attacks in semantic\nsegmentation. We observe that uncertainty as for example captured by the\nentropy of the output distribution behaves differently on clean and perturbed\nimages and leverage this property to distinguish between the two cases. Our\nmethod works in a light-weight and post-processing manner, i.e., we do not\nmodify the model or need knowledge of the process used for generating\nadversarial examples. In a thorough empirical analysis, we demonstrate the\nability of our approach to detect perturbed images across multiple types of\nadversarial attacks.\n","authors":["Kira Maag","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2305.12825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07654v1","updated":"2024-01-15T12:49:51Z","published":"2024-01-15T12:49:51Z","title":"Foundation Models for Biomedical Image Segmentation: A Survey","summary":" Recent advancements in biomedical image analysis have been significantly\ndriven by the Segment Anything Model (SAM). This transformative technology,\noriginally developed for general-purpose computer vision, has found rapid\napplication in medical image processing. Within the last year, marked by over\n100 publications, SAM has demonstrated its prowess in zero-shot learning\nadaptations for medical imaging. The fundamental premise of SAM lies in its\ncapability to segment or identify objects in images without prior knowledge of\nthe object type or imaging modality. This approach aligns well with tasks\nachievable by the human visual system, though its application in non-biological\nvision contexts remains more theoretically challenging. A notable feature of\nSAM is its ability to adjust segmentation according to a specified resolution\nscale or area of interest, akin to semantic priming. This adaptability has\nspurred a wave of creativity and innovation in applying SAM to medical imaging.\nOur review focuses on the period from April 1, 2023, to September 30, 2023, a\ncritical first six months post-initial publication. We examine the adaptations\nand integrations of SAM necessary to address longstanding clinical challenges,\nparticularly in the context of 33 open datasets covered in our analysis. While\nSAM approaches or achieves state-of-the-art performance in numerous\napplications, it falls short in certain areas, such as segmentation of the\ncarotid artery, adrenal glands, optic nerve, and mandible bone. Our survey\ndelves into the innovative techniques where SAM's foundational approach excels\nand explores the core concepts in translating and applying these models\neffectively in diverse medical imaging scenarios.\n","authors":["Ho Hin Lee","Yu Gu","Theodore Zhao","Yanbo Xu","Jianwei Yang","Naoto Usuyama","Cliff Wong","Mu Wei","Bennett A. Landman","Yuankai Huo","Alberto Santamaria-Pang","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2401.07654v1.pdf","comment":"22 pages, 4 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.07641v1","updated":"2024-01-15T12:33:00Z","published":"2024-01-15T12:33:00Z","title":"SwinTextSpotter v2: Towards Better Synergy for Scene Text Spotting","summary":" End-to-end scene text spotting, which aims to read the text in natural\nimages, has garnered significant attention in recent years. However, recent\nstate-of-the-art methods usually incorporate detection and recognition simply\nby sharing the backbone, which does not directly take advantage of the feature\ninteraction between the two tasks. In this paper, we propose a new end-to-end\nscene text spotting framework termed SwinTextSpotter v2, which seeks to find a\nbetter synergy between text detection and recognition. Specifically, we enhance\nthe relationship between two tasks using novel Recognition Conversion and\nRecognition Alignment modules. Recognition Conversion explicitly guides text\nlocalization through recognition loss, while Recognition Alignment dynamically\nextracts text features for recognition through the detection predictions. This\nsimple yet effective design results in a concise framework that requires\nneither an additional rectification module nor character-level annotations for\nthe arbitrarily-shaped text. Furthermore, the parameters of the detector are\ngreatly reduced without performance degradation by introducing a Box Selection\nSchedule. Qualitative and quantitative experiments demonstrate that\nSwinTextSpotter v2 achieved state-of-the-art performance on various\nmultilingual (English, Chinese, and Vietnamese) benchmarks. The code will be\navailable at\n\\href{https://github.com/mxin262/SwinTextSpotterv2}{SwinTextSpotter v2}.\n","authors":["Mingxin Huang","Dezhi Peng","Hongliang Li","Zhenghao Peng","Chongyu Liu","Dahua Lin","Yuliang Liu","Xiang Bai","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2401.07641v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2203.10209"},{"id":"http://arxiv.org/abs/2401.07629v1","updated":"2024-01-15T12:12:48Z","published":"2024-01-15T12:12:48Z","title":"Fine-Grained Prototypes Distillation for Few-Shot Object Detection","summary":" Few-shot object detection (FSOD) aims at extending a generic detector for\nnovel object detection with only a few training examples. It attracts great\nconcerns recently due to the practical meanings. Meta-learning has been\ndemonstrated to be an effective paradigm for this task. In general, methods\nbased on meta-learning employ an additional support branch to encode novel\nexamples (a.k.a. support images) into class prototypes, which are then fused\nwith query branch to facilitate the model prediction. However, the class-level\nprototypes are difficult to precisely generate, and they also lack detailed\ninformation, leading to instability in performance.New methods are required to\ncapture the distinctive local context for more robust novel object detection.\nTo this end, we propose to distill the most representative support features\ninto fine-grained prototypes. These prototypes are then assigned into query\nfeature maps based on the matching results, modeling the detailed feature\nrelations between two branches. This process is realized by our Fine-Grained\nFeature Aggregation (FFA) module. Moreover, in terms of high-level feature\nfusion, we propose Balanced Class-Agnostic Sampling (B-CAS) strategy and\nNon-Linear Fusion (NLF) module from differenct perspectives. They are\ncomplementary to each other and depict the high-level feature relations more\neffectively. Extensive experiments on PASCAL VOC and MS COCO benchmarks show\nthat our method sets a new state-of-the-art performance in most settings. Our\ncode is available at https://github.com/wangchen1801/FPD.\n","authors":["Zichen Wang","Bo Yang","Haonan Yue","Zhenghao Ma"],"pdf_url":"https://arxiv.org/pdf/2401.07629v1.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2306.11238v3","updated":"2024-01-15T11:45:49Z","published":"2023-06-20T02:21:45Z","title":"CAMP-Net: Consistency-Aware Multi-Prior Network for Accelerated MRI\n Reconstruction","summary":" Undersampling k-space data in MRI reduces scan time but pose challenges in\nimage reconstruction. Considerable progress has been made in reconstructing\naccelerated MRI. However, restoration of high-frequency image details in highly\nundersampled data remains challenging. To address this issue, we propose\nCAMP-Net, an unrolling-based Consistency-Aware Multi-Prior Network for\naccelerated MRI reconstruction. CAMP-Net leverages complementary multi-prior\nknowledge and multi-slice information from various domains to enhance\nreconstruction quality. Specifically, CAMP-Net comprises three interleaved\nmodules for image enhancement, k-space restoration, and calibration\nconsistency, respectively. These modules jointly learn priors from data in\nimage domain, k-domain, and calibration region, respectively, in data-driven\nmanner during each unrolled iteration. Notably, the encoded calibration prior\nknowledge extracted from auto-calibrating signals implicitly guides the\nlearning of consistency-aware k-space correlation for reliable interpolation of\nmissing k-space data. To maximize the benefits of image domain and k-domain\nprior knowledge, the reconstructions are aggregated in a frequency fusion\nmodule, exploiting their complementary properties to optimize the trade-off\nbetween artifact removal and fine detail preservation. Additionally, we\nincorporate a surface data fidelity layer during the learning of k-domain and\ncalibration domain priors to prevent degradation of the reconstruction caused\nby padding-induced data imperfections. We evaluate the generalizability and\nrobustness of our method on three large public datasets with varying\nacceleration factors and sampling patterns. The experimental results\ndemonstrate that our method outperforms state-of-the-art approaches in terms of\nboth reconstruction quality and $T_2$ mapping estimation, particularly in\nscenarios with high acceleration factors.\n","authors":["Liping Zhang","Xiaobo Li","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2306.11238v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02700v2","updated":"2024-01-15T11:41:11Z","published":"2023-11-05T16:12:48Z","title":"A Generative Multi-Resolution Pyramid and Normal-Conditioning 3D Cloth\n Draping","summary":" RGB cloth generation has been deeply studied in the related literature,\nhowever, 3D garment generation remains an open problem. In this paper, we build\na conditional variational autoencoder for 3D garment generation and draping. We\npropose a pyramid network to add garment details progressively in a canonical\nspace, i.e. unposing and unshaping the garments w.r.t. the body. We study\nconditioning the network on surface normal UV maps, as an intermediate\nrepresentation, which is an easier problem to optimize than 3D coordinates. Our\nresults on two public datasets, CLOTH3D and CAPE, show that our model is\nrobust, controllable in terms of detail generation by the use of\nmulti-resolution pyramids, and achieves state-of-the-art results that can\nhighly generalize to unseen garments, poses, and shapes even when training with\nsmall amounts of data.\n","authors":["Hunor Laczkó","Meysam Madadi","Sergio Escalera","Jordi Gonzalez"],"pdf_url":"https://arxiv.org/pdf/2311.02700v2.pdf","comment":"WACV24, IEEE copyright"},{"id":"http://arxiv.org/abs/2204.07636v2","updated":"2024-01-15T11:11:23Z","published":"2022-04-15T20:24:11Z","title":"Lagrangian Motion Magnification with Double Sparse Optical Flow\n Decomposition","summary":" Microexpressions are fast and spatially small facial expressions that are\ndifficult to detect. Therefore motion magnification techniques, which aim at\namplifying and hence revealing subtle motion in videos, appear useful for\nhandling such expressions. There are basically two main approaches, namely via\nEulerian or Lagrangian techniques. While the first one magnifies motion\nimplicitly by operating directly on image pixels, the Lagrangian approach uses\noptical flow (OF) techniques to extract and magnify pixel trajectories. In this\npaper, we propose a novel approach for local Lagrangian motion magnification of\nfacial micro-motions. Our contribution is three-fold: first, we fine tune the\nrecurrent all-pairs field transforms (RAFT) for OFs deep learning approach for\nfaces by adding ground truth obtained from the variational dense inverse search\n(DIS) for OF algorithm applied to the CASME II video set of facial micro\nexpressions. This enables us to produce OFs of facial videos in an efficient\nand sufficiently accurate way. Second, since facial micro-motions are both\nlocal in space and time, we propose to approximate the OF field by sparse\ncomponents both in space and time leading to a double sparse decomposition.\nThird, we use this decomposition to magnify micro-motions in specific areas of\nthe face, where we introduce a new forward warping strategy using a triangular\nsplitting of the image grid and barycentric interpolation of the RGB vectors at\nthe corners of the transformed triangles. We demonstrate the feasibility of our\napproach by various examples.\n","authors":["Philipp Flotho","Cosmas Heiss","Gabriele Steidl","Daniel J. Strauss"],"pdf_url":"https://arxiv.org/pdf/2204.07636v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07591v1","updated":"2024-01-15T10:54:35Z","published":"2024-01-15T10:54:35Z","title":"Multimodal Crowd Counting with Pix2Pix GANs","summary":" Most state-of-the-art crowd counting methods use color (RGB) images to learn\nthe density map of the crowd. However, these methods often struggle to achieve\nhigher accuracy in densely crowded scenes with poor illumination. Recently,\nsome studies have reported improvement in the accuracy of crowd counting models\nusing a combination of RGB and thermal images. Although multimodal data can\nlead to better predictions, multimodal data might not be always available\nbeforehand. In this paper, we propose the use of generative adversarial\nnetworks (GANs) to automatically generate thermal infrared (TIR) images from\ncolor (RGB) images and use both to train crowd counting models to achieve\nhigher accuracy. We use a Pix2Pix GAN network first to translate RGB images to\nTIR images. Our experiments on several state-of-the-art crowd counting models\nand benchmark crowd datasets report significant improvement in accuracy.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2401.07591v1.pdf","comment":"Accepted version of the paper in 19th International Conference on\n Computer Vision Theory and Applications (VISAPP), Rome, Italy, 27-29 Feb,\n 2024,"},{"id":"http://arxiv.org/abs/2401.07586v1","updated":"2024-01-15T10:46:01Z","published":"2024-01-15T10:46:01Z","title":"Curriculum for Crowd Counting -- Is it Worthy?","summary":" Recent advances in deep learning techniques have achieved remarkable\nperformance in several computer vision problems. A notably intuitive technique\ncalled Curriculum Learning (CL) has been introduced recently for training deep\nlearning models. Surprisingly, curriculum learning achieves significantly\nimproved results in some tasks but marginal or no improvement in others. Hence,\nthere is still a debate about its adoption as a standard method to train\nsupervised learning models. In this work, we investigate the impact of\ncurriculum learning in crowd counting using the density estimation method. We\nperformed detailed investigations by conducting 112 experiments using six\ndifferent CL settings using eight different crowd models. Our experiments show\nthat curriculum learning improves the model learning performance and shortens\nthe convergence time.\n","authors":["Muhammad Asif Khan","Hamid Menouar","Ridha Hamila"],"pdf_url":"https://arxiv.org/pdf/2401.07586v1.pdf","comment":"Accepted version of the paper in 19th International Conference on\n Computer Vision Theory and Applications (VISAPP), Rome, Italy, 27-19 February\n 2024"},{"id":"http://arxiv.org/abs/2203.04049v2","updated":"2024-01-15T10:44:57Z","published":"2022-03-08T12:39:05Z","title":"Graph Attention Transformer Network for Multi-Label Image Classification","summary":" Multi-label classification aims to recognize multiple objects or attributes\nfrom images. However, it is challenging to learn from proper label graphs to\neffectively characterize such inter-label correlations or dependencies. Current\nmethods often use the co-occurrence probability of labels based on the training\nset as the adjacency matrix to model this correlation, which is greatly limited\nby the dataset and affects the model's generalization ability. In this paper,\nwe propose a Graph Attention Transformer Network (GATN), a general framework\nfor multi-label image classification that can effectively mine complex\ninter-label relationships. First, we use the cosine similarity based on the\nlabel word embedding as the initial correlation matrix, which can represent\nrich semantic information. Subsequently, we design the graph attention\ntransformer layer to transfer this adjacency matrix to adapt to the current\ndomain. Our extensive experiments have demonstrated that our proposed methods\ncan achieve state-of-the-art performance on three datasets.\n","authors":["Jin Yuan","Shikai Chen","Yao Zhang","Zhongchao Shi","Xin Geng","Jianping Fan","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2203.04049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07584v1","updated":"2024-01-15T10:42:04Z","published":"2024-01-15T10:42:04Z","title":"Collaboratively Self-supervised Video Representation Learning for Action\n Recognition","summary":" Considering the close connection between action recognition and human pose\nestimation, we design a Collaboratively Self-supervised Video Representation\n(CSVR) learning framework specific to action recognition by jointly considering\ngenerative pose prediction and discriminative context matching as pretext\ntasks. Specifically, our CSVR consists of three branches: a generative pose\nprediction branch, a discriminative context matching branch, and a video\ngenerating branch. Among them, the first one encodes dynamic motion feature by\nutilizing Conditional-GAN to predict the human poses of future frames, and the\nsecond branch extracts static context features by pulling the representations\nof clips and compressed key frames from the same video together while pushing\napart the pairs from different videos. The third branch is designed to recover\nthe current video frames and predict the future ones, for the purpose of\ncollaboratively improving dynamic motion features and static context features.\nExtensive experiments demonstrate that our method achieves state-of-the-art\nperformance on the UCF101 and HMDB51 datasets.\n","authors":["Jie Zhang","Zhifan Wan","Lanqing Hu","Stephen Lin","Shuzhe Wu","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2401.07584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07582v1","updated":"2024-01-15T10:38:07Z","published":"2024-01-15T10:38:07Z","title":"Geo-locating Road Objects using Inverse Haversine Formula with NVIDIA\n Driveworks","summary":" Geolocation is integral to the seamless functioning of autonomous vehicles\nand advanced traffic monitoring infrastructures. This paper introduces a\nmethodology to geolocate road objects using a monocular camera, leveraging the\nNVIDIA DriveWorks platform. We use the Centimeter Positioning Service (CPOS)\nand the inverse Haversine formula to geo-locate road objects accurately. The\nreal-time algorithm processing capability of the NVIDIA DriveWorks platform\nenables instantaneous object recognition and spatial localization for Advanced\nDriver Assistance Systems (ADAS) and autonomous driving platforms. We present a\nmeasurement pipeline suitable for autonomous driving (AD) platforms and provide\ndetailed guidelines for calibrating cameras using NVIDIA DriveWorks.\nExperiments were carried out to validate the accuracy of the proposed method\nfor geolocating targets in both controlled and dynamic settings. We show that\nour approach can locate targets with less than 1m error when the AD platform is\nstationary and less than 4m error at higher speeds (i.e. up to 60km/h) within a\n15m radius.\n","authors":["Mamoona Birkhez Shami","Gabriel Kiss","Trond Arve Haakonsen","Frank Lindseth"],"pdf_url":"https://arxiv.org/pdf/2401.07582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16221v4","updated":"2024-01-15T10:34:18Z","published":"2023-10-24T22:24:44Z","title":"Hierarchical Randomized Smoothing","summary":" Real-world data is complex and often consists of objects that can be\ndecomposed into multiple entities (e.g. images into pixels, graphs into\ninterconnected nodes). Randomized smoothing is a powerful framework for making\nmodels provably robust against small changes to their inputs - by guaranteeing\nrobustness of the majority vote when randomly adding noise before\nclassification. Yet, certifying robustness on such complex data via randomized\nsmoothing is challenging when adversaries do not arbitrarily perturb entire\nobjects (e.g. images) but only a subset of their entities (e.g. pixels). As a\nsolution, we introduce hierarchical randomized smoothing: We partially smooth\nobjects by adding random noise only on a randomly selected subset of their\nentities. By adding noise in a more targeted manner than existing methods we\nobtain stronger robustness guarantees while maintaining high accuracy. We\ninitialize hierarchical smoothing using different noising distributions,\nyielding novel robustness certificates for discrete and continuous domains. We\nexperimentally demonstrate the importance of hierarchical smoothing in image\nand node classification, where it yields superior robustness-accuracy\ntrade-offs. Overall, hierarchical smoothing is an important contribution\ntowards models that are both - certifiably robust to perturbations and\naccurate.\n","authors":["Yan Scholten","Jan Schuchardt","Aleksandar Bojchevski","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2310.16221v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07579v1","updated":"2024-01-15T10:26:47Z","published":"2024-01-15T10:26:47Z","title":"PMFSNet: Polarized Multi-scale Feature Self-attention Network For\n Lightweight Medical Image Segmentation","summary":" Current state-of-the-art medical image segmentation methods prioritize\naccuracy but often at the expense of increased computational demands and larger\nmodel sizes. Applying these large-scale models to the relatively limited scale\nof medical image datasets tends to induce redundant computation, complicating\nthe process without the necessary benefits. This approach not only adds\ncomplexity but also presents challenges for the integration and deployment of\nlightweight models on edge devices. For instance, recent transformer-based\nmodels have excelled in 2D and 3D medical image segmentation due to their\nextensive receptive fields and high parameter count. However, their\neffectiveness comes with a risk of overfitting when applied to small datasets\nand often neglects the vital inductive biases of Convolutional Neural Networks\n(CNNs), essential for local feature representation. In this work, we propose\nPMFSNet, a novel medical imaging segmentation model that effectively balances\nglobal and local feature processing while avoiding the computational redundancy\ntypical in larger models. PMFSNet streamlines the UNet-based hierarchical\nstructure and simplifies the self-attention mechanism's computational\ncomplexity, making it suitable for lightweight applications. It incorporates a\nplug-and-play PMFS block, a multi-scale feature enhancement module based on\nattention mechanisms, to capture long-term dependencies. Extensive\ncomprehensive results demonstrate that even with a model (less than 1 million\nparameters), our method achieves superior performance in various segmentation\ntasks across different data scales. It achieves (IoU) metrics of 84.68%,\n82.02%, and 78.82% on public datasets of teeth CT (CBCT), ovarian tumors\nultrasound(MMOTU), and skin lesions dermoscopy images (ISIC 2018),\nrespectively. The source code is available at\nhttps://github.com/yykzjh/PMFSNet.\n","authors":["Jiahui Zhong","Wenhong Tian","Yuanlun Xie","Zhijia Liu","Jie Ou","Taoran Tian","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07572v1","updated":"2024-01-15T10:16:44Z","published":"2024-01-15T10:16:44Z","title":"Exploiting GPT-4 Vision for Zero-shot Point Cloud Understanding","summary":" In this study, we tackle the challenge of classifying the object category in\npoint clouds, which previous works like PointCLIP struggle to address due to\nthe inherent limitations of the CLIP architecture. Our approach leverages GPT-4\nVision (GPT-4V) to overcome these challenges by employing its advanced\ngenerative abilities, enabling a more adaptive and robust classification\nprocess. We adapt the application of GPT-4V to process complex 3D data,\nenabling it to achieve zero-shot recognition capabilities without altering the\nunderlying model architecture. Our methodology also includes a systematic\nstrategy for point cloud image visualization, mitigating domain gap and\nenhancing GPT-4V's efficiency. Experimental validation demonstrates our\napproach's superiority in diverse scenarios, setting a new benchmark in\nzero-shot point cloud classification.\n","authors":["Qi Sun","Xiao Cui","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2401.07572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07571v1","updated":"2024-01-15T10:11:19Z","published":"2024-01-15T10:11:19Z","title":"A Bi-Pyramid Multimodal Fusion Method for the Diagnosis of Bipolar\n Disorders","summary":" Previous research on the diagnosis of Bipolar disorder has mainly focused on\nresting-state functional magnetic resonance imaging. However, their accuracy\ncan not meet the requirements of clinical diagnosis. Efficient multimodal\nfusion strategies have great potential for applications in multimodal data and\ncan further improve the performance of medical diagnosis models. In this work,\nwe utilize both sMRI and fMRI data and propose a novel multimodal diagnosis\nmodel for bipolar disorder. The proposed Patch Pyramid Feature Extraction\nModule extracts sMRI features, and the spatio-temporal pyramid structure\nextracts the fMRI features. Finally, they are fused by a fusion module to\noutput diagnosis results with a classifier. Extensive experiments show that our\nproposed method outperforms others in balanced accuracy from 0.657 to 0.732 on\nthe OpenfMRI dataset, and achieves the state of the art.\n","authors":["Guoxin Wang","Sheng Shi","Shan An","Fengmei Fan","Wenshu Ge","Qi Wang","Feng Yu","Zhiren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07571v1.pdf","comment":"Accepted by IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.07567v1","updated":"2024-01-15T09:59:43Z","published":"2024-01-15T09:59:43Z","title":"Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy\n for Temporal Sentence Grounding in Video","summary":" Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias\nissue, which is caused by the uneven temporal distribution of the target\nmoments for samples with similar semantic components in input videos or query\ntexts. Existing methods resort to utilizing prior knowledge about bias to\nartificially break this uneven distribution, which only removes a limited\namount of significant language biases. In this work, we propose the\nbias-conflict sample synthesis and adversarial removal debias strategy\n(BSSARD), which dynamically generates bias-conflict samples by explicitly\nleveraging potentially spurious correlations between single-modality features\nand the temporal position of the target moments. Through adversarial training,\nits bias generators continuously introduce biases and generate bias-conflict\nsamples to deceive its grounding model. Meanwhile, the grounding model\ncontinuously eliminates the introduced biases, which requires it to model\nmulti-modality alignment information. BSSARD will cover most kinds of coupling\nrelationships and disrupt language and visual biases simultaneously. Extensive\nexperiments on Charades-CD and ActivityNet-CD demonstrate the promising\ndebiasing capability of BSSARD. Source codes are available at\nhttps://github.com/qzhb/BSSARD.\n","authors":["Zhaobo Qi","Yibo Yuan","Xiaowen Ruan","Shuhui Wang","Weigang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2401.07567v1.pdf","comment":"accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2306.10898v2","updated":"2024-01-15T09:13:05Z","published":"2023-06-19T12:54:28Z","title":"B-cos Alignment for Inherently Interpretable CNNs and Vision\n Transformers","summary":" We present a new direction for increasing the interpretability of deep neural\nnetworks (DNNs) by promoting weight-input alignment during training. For this,\nwe propose to replace the linear transformations in DNNs by our novel B-cos\ntransformation. As we show, a sequence (network) of such transformations\ninduces a single linear transformation that faithfully summarises the full\nmodel computations. Moreover, the B-cos transformation is designed such that\nthe weights align with relevant signals during optimisation. As a result, those\ninduced linear transformations become highly interpretable and highlight\ntask-relevant features. Importantly, the B-cos transformation is designed to be\ncompatible with existing architectures and we show that it can easily be\nintegrated into virtually all of the latest state of the art models for\ncomputer vision - e.g. ResNets, DenseNets, ConvNext models, as well as Vision\nTransformers - by combining the B-cos-based explanations with normalisation and\nattention layers, all whilst maintaining similar accuracy on ImageNet. Finally,\nwe show that the resulting explanations are of high visual quality and perform\nwell under quantitative interpretability metrics.\n","authors":["Moritz Böhle","Navdeeppal Singh","Mario Fritz","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2306.10898v2.pdf","comment":"Extension of B-cos Networks: Alignment is All We Need for\n Interpretability (B\\\"ohle et al., CVPR 2022). Accepted for publication in\n IEEE Transactions on Pattern Analysis and Machine Intelligence. arXiv admin\n note: substantial text overlap with arXiv:2205.10268"},{"id":"http://arxiv.org/abs/2309.17334v2","updated":"2024-01-15T09:05:34Z","published":"2023-09-29T15:46:25Z","title":"Multi-Depth Branch Network for Efficient Image Super-Resolution","summary":" A longstanding challenge in Super-Resolution (SR) is how to efficiently\nenhance high-frequency details in Low-Resolution (LR) images while maintaining\nsemantic coherence. This is particularly crucial in practical applications\nwhere SR models are often deployed on low-power devices. To address this issue,\nwe propose an innovative asymmetric SR architecture featuring Multi-Depth\nBranch Module (MDBM). These MDBMs contain branches of different depths,\ndesigned to capture high- and low-frequency information simultaneously and\nefficiently. The hierarchical structure of MDBM allows the deeper branch to\ngradually accumulate fine-grained local details under the contextual guidance\nof the shallower branch. We visualize this process using feature maps, and\nfurther demonstrate the rationality and effectiveness of this design using\nproposed novel Fourier spectral analysis methods. Moreover, our model exhibits\nmore significant spectral differentiation between branches than existing branch\nnetworks. This suggests that MDBM reduces feature redundancy and offers a more\neffective method for integrating high- and low-frequency information. Extensive\nqualitative and quantitative evaluations on various datasets show that our\nmodel can generate structurally consistent and visually realistic HR images. It\nachieves state-of-the-art (SOTA) results at a very fast inference speed. Our\ncode is available at https://github.com/thy960112/MDBN.\n","authors":["Huiyuan Tian","Li Zhang","Shijian Li","Min Yao","Gang Pan"],"pdf_url":"https://arxiv.org/pdf/2309.17334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07542v1","updated":"2024-01-15T09:03:50Z","published":"2024-01-15T09:03:50Z","title":"Combining Image- and Geometric-based Deep Learning for Shape Regression:\n A Comparison to Pixel-level Methods for Segmentation in Chest X-Ray","summary":" When solving a segmentation task, shaped-base methods can be beneficial\ncompared to pixelwise classification due to geometric understanding of the\ntarget object as shape, preventing the generation of anatomical implausible\npredictions in particular for corrupted data. In this work, we propose a novel\nhybrid method that combines a lightweight CNN backbone with a geometric neural\nnetwork (Point Transformer) for shape regression. Using the same CNN encoder,\nthe Point Transformer reaches segmentation quality on per with current\nstate-of-the-art convolutional decoders ($4\\pm1.9$ vs $3.9\\pm2.9$ error in mm\nand $85\\pm13$ vs $88\\pm10$ Dice), but crucially, is more stable w.r.t image\ndistortion, starting to outperform them at a corruption level of 30%.\nFurthermore, we include the nnU-Net as an upper baseline, which has $3.7\\times$\nmore trainable parameters than our proposed method.\n","authors":["Ron Keuth","Mattias Heinrich"],"pdf_url":"https://arxiv.org/pdf/2401.07542v1.pdf","comment":"Submitted to German Conference on Medical Image Computing 2024"},{"id":"http://arxiv.org/abs/2109.13004v2","updated":"2024-01-15T08:44:20Z","published":"2021-09-27T12:39:46Z","title":"Optimising for Interpretability: Convolutional Dynamic Alignment\n Networks","summary":" We introduce a new family of neural network models called Convolutional\nDynamic Alignment Networks (CoDA Nets), which are performant classifiers with a\nhigh degree of inherent interpretability. Their core building blocks are\nDynamic Alignment Units (DAUs), which are optimised to transform their inputs\nwith dynamically computed weight vectors that align with task-relevant\npatterns. As a result, CoDA Nets model the classification prediction through a\nseries of input-dependent linear transformations, allowing for linear\ndecomposition of the output into individual input contributions. Given the\nalignment of the DAUs, the resulting contribution maps align with\ndiscriminative input patterns. These model-inherent decompositions are of high\nvisual quality and outperform existing attribution methods under quantitative\nmetrics. Further, CoDA Nets constitute performant classifiers, achieving on par\nresults to ResNet and VGG models on e.g. CIFAR-10 and TinyImagenet. Lastly,\nCoDA Nets can be combined with conventional neural network models to yield\npowerful classifiers that more easily scale to complex datasets such as\nImagenet whilst exhibiting an increased interpretable depth, i.e., the output\ncan be explained well in terms of contributions from intermediate layers within\nthe network.\n","authors":["Moritz Böhle","Mario Fritz","Bernt Schiele"],"pdf_url":"https://arxiv.org/pdf/2109.13004v2.pdf","comment":"Extension of \"Convolutional Dynamic Alignment Networks for\n Interpretable Classifications\" (B\\\"ohle et al., CVPR 2021). arXiv admin note:\n substantial text overlap with arXiv:2104.00032"},{"id":"http://arxiv.org/abs/2401.06637v2","updated":"2024-01-15T08:23:02Z","published":"2024-01-12T15:29:21Z","title":"Adversarial Examples are Misaligned in Diffusion Model Manifolds","summary":" In recent years, diffusion models (DMs) have drawn significant attention for\ntheir success in approximating data distributions, yielding state-of-the-art\ngenerative results. Nevertheless, the versatility of these models extends\nbeyond their generative capabilities to encompass various vision applications,\nsuch as image inpainting, segmentation, adversarial robustness, among others.\nThis study is dedicated to the investigation of adversarial attacks through the\nlens of diffusion models. However, our objective does not involve enhancing the\nadversarial robustness of image classifiers. Instead, our focus lies in\nutilizing the diffusion model to detect and analyze the anomalies introduced by\nthese attacks on images. To that end, we systematically examine the alignment\nof the distributions of adversarial examples when subjected to the process of\ntransformation using diffusion models. The efficacy of this approach is\nassessed across CIFAR-10 and ImageNet datasets, including varying image sizes\nin the latter. The results demonstrate a notable capacity to discriminate\neffectively between benign and attacked images, providing compelling evidence\nthat adversarial instances do not align with the learned manifold of the DMs.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2401.06637v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.07529v1","updated":"2024-01-15T08:19:22Z","published":"2024-01-15T08:19:22Z","title":"MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of\n Multimodal Large Language Models in Perception","summary":" Multimodal Large Language Models (MLLMs) have shown their remarkable\nabilities in visual perception and understanding recently. However, how to\ncomprehensively evaluate the capabilities of MLLMs remains a challenge. Most of\nthe existing benchmarks predominantly focus on assessing perception, cognition,\nand reasoning, neglecting the abilities of self-awareness, referring to the\nmodel's recognition of its own capability boundary. In our study, we focus on\nself-awareness in image perception and introduce the knowledge quadrant for\nMLLMs, which clearly defines the knowns and unknowns in perception. Based on\nthis, we propose a novel benchmark specifically designed to evaluate the\nSelf-Aware capabilities in Perception for MLLMs(MM-SAP). MM-SAP encompasses\nthree distinct sub-datasets, each focusing on different aspects of\nself-awareness. We evaluated eight well-known MLLMs using MM-SAP, analyzing\ntheir self-awareness and providing detailed insights. Code and data are\navailable at https://github.com/YHWmz/MM-SAP\n","authors":["Yuhao Wang","Yusheng Liao","Heyang Liu","Hongcheng Liu","Yu Wang","Yanfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03517v2","updated":"2024-01-15T08:15:52Z","published":"2023-11-06T20:51:16Z","title":"SoundCam: A Dataset for Finding Humans Using Room Acoustics","summary":" A room's acoustic properties are a product of the room's geometry, the\nobjects within the room, and their specific positions. A room's acoustic\nproperties can be characterized by its impulse response (RIR) between a source\nand listener location, or roughly inferred from recordings of natural signals\npresent in the room. Variations in the positions of objects in a room can\neffect measurable changes in the room's acoustic properties, as characterized\nby the RIR. Existing datasets of RIRs either do not systematically vary\npositions of objects in an environment, or they consist of only simulated RIRs.\nWe present SoundCam, the largest dataset of unique RIRs from in-the-wild rooms\npublicly released to date. It includes 5,000 10-channel real-world measurements\nof room impulse responses and 2,000 10-channel recordings of music in three\ndifferent rooms, including a controlled acoustic lab, an in-the-wild living\nroom, and a conference room, with different humans in positions throughout each\nroom. We show that these measurements can be used for interesting tasks, such\nas detecting and identifying humans, and tracking their positions.\n","authors":["Mason Wang","Samuel Clarke","Jui-Hsien Wang","Ruohan Gao","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2311.03517v2.pdf","comment":"In NeurIPS 2023 Datasets and Benchmarks Track. Project page:\n https://masonlwang.com/soundcam/. Wang and Clarke contributed equally to this\n work"},{"id":"http://arxiv.org/abs/2401.07527v1","updated":"2024-01-15T08:12:51Z","published":"2024-01-15T08:12:51Z","title":"One for All: Toward Unified Foundation Models for Earth Vision","summary":" Foundation models characterized by extensive parameters and trained on\nlarge-scale datasets have demonstrated remarkable efficacy across various\ndownstream tasks for remote sensing data. Current remote sensing foundation\nmodels typically specialize in a single modality or a specific spatial\nresolution range, limiting their versatility for downstream datasets. While\nthere have been attempts to develop multi-modal remote sensing foundation\nmodels, they typically employ separate vision encoders for each modality or\nspatial resolution, necessitating a switch in backbones contingent upon the\ninput data. To address this issue, we introduce a simple yet effective method,\ntermed OFA-Net (One-For-All Network): employing a single, shared Transformer\nbackbone for multiple data modalities with different spatial resolutions. Using\nthe masked image modeling mechanism, we pre-train a single Transformer backbone\non a curated multi-modal dataset with this simple design. Then the backbone\nmodel can be used in different downstream tasks, thus forging a path towards a\nunified foundation backbone model in Earth vision. The proposed method is\nevaluated on 12 distinct downstream tasks and demonstrates promising\nperformance.\n","authors":["Zhitong Xiong","Yi Wang","Fahong Zhang","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.07527v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2305.18455v2","updated":"2024-01-15T07:51:23Z","published":"2023-05-29T04:22:57Z","title":"Diff-Instruct: A Universal Approach for Transferring Knowledge From\n Pre-trained Diffusion Models","summary":" Due to the ease of training, ability to scale, and high sample quality,\ndiffusion models (DMs) have become the preferred option for generative\nmodeling, with numerous pre-trained models available for a wide variety of\ndatasets. Containing intricate information about data distributions,\npre-trained DMs are valuable assets for downstream applications. In this work,\nwe consider learning from pre-trained DMs and transferring their knowledge to\nother generative models in a data-free fashion. Specifically, we propose a\ngeneral framework called Diff-Instruct to instruct the training of arbitrary\ngenerative models as long as the generated samples are differentiable with\nrespect to the model parameters. Our proposed Diff-Instruct is built on a\nrigorous mathematical foundation where the instruction process directly\ncorresponds to minimizing a novel divergence we call Integral Kullback-Leibler\n(IKL) divergence. IKL is tailored for DMs by calculating the integral of the KL\ndivergence along a diffusion process, which we show to be more robust in\ncomparing distributions with misaligned supports. We also reveal non-trivial\nconnections of our method to existing works such as DreamFusion, and generative\nadversarial training. To demonstrate the effectiveness and universality of\nDiff-Instruct, we consider two scenarios: distilling pre-trained diffusion\nmodels and refining existing GAN models. The experiments on distilling\npre-trained diffusion models show that Diff-Instruct results in\nstate-of-the-art single-step diffusion-based models. The experiments on\nrefining GAN models show that the Diff-Instruct can consistently improve the\npre-trained generators of GAN models across various settings.\n","authors":["Weijian Luo","Tianyang Hu","Shifeng Zhang","Jiacheng Sun","Zhenguo Li","Zhihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.18455v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07519v1","updated":"2024-01-15T07:50:18Z","published":"2024-01-15T07:50:18Z","title":"InstantID: Zero-shot Identity-Preserving Generation in Seconds","summary":" There has been significant progress in personalized image synthesis with\nmethods such as Textual Inversion, DreamBooth, and LoRA. Yet, their real-world\napplicability is hindered by high storage demands, lengthy fine-tuning\nprocesses, and the need for multiple reference images. Conversely, existing ID\nembedding-based methods, while requiring only a single forward inference, face\nchallenges: they either necessitate extensive fine-tuning across numerous model\nparameters, lack compatibility with community pre-trained models, or fail to\nmaintain high face fidelity. Addressing these limitations, we introduce\nInstantID, a powerful diffusion model-based solution. Our plug-and-play module\nadeptly handles image personalization in various styles using just a single\nfacial image, while ensuring high fidelity. To achieve this, we design a novel\nIdentityNet by imposing strong semantic and weak spatial conditions,\nintegrating facial and landmark images with textual prompts to steer the image\ngeneration. InstantID demonstrates exceptional performance and efficiency,\nproving highly beneficial in real-world applications where identity\npreservation is paramount. Moreover, our work seamlessly integrates with\npopular pre-trained text-to-image diffusion models like SD1.5 and SDXL, serving\nas an adaptable plugin. Our codes and pre-trained checkpoints will be available\nat https://github.com/InstantID/InstantID.\n","authors":["Qixun Wang","Xu Bai","Haofan Wang","Zekui Qin","Anthony Chen"],"pdf_url":"https://arxiv.org/pdf/2401.07519v1.pdf","comment":"Technical Report, project page available at\n https://instantid.github.io/"},{"id":"http://arxiv.org/abs/2306.14448v3","updated":"2024-01-15T07:45:02Z","published":"2023-06-26T06:34:53Z","title":"Progressive Energy-Based Cooperative Learning for Multi-Domain\n Image-to-Image Translation","summary":" This paper studies a novel energy-based cooperative learning framework for\nmulti-domain image-to-image translation. The framework consists of four\ncomponents: descriptor, translator, style encoder, and style generator. The\ndescriptor is a multi-head energy-based model that represents a multi-domain\nimage distribution. The components of translator, style encoder, and style\ngenerator constitute a diversified image generator. Specifically, given an\ninput image from a source domain, the translator turns it into a stylised\noutput image of the target domain according to a style code, which can be\ninferred by the style encoder from a reference image or produced by the style\ngenerator from a random noise. Since the style generator is represented as an\ndomain-specific distribution of style codes, the translator can provide a\none-to-many transformation (i.e., diversified generation) between source domain\nand target domain. To train our framework, we propose a likelihood-based\nmulti-domain cooperative learning algorithm to jointly train the multi-domain\ndescriptor and the diversified image generator (including translator, style\nencoder, and style generator modules) via multi-domain MCMC teaching, in which\nthe descriptor guides the diversified image generator to shift its probability\ndensity toward the data distribution, while the diversified image generator\nuses its randomly translated images to initialize the descriptor's Langevin\ndynamics process for efficient sampling.\n","authors":["Weinan Song","Yaxuan Zhu","Lei He","Yingnian Wu","Jianwen Xie"],"pdf_url":"https://arxiv.org/pdf/2306.14448v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12401v3","updated":"2024-01-15T07:32:28Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Marginalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose Causal Abstraction Segmentation Refiner (CASR), which can\nrefine TAS results from various models by enhancing video causality in\nmarginalizing frame-level casual relationships. Specifically, we define the\nequivalent frame-level casual model and segment-level causal model, so that the\ncausal adjacency matrix constructed from marginalized frame-level causal\nrelationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04377v2","updated":"2024-01-15T07:22:25Z","published":"2024-01-09T06:52:23Z","title":"Towards Real-World Aerial Vision Guidance with Categorical 6D Pose\n Tracker","summary":" Tracking the object 6-DoF pose is crucial for various downstream robot tasks\nand real-world applications. In this paper, we investigate the real-world robot\ntask of aerial vision guidance for aerial robotics manipulation, utilizing\ncategory-level 6-DoF pose tracking. Aerial conditions inevitably introduce\nspecial challenges, such as rapid viewpoint changes in pitch and roll and\ninter-frame differences. To support these challenges in task, we firstly\nintroduce a robust category-level 6-DoF pose tracker (Robust6DoF). This tracker\nleverages shape and temporal prior knowledge to explore optimal inter-frame\nkeypoint pairs, generated under a priori structural adaptive supervision in a\ncoarse-to-fine manner. Notably, our Robust6DoF employs a Spatial-Temporal\nAugmentation module to deal with the problems of the inter-frame differences\nand intra-class shape variations through both temporal dynamic filtering and\nshape-similarity filtering. We further present a Pose-Aware Discrete Servo\nstrategy (PAD-Servo), serving as a decoupling approach to implement the final\naerial vision guidance task. It contains two servo action policies to better\naccommodate the structural properties of aerial robotics manipulation.\nExhaustive experiments on four well-known public benchmarks demonstrate the\nsuperiority of our Robust6DoF. Real-world tests directly verify that our\nRobust6DoF along with PAD-Servo can be readily used in real-world aerial\nrobotic applications.\n","authors":["Jingtao Sun","Yaonan Wang","Danwei Wang"],"pdf_url":"https://arxiv.org/pdf/2401.04377v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07503v1","updated":"2024-01-15T07:06:36Z","published":"2024-01-15T07:06:36Z","title":"PolMERLIN: Self-Supervised Polarimetric Complex SAR Image Despeckling\n with Masked Networks","summary":" Despeckling is a crucial noise reduction task in improving the quality of\nsynthetic aperture radar (SAR) images. Directly obtaining noise-free SAR images\nis a challenging task that has hindered the development of accurate despeckling\nalgorithms. The advent of deep learning has facilitated the study of denoising\nmodels that learn from only noisy SAR images. However, existing methods deal\nsolely with single-polarization images and cannot handle the multi-polarization\nimages captured by modern satellites. In this work, we present an extension of\nthe existing model for generating single-polarization SAR images to handle\nmulti-polarization SAR images. Specifically, we propose a novel self-supervised\ndespeckling approach called channel masking, which exploits the relationship\nbetween polarizations. Additionally, we utilize a spatial masking method that\naddresses pixel-to-pixel correlations to further enhance the performance of our\napproach. By effectively incorporating multiple polarization information, our\nmethod surpasses current state-of-the-art methods in quantitative evaluation in\nboth synthetic and real-world scenarios.\n","authors":["Shunya Kato","Masaki Saito","Katsuhiko Ishiguro","Sol Cummings"],"pdf_url":"https://arxiv.org/pdf/2401.07503v1.pdf","comment":"To appear on IEEE Geoscience and Remote Sensing Letters"},{"id":"http://arxiv.org/abs/2401.07502v1","updated":"2024-01-15T07:03:10Z","published":"2024-01-15T07:03:10Z","title":"Compositional Oil Spill Detection Based on Object Detector and Adapted\n Segment Anything Model from SAR Images","summary":" Semantic segmentation-based methods have attracted extensive attention in oil\nspill detection from SAR images. However, the existing approaches require a\nlarge number of finely annotated segmentation samples in the training stage. To\nalleviate this issue, we propose a composite oil spill detection framework,\nSAM-OIL, comprising an object detector (e.g., YOLOv8), an adapted Segment\nAnything Model (SAM), and an Ordered Mask Fusion (OMF) module. SAM-OIL is the\nfirst application of the powerful SAM in oil spill detection. Specifically, the\nSAM-OIL strategy uses YOLOv8 to obtain the categories and bounding boxes of oil\nspill-related objects, then inputs bounding boxes into the adapted SAM to\nretrieve category-agnostic masks, and finally adopts the Ordered Mask Fusion\n(OMF) module to fuse the masks and categories. The adapted SAM, combining a\nfrozen SAM with a learnable Adapter module, can enhance SAM's ability to\nsegment ambiguous objects. The OMF module, a parameter-free method, can\neffectively resolve pixel category conflicts within SAM. Experimental results\ndemonstrate that SAM-OIL surpasses existing semantic segmentation-based oil\nspill detection methods, achieving mIoU of 69.52%. The results also indicated\nthat both OMF and Adapter modules can effectively improve the accuracy in\nSAM-OIL.\n","authors":["Wenhui Wu","Man Sing Wong","Xinyu Yu","Guoqiang Shi","Coco Yin Tung Kwok","Kang Zou"],"pdf_url":"https://arxiv.org/pdf/2401.07502v1.pdf","comment":"5 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.07500v1","updated":"2024-01-15T06:50:09Z","published":"2024-01-15T06:50:09Z","title":"Harnessing Deep Learning and Satellite Imagery for Post-Buyout Land\n Cover Mapping","summary":" Environmental disasters such as floods, hurricanes, and wildfires have\nincreasingly threatened communities worldwide, prompting various mitigation\nstrategies. Among these, property buyouts have emerged as a prominent approach\nto reducing vulnerability to future disasters. This strategy involves\ngovernments purchasing at-risk properties from willing sellers and converting\nthe land into open space, ostensibly reducing future disaster risk and impact.\nHowever, the aftermath of these buyouts, particularly concerning land-use\npatterns and community impacts, remains under-explored. This research aims to\nfill this gap by employing innovative techniques like satellite imagery\nanalysis and deep learning to study these patterns. To achieve this goal, we\nemployed FEMA's Hazard Mitigation Grant Program (HMGP) buyout dataset,\nencompassing over 41,004 addresses of these buyout properties from 1989 to\n2017. Leveraging Google's Maps Static API, we gathered 40,053 satellite images\ncorresponding to these buyout lands. Subsequently, we implemented five\ncutting-edge machine learning models to evaluate their performance in\nclassifying land cover types. Notably, this task involved multi-class\nclassification, and our model achieved an outstanding ROC-AUC score of 98.86%\n","authors":["Hakan T. Otal","Elyse Zavar","Sherri B. Binder","Alex Greer","M. Abdullah Canbaz"],"pdf_url":"https://arxiv.org/pdf/2401.07500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06071v3","updated":"2024-01-15T06:29:17Z","published":"2024-01-11T17:41:57Z","title":"LEGO:Language Enhanced Multi-modal Grounding Model","summary":" Multi-modal large language models have demonstrated impressive performance\nacross various tasks in different modalities. However, existing multi-modal\nmodels primarily emphasize capturing global information within each modality\nwhile neglecting the importance of perceiving local information across\nmodalities. Consequently, these models lack the ability to effectively\nunderstand the fine-grained details of input data, limiting their performance\nin tasks that require a more nuanced understanding. To address this limitation,\nthere is a compelling need to develop models that enable fine-grained\nunderstanding across multiple modalities, thereby enhancing their applicability\nto a wide range of tasks. In this paper, we propose LEGO, a language enhanced\nmulti-modal grounding model. Beyond capturing global information like other\nmulti-modal models, our proposed model excels at tasks demanding a detailed\nunderstanding of local information within the input. It demonstrates precise\nidentification and localization of specific regions in images or moments in\nvideos. To achieve this objective, we design a diversified dataset construction\npipeline, resulting in a multi-modal, multi-granularity dataset for model\ntraining. The code, dataset, and demo of our model can be found at https:\n//github.com/lzw-lzw/LEGO.\n","authors":["Zhaowei Li","Qi Xu","Dong Zhang","Hang Song","Yiqing Cai","Qi Qi","Ran Zhou","Junting Pan","Zefeng Li","Van Tu Vu","Zhida Huang","Tao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.06071v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07487v1","updated":"2024-01-15T06:02:30Z","published":"2024-01-15T06:02:30Z","title":"Robo-ABC: Affordance Generalization Beyond Categories via Semantic\n Correspondence for Robot Manipulation","summary":" Enabling robotic manipulation that generalizes to out-of-distribution scenes\nis a crucial step toward open-world embodied intelligence. For human beings,\nthis ability is rooted in the understanding of semantic correspondence among\nobjects, which naturally transfers the interaction experience of familiar\nobjects to novel ones. Although robots lack such a reservoir of interaction\nexperience, the vast availability of human videos on the Internet may serve as\na valuable resource, from which we extract an affordance memory including the\ncontact points. Inspired by the natural way humans think, we propose Robo-ABC:\nwhen confronted with unfamiliar objects that require generalization, the robot\ncan acquire affordance by retrieving objects that share visual or semantic\nsimilarities from the affordance memory. The next step is to map the contact\npoints of the retrieved objects to the new object. While establishing this\ncorrespondence may present formidable challenges at first glance, recent\nresearch finds it naturally arises from pre-trained diffusion models, enabling\naffordance mapping even across disparate object categories. Through the\nRobo-ABC framework, robots may generalize to manipulate out-of-category objects\nin a zero-shot manner without any manual annotation, additional training, part\nsegmentation, pre-coded knowledge, or viewpoint restrictions. Quantitatively,\nRobo-ABC significantly enhances the accuracy of visual affordance retrieval by\na large margin of 31.6% compared to state-of-the-art (SOTA) end-to-end\naffordance models. We also conduct real-world experiments of cross-category\nobject-grasping tasks. Robo-ABC achieved a success rate of 85.7%, proving its\ncapacity for real-world tasks.\n","authors":["Yuanchen Ju","Kaizhe Hu","Guowei Zhang","Gu Zhang","Mingrun Jiang","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2401.07487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05527v3","updated":"2024-01-15T05:38:47Z","published":"2023-09-11T15:11:11Z","title":"ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source\n Reconstruction and Target Simulation","summary":" Domain shifts such as sensor type changes and geographical situation\nvariations are prevalent in Autonomous Driving (AD), which poses a challenge\nsince AD model relying on the previous domain knowledge can be hardly directly\ndeployed to a new domain without additional costs. In this paper, we provide a\nnew perspective and approach of alleviating the domain shifts, by proposing a\nReconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the\nimplicit reconstruction process is based on the knowledge from the previous old\ndomain, aiming to convert the domain-related knowledge into domain-invariant\nrepresentations, e.g., 3D scene-level meshes. Besides, the point clouds\nsimulation process of multiple new domains is conditioned on the above\nreconstructed 3D meshes, where the target-domain-like simulation samples can be\nobtained, thus reducing the cost of collecting and annotating new-domain data\nfor the subsequent perception process. For experiments, we consider different\ncross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes,\nWaymo-to-ONCE, etc, to verify the zero-shot target-domain perception using\nReSimAD. Results demonstrate that our method is beneficial to boost the domain\ngeneralization ability, even promising for 3D pre-training.\n","authors":["Bo Zhang","Xinyu Cai","Jiakang Yuan","Donglin Yang","Jianfei Guo","Xiangchao Yan","Renqiu Xia","Botian Shi","Min Dou","Tao Chen","Si Liu","Junchi Yan","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2309.05527v3.pdf","comment":"Code and simulated points are available at\n https://github.com/PJLab-ADG/3DTrans#resimad"},{"id":"http://arxiv.org/abs/2401.02330v2","updated":"2024-01-15T05:24:20Z","published":"2024-01-04T16:07:43Z","title":"LLaVA-Phi: Efficient Multi-Modal Assistant with Small Language Model","summary":" In this paper, we introduce LLaVA-$\\phi$ (LLaVA-Phi), an efficient\nmulti-modal assistant that harnesses the power of the recently advanced small\nlanguage model, Phi-2, to facilitate multi-modal dialogues. LLaVA-Phi marks a\nnotable advancement in the realm of compact multi-modal models. It demonstrates\nthat even smaller language models, with as few as 2.7B parameters, can\neffectively engage in intricate dialogues that integrate both textual and\nvisual elements, provided they are trained with high-quality corpora. Our model\ndelivers commendable performance on publicly available benchmarks that\nencompass visual comprehension, reasoning, and knowledge-based perception.\nBeyond its remarkable performance in multi-modal dialogue tasks, our model\nopens new avenues for applications in time-sensitive environments and systems\nthat require real-time interaction, such as embodied agents. It highlights the\npotential of smaller language models to achieve sophisticated levels of\nunderstanding and interaction, while maintaining greater resource\nefficiency.The project is available at {https://github.com/zhuyiche/llava-phi}.\n","authors":["Yichen Zhu","Minjie Zhu","Ning Liu","Zhicai Ou","Xiaofeng Mou","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.02330v2.pdf","comment":"technique report"},{"id":"http://arxiv.org/abs/2401.02099v2","updated":"2024-01-15T05:23:37Z","published":"2024-01-04T07:11:16Z","title":"CLAPP: Contrastive Language-Audio Pre-training in Passive Underwater\n Vessel Classification","summary":" Existing research on audio classification faces challenges in recognizing\nattributes of passive underwater vessel scenarios and lacks well-annotated\ndatasets due to data privacy concerns. In this study, we introduce CLAPP\n(Contrastive Language-Audio Pre-training in Passive Underwater Vessel\nClassification), a novel model. Our aim is to train a neural network using a\nwide range of vessel audio and vessel state text pairs obtained from an\noceanship dataset. CLAPP is capable of directly learning from raw vessel audio\ndata and, when available, from carefully curated labels, enabling improved\nrecognition of vessel attributes in passive underwater vessel scenarios.\nModel's zero-shot capability allows predicting the most relevant vessel state\ndescription for a given vessel audio, without directly optimizing for the task.\nOur approach aims to solve 2 challenges: vessel audio-text classification and\npassive underwater vessel audio attribute recognition. The proposed method\nachieves new state-of-the-art results on both Deepship and Shipsear public\ndatasets, with a notable margin of about 7%-13% for accuracy compared to prior\nmethods on zero-shot task.\n","authors":["Zeyu Li","Jingsheng Gao","Tong Yu","Suncheng Xiang","Jiacheng Ruan","Ting Liu","Yuzhuo Fu"],"pdf_url":"https://arxiv.org/pdf/2401.02099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08747v2","updated":"2024-01-15T05:13:16Z","published":"2023-11-15T07:29:24Z","title":"Improved Dense Nested Attention Network Based on Transformer for\n Infrared Small Target Detection","summary":" Infrared small target detection based on deep learning offers unique\nadvantages in separating small targets from complex and dynamic backgrounds.\nHowever, the features of infrared small targets gradually weaken as the depth\nof convolutional neural network (CNN) increases. To address this issue, we\npropose a novel method for detecting infrared small targets called improved\ndense nested attention network (IDNANet), which is based on the transformer\narchitecture. We preserve the dense nested structure of dense nested attention\nnetwork (DNANet) and introduce the Swin-transformer during feature extraction\nstage to enhance the continuity of features. Furthermore, we integrate the\nACmix attention structure into the dense nested structure to enhance the\nfeatures of intermediate layers. Additionally, we design a weighted dice binary\ncross-entropy (WD-BCE) loss function to mitigate the negative impact of\nforeground-background imbalance in the samples. Moreover, we develop a dataset\nspecifically for infrared small targets, called BIT-SIRST. The dataset\ncomprises a significant amount of real-world targets and manually annotated\nlabels, as well as synthetic data and corresponding labels. We have evaluated\nthe effectiveness of our method through experiments conducted on public\ndatasets. In comparison to other state-of-the-art methods, our approach\noutperforms in terms of probability of detection ($P_d$), false-alarm rate\n($F_a$), and mean intersection of union ($mIoU$). The $mIoU$ reaches 90.89\\% on\nthe NUDT-SIRST dataset and 79.72\\% on the SIRST dataset.\n","authors":["Chun Bao","Jie Cao","Yaqian Ning","Tianhua Zhao","Zhijun Li","Zechen Wang","Li Zhang","Qun Hao"],"pdf_url":"https://arxiv.org/pdf/2311.08747v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07477v1","updated":"2024-01-15T05:10:27Z","published":"2024-01-15T05:10:27Z","title":"CascadeV-Det: Cascade Point Voting for 3D Object Detection","summary":" Anchor-free object detectors are highly efficient in performing point-based\nprediction without the need for extra post-processing of anchors. However,\ndifferent from the 2D grids, the 3D points used in these detectors are often\nfar from the ground truth center, making it challenging to accurately regress\nthe bounding boxes. To address this issue, we propose a Cascade Voting\n(CascadeV) strategy that provides high-quality 3D object detection with\npoint-based prediction. Specifically, CascadeV performs cascade detection using\na novel Cascade Voting decoder that combines two new components: Instance Aware\nVoting (IA-Voting) and a Cascade Point Assignment (CPA) module. The IA-Voting\nmodule updates the object features of updated proposal points within the\nbounding box using conditional inverse distance weighting. This approach\nprevents features from being aggregated outside the instance and helps improve\nthe accuracy of object detection. Additionally, since model training can suffer\nfrom a lack of proposal points with high centerness, we have developed the CPA\nmodule to narrow down the positive assignment threshold with cascade stages.\nThis approach relaxes the dependence on proposal centerness in the early stages\nwhile ensuring an ample quantity of positives with high centerness in the later\nstages. Experiments show that FCAF3D with our CascadeV achieves\nstate-of-the-art 3D object detection results with 70.4\\% mAP@0.25 and 51.6\\%\nmAP@0.5 on SUN RGB-D and competitive results on ScanNet. Code will be released\nat https://github.com/Sharpiless/CascadeV-Det\n","authors":["Yingping Liang","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2401.07477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07469v1","updated":"2024-01-15T04:51:39Z","published":"2024-01-15T04:51:39Z","title":"A Deep Hierarchical Feature Sparse Framework for Occluded Person\n Re-Identification","summary":" Most existing methods tackle the problem of occluded person re-identification\n(ReID) by utilizing auxiliary models, resulting in a complicated and\ninefficient ReID framework that is unacceptable for real-time applications. In\nthis work, a speed-up person ReID framework named SUReID is proposed to\nmitigate occlusion interference while speeding up inference. The SUReID\nconsists of three key components: hierarchical token sparsification (HTS)\nstrategy, non-parametric feature alignment knowledge distillation (NPKD), and\nnoise occlusion data augmentation (NODA). The HTS strategy works by pruning the\nredundant tokens in the vision transformer to achieve highly effective\nself-attention computation and eliminate interference from occlusions or\nbackground noise. However, the pruned tokens may contain human part features\nthat contaminate the feature representation and degrade the performance. To\nsolve this problem, the NPKD is employed to supervise the HTS strategy,\nretaining more discriminative tokens and discarding meaningless ones.\nFurthermore, the NODA is designed to introduce more noisy samples, which\nfurther trains the ability of the HTS to disentangle different tokens.\nExperimental results show that the SUReID achieves superior performance with\nsurprisingly fast inference.\n","authors":["Yihu Song","Shuaishi Liu"],"pdf_url":"https://arxiv.org/pdf/2401.07469v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.12340v4","updated":"2024-01-15T04:27:04Z","published":"2023-12-19T17:13:51Z","title":"Scalable Geometric Fracture Assembly via Co-creation Space among\n Assemblers","summary":" Geometric fracture assembly presents a challenging practical task in\narchaeology and 3D computer vision. Previous methods have focused solely on\nassembling fragments based on semantic information, which has limited the\nquantity of objects that can be effectively assembled. Therefore, there is a\nneed to develop a scalable framework for geometric fracture assembly without\nrelying on semantic information. To improve the effectiveness of assembling\ngeometric fractures without semantic information, we propose a co-creation\nspace comprising several assemblers capable of gradually and unambiguously\nassembling fractures. Additionally, we introduce a novel loss function, i.e.,\nthe geometric-based collision loss, to address collision issues during the\nfracture assembly process and enhance the results. Our framework exhibits\nbetter performance on both PartNet and Breaking Bad datasets compared to\nexisting state-of-the-art frameworks. Extensive experiments and quantitative\ncomparisons demonstrate the effectiveness of our proposed framework, which\nfeatures linear computational complexity, enhanced abstraction, and improved\ngeneralization. Our code is publicly available at\nhttps://github.com/Ruiyuan-Zhang/CCS.\n","authors":["Ruiyuan Zhang","Jiaxiang Liu","Zexi Li","Hao Dong","Jie Fu","Chao Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12340v4.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2401.07459v1","updated":"2024-01-15T04:08:53Z","published":"2024-01-15T04:08:53Z","title":"Semantic Segmentation in Multiple Adverse Weather Conditions with Domain\n Knowledge Retention","summary":" Semantic segmentation's performance is often compromised when applied to\nunlabeled adverse weather conditions. Unsupervised domain adaptation is a\npotential approach to enhancing the model's adaptability and robustness to\nadverse weather. However, existing methods encounter difficulties when\nsequentially adapting the model to multiple unlabeled adverse weather\nconditions. They struggle to acquire new knowledge while also retaining\npreviously learned knowledge.To address these problems, we propose a semantic\nsegmentation method for multiple adverse weather conditions that incorporates\nadaptive knowledge acquisition, pseudolabel blending, and weather composition\nreplay. Our adaptive knowledge acquisition enables the model to avoid learning\nfrom extreme images that could potentially cause the model to forget. In our\napproach of blending pseudo-labels, we not only utilize the current model but\nalso integrate the previously learned model into the ongoing learning process.\nThis collaboration between the current teacher and the previous model enhances\nthe robustness of the pseudo-labels for the current target. Our weather\ncomposition replay mechanism allows the model to continuously refine its\npreviously learned weather information while simultaneously learning from the\nnew target domain. Our method consistently outperforms the stateof-the-art\nmethods, and obtains the best performance with averaged mIoU (%) of 65.7 and\nthe lowest forgetting (%) of 3.6 against 60.1 and 11.3, on the ACDC datasets\nfor a four-target continual multi-target domain adaptation.\n","authors":["Xin Yang","Wending Yan","Yuan Yuan","Michael Bi Mi","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2401.07459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07457v1","updated":"2024-01-15T04:04:47Z","published":"2024-01-15T04:04:47Z","title":"Concept-Guided Prompt Learning for Generalization in Vision-Language\n Models","summary":" Contrastive Language-Image Pretraining (CLIP) model has exhibited remarkable\nefficacy in establishing cross-modal connections between texts and images,\nyielding impressive performance across a broad spectrum of downstream\napplications through fine-tuning. However, for generalization tasks, the\ncurrent fine-tuning methods for CLIP, such as CoOp and CoCoOp, demonstrate\nrelatively low performance on some fine-grained datasets. We recognize the\nunderlying reason is that these previous methods only projected global features\ninto the prompt, neglecting the various visual concepts, such as colors,\nshapes, and sizes, which are naturally transferable across domains and play a\ncrucial role in generalization tasks. To address this issue, in this work, we\npropose Concept-Guided Prompt Learning (CPL) for vision-language models.\nSpecifically, we leverage the well-learned knowledge of CLIP to create a visual\nconcept cache to enable concept-guided prompting. In order to refine the text\nfeatures, we further develop a projector that transforms multi-level visual\nfeatures into text features. We observe that this concept-guided prompt\nlearning approach is able to achieve enhanced consistency between visual and\nlinguistic modalities. Extensive experimental results demonstrate that our CPL\nmethod significantly improves generalization capabilities compared to the\ncurrent state-of-the-art methods.\n","authors":["Yi Zhang","Ce Zhang","Ke Yu","Yushun Tang","Zhihai He"],"pdf_url":"https://arxiv.org/pdf/2401.07457v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.07450v1","updated":"2024-01-15T03:38:57Z","published":"2024-01-15T03:38:57Z","title":"Hierarchical Fashion Design with Multi-stage Diffusion Models","summary":" Cross-modal fashion synthesis and editing offer intelligent support to\nfashion designers by enabling the automatic generation and local modification\nof design drafts.While current diffusion models demonstrate commendable\nstability and controllability in image synthesis,they still face significant\nchallenges in generating fashion design from abstract design elements and\nfine-grained editing.Abstract sensory expressions, \\eg office, business, and\nparty, form the high-level design concepts, while measurable aspects like\nsleeve length, collar type, and pant length are considered the low-level\nattributes of clothing.Controlling and editing fashion images using lengthy\ntext descriptions poses a difficulty.In this paper, we propose HieraFashDiff,a\nnovel fashion design method using the shared multi-stage diffusion model\nencompassing high-level design concepts and low-level clothing attributes in a\nhierarchical structure.Specifically, we categorized the input text into\ndifferent levels and fed them in different time step to the diffusion model\naccording to the criteria of professional clothing designers.HieraFashDiff\nallows designers to add low-level attributes after high-level prompts for\ninteractive editing incrementally.In addition, we design a differentiable loss\nfunction in the sampling process with a mask to keep non-edit\nareas.Comprehensive experiments performed on our newly conducted Hierarchical\nfashion dataset,demonstrate that our proposed method outperforms other\nstate-of-the-art competitors.\n","authors":["Zhifeng Xie","Hao li","Huiming Ding","Mengtian Li","Ying Cao"],"pdf_url":"https://arxiv.org/pdf/2401.07450v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.07883v1","updated":"2024-01-15T18:25:18Z","published":"2024-01-15T18:25:18Z","title":"The Chronicles of RAG: The Retriever, the Chunk and the Generator","summary":" Retrieval Augmented Generation (RAG) has become one of the most popular\nparadigms for enabling LLMs to access external data, and also as a mechanism\nfor grounding to mitigate against hallucinations. When implementing RAG you can\nface several challenges like effective integration of retrieval models,\nefficient representation learning, data diversity, computational efficiency\noptimization, evaluation, and quality of text generation. Given all these\nchallenges, every day a new technique to improve RAG appears, making it\nunfeasible to experiment with all combinations for your problem. In this\ncontext, this paper presents good practices to implement, optimize, and\nevaluate RAG for the Brazilian Portuguese language, focusing on the\nestablishment of a simple pipeline for inference and experiments. We explored a\ndiverse set of methods to answer questions about the first Harry Potter book.\nTo generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview,\ngpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the\nretriever, our approach achieved an improvement of MRR@10 by 35.4% compared to\nthe baseline. When optimizing the input size in the application, we observed\nthat it is possible to further enhance it by 2.4%. Finally, we present the\ncomplete architecture of the RAG with our recommendations. As result, we moved\nfrom a baseline of 57.88% to a maximum relative score of 98.61%.\n","authors":["Paulo Finardi","Leonardo Avila","Rodrigo Castaldoni","Pedro Gengo","Celio Larcher","Marcos Piau","Pablo Costa","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2401.07883v1.pdf","comment":"16 pages, 15 figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.07769v1","updated":"2024-01-15T15:27:24Z","published":"2024-01-15T15:27:24Z","title":"Deep Evolutional Instant Interest Network for CTR Prediction in\n Trigger-Induced Recommendation","summary":" The recommendation has been playing a key role in many industries, e.g.,\ne-commerce, streaming media, social media, etc. Recently, a new recommendation\nscenario, called Trigger-Induced Recommendation (TIR), where users are able to\nexplicitly express their instant interests via trigger items, is emerging as an\nessential role in many e-commerce platforms, e.g., Alibaba.com and Amazon.\nWithout explicitly modeling the user's instant interest, traditional\nrecommendation methods usually obtain sub-optimal results in TIR. Even though\nthere are a few methods considering the trigger and target items simultaneously\nto solve this problem, they still haven't taken into account temporal\ninformation of user behaviors, the dynamic change of user instant interest when\nthe user scrolls down and the interactions between the trigger and target\nitems. To tackle these problems, we propose a novel method -- Deep Evolutional\nInstant Interest Network (DEI2N), for click-through rate prediction in TIR\nscenarios. Specifically, we design a User Instant Interest Modeling Layer to\npredict the dynamic change of the intensity of instant interest when the user\nscrolls down. Temporal information is utilized in user behavior modeling.\nMoreover, an Interaction Layer is introduced to learn better interactions\nbetween the trigger and target items. We evaluate our method on several offline\nand real-world industrial datasets. Experimental results show that our proposed\nDEI2N outperforms state-of-the-art baselines. In addition, online A/B testing\ndemonstrates the superiority over the existing baseline in real-world\nproduction environments.\n","authors":["Zhibo Xiao","Luwei Yang","Tao Zhang","Wen Jiang","Wei Ning","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.07769v1.pdf","comment":"7 pages, 3 figures, reviewing of the 17th ACM International\n Conference on Web Search and Data Mining"},{"id":"http://arxiv.org/abs/2401.03883v2","updated":"2024-01-15T15:09:00Z","published":"2024-01-08T13:31:02Z","title":"The Impact of Differential Privacy on Recommendation Accuracy and\n Popularity Bias","summary":" Collaborative filtering-based recommender systems leverage vast amounts of\nbehavioral user data, which poses severe privacy risks. Thus, often, random\nnoise is added to the data to ensure Differential Privacy (DP). However, to\ndate, it is not well understood, in which ways this impacts personalized\nrecommendations. In this work, we study how DP impacts recommendation accuracy\nand popularity bias, when applied to the training data of state-of-the-art\nrecommendation models. Our findings are three-fold: First, we find that nearly\nall users' recommendations change when DP is applied. Second, recommendation\naccuracy drops substantially while recommended item popularity experiences a\nsharp increase, suggesting that popularity bias worsens. Third, we find that DP\nexacerbates popularity bias more severely for users who prefer unpopular items\nthan for users that prefer popular items.\n","authors":["Peter Müllner","Elisabeth Lex","Markus Schedl","Dominik Kowald"],"pdf_url":"https://arxiv.org/pdf/2401.03883v2.pdf","comment":"Accepted at the IR4Good track at ECIR'24, 17 pages"},{"id":"http://arxiv.org/abs/2401.07521v1","updated":"2024-01-15T07:53:58Z","published":"2024-01-15T07:53:58Z","title":"CREAD: A Classification-Restoration Framework with Error Adaptive\n Discretization for Watch Time Prediction in Video Recommender Systems","summary":" The watch time is a significant indicator of user satisfaction in video\nrecommender systems. However, the prediction of watch time as a target variable\nis often hindered by its highly imbalanced distribution with a scarcity of\nobservations for larger target values and over-populated samples for small\nvalues. State-of-the-art watch time prediction models discretize the continuous\nwatch time into a set of buckets in order to consider the distribution of watch\ntime. However, it is highly uninvestigated how these discrete buckets should be\ncreated from the continuous watch time distribution, and existing\ndiscretization approaches suffer from either a large learning error or a large\nrestoration error. To address this challenge, we propose a\nClassification-Restoration framework with Error-Adaptive-Discretization (CREAD)\nto accurately predict the watch time. The proposed framework contains a\ndiscretization module, a classification module, and a restoration module. It\npredicts the watch time through multiple classification problems. The\ndiscretization process is a key contribution of the CREAD framework. We\ntheoretically analyze the impacts of the discretization on the learning error\nand the restoration error, and then propose the error-adaptive discretization\n(EAD) technique to better balance the two errors, which achieves better\nperformance over traditional discretization approaches. We conduct detailed\noffline evaluations on a public dataset and an industrial dataset, both showing\nperformance gains through the proposed approach. Moreover, We have fully\nlaunched our framework to Kwai App, an online video platform, which resulted in\na significant increase in users' video watch time by 0.29% through A/B testing.\nThese results highlight the effectiveness of the CREAD framework in watch time\nprediction in video recommender systems.\n","authors":["Jie Sun","Zhaoying Ding","Xiaoshuang Chen","Qi Chen","Yincheng Wang","Kaiqiao Zhan","Ben Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07521v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.07453v1","updated":"2024-01-15T03:57:15Z","published":"2024-01-15T03:57:15Z","title":"Model Editing at Scale leads to Gradual and Catastrophic Forgetting","summary":" Editing knowledge in large language models is an attractive capability to\nhave which allows us to correct incorrectly learnt facts during pre-training,\nas well as update the model with an ever-growing list of new facts. While\nexisting model editing techniques have shown promise, they are usually\nevaluated using metrics for reliability, specificity and generalization over\none or few edits. We argue that for model editing to have practical utility, we\nmust be able to make multiple edits to the same model. With this in mind, we\nevaluate the current model editing methods at scale, focusing on two state of\nthe art methods: ROME and MEMIT. We find that as the model is edited\nsequentially with multiple facts, it continually forgets previously edited\nfacts and the ability to perform downstream tasks. This forgetting happens in\ntwo phases -- an initial gradual but progressive forgetting phase followed by\nabrupt or catastrophic forgetting phase. Both gradual and catastrophic\nforgetting limit the usefulness of model editing methods at scale -- the former\nmaking model editing less effective as multiple edits are made to the model\nwhile the latter caps the scalability of such model editing methods. Our\nanalysis also highlights other key limitations of ROME and MEMIT at scale. With\nour work, we push for the development and evaluation of model editing methods\nkeeping scalability in mind.\n","authors":["Akshat Gupta","Anurag Rao","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2401.07453v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07445v1","updated":"2024-01-15T03:12:21Z","published":"2024-01-15T03:12:21Z","title":"GACE: Learning Graph-Based Cross-Page Ads Embedding For Click-Through\n Rate Prediction","summary":" Predicting click-through rate (CTR) is the core task of many ads online\nrecommendation systems, which helps improve user experience and increase\nplatform revenue. In this type of recommendation system, we often encounter two\nmain problems: the joint usage of multi-page historical advertising data and\nthe cold start of new ads. In this paper, we proposed GACE, a graph-based\ncross-page ads embedding generation method. It can warm up and generate the\nrepresentation embedding of cold-start and existing ads across various pages.\nSpecifically, we carefully build linkages and a weighted undirected graph model\nconsidering semantic and page-type attributes to guide the direction of feature\nfusion and generation. We designed a variational auto-encoding task as\npre-training module and generated embedding representations for new and old ads\nbased on this task. The results evaluated in the public dataset AliEC from\nRecBole and the real-world industry dataset from Alipay show that our GACE\nmethod is significantly superior to the SOTA method. In the online A/B test,\nthe click-through rate on three real-world pages from Alipay has increased by\n3.6%, 2.13%, and 3.02%, respectively. Especially in the cold-start task, the\nCTR increased by 9.96%, 7.51%, and 8.97%, respectively.\n","authors":["Haowen Wang","Yuliang Du","Congyun Jin","Yujiao Li","Yingbo Wang","Tao Sun","Piqi Qin","Cong Fan"],"pdf_url":"https://arxiv.org/pdf/2401.07445v1.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.14084v3","updated":"2024-01-15T02:31:04Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon causes source bias in text retrieval for web search.\nSpecifically, neural retrieval models tend to rank generated texts higher than\nhuman-written texts. In this paper, we extend the study of this bias to\ncross-modal retrieval. Firstly, we successfully construct a suitable benchmark\nto explore the existence of the bias. Subsequent extensive experiments on this\nbenchmark reveal that AI-generated images introduce an invisible relevance bias\nto text-image retrieval models. Specifically, our experiments show that\ntext-image retrieval models tend to rank the AI-generated images higher than\nthe real images, even though the AI-generated images do not exhibit more\nvisually relevant features to the query than real images. This invisible\nrelevance bias is prevalent across retrieval models with varying training data\nand architectures. Furthermore, our subsequent exploration reveals that the\ninclusion of AI-generated images in the training data of the retrieval models\nexacerbates the invisible relevance bias. The above phenomenon triggers a\nvicious cycle, which makes the invisible relevance bias become more and more\nserious. To elucidate the potential causes of invisible relevance and address\nthe aforementioned issues, we introduce an effective training method aimed at\nalleviating the invisible relevance bias. Subsequently, we apply our proposed\ndebiasing method to retroactively identify the causes of invisible relevance,\nrevealing that the AI-generated images induce the image encoder to embed\nadditional information into their representation. This information exhibits a\ncertain consistency across generated images with different semantics and can\nmake the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v3.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2312.16018v2","updated":"2024-01-15T01:57:10Z","published":"2023-12-26T12:12:58Z","title":"RecRanker: Instruction Tuning Large Language Model as Ranker for Top-k\n Recommendation","summary":" Large language models (LLMs) have demonstrated remarkable capabilities and\nhave been extensively deployed across various domains, including recommender\nsystems. Numerous studies have employed specialized \\textit{prompts} to harness\nthe in-context learning capabilities intrinsic to LLMs. For example, LLMs are\nprompted to act as zero-shot rankers for listwise ranking, evaluating candidate\nitems generated by a retrieval model for recommendation. Recent research\nfurther uses instruction tuning techniques to align LLM with human preference\nfor more promising recommendations. Despite its potential, current research\noverlooks the integration of multiple ranking tasks to enhance model\nperformance. Moreover, the signal from the conventional recommendation model is\nnot integrated into the LLM, limiting the current system performance.\n In this paper, we introduce RecRanker, tailored for instruction tuning LLM to\nserve as the \\textbf{Ranker} for top-\\textit{k} \\textbf{Rec}ommendations.\nSpecifically, we introduce importance-aware sampling, clustering-based\nsampling, and penalty for repetitive sampling for sampling high-quality,\nrepresentative, and diverse training data. To enhance the prompt, we introduce\nposition shifting strategy to mitigate position bias and augment the prompt\nwith auxiliary information from conventional recommendation models, thereby\nenriching the contextual understanding of the LLM. Subsequently, we utilize the\nsampled data to assemble an instruction-tuning dataset with the augmented\nprompt comprising three distinct ranking tasks: pointwise, pairwise, and\nlistwise rankings. We further propose a hybrid ranking method to enhance the\nmodel performance by ensembling these ranking tasks. Our empirical evaluations\ndemonstrate the effectiveness of our proposed RecRanker in both direct and\nsequential recommendation scenarios.\n","authors":["Sichun Luo","Bowei He","Haohan Zhao","Yinya Huang","Aojun Zhou","Zongpeng Li","Yuanzhang Xiao","Mingjie Zhan","Linqi Song"],"pdf_url":"https://arxiv.org/pdf/2312.16018v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.08016v1","updated":"2024-01-15T23:58:21Z","published":"2024-01-15T23:58:21Z","title":"Contextual Bandits with Stage-wise Constraints","summary":" We study contextual bandits in the presence of a stage-wise constraint (a\nconstraint at each round), when the constraint must be satisfied both with high\nprobability and in expectation. Obviously the setting where the constraint is\nin expectation is a relaxation of the one with high probability. We start with\nthe linear case where both the contextual bandit problem (reward function) and\nthe stage-wise constraint (cost function) are linear. In each of the high\nprobability and in expectation settings, we propose an upper-confidence bound\nalgorithm for the problem and prove a $T$-round regret bound for it. Our\nalgorithms balance exploration and constraint satisfaction using a novel idea\nthat scales the radii of the reward and cost confidence sets with different\nscaling factors. We also prove a lower-bound for this constrained problem, show\nhow our algorithms and analyses can be extended to multiple constraints, and\nprovide simulations to validate our theoretical results. In the high\nprobability setting, we describe the minimum requirements for the action set in\norder for our algorithm to be tractable. In the setting that the constraint is\nin expectation, we further specialize our results to multi-armed bandits and\npropose a computationally efficient algorithm for this setting with regret\nanalysis. Finally, we extend our results to the case where the reward and cost\nfunctions are both non-linear. We propose an algorithm for this case and prove\na regret bound for it that characterize the function class complexity by the\neluder dimension.\n","authors":["Aldo Pacchiano","Mohammad Ghavamzadeh","Peter Bartlett"],"pdf_url":"https://arxiv.org/pdf/2401.08016v1.pdf","comment":"53 pages. arXiv admin note: text overlap with arXiv:2006.10185"},{"id":"http://arxiv.org/abs/2305.11554v4","updated":"2024-01-15T23:52:21Z","published":"2023-05-19T09:54:21Z","title":"ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via\n Tool Embeddings","summary":" Augmenting large language models (LLMs) with external tools has emerged as a\npromising approach to solving complex problems. However, traditional methods,\nwhich finetune LLMs with tool demonstration data, can be both costly and\nrestricted to a predefined set of tools. Recent in-context learning paradigm\nalleviates these issues, but the limited context length only allows for a few\nshots of demonstrations, leading to suboptimal understandings of the tools.\nMoreover, when there are numerous tools to choose from, in-context learning\ncould completely fail to work. In this paper, we propose an alternative\napproach, $\\textbf{ToolkenGPT}$, which combines the benefits of both sides. Our\napproach represents each $\\underline{tool}$ as a to$\\underline{ken}$\n($\\textit{toolken}$) and learns an embedding for it, enabling tool calls in the\nsame way as generating a regular word token. Once a toolken is triggered, the\nLLM is prompted to complete arguments for the tool to execute. ToolkenGPT\noffers the flexibility to plug in an arbitrary number of tools by expanding the\nset of toolkens on the fly. In addition, it improves tool use by allowing\nextensive demonstration data for learning the toolken embeddings. In diverse\ndomains, including numerical reasoning, knowledge-based question answering, and\nembodied plan generation, our approach effectively augments LLMs with tools and\nsubstantially outperforms various latest baselines. ToolkenGPT demonstrates the\npromising ability to use relevant tools from a large tool set in complex\nscenarios.\n","authors":["Shibo Hao","Tianyang Liu","Zhen Wang","Zhiting Hu"],"pdf_url":"https://arxiv.org/pdf/2305.11554v4.pdf","comment":"NeurIPS 2023 (oral). Code: https://github.com/Ber666/ToolkenGPT"},{"id":"http://arxiv.org/abs/2401.08002v1","updated":"2024-01-15T23:10:22Z","published":"2024-01-15T23:10:22Z","title":"Discovery of Generalizable TBI Phenotypes Using Multivariate Time-Series\n Clustering","summary":" Traumatic Brain Injury (TBI) presents a broad spectrum of clinical\npresentations and outcomes due to its inherent heterogeneity, leading to\ndiverse recovery trajectories and varied therapeutic responses. While many\nstudies have delved into TBI phenotyping for distinct patient populations,\nidentifying TBI phenotypes that consistently generalize across various settings\nand populations remains a critical research gap. Our research addresses this by\nemploying multivariate time-series clustering to unveil TBI's dynamic\nintricates. Utilizing a self-supervised learning-based approach to clustering\nmultivariate time-Series data with missing values (SLAC-Time), we analyzed both\nthe research-centric TRACK-TBI and the real-world MIMIC-IV datasets.\nRemarkably, the optimal hyperparameters of SLAC-Time and the ideal number of\nclusters remained consistent across these datasets, underscoring SLAC-Time's\nstability across heterogeneous datasets. Our analysis revealed three\ngeneralizable TBI phenotypes ({\\alpha}, \\b{eta}, and {\\gamma}), each exhibiting\ndistinct non-temporal features during emergency department visits, and temporal\nfeature profiles throughout ICU stays. Specifically, phenotype {\\alpha}\nrepresents mild TBI with a remarkably consistent clinical presentation. In\ncontrast, phenotype \\b{eta} signifies severe TBI with diverse clinical\nmanifestations, and phenotype {\\gamma} represents a moderate TBI profile in\nterms of severity and clinical diversity. Age is a significant determinant of\nTBI outcomes, with older cohorts recording higher mortality rates. Importantly,\nwhile certain features varied by age, the core characteristics of TBI\nmanifestations tied to each phenotype remain consistent across diverse\npopulations.\n","authors":["Hamid Ghaderi","Brandon Foreman","Chandan K. Reddy","Vignesh Subbian"],"pdf_url":"https://arxiv.org/pdf/2401.08002v1.pdf","comment":"25 pages, 10 figures, 4 tables, submitted to Computers in Biology and\n Medicine"},{"id":"http://arxiv.org/abs/2401.07994v1","updated":"2024-01-15T22:36:31Z","published":"2024-01-15T22:36:31Z","title":"A Novel Approach for Automatic Program Repair using Round-Trip\n Translation with Large Language Models","summary":" Research shows that grammatical mistakes in a sentence can be corrected by\ntranslating it to another language and back using neural machine translation\nwith language models. We investigate whether this correction capability of\nLarge Language Models (LLMs) extends to Automatic Program Repair (APR). Current\ngenerative models for APR are pre-trained on source code and fine-tuned for\nrepair. This paper proposes bypassing the fine-tuning step and using Round-Trip\nTranslation (RTT): translation of code from one programming language to another\nprogramming or natural language, and back. We hypothesize that RTT with LLMs\nrestores the most commonly seen patterns in code during pre-training, i.e.,\nperforms a regression toward the mean, which removes bugs as they are a form of\nnoise w.r.t. the more frequent, natural, bug-free code in the training data. To\ntest this hypothesis, we employ eight recent LLMs pre-trained on code,\nincluding the latest GPT versions, and four common program repair benchmarks in\nJava. We find that RTT with English as an intermediate language repaired 101 of\n164 bugs with GPT-4 on the HumanEval-Java dataset. Moreover, 46 of these are\nunique bugs that are not repaired by other LLMs fine-tuned for APR. Our\nfindings highlight the viability of round-trip translation with LLMs as a\ntechnique for automated program repair and its potential for research in\nsoftware engineering.\n Keywords: automated program repair, large language model, machine translation\n","authors":["Fernando Vallecillos Ruiz","Anastasiia Grishina","Max Hort","Leon Moonen"],"pdf_url":"https://arxiv.org/pdf/2401.07994v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07993v1","updated":"2024-01-15T22:36:11Z","published":"2024-01-15T22:36:11Z","title":"Carrying over algorithm in transformers","summary":" Addition is perhaps one of the simplest arithmetic tasks one can think of and\nis usually performed using the carrying over algorithm. This algorithm consists\nof two tasks: adding digits in the same position and carrying over a one\nwhenever necessary. We study how transformer models implement this algorithm\nand how the two aforementioned tasks are allocated to different parts of the\nnetwork. We first focus on two-layer encoder-only models and show that the\ncarrying over algorithm is implemented in a modular fashion. The first layer is\nmostly responsible for adding digits in the same position. The second layer\nfirst decides, in the attention, which positions need a carried one or not, and\nthen performs the carrying of the one in the final MLP. We provide a simple way\nof precisely identifying which neurons are responsible for that task. This\nimplementation of the carrying over algorithm occurs across a range of\nhyperparameters for two as well as three-layer models. For small decoder-only\nmodels, we observe the same implementation and provide suggestive evidence for\nits existence in three 7B large language models.\n","authors":["Jorrit Kruthoff"],"pdf_url":"https://arxiv.org/pdf/2401.07993v1.pdf","comment":"Comments welcome!"},{"id":"http://arxiv.org/abs/2206.08756v3","updated":"2024-01-15T22:35:00Z","published":"2022-06-17T13:15:27Z","title":"Tensor-on-Tensor Regression: Riemannian Optimization,\n Over-parameterization, Statistical-computational Gap, and Their Interplay","summary":" We study the tensor-on-tensor regression, where the goal is to connect tensor\nresponses to tensor covariates with a low Tucker rank parameter tensor/matrix\nwithout the prior knowledge of its intrinsic rank. We propose the Riemannian\ngradient descent (RGD) and Riemannian Gauss-Newton (RGN) methods and cope with\nthe challenge of unknown rank by studying the effect of rank\nover-parameterization. We provide the first convergence guarantee for the\ngeneral tensor-on-tensor regression by showing that RGD and RGN respectively\nconverge linearly and quadratically to a statistically optimal estimate in both\nrank correctly-parameterized and over-parameterized settings. Our theory\nreveals an intriguing phenomenon: Riemannian optimization methods naturally\nadapt to over-parameterization without modifications to their implementation.\nWe also prove the statistical-computational gap in scalar-on-tensor regression\nby a direct low-degree polynomial argument. Our theory demonstrates a \"blessing\nof statistical-computational gap\" phenomenon: in a wide range of scenarios in\ntensor-on-tensor regression for tensors of order three or higher, the\ncomputationally required sample size matches what is needed by moderate rank\nover-parameterization when considering computationally feasible estimators,\nwhile there are no such benefits in the matrix settings. This shows moderate\nrank over-parameterization is essentially \"cost-free\" in terms of sample size\nin tensor-on-tensor regression of order three or higher. Finally, we conduct\nsimulation studies to show the advantages of our proposed methods and to\ncorroborate our theoretical findings.\n","authors":["Yuetian Luo","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2206.08756v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07991v1","updated":"2024-01-15T22:31:15Z","published":"2024-01-15T22:31:15Z","title":"Robustness Against Adversarial Attacks via Learning Confined Adversarial\n Polytopes","summary":" Deep neural networks (DNNs) could be deceived by generating\nhuman-imperceptible perturbations of clean samples. Therefore, enhancing the\nrobustness of DNNs against adversarial attacks is a crucial task. In this\npaper, we aim to train robust DNNs by limiting the set of outputs reachable via\na norm-bounded perturbation added to a clean sample. We refer to this set as\nadversarial polytope, and each clean sample has a respective adversarial\npolytope. Indeed, if the respective polytopes for all the samples are compact\nsuch that they do not intersect the decision boundaries of the DNN, then the\nDNN is robust against adversarial samples. Hence, the inner-working of our\nalgorithm is based on learning \\textbf{c}onfined \\textbf{a}dversarial\n\\textbf{p}olytopes (CAP). By conducting a thorough set of experiments, we\ndemonstrate the effectiveness of CAP over existing adversarial robustness\nmethods in improving the robustness of models against state-of-the-art attacks\nincluding AutoAttack.\n","authors":["Shayan Mohajer Hamidi","Linfeng Ye"],"pdf_url":"https://arxiv.org/pdf/2401.07991v1.pdf","comment":"The paper has been accepted in ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.05739v3","updated":"2024-01-15T22:29:37Z","published":"2023-11-09T20:52:36Z","title":"Deep Learning Architecture for Network-Efficiency at the Edge","summary":" The growing number of AI-driven applications in the mobile devices has led to\nsolutions that integrate deep learning models with the available edge-cloud\nresources; due to multiple benefits such as reduction in on-device energy\nconsumption, improved latency, improved network usage, and certain privacy\nimprovements, split learning, where deep learning models are split away from\nthe mobile device and computed in a distributed manner, has become an\nextensively explored topic. Combined with compression-aware methods where\nlearning adapts to compression of communicated data, the benefits of this\napproach have further improved and could serve as an alternative to established\napproaches like federated learning methods. In this work, we develop an\nadaptive compression-aware split learning method ('deprune') to improve and\ntrain deep learning models so that they are much more network-efficient (use\nless network resources and are faster), which would make them ideal to deploy\nin weaker devices with the help of edge-cloud resources. This method is also\nextended ('prune') to very quickly train deep learning models, through a\ntransfer learning approach, that trades off little accuracy for much more\nnetwork-efficient inference abilities. We show that the 'deprune' method can\nreduce network usage by 4x when compared with a split-learning approach (that\ndoes not use our method) without loss of accuracy, while also improving\naccuracy over compression-aware split-learning by 4 percent. Lastly, we show\nthat the 'prune' method can reduce the training time for certain models by up\nto 6x without affecting the accuracy when compared against a compression-aware\nsplit-learning approach.\n","authors":["Akrit Mudvari","Antero Vainio","Iason Ofeidis","Sasu Tarkoma","Leandros Tassiulas"],"pdf_url":"https://arxiv.org/pdf/2311.05739v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07990v1","updated":"2024-01-15T22:29:23Z","published":"2024-01-15T22:29:23Z","title":"How does self-supervised pretraining improve robustness against noisy\n labels across various medical image classification datasets?","summary":" Noisy labels can significantly impact medical image classification,\nparticularly in deep learning, by corrupting learned features. Self-supervised\npretraining, which doesn't rely on labeled data, can enhance robustness against\nnoisy labels. However, this robustness varies based on factors like the number\nof classes, dataset complexity, and training size. In medical images, subtle\ninter-class differences and modality-specific characteristics add complexity.\nPrevious research hasn't comprehensively explored the interplay between\nself-supervised learning and robustness against noisy labels in medical image\nclassification, considering all these factors. In this study, we address three\nkey questions: i) How does label noise impact various medical image\nclassification datasets? ii) Which types of medical image datasets are more\nchallenging to learn and more affected by label noise? iii) How do different\nself-supervised pretraining methods enhance robustness across various medical\nimage datasets? Our results show that DermNet, among five datasets (Fetal\nplane, DermNet, COVID-DU-Ex, MURA, NCT-CRC-HE-100K), is the most challenging\nbut exhibits greater robustness against noisy labels. Additionally, contrastive\nlearning stands out among the eight self-supervised methods as the most\neffective approach to enhance robustness against noisy labels.\n","authors":["Bidur Khanal","Binod Bhattarai","Bishesh Khanal","Cristian Linte"],"pdf_url":"https://arxiv.org/pdf/2401.07990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11578v3","updated":"2024-01-15T22:22:24Z","published":"2023-01-27T07:53:50Z","title":"Learning to Unlearn: Instance-wise Unlearning for Pre-trained\n Classifiers","summary":" Since the recent advent of regulations for data protection (e.g., the General\nData Protection Regulation), there has been increasing demand in deleting\ninformation learned from sensitive data in pre-trained models without\nretraining from scratch. The inherent vulnerability of neural networks towards\nadversarial attacks and unfairness also calls for a robust method to remove or\ncorrect information in an instance-wise fashion, while retaining the predictive\nperformance across remaining data. To this end, we consider instance-wise\nunlearning, of which the goal is to delete information on a set of instances\nfrom a pre-trained model, by either misclassifying each instance away from its\noriginal prediction or relabeling the instance to a different label. We also\npropose two methods that reduce forgetting on the remaining data: 1) utilizing\nadversarial examples to overcome forgetting at the representation-level and 2)\nleveraging weight importance metrics to pinpoint network parameters guilty of\npropagating unwanted information. Both methods only require the pre-trained\nmodel and data instances to forget, allowing painless application to real-life\nsettings where the entire training set is unavailable. Through extensive\nexperimentation on various image classification benchmarks, we show that our\napproach effectively preserves knowledge of remaining data while unlearning\ngiven instances in both single-task and continual unlearning scenarios.\n","authors":["Sungmin Cha","Sungjun Cho","Dasol Hwang","Honglak Lee","Taesup Moon","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2301.11578v3.pdf","comment":"AAAI 2024 camera ready version"},{"id":"http://arxiv.org/abs/2310.04585v2","updated":"2024-01-15T21:54:35Z","published":"2023-10-06T20:57:34Z","title":"Interventions Against Machine-Assisted Statistical Discrimination","summary":" This article studies how to intervene against statistical discrimination,\nwhen it is based on beliefs generated by machine learning, rather than by\nhumans. Unlike beliefs formed by a human mind, machine learning-generated\nbeliefs are verifiable. This allows interventions to move beyond simple,\nbelief-free designs like affirmative action, to more sophisticated ones, that\nconstrain decision makers in ways that depend on what they are thinking. Such\nmind reading interventions can perform well where affirmative action does not,\neven when the beliefs being conditioned on are possibly incorrect and biased.\n","authors":["John Y. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.04585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13304v2","updated":"2024-01-15T21:54:28Z","published":"2023-07-25T07:44:06Z","title":"QuIP: 2-Bit Quantization of Large Language Models With Guarantees","summary":" This work studies post-training parameter quantization in large language\nmodels (LLMs). We introduce quantization with incoherence processing (QuIP), a\nnew method based on the insight that quantization benefits from\n$\\textit{incoherent}$ weight and Hessian matrices, i.e., from the weights being\neven in magnitude and the directions in which it is important to round them\naccurately being unaligned with the coordinate axes. QuIP consists of two\nsteps: (1) an adaptive rounding procedure minimizing a quadratic proxy\nobjective; (2) efficient pre- and post-processing that ensures weight and\nHessian incoherence via multiplication by random orthogonal matrices. We\ncomplement QuIP with the first theoretical analysis for an LLM-scale\nquantization algorithm, and show that our theory also applies to an existing\nmethod, OPTQ. Empirically, we find that our incoherence preprocessing improves\nseveral existing quantization algorithms and yields the first LLM quantization\nmethods that produce viable results using only two bits per weight. Our code\ncan be found at https://github.com/Cornell-RelaxML/QuIP.\n","authors":["Jerry Chee","Yaohui Cai","Volodymyr Kuleshov","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2307.13304v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07899v2","updated":"2024-01-15T21:22:46Z","published":"2023-12-13T05:08:32Z","title":"Morphological Profiling for Drug Discovery in the Era of Deep Learning","summary":" Morphological profiling is a valuable tool in phenotypic drug discovery. The\nadvent of high-throughput automated imaging has enabled the capturing of a wide\nrange of morphological features of cells or organisms in response to\nperturbations at the single-cell resolution. Concurrently, significant advances\nin machine learning and deep learning, especially in computer vision, have led\nto substantial improvements in analyzing large-scale high-content images at\nhigh-throughput. These efforts have facilitated understanding of compound\nmechanism-of-action (MOA), drug repurposing, characterization of cell\nmorphodynamics under perturbation, and ultimately contributing to the\ndevelopment of novel therapeutics. In this review, we provide a comprehensive\noverview of the recent advances in the field of morphological profiling. We\nsummarize the image profiling analysis workflow, survey a broad spectrum of\nanalysis strategies encompassing feature engineering- and deep learning-based\napproaches, and introduce publicly available benchmark datasets. We place a\nparticular emphasis on the application of deep learning in this pipeline,\ncovering cell segmentation, image representation learning, and multimodal\nlearning. Additionally, we illuminate the application of morphological\nprofiling in phenotypic drug discovery and highlight potential challenges and\nopportunities in this field.\n","authors":["Qiaosi Tang","Ranjala Ratnayake","Gustavo Seabra","Zhe Jiang","Ruogu Fang","Lina Cui","Yousong Ding","Tamer Kahveci","Jiang Bian","Chenglong Li","Hendrik Luesch","Yanjun Li"],"pdf_url":"https://arxiv.org/pdf/2312.07899v2.pdf","comment":"44 pages, 5 figure, 5 tables"},{"id":"http://arxiv.org/abs/2309.16672v2","updated":"2024-01-15T21:13:58Z","published":"2023-09-28T17:59:58Z","title":"Learning to Transform for Generalizable Instance-wise Invariance","summary":" Computer vision research has long aimed to build systems that are robust to\nspatial transformations found in natural data. Traditionally, this is done\nusing data augmentation or hard-coding invariances into the architecture.\nHowever, too much or too little invariance can hurt, and the correct amount is\nunknown a priori and dependent on the instance. Ideally, the appropriate\ninvariance would be learned from data and inferred at test-time.\n We treat invariance as a prediction problem. Given any image, we use a\nnormalizing flow to predict a distribution over transformations and average the\npredictions over them. Since this distribution only depends on the instance, we\ncan align instances before classifying them and generalize invariance across\nclasses. The same distribution can also be used to adapt to out-of-distribution\nposes. This normalizing flow is trained end-to-end and can learn a much larger\nrange of transformations than Augerino and InstaAug. When used as data\naugmentation, our method shows accuracy and robustness gains on CIFAR 10,\nCIFAR10-LT, and TinyImageNet.\n","authors":["Utkarsh Singhal","Carlos Esteves","Ameesh Makadia","Stella X. Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16672v2.pdf","comment":"Accepted to ICCV 2023"},{"id":"http://arxiv.org/abs/2312.14698v2","updated":"2024-01-15T21:12:03Z","published":"2023-12-22T13:57:29Z","title":"Time-changed normalizing flows for accurate SDE modeling","summary":" The generative paradigm has become increasingly important in machine learning\nand deep learning models. Among popular generative models are normalizing\nflows, which enable exact likelihood estimation by transforming a base\ndistribution through diffeomorphic transformations. Extending the normalizing\nflow framework to handle time-indexed flows gave dynamic normalizing flows, a\npowerful tool to model time series, stochastic processes, and neural stochastic\ndifferential equations (SDEs). In this work, we propose a novel variant of\ndynamic normalizing flows, a Time Changed Normalizing Flow (TCNF), based on\ntime deformation of a Brownian motion which constitutes a versatile and\nextensive family of Gaussian processes. This approach enables us to effectively\nmodel some SDEs, that cannot be modeled otherwise, including standard ones such\nas the well-known Ornstein-Uhlenbeck process, and generalizes prior\nmethodologies, leading to improved results and better inference and prediction\ncapability.\n","authors":["Naoufal El Bekri","Lucas Drumetz","Franck Vermet"],"pdf_url":"https://arxiv.org/pdf/2312.14698v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07961v1","updated":"2024-01-15T20:57:50Z","published":"2024-01-15T20:57:50Z","title":"Solution of the Probabilistic Lambert Problem: Connections with Optimal\n Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs","summary":" Lambert's problem concerns with transferring a spacecraft from a given\ninitial to a given terminal position within prescribed flight time via velocity\ncontrol subject to a gravitational force field. We consider a probabilistic\nvariant of the Lambert problem where the knowledge of the endpoint constraints\nin position vectors are replaced by the knowledge of their respective joint\nprobability density functions. We show that the Lambert problem with endpoint\njoint probability density constraints is a generalized optimal mass transport\n(OMT) problem, thereby connecting this classical astrodynamics problem with a\nburgeoning area of research in modern stochastic control and stochastic machine\nlearning. This newfound connection allows us to rigorously establish the\nexistence and uniqueness of solution for the probabilistic Lambert problem. The\nsame connection also helps to numerically solve the probabilistic Lambert\nproblem via diffusion regularization, i.e., by leveraging further connection of\nthe OMT with the Schr\\\"odinger bridge problem (SBP). This also shows that the\nprobabilistic Lambert problem with additive dynamic process noise is in fact a\ngeneralized SBP, and can be solved numerically using the so-called\nSchr\\\"odinger factors, as we do in this work. We explain how the resulting\nanalysis leads to solving a boundary-coupled system of reaction-diffusion PDEs\nwhere the nonlinear gravitational potential appears as the reaction rate. We\npropose novel algorithms for the same, and present illustrative numerical\nresults. Our analysis and the algorithmic framework are nonparametric, i.e., we\nmake neither statistical (e.g., Gaussian, first few moments, mixture or\nexponential family, finite dimensionality of the sufficient statistic) nor\ndynamical (e.g., Taylor series) approximations.\n","authors":["Alexis M. H. Teter","Iman Nodozi","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2401.07961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07958v1","updated":"2024-01-15T20:54:20Z","published":"2024-01-15T20:54:20Z","title":"GD-CAF: Graph Dual-stream Convolutional Attention Fusion for\n Precipitation Nowcasting","summary":" Accurate precipitation nowcasting is essential for various purposes,\nincluding flood prediction, disaster management, optimizing agricultural\nactivities, managing transportation routes and renewable energy. While several\nstudies have addressed this challenging task from a sequence-to-sequence\nperspective, most of them have focused on a single area without considering the\nexisting correlation between multiple disjoint regions. In this paper, we\nformulate precipitation nowcasting as a spatiotemporal graph sequence\nnowcasting problem. In particular, we introduce Graph Dual-stream Convolutional\nAttention Fusion (GD-CAF), a novel approach designed to learn from historical\nspatiotemporal graph of precipitation maps and nowcast future time step ahead\nprecipitation at different spatial locations. GD-CAF consists of\nspatio-temporal convolutional attention as well as gated fusion modules which\nare equipped with depthwise-separable convolutional operations. This\nenhancement enables the model to directly process the high-dimensional\nspatiotemporal graph of precipitation maps and exploits higher-order\ncorrelations between the data dimensions. We evaluate our model on seven years\nof precipitation maps across Europe and its neighboring areas collected from\nthe ERA5 dataset, provided by Copernicus. The model receives a fully connected\ngraph in which each node represents historical observations from a specific\nregion on the map. Consequently, each node contains a 3D tensor with time,\nheight, and width dimensions. Experimental results demonstrate that the\nproposed GD-CAF model outperforms the other examined models. Furthermore, the\naveraged seasonal spatial and temporal attention scores over the test set are\nvisualized to provide additional insights about the strongest connections\nbetween different regions or time steps. These visualizations shed light on the\ndecision-making process of our model.\n","authors":["Lorand Vatamany","Siamak Mehrkanoon"],"pdf_url":"https://arxiv.org/pdf/2401.07958v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.07957v1","updated":"2024-01-15T20:47:24Z","published":"2024-01-15T20:47:24Z","title":"Machine Perceptual Quality: Evaluating the Impact of Severe Lossy\n Compression on Audio and Image Models","summary":" In the field of neural data compression, the prevailing focus has been on\noptimizing algorithms for either classical distortion metrics, such as PSNR or\nSSIM, or human perceptual quality. With increasing amounts of data consumed by\nmachines rather than humans, a new paradigm of machine-oriented\ncompression$\\unicode{x2013}$which prioritizes the retention of features salient\nfor machine perception over traditional human-centric\ncriteria$\\unicode{x2013}$has emerged, creating several new challenges to the\ndevelopment, evaluation, and deployment of systems utilizing lossy compression.\nIn particular, it is unclear how different approaches to lossy compression will\naffect the performance of downstream machine perception tasks. To address this\nunder-explored area, we evaluate various perception\nmodels$\\unicode{x2013}$including image classification, image segmentation,\nspeech recognition, and music source separation$\\unicode{x2013}$under severe\nlossy compression. We utilize several popular codecs spanning conventional,\nneural, and generative compression architectures. Our results indicate three\nkey findings: (1) using generative compression, it is feasible to leverage\nhighly compressed data while incurring a negligible impact on machine\nperceptual quality; (2) machine perceptual quality correlates strongly with\ndeep similarity metrics, indicating a crucial role of these metrics in the\ndevelopment of machine-oriented codecs; and (3) using lossy compressed\ndatasets, (e.g. ImageNet) for pre-training can lead to counter-intuitive\nscenarios where lossy compression increases machine perceptual quality rather\nthan degrading it. To encourage engagement on this growing area of research,\nour code and experiments are available at:\nhttps://github.com/danjacobellis/MPQ.\n","authors":["Dan Jacobellis","Daniel Cummings","Neeraja J. Yadwadkar"],"pdf_url":"https://arxiv.org/pdf/2401.07957v1.pdf","comment":"10 pages; abridged version published in IEEE Data Compression\n Conference 2024"},{"id":"http://arxiv.org/abs/2401.01841v2","updated":"2024-01-15T20:45:21Z","published":"2024-01-03T17:19:54Z","title":"Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov\n Decision Processes","summary":" A fundamental (and largely open) challenge in sequential decision-making is\ndealing with non-stationary environments, where exogenous environmental\nconditions change over time. Such problems are traditionally modeled as\nnon-stationary Markov decision processes (NSMDP). However, existing approaches\nfor decision-making in NSMDPs have two major shortcomings: first, they assume\nthat the updated environmental dynamics at the current time are known (although\nfuture dynamics can change); and second, planning is largely pessimistic, i.e.,\nthe agent acts ``safely'' to account for the non-stationary evolution of the\nenvironment. We argue that both these assumptions are invalid in practice --\nupdated environmental conditions are rarely known, and as the agent interacts\nwith the environment, it can learn about the updated dynamics and avoid being\npessimistic, at least in states whose dynamics it is confident about. We\npresent a heuristic search algorithm called \\textit{Adaptive Monte Carlo Tree\nSearch (ADA-MCTS)} that addresses these challenges. We show that the agent can\nlearn the updated dynamics of the environment over time and then act as it\nlearns, i.e., if the agent is in a region of the state space about which it has\nupdated knowledge, it can avoid being pessimistic. To quantify ``updated\nknowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the\nagent's updated belief and show how the agent can use these estimates for\ndecision-making. We compare the proposed approach with the multiple\nstate-of-the-art approaches in decision-making across multiple well-established\nopen-source problems and empirically show that our approach is faster and\nhighly adaptive without sacrificing safety.\n","authors":["Baiting Luo","Yunuo Zhang","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2401.01841v2.pdf","comment":"Accepted for publication at the International Conference on\n Autonomous Agents and MultiAgent Systems (AAMAS), 2024"},{"id":"http://arxiv.org/abs/2401.07955v1","updated":"2024-01-15T20:42:16Z","published":"2024-01-15T20:42:16Z","title":"A Study on Large Language Models' Limitations in Multiple-Choice\n Question Answering","summary":" The widespread adoption of Large Language Models (LLMs) has become\ncommonplace, particularly with the emergence of open-source models. More\nimportantly, smaller models are well-suited for integration into consumer\ndevices and are frequently employed either as standalone solutions or as\nsubroutines in various AI tasks. Despite their ubiquitous use, there is no\nsystematic analysis of their specific capabilities and limitations. In this\nstudy, we tackle one of the most widely used tasks - answering Multiple Choice\nQuestion (MCQ). We analyze 26 small open-source models and find that 65% of the\nmodels do not understand the task, only 4 models properly select an answer from\nthe given choices, and only 5 of these models are choice order independent.\nThese results are rather alarming given the extensive use of MCQ tests with\nthese models. We recommend exercising caution and testing task understanding\nbefore using MCQ to evaluate LLMs in any field whatsoever.\n","authors":["Aisha Khatun","Daniel G. Brown"],"pdf_url":"https://arxiv.org/pdf/2401.07955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12248v2","updated":"2024-01-15T20:36:44Z","published":"2023-10-18T18:33:41Z","title":"A PAC Learning Algorithm for LTL and Omega-regular Objectives in MDPs","summary":" Linear temporal logic (LTL) and omega-regular objectives -- a superset of LTL\n-- have seen recent use as a way to express non-Markovian objectives in\nreinforcement learning. We introduce a model-based probably approximately\ncorrect (PAC) learning algorithm for omega-regular objectives in Markov\ndecision processes (MDPs). As part of the development of our algorithm, we\nintroduce the epsilon-recurrence time: a measure of the speed at which a policy\nconverges to the satisfaction of the omega-regular objective in the limit. We\nprove that our algorithm only requires a polynomial number of samples in the\nrelevant parameters, and perform experiments which confirm our theory.\n","authors":["Mateo Perez","Fabio Somenzi","Ashutosh Trivedi"],"pdf_url":"https://arxiv.org/pdf/2310.12248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04748v2","updated":"2024-01-15T20:12:22Z","published":"2023-08-09T07:36:21Z","title":"Fuzz4All: Universal Fuzzing with Large Language Models","summary":" Fuzzing has achieved tremendous success in discovering bugs and\nvulnerabilities in various software systems. Systems under test (SUTs) that\ntake in programming or formal language as inputs, e.g., compilers, runtime\nengines, constraint solvers, and software libraries with accessible APIs, are\nespecially important as they are fundamental building blocks of software\ndevelopment. However, existing fuzzers for such systems often target a specific\nlanguage, and thus cannot be easily applied to other languages or even other\nversions of the same language. Moreover, the inputs generated by existing\nfuzzers are often limited to specific features of the input language, and thus\ncan hardly reveal bugs related to other or new features. This paper presents\nFuzz4All, the first fuzzer that is universal in the sense that it can target\nmany different input languages and many different features of these languages.\nThe key idea behind Fuzz4All is to leverage large language models (LLMs) as an\ninput generation and mutation engine, which enables the approach to produce\ndiverse and realistic inputs for any practically relevant language. To realize\nthis potential, we present a novel autoprompting technique, which creates LLM\nprompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop,\nwhich iteratively updates the prompt to create new fuzzing inputs. We evaluate\nFuzz4All on nine systems under test that take in six different languages (C,\nC++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six\nlanguages, that universal fuzzing achieves higher coverage than existing,\nlanguage-specific fuzzers. Furthermore, Fuzz4All has identified 98 bugs in\nwidely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit\nquantum computing platform, with 64 bugs already confirmed by developers as\npreviously unknown.\n","authors":["Chunqiu Steven Xia","Matteo Paltenghi","Jia Le Tian","Michael Pradel","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.04748v2.pdf","comment":"Accepted at ICSE 2024"},{"id":"http://arxiv.org/abs/2401.07937v1","updated":"2024-01-15T19:57:07Z","published":"2024-01-15T19:57:07Z","title":"Integrate Any Omics: Towards genome-wide data integration for patient\n stratification","summary":" High-throughput omics profiling advancements have greatly enhanced cancer\npatient stratification. However, incomplete data in multi-omics integration\npresents a significant challenge, as traditional methods like sample exclusion\nor imputation often compromise biological diversity and dependencies.\nFurthermore, the critical task of accurately classifying new patients with\npartial omics data into existing subtypes is commonly overlooked. To address\nthese issues, we introduce IntegrAO (Integrate Any Omics), an unsupervised\nframework for integrating incomplete multi-omics data and classifying new\nsamples. IntegrAO first combines partially overlapping patient graphs from\ndiverse omics sources and utilizes graph neural networks to produce unified\npatient embeddings. Our systematic evaluation across five cancer cohorts\ninvolving six omics modalities demonstrates IntegrAO's robustness to missing\ndata and its accuracy in classifying new samples with partial profiles. An\nacute myeloid leukemia case study further validates its capability to uncover\nbiological and clinical heterogeneity in incomplete datasets. IntegrAO's\nability to handle heterogeneous and incomplete data makes it an essential tool\nfor precision oncology, offering a holistic approach to patient\ncharacterization.\n","authors":["Shihao Ma","Andy G. X. Zeng","Benjamin Haibe-Kains","Anna Goldenberg","John E Dick","Bo Wang"],"pdf_url":"https://arxiv.org/pdf/2401.07937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07936v1","updated":"2024-01-15T19:53:35Z","published":"2024-01-15T19:53:35Z","title":"A Globally Convergent Algorithm for Neural Network Parameter\n Optimization Based on Difference-of-Convex Functions","summary":" We propose an algorithm for optimizing the parameters of single hidden layer\nneural networks. Specifically, we derive a blockwise difference-of-convex (DC)\nfunctions representation of the objective function. Based on the latter, we\npropose a block coordinate descent (BCD) approach that we combine with a\ntailored difference-of-convex functions algorithm (DCA). We prove global\nconvergence of the proposed algorithm. Furthermore, we mathematically analyze\nthe convergence rate of parameters and the convergence rate in value (i.e., the\ntraining loss). We give conditions under which our algorithm converges linearly\nor even faster depending on the local shape of the loss function. We confirm\nour theoretical derivations numerically and compare our algorithm against\nstate-of-the-art gradient-based solvers in terms of both training loss and test\nloss.\n","authors":["Daniel Tschernutter","Mathias Kraus","Stefan Feuerriegel"],"pdf_url":"https://arxiv.org/pdf/2401.07936v1.pdf","comment":"accepted by TMLR"},{"id":"http://arxiv.org/abs/2401.07931v1","updated":"2024-01-15T19:47:14Z","published":"2024-01-15T19:47:14Z","title":"Vertical Federated Image Segmentation","summary":" With the popularization of AI solutions for image based problems, there has\nbeen a growing concern for both data privacy and acquisition. In a large number\nof cases, information is located on separate data silos and it can be difficult\nfor a developer to consolidate all of it in a fashion that is appropriate for\nmachine learning model development. Alongside this, a portion of these\nlocalized data regions may not have access to a labelled ground truth. This\nindicates that they have the capacity to reach conclusions numerically, but are\nnot able to assign classifications amid a lack of pertinent information. Such a\ndetermination is often negligible, especially when attempting to develop image\nbased solutions that often necessitate this capability. With this being the\ncase, we propose an innovative vertical federated learning (VFL) model\narchitecture that can operate under this common set of conditions. This is the\nfirst (and currently the only) implementation of a system that can work under\nthe constraints of a VFL environment and perform image segmentation while\nmaintaining nominal accuracies. We achieved this by utilizing an FCN that\nboasts the ability to operate on federates that lack labelled data and\nprivately share the respective weights with a central server, that of which\nhosts the necessary features for classification. Tests were conducted on the\nCamVid dataset in order to determine the impact of heavy feature compression\nrequired for the transfer of information between federates, as well as to reach\nnominal conclusions about the overall performance metrics when working under\nsuch constraints.\n","authors":["Paul K. Mandal","Cole Leo"],"pdf_url":"https://arxiv.org/pdf/2401.07931v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.07927v1","updated":"2024-01-15T19:39:15Z","published":"2024-01-15T19:39:15Z","title":"Can Large Language Models Explain Themselves?","summary":" Instruction-tuned large language models (LLMs) excel at many tasks, and will\neven provide explanations for their behavior. Since these models are directly\naccessible to the public, there is a risk that convincing and wrong\nexplanations can lead to unsupported confidence in LLMs. Therefore,\ninterpretability-faithfulness of self-explanations is an important\nconsideration for AI Safety. Assessing the interpretability-faithfulness of\nthese explanations, termed self-explanations, is challenging as the models are\ntoo complex for humans to annotate what is a correct explanation. To address\nthis, we propose employing self-consistency checks as a measure of\nfaithfulness. For example, if an LLM says a set of words is important for\nmaking a prediction, then it should not be able to make the same prediction\nwithout these words. While self-consistency checks are a common approach to\nfaithfulness, they have not previously been applied to LLM's self-explanations.\nWe apply self-consistency checks to three types of self-explanations:\ncounterfactuals, importance measures, and redactions. Our work demonstrate that\nfaithfulness is both task and model dependent, e.g., for sentiment\nclassification, counterfactual explanations are more faithful for Llama2,\nimportance measures for Mistral, and redaction for Falcon 40B. Finally, our\nfindings are robust to prompt-variations.\n","authors":["Andreas Madsen","Sarath Chandar","Siva Reddy"],"pdf_url":"https://arxiv.org/pdf/2401.07927v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.03374v3","updated":"2024-01-15T19:12:13Z","published":"2023-03-06T18:56:39Z","title":"To Stay or Not to Stay in the Pre-train Basin: Insights on Ensembling in\n Transfer Learning","summary":" Transfer learning and ensembling are two popular techniques for improving the\nperformance and robustness of neural networks. Due to the high cost of\npre-training, ensembles of models fine-tuned from a single pre-trained\ncheckpoint are often used in practice. Such models end up in the same basin of\nthe loss landscape, which we call the pre-train basin, and thus have limited\ndiversity. In this work, we show that ensembles trained from a single\npre-trained checkpoint may be improved by better exploring the pre-train basin,\nhowever, leaving the basin results in losing the benefits of transfer learning\nand in degradation of the ensemble quality. Based on the analysis of existing\nexploration methods, we propose a more effective modification of the Snapshot\nEnsembles (SSE) for transfer learning setup, StarSSE, which results in stronger\nensembles and uniform model soups.\n","authors":["Ildus Sadrtdinov","Dmitrii Pozdeev","Dmitry Vetrov","Ekaterina Lobacheva"],"pdf_url":"https://arxiv.org/pdf/2303.03374v3.pdf","comment":"Published in NeurIPS 2023. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2110.10083v4","updated":"2024-01-15T18:49:36Z","published":"2021-10-19T16:20:49Z","title":"Contrastive Active Inference","summary":" Active inference is a unifying theory for perception and action resting upon\nthe idea that the brain maintains an internal model of the world by minimizing\nfree energy. From a behavioral perspective, active inference agents can be seen\nas self-evidencing beings that act to fulfill their optimistic predictions,\nnamely preferred outcomes or goals. In contrast, reinforcement learning\nrequires human-designed rewards to accomplish any desired outcome. Although\nactive inference could provide a more natural self-supervised objective for\ncontrol, its applicability has been limited because of the shortcomings in\nscaling the approach to complex environments. In this work, we propose a\ncontrastive objective for active inference that strongly reduces the\ncomputational burden in learning the agent's generative model and planning\nfuture actions. Our method performs notably better than likelihood-based active\ninference in image-based tasks, while also being computationally cheaper and\neasier to train. We compare to reinforcement learning agents that have access\nto human-designed reward functions, showing that our approach closely matches\ntheir performance. Finally, we also show that contrastive methods perform\nsignificantly better in the case of distractors in the environment and that our\nmethod is able to generalize goals to variations in the background. Website and\ncode: https://contrastive-aif.github.io/\n","authors":["Pietro Mazzaglia","Tim Verbelen","Bart Dhoedt"],"pdf_url":"https://arxiv.org/pdf/2110.10083v4.pdf","comment":"Accepted as a conference paper at 35th Conference on Neural\n Information Processing Systems (NeurIPS 2021)"},{"id":"http://arxiv.org/abs/2401.07889v1","updated":"2024-01-15T18:39:13Z","published":"2024-01-15T18:39:13Z","title":"Machine Learning Techniques to Identify Hand Gestures amidst Forearm\n Muscle Signals","summary":" This study investigated the use of forearm EMG data for distinguishing eight\nhand gestures, employing the Neural Network and Random Forest algorithms on\ndata from ten participants. The Neural Network achieved 97 percent accuracy\nwith 1000-millisecond windows, while the Random Forest achieved 85 percent\naccuracy with 200-millisecond windows. Larger window sizes improved gesture\nclassification due to increased temporal resolution. The Random Forest\nexhibited faster processing at 92 milliseconds, compared to the Neural\nNetwork's 124 milliseconds. In conclusion, the study identified a Neural\nNetwork with a 1000-millisecond stream as the most accurate (97 percent), and a\nRandom Forest with a 200-millisecond stream as the most efficient (85 percent).\nFuture research should focus on increasing sample size, incorporating more hand\ngestures, and exploring different feature extraction methods and modeling\nalgorithms to enhance system accuracy and efficiency.\n","authors":["Ryan Cho","Sunil Patel","Kyu Taek Cho","Jaejin Hwang"],"pdf_url":"https://arxiv.org/pdf/2401.07889v1.pdf","comment":"21 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.07888v1","updated":"2024-01-15T18:32:53Z","published":"2024-01-15T18:32:53Z","title":"Multifidelity domain decomposition-based physics-informed neural\n networks for time-dependent problems","summary":" Multiscale problems are challenging for neural network-based discretizations\nof differential equations, such as physics-informed neural networks (PINNs).\nThis can be (partly) attributed to the so-called spectral bias of neural\nnetworks. To improve the performance of PINNs for time-dependent problems, a\ncombination of multifidelity stacking PINNs and domain decomposition-based\nfinite basis PINNs are employed. In particular, to learn the high-fidelity part\nof the multifidelity model, a domain decomposition in time is employed. The\nperformance is investigated for a pendulum and a two-frequency problem as well\nas the Allen-Cahn equation. It can be observed that the domain decomposition\napproach clearly improves the PINN and stacking PINN approaches.\n","authors":["Alexander Heinlein","Amanda A. Howard","Damien Beecroft","Panos Stinis"],"pdf_url":"https://arxiv.org/pdf/2401.07888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07886v1","updated":"2024-01-15T18:28:17Z","published":"2024-01-15T18:28:17Z","title":"Learned Best-Effort LLM Serving","summary":" Many applications must provide low-latency LLM service to users or risk\nunacceptable user experience. However, over-provisioning resources to serve\nfluctuating request patterns is often prohibitively expensive. In this work, we\npresent a best-effort serving system that employs deep reinforcement learning\nto adjust service quality based on the task distribution and system load. Our\nbest-effort system can maintain availability with over 10x higher client\nrequest rates, serves above 96% of peak performance 4.1x more often, and serves\nabove 98% of peak performance 2.3x more often than static serving on\nunpredictable workloads. Our learned router is robust to shifts in both the\narrival and task distribution. Compared to static serving, learned best-effort\nserving allows for cost-efficient serving through increased hardware utility.\nAdditionally, we argue that learned best-effort LLM serving is applicable in\nwide variety of settings and provides application developers great flexibility\nto meet their specific needs.\n","authors":["Siddharth Jha","Coleman Hooper","Xiaoxuan Liu","Sehoon Kim","Kurt Keutzer"],"pdf_url":"https://arxiv.org/pdf/2401.07886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07883v1","updated":"2024-01-15T18:25:18Z","published":"2024-01-15T18:25:18Z","title":"The Chronicles of RAG: The Retriever, the Chunk and the Generator","summary":" Retrieval Augmented Generation (RAG) has become one of the most popular\nparadigms for enabling LLMs to access external data, and also as a mechanism\nfor grounding to mitigate against hallucinations. When implementing RAG you can\nface several challenges like effective integration of retrieval models,\nefficient representation learning, data diversity, computational efficiency\noptimization, evaluation, and quality of text generation. Given all these\nchallenges, every day a new technique to improve RAG appears, making it\nunfeasible to experiment with all combinations for your problem. In this\ncontext, this paper presents good practices to implement, optimize, and\nevaluate RAG for the Brazilian Portuguese language, focusing on the\nestablishment of a simple pipeline for inference and experiments. We explored a\ndiverse set of methods to answer questions about the first Harry Potter book.\nTo generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview,\ngpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the\nretriever, our approach achieved an improvement of MRR@10 by 35.4% compared to\nthe baseline. When optimizing the input size in the application, we observed\nthat it is possible to further enhance it by 2.4%. Finally, we present the\ncomplete architecture of the RAG with our recommendations. As result, we moved\nfrom a baseline of 57.88% to a maximum relative score of 98.61%.\n","authors":["Paulo Finardi","Leonardo Avila","Rodrigo Castaldoni","Pedro Gengo","Celio Larcher","Marcos Piau","Pablo Costa","Vinicius Caridá"],"pdf_url":"https://arxiv.org/pdf/2401.07883v1.pdf","comment":"16 pages, 15 figures, 9 tables"},{"id":"http://arxiv.org/abs/2401.07874v1","updated":"2024-01-15T18:08:31Z","published":"2024-01-15T18:08:31Z","title":"Do stable neural networks exist for classification problems? -- A new\n view on stability in AI","summary":" In deep learning (DL) the instability phenomenon is widespread and well\ndocumented, most commonly using the classical measure of stability, the\nLipschitz constant. While a small Lipchitz constant is traditionally viewed as\nguarantying stability, it does not capture the instability phenomenon in DL for\nclassification well. The reason is that a classification function -- which is\nthe target function to be approximated -- is necessarily discontinuous, thus\nhaving an 'infinite' Lipchitz constant. As a result, the classical approach\nwill deem every classification function unstable, yet basic classification\nfunctions a la 'is there a cat in the image?' will typically be locally very\n'flat' -- and thus locally stable -- except at the decision boundary. The lack\nof an appropriate measure of stability hinders a rigorous theory for stability\nin DL, and consequently, there are no proper approximation theoretic results\nthat can guarantee the existence of stable networks for classification\nfunctions. In this paper we introduce a novel stability measure\n$\\mathscr{S}(f)$, for any classification function $f$, appropriate to study the\nstability of discontinuous functions and their approximations. We further prove\ntwo approximation theorems: First, for any $\\epsilon > 0$ and any\nclassification function $f$ on a \\emph{compact set}, there is a neural network\n(NN) $\\psi$, such that $\\psi - f \\neq 0$ only on a set of measure $< \\epsilon$,\nmoreover, $\\mathscr{S}(\\psi) \\geq \\mathscr{S}(f) - \\epsilon$ (as accurate and\nstable as $f$ up to $\\epsilon$). Second, for any classification function $f$\nand $\\epsilon > 0$, there exists a NN $\\psi$ such that $\\psi = f$ on the set of\npoints that are at least $\\epsilon$ away from the decision boundary.\n","authors":["Z. N. D. Liu","A. C. Hansen"],"pdf_url":"https://arxiv.org/pdf/2401.07874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07871v1","updated":"2024-01-15T18:06:59Z","published":"2024-01-15T18:06:59Z","title":"Explainable Predictive Maintenance: A Survey of Current Methods,\n Challenges and Opportunities","summary":" Predictive maintenance is a well studied collection of techniques that aims\nto prolong the life of a mechanical system by using artificial intelligence and\nmachine learning to predict the optimal time to perform maintenance. The\nmethods allow maintainers of systems and hardware to reduce financial and time\ncosts of upkeep. As these methods are adopted for more serious and potentially\nlife-threatening applications, the human operators need trust the predictive\nsystem. This attracts the field of Explainable AI (XAI) to introduce\nexplainability and interpretability into the predictive system. XAI brings\nmethods to the field of predictive maintenance that can amplify trust in the\nusers while maintaining well-performing systems. This survey on explainable\npredictive maintenance (XPM) discusses and presents the current methods of XAI\nas applied to predictive maintenance while following the Preferred Reporting\nItems for Systematic Reviews and Meta-Analyses (PRISMA) 2020 guidelines. We\ncategorize the different XPM methods into groups that follow the XAI\nliterature. Additionally, we include current challenges and a discussion on\nfuture research directions in XPM.\n","authors":["Logan Cummins","Alex Sommers","Somayeh Bakhtiari Ramezani","Sudip Mittal","Joseph Jabour","Maria Seale","Shahram Rahimi"],"pdf_url":"https://arxiv.org/pdf/2401.07871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07862v1","updated":"2024-01-15T17:52:15Z","published":"2024-01-15T17:52:15Z","title":"Adaptive Neural-Operator Backstepping Control of a Benchmark Hyperbolic\n PDE","summary":" To stabilize PDEs, feedback controllers require gain kernel functions, which\nare themselves governed by PDEs. Furthermore, these gain-kernel PDEs depend on\nthe PDE plants' functional coefficients. The functional coefficients in PDE\nplants are often unknown. This requires an adaptive approach to PDE control,\ni.e., an estimation of the plant coefficients conducted concurrently with\ncontrol, where a separate PDE for the gain kernel must be solved at each\ntimestep upon the update in the plant coefficient function estimate. Solving a\nPDE at each timestep is computationally expensive and a barrier to the\nimplementation of real-time adaptive control of PDEs. Recently, results in\nneural operator (NO) approximations of functional mappings have been introduced\ninto PDE control, for replacing the computation of the gain kernel with a\nneural network that is trained, once offline, and reused in real-time for rapid\nsolution of the PDEs. In this paper, we present the first result on applying\nNOs in adaptive PDE control, presented for a benchmark 1-D hyperbolic PDE with\nrecirculation. We establish global stabilization via Lyapunov analysis, in the\nplant and parameter error states, and also present an alternative approach, via\npassive identifiers, which avoids the strong assumptions on kernel\ndifferentiability. We then present numerical simulations demonstrating\nstability and observe speedups up to three orders of magnitude, highlighting\nthe real-time efficacy of neural operators in adaptive control. Our code\n(Github) is made publicly available for future researchers.\n","authors":["Maxence Lamarque","Luke Bhan","Yuanyuan Shi","Miroslav Krstic"],"pdf_url":"https://arxiv.org/pdf/2401.07862v1.pdf","comment":"16.5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2206.09507v2","updated":"2024-01-15T17:35:33Z","published":"2022-06-19T23:37:24Z","title":"Resource-Efficient Separation Transformer","summary":" Transformers have recently achieved state-of-the-art performance in speech\nseparation. These models, however, are computationally demanding and require a\nlot of learnable parameters. This paper explores Transformer-based speech\nseparation with a reduced computational cost. Our main contribution is the\ndevelopment of the Resource-Efficient Separation Transformer (RE-SepFormer), a\nself-attention-based architecture that reduces the computational burden in two\nways. First, it uses non-overlapping blocks in the latent space. Second, it\noperates on compact latent summaries calculated from each chunk. The\nRE-SepFormer reaches a competitive performance on the popular WSJ0-2Mix and\nWHAM! datasets in both causal and non-causal settings. Remarkably, it scales\nsignificantly better than the previous Transformer-based architectures in terms\nof memory and inference time, making it more suitable for processing long\nmixtures.\n","authors":["Luca Della Libera","Cem Subakan","Mirco Ravanelli","Samuele Cornell","Frédéric Lepoutre","François Grondin"],"pdf_url":"https://arxiv.org/pdf/2206.09507v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2302.01463v3","updated":"2024-01-15T17:27:40Z","published":"2023-02-02T23:32:24Z","title":"Gradient Descent with Linearly Correlated Noise: Theory and Applications\n to Differential Privacy","summary":" We study gradient descent under linearly correlated noise. Our work is\nmotivated by recent practical methods for optimization with differential\nprivacy (DP), such as DP-FTRL, which achieve strong performance in settings\nwhere privacy amplification techniques are infeasible (such as in federated\nlearning). These methods inject privacy noise through a matrix factorization\nmechanism, making the noise linearly correlated over iterations. We propose a\nsimplified setting that distills key facets of these methods and isolates the\nimpact of linearly correlated noise. We analyze the behavior of gradient\ndescent in this setting, for both convex and non-convex functions. Our analysis\nis demonstrably tighter than prior work and recovers multiple important special\ncases exactly (including anticorrelated perturbed gradient descent). We use our\nresults to develop new, effective matrix factorizations for differentially\nprivate optimization, and highlight the benefits of these factorizations\ntheoretically and empirically.\n","authors":["Anastasia Koloskova","Ryan McKenna","Zachary Charles","Keith Rush","Brendan McMahan"],"pdf_url":"https://arxiv.org/pdf/2302.01463v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07844v1","updated":"2024-01-15T17:20:17Z","published":"2024-01-15T17:20:17Z","title":"The ODE Method for Stochastic Approximation and Reinforcement Learning\n with Markovian Noise","summary":" Stochastic approximation is a class of algorithms that update a vector\niteratively, incrementally, and stochastically, including, e.g., stochastic\ngradient descent and temporal difference learning. One fundamental challenge in\nanalyzing a stochastic approximation algorithm is to establish its stability,\ni.e., to show that the stochastic vector iterates are bounded almost surely. In\nthis paper, we extend the celebrated Borkar-Meyn theorem for stability from the\nMartingale difference noise setting to the Markovian noise setting, which\ngreatly improves its applicability in reinforcement learning, especially in\nthose off-policy reinforcement learning algorithms with linear function\napproximation and eligibility traces. Central to our analysis is the\ndiminishing asymptotic rate of change of a few functions, which is implied by\nboth a form of strong law of large numbers and a commonly used V4 Lyapunov\ndrift condition and trivially holds if the Markov chain is finite and\nirreducible.\n","authors":["Shuze Liu","Shuhang Chen","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07836v1","updated":"2024-01-15T17:06:02Z","published":"2024-01-15T17:06:02Z","title":"Two Types of AI Existential Risk: Decisive and Accumulative","summary":" The conventional discourse on existential risks (x-risks) from AI typically\nfocuses on abrupt, dire events caused by advanced AI systems, particularly\nthose that might achieve or surpass human-level intelligence. These events have\nsevere consequences that either lead to human extinction or irreversibly\ncripple human civilization to a point beyond recovery. This discourse, however,\noften neglects the serious possibility of AI x-risks manifesting incrementally\nthrough a series of smaller yet interconnected disruptions, gradually crossing\ncritical thresholds over time. This paper contrasts the conventional \"decisive\nAI x-risk hypothesis\" with an \"accumulative AI x-risk hypothesis.\" While the\nformer envisions an overt AI takeover pathway, characterized by scenarios like\nuncontrollable superintelligence, the latter suggests a different causal\npathway to existential catastrophes. This involves a gradual accumulation of\ncritical AI-induced threats such as severe vulnerabilities and systemic erosion\nof econopolitical structures. The accumulative hypothesis suggests a boiling\nfrog scenario where incremental AI risks slowly converge, undermining\nresilience until a triggering event results in irreversible collapse. Through\nsystems analysis, this paper examines the distinct assumptions differentiating\nthese two hypotheses. It is then argued that the accumulative view reconciles\nseemingly incompatible perspectives on AI risks. The implications of\ndifferentiating between these causal pathways -- the decisive and the\naccumulative -- for the governance of AI risks as well as long-term AI safety\nare discussed.\n","authors":["Atoosa Kasirzadeh"],"pdf_url":"https://arxiv.org/pdf/2401.07836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.02423v2","updated":"2024-01-15T16:42:13Z","published":"2022-02-04T22:47:53Z","title":"Improved Information Theoretic Generalization Bounds for Distributed and\n Federated Learning","summary":" We consider information-theoretic bounds on expected generalization error for\nstatistical learning problems in a networked setting. In this setting, there\nare $K$ nodes, each with its own independent dataset, and the models from each\nnode have to be aggregated into a final centralized model. We consider both\nsimple averaging of the models as well as more complicated multi-round\nalgorithms. We give upper bounds on the expected generalization error for a\nvariety of problems, such as those with Bregman divergence or Lipschitz\ncontinuous losses, that demonstrate an improved dependence of $1/K$ on the\nnumber of nodes. These \"per node\" bounds are in terms of the mutual information\nbetween the training dataset and the trained weights at each node, and are\ntherefore useful in describing the generalization properties inherent to having\ncommunication or privacy constraints at each node.\n","authors":["L. P. Barnes","Alex Dytso","H. V. Poor"],"pdf_url":"https://arxiv.org/pdf/2202.02423v2.pdf","comment":"This version of the paper adds an assumption that was missing from\n Theorem 4 for loss functions of type (i). Thanks to Peyman Gholami for\n spotting this bug"},{"id":"http://arxiv.org/abs/2305.16501v2","updated":"2024-01-15T16:39:52Z","published":"2023-05-25T22:07:29Z","title":"Strategic Classification under Unknown Personalized Manipulation","summary":" We study the fundamental mistake bound and sample complexity in the strategic\nclassification, where agents can strategically manipulate their feature vector\nup to an extent in order to be predicted as positive. For example, given a\nclassifier determining college admission, student candidates may try to take\neasier classes to improve their GPA, retake SAT and change schools in an effort\nto fool the classifier. Ball manipulations are a widely studied class of\nmanipulations in the literature, where agents can modify their feature vector\nwithin a bounded radius ball. Unlike most prior work, our work considers\nmanipulations to be personalized, meaning that agents can have different levels\nof manipulation abilities (e.g., varying radii for ball manipulations), and\nunknown to the learner.\n We formalize the learning problem in an interaction model where the learner\nfirst deploys a classifier and the agent manipulates the feature vector within\ntheir manipulation set to game the deployed classifier. We investigate various\nscenarios in terms of the information available to the learner during the\ninteraction, such as observing the original feature vector before or after\ndeployment, observing the manipulated feature vector, or not seeing either the\noriginal or the manipulated feature vector. We begin by providing online\nmistake bounds and PAC sample complexity in these scenarios for ball\nmanipulations. We also explore non-ball manipulations and show that, even in\nthe simplest scenario where both the original and the manipulated feature\nvectors are revealed, the mistake bounds and sample complexity are lower\nbounded by $\\Omega(|H|)$ when the target function belongs to a known class $H$.\n","authors":["Han Shao","Avrim Blum","Omar Montasser"],"pdf_url":"https://arxiv.org/pdf/2305.16501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07809v1","updated":"2024-01-15T16:30:12Z","published":"2024-01-15T16:30:12Z","title":"Optimal Data Splitting in Distributed Optimization for Machine Learning","summary":" The distributed optimization problem has become increasingly relevant\nrecently. It has a lot of advantages such as processing a large amount of data\nin less time compared to non-distributed methods. However, most distributed\napproaches suffer from a significant bottleneck - the cost of communications.\nTherefore, a large amount of research has recently been directed at solving\nthis problem. One such approach uses local data similarity. In particular,\nthere exists an algorithm provably optimally exploiting the similarity\nproperty. But this result, as well as results from other works solve the\ncommunication bottleneck by focusing only on the fact that communication is\nsignificantly more expensive than local computing and does not take into\naccount the various capacities of network devices and the different\nrelationship between communication time and local computing expenses. We\nconsider this setup and the objective of this study is to achieve an optimal\nratio of distributed data between the server and local machines for any costs\nof communications and local computations. The running times of the network are\ncompared between uniform and optimal distributions. The superior theoretical\nperformance of our solutions is experimentally validated.\n","authors":["Daniil Medyakov","Gleb Molodtsov","Aleksandr Beznosikov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2401.07809v1.pdf","comment":"17 pages, 2 figures, Doklady Rossijskoj akademii nauk:\n https://journals.rcsi.science/2686-9543/article/view/247131"},{"id":"http://arxiv.org/abs/2401.04464v2","updated":"2024-01-15T16:12:45Z","published":"2024-01-09T09:58:42Z","title":"PhilEO Bench: Evaluating Geo-Spatial Foundation Models","summary":" Massive amounts of unlabelled data are captured by Earth Observation (EO)\nsatellites, with the Sentinel-2 constellation generating 1.6 TB of data daily.\nThis makes Remote Sensing a data-rich domain well suited to Machine Learning\n(ML) solutions. However, a bottleneck in applying ML models to EO is the lack\nof annotated data as annotation is a labour-intensive and costly process. As a\nresult, research in this domain has focused on Self-Supervised Learning and\nFoundation Model approaches. This paper addresses the need to evaluate\ndifferent Foundation Models on a fair and uniform benchmark by introducing the\nPhilEO Bench, a novel evaluation framework for EO Foundation Models. The\nframework comprises of a testbed and a novel 400 GB Sentinel-2 dataset\ncontaining labels for three downstream tasks, building density estimation, road\nsegmentation, and land cover classification. We present experiments using our\nframework evaluating different Foundation Models, including Prithvi and SatMAE,\nat multiple n-shots and convergence rates.\n","authors":["Casper Fibaek","Luke Camilleri","Andreas Luyts","Nikolaos Dionelis","Bertrand Le Saux"],"pdf_url":"https://arxiv.org/pdf/2401.04464v2.pdf","comment":"6 pages, 5 figures, Submitted to IGARSS 2024"},{"id":"http://arxiv.org/abs/2401.07796v1","updated":"2024-01-15T16:04:46Z","published":"2024-01-15T16:04:46Z","title":"Fusing Echocardiography Images and Medical Records for Continuous\n Patient Stratification","summary":" Deep learning now enables automatic and robust extraction of cardiac function\ndescriptors from echocardiographic sequences, such as ejection fraction or\nstrain. These descriptors provide fine-grained information that physicians\nconsider, in conjunction with more global variables from the clinical record,\nto assess patients' condition. Drawing on novel transformer models applied to\ntabular data (e.g., variables from electronic health records), we propose a\nmethod that considers all descriptors extracted from medical records and\nechocardiograms to learn the representation of a difficult-to-characterize\ncardiovascular pathology, namely hypertension. Our method first projects each\nvariable into its own representation space using modality-specific approaches.\nThese standardized representations of multimodal data are then fed to a\ntransformer encoder, which learns to merge them into a comprehensive\nrepresentation of the patient through a pretext task of predicting a clinical\nrating. This pretext task is formulated as an ordinal classification to enforce\na pathological continuum in the representation space. We observe the major\ntrends along this continuum for a cohort of 239 hypertensive patients to\ndescribe, with unprecedented gradation, the effect of hypertension on a number\nof cardiac function descriptors. Our analysis shows that i) pretrained weights\nfrom a foundation model allow to reach good performance (83% accuracy) even\nwith limited data (less than 200 training samples), ii) trends across the\npopulation are reproducible between trainings, and iii) for descriptors whose\ninteractions with hypertension are well documented, patterns are consistent\nwith prior physiological knowledge.\n","authors":["Nathan Painchaud","Pierre-Yves Courand","Pierre-Marc Jodoin","Nicolas Duchateau","Olivier Bernard"],"pdf_url":"https://arxiv.org/pdf/2401.07796v1.pdf","comment":"10 pages, submitted to IEEE TMI"},{"id":"http://arxiv.org/abs/2401.07788v1","updated":"2024-01-15T15:54:54Z","published":"2024-01-15T15:54:54Z","title":"Activations and Gradients Compression for Model-Parallel Training","summary":" Large neural networks require enormous computational clusters of machines.\nModel-parallel training, when the model architecture is partitioned\nsequentially between workers, is a popular approach for training modern models.\nInformation compression can be applied to decrease workers communication time,\nas it is often a bottleneck in such systems. This work explores how\nsimultaneous compression of activations and gradients in model-parallel\ndistributed training setup affects convergence. We analyze compression methods\nsuch as quantization and TopK compression, and also experiment with error\ncompensation techniques. Moreover, we employ TopK with AQ-SGD per-batch error\nfeedback approach. We conduct experiments on image classification and language\nmodel fine-tuning tasks. Our findings demonstrate that gradients require milder\ncompression rates than activations. We observe that $K=10\\%$ is the lowest TopK\ncompression level, which does not harm model convergence severely. Experiments\nalso show that models trained with TopK perform well only when compression is\nalso applied during inference. We find that error feedback techniques do not\nimprove model-parallel training compared to plain compression, but allow model\ninference without compression with almost no quality drop. Finally, when\napplied with the AQ-SGD approach, TopK stronger than with $ K=30\\%$ worsens\nmodel performance significantly.\n","authors":["Mikhail Rudakov","Aleksandr Beznosikov","Yaroslav Kholodov","Alexander Gasnikov"],"pdf_url":"https://arxiv.org/pdf/2401.07788v1.pdf","comment":"17 pages, 6 figures, 5 tables, Doklady Rossijskoj akademii nauk:\n https://journals.rcsi.science/2686-9543/article/view/247111"},{"id":"http://arxiv.org/abs/2401.07787v1","updated":"2024-01-15T15:53:13Z","published":"2024-01-15T15:53:13Z","title":"Improving OCR Quality in 19th Century Historical Documents Using a\n Combined Machine Learning Based Approach","summary":" This paper addresses a major challenge to historical research on the 19th\ncentury. Large quantities of sources have become digitally available for the\nfirst time, while extraction techniques are lagging behind. Therefore, we\nresearched machine learning (ML) models to recognise and extract complex data\nstructures in a high-value historical primary source, the Schematismus. It\nrecords every single person in the Habsburg civil service above a certain\nhierarchical level between 1702 and 1918 and documents the genesis of the\ncentral administration over two centuries. Its complex and intricate structure\nas well as its enormous size have so far made any more comprehensive analysis\nof the administrative and social structure of the later Habsburg Empire on the\nbasis of this source impossible. We pursued two central objectives: Primarily,\nthe improvement of the OCR quality, for which we considered an improved\nstructure recognition to be essential; in the further course, it turned out\nthat this also made the extraction of the data structure possible. We chose\nFaster R-CNN as base for the ML architecture for structure recognition. In\norder to obtain the required amount of training data quickly and economically,\nwe synthesised Hof- und Staatsschematismus-style data, which we used to train\nour model. The model was then fine-tuned with a smaller set of manually\nannotated historical source data. We then used Tesseract-OCR, which was further\noptimised for the style of our documents, to complete the combined structure\nextraction and OCR process. Results show a significant decrease in the two\nstandard parameters of OCR-performance, WER and CER (where lower values are\nbetter). Combined structure detection and fine-tuned OCR improved CER and WER\nvalues by remarkable 71.98 percent (CER) respectively 52.49 percent (WER).\n","authors":["David Fleischhacker","Wolfgang Goederle","Roman Kern"],"pdf_url":"https://arxiv.org/pdf/2401.07787v1.pdf","comment":"29 pages, 23 figures, 7 tables"},{"id":"http://arxiv.org/abs/2301.12755v2","updated":"2024-01-15T15:52:46Z","published":"2023-01-30T10:01:18Z","title":"Efficient Node Selection in Private Personalized Decentralized Learning","summary":" Personalized decentralized learning is a promising paradigm for distributed\nlearning, enabling each node to train a local model on its own data and\ncollaborate with other nodes to improve without sharing any data. However, this\napproach poses significant privacy risks, as nodes may inadvertently disclose\nsensitive information about their data or preferences through their\ncollaboration choices. In this paper, we propose Private Personalized\nDecentralized Learning (PPDL), a novel approach that combines secure\naggregation and correlated adversarial multi-armed bandit optimization to\nprotect node privacy while facilitating efficient node selection. By leveraging\ndependencies between different arms, represented by potential collaborators, we\ndemonstrate that PPDL can effectively identify suitable collaborators solely\nbased on aggregated models. Additionally, we show that PPDL surpasses previous\nnon-private methods in model performance on standard benchmarks under label and\ncovariate shift scenarios.\n","authors":["Edvin Listo Zec","Johan Östman","Olof Mogren","Daniel Gillblad"],"pdf_url":"https://arxiv.org/pdf/2301.12755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.14972v2","updated":"2024-01-15T15:44:10Z","published":"2023-12-20T19:27:59Z","title":"A Trade-off Analysis of Replacing Proprietary LLMs with Open Source SLMs\n in Production","summary":" Many companies rely on APIs of managed AI models such as OpenAI's GPT-4 to\ncreate AI-enabled experiences in their products. Along with the benefits of\nease of use and shortened time to production, this reliance on proprietary APIs\nhas downsides in terms of model control, performance reliability, up-time\npredictability, and cost. At the same time, there has been a flurry of open\nsource small language models (SLMs) that have been made available for\ncommercial use. However, their readiness to replace existing capabilities\nremains unclear, and a systematic approach to test these models is not readily\navailable. In this paper, we present a systematic evaluation methodology for,\nand characterization of, modern open source SLMs and their trade-offs when\nreplacing a proprietary LLM APIs for a real-world product feature. We have\ndesigned SLaM, an automated analysis tool that enables the quantitative and\nqualitative testing of product features utilizing arbitrary SLMs. Using SLaM,\nwe examine both the quality and the performance characteristics of modern SLMs\nrelative to an existing customer-facing OpenAI-based implementation. We find\nthat across 9 SLMs and 29 variants, we observe competitive quality-of-results\nfor our use case, significant performance consistency improvement, and a cost\nreduction of 5x-29x when compared to OpenAI GPT-4.\n","authors":["Chandra Irugalbandara","Ashish Mahendra","Roland Daynauth","Tharuka Kasthuri Arachchige","Krisztian Flautner","Lingjia Tang","Yiping Kang","Jason Mars"],"pdf_url":"https://arxiv.org/pdf/2312.14972v2.pdf","comment":"Updated title"},{"id":"http://arxiv.org/abs/2301.10260v2","updated":"2024-01-15T15:34:41Z","published":"2023-01-24T19:00:03Z","title":"Learned Interferometric Imaging for the SPIDER Instrument","summary":" The Segmented Planar Imaging Detector for Electro-Optical Reconnaissance\n(SPIDER) is an optical interferometric imaging device that aims to offer an\nalternative to the large space telescope designs of today with reduced size,\nweight and power consumption. This is achieved through interferometric imaging.\nState-of-the-art methods for reconstructing images from interferometric\nmeasurements adopt proximal optimization techniques, which are computationally\nexpensive and require handcrafted priors. In this work we present two\ndata-driven approaches for reconstructing images from measurements made by the\nSPIDER instrument. These approaches use deep learning to learn prior\ninformation from training data, increasing the reconstruction quality, and\nsignificantly reducing the computation time required to recover images by\norders of magnitude. Reconstruction time is reduced to ${\\sim} 10$\nmilliseconds, opening up the possibility of real-time imaging with SPIDER for\nthe first time. Furthermore, we show that these methods can also be applied in\ndomains where training data is scarce, such as astronomical imaging, by\nleveraging transfer learning from domains where plenty of training data are\navailable.\n","authors":["Matthijs Mars","Marta M. Betcke","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2301.10260v2.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.07769v1","updated":"2024-01-15T15:27:24Z","published":"2024-01-15T15:27:24Z","title":"Deep Evolutional Instant Interest Network for CTR Prediction in\n Trigger-Induced Recommendation","summary":" The recommendation has been playing a key role in many industries, e.g.,\ne-commerce, streaming media, social media, etc. Recently, a new recommendation\nscenario, called Trigger-Induced Recommendation (TIR), where users are able to\nexplicitly express their instant interests via trigger items, is emerging as an\nessential role in many e-commerce platforms, e.g., Alibaba.com and Amazon.\nWithout explicitly modeling the user's instant interest, traditional\nrecommendation methods usually obtain sub-optimal results in TIR. Even though\nthere are a few methods considering the trigger and target items simultaneously\nto solve this problem, they still haven't taken into account temporal\ninformation of user behaviors, the dynamic change of user instant interest when\nthe user scrolls down and the interactions between the trigger and target\nitems. To tackle these problems, we propose a novel method -- Deep Evolutional\nInstant Interest Network (DEI2N), for click-through rate prediction in TIR\nscenarios. Specifically, we design a User Instant Interest Modeling Layer to\npredict the dynamic change of the intensity of instant interest when the user\nscrolls down. Temporal information is utilized in user behavior modeling.\nMoreover, an Interaction Layer is introduced to learn better interactions\nbetween the trigger and target items. We evaluate our method on several offline\nand real-world industrial datasets. Experimental results show that our proposed\nDEI2N outperforms state-of-the-art baselines. In addition, online A/B testing\ndemonstrates the superiority over the existing baseline in real-world\nproduction environments.\n","authors":["Zhibo Xiao","Luwei Yang","Tao Zhang","Wen Jiang","Wei Ning","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.07769v1.pdf","comment":"7 pages, 3 figures, reviewing of the 17th ACM International\n Conference on Web Search and Data Mining"},{"id":"http://arxiv.org/abs/2401.05982v3","updated":"2024-01-15T15:15:34Z","published":"2024-01-11T15:35:32Z","title":"A tree-based varying coefficient model","summary":" The paper introduces a tree-based varying coefficient model (VCM) where the\nvarying coefficients are modelled using the cyclic gradient boosting machine\n(CGBM) from Delong et al. (2023). Modelling the coefficient functions using a\nCGBM allows for dimension-wise early stopping and feature importance scores.\nThe dimension-wise early stopping not only reduces the risk of\ndimension-specific overfitting, but also reveals differences in model\ncomplexity across dimensions. The use of feature importance scores allows for\nsimple feature selection and easy model interpretation. The model is evaluated\non the same simulated and real data examples as those used in Richman and\nW\\\"uthrich (2023), and the results show that it produces results in terms of\nout of sample loss that are comparable to those of their neural network-based\nVCM called LocalGLMnet.\n","authors":["Henning Zakrisson","Mathias Lindholm"],"pdf_url":"https://arxiv.org/pdf/2401.05982v3.pdf","comment":"23 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.07756v1","updated":"2024-01-15T15:09:47Z","published":"2024-01-15T15:09:47Z","title":"Joint Probability Selection and Power Allocation for Federated Learning","summary":" In this paper, we study the performance of federated learning over wireless\nnetworks, where devices with a limited energy budget train a machine learning\nmodel. The federated learning performance depends on the selection of the\nclients participating in the learning at each round. Most existing studies\nsuggest deterministic approaches for the client selection, resulting in\nchallenging optimization problems that are usually solved using heuristics, and\ntherefore without guarantees on the quality of the final solution. We formulate\na new probabilistic approach to jointly select clients and allocate power\noptimally so that the expected number of participating clients is maximized. To\nsolve the problem, a new alternating algorithm is proposed, where at each step,\nthe closed-form solutions for user selection probabilities and power\nallocations are obtained. Our numerical results show that the proposed approach\nachieves a significant performance in terms of energy consumption, completion\ntime and accuracy as compared to the studied benchmarks.\n","authors":["Ouiame Marnissi","Hajar EL Hammouti","El Houcine Bergou"],"pdf_url":"https://arxiv.org/pdf/2401.07756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09734v3","updated":"2024-01-15T15:04:10Z","published":"2023-01-23T21:54:25Z","title":"Topological Learning in Multi-Class Data Sets","summary":" We specialize techniques from topological data analysis to the problem of\ncharacterizing the topological complexity (as defined in the body of the paper)\nof a multi-class data set. As a by-product, a topological classifier is defined\nthat uses an open sub-covering of the data set. This sub-covering can be used\nto construct a simplicial complex whose topological features (e.g., Betti\nnumbers) provide information about the classification problem. We use these\ntopological constructs to study the impact of topological complexity on\nlearning in feedforward deep neural networks (DNNs). We hypothesize that\ntopological complexity is negatively correlated with the ability of a fully\nconnected feedforward deep neural network to learn to classify data correctly.\nWe evaluate our topological classification algorithm on multiple constructed\nand open source data sets. We also validate our hypothesis regarding the\nrelationship between topological complexity and learning in DNN's on multiple\ndata sets.\n","authors":["Christopher Griffin","Trevor Karn","Benjamin Apple"],"pdf_url":"https://arxiv.org/pdf/2301.09734v3.pdf","comment":"16 pages, 18 figures. This is a revision of v2"},{"id":"http://arxiv.org/abs/2401.07744v1","updated":"2024-01-15T14:56:04Z","published":"2024-01-15T14:56:04Z","title":"Combining Machine Learning and Ontology: A Systematic Literature Review","summary":" Motivated by the desire to explore the process of combining inductive and\ndeductive reasoning, we conducted a systematic literature review of articles\nthat investigate the integration of machine learning and ontologies. The\nobjective was to identify diverse techniques that incorporate both inductive\nreasoning (performed by machine learning) and deductive reasoning (performed by\nontologies) into artificial intelligence systems. Our review, which included\nthe analysis of 128 studies, allowed us to identify three main categories of\nhybridization between machine learning and ontologies: learning-enhanced\nontologies, semantic data mining, and learning and reasoning systems. We\nprovide a comprehensive examination of all these categories, emphasizing the\nvarious machine learning algorithms utilized in the studies. Furthermore, we\ncompared our classification with similar recent work in the field of hybrid AI\nand neuro-symbolic approaches.\n","authors":["Sarah Ghidalia","Ouassila Labbani Narsis","Aurélie Bertaux","Christophe Nicolle"],"pdf_url":"https://arxiv.org/pdf/2401.07744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09373v2","updated":"2024-01-15T14:52:42Z","published":"2023-05-16T11:56:02Z","title":"Multi-task convolutional neural network for image aesthetic assessment","summary":" As people's aesthetic preferences for images are far from understood, image\naesthetic assessment is a challenging artificial intelligence task. The range\nof factors underlying this task is almost unlimited, but we know that some\naesthetic attributes affect those preferences. In this study, we present a\nmulti-task convolutional neural network that takes into account these\nattributes. The proposed neural network jointly learns the attributes along\nwith the overall aesthetic scores of images. This multi-task learning framework\nallows for effective generalization through the utilization of shared\nrepresentations. Our experiments demonstrate that the proposed method\noutperforms the state-of-the-art approaches in predicting overall aesthetic\nscores for images in one benchmark of image aesthetics. We achieve near-human\nperformance in terms of overall aesthetic scores when considering the\nSpearman's rank correlations. Moreover, our model pioneers the application of\nmulti-tasking in another benchmark, serving as a new baseline for future\nresearch. Notably, our approach achieves this performance while using fewer\nparameters compared to existing multi-task neural networks in the literature,\nand consequently makes our method more efficient in terms of computational\ncomplexity.\n","authors":["Derya Soydaner","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2305.09373v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11691v3","updated":"2024-01-15T14:46:57Z","published":"2022-11-21T18:09:03Z","title":"Deep Signature Algorithm for Multi-dimensional Path-Dependent Options","summary":" In this work, we study the deep signature algorithms for path-dependent\noptions. We extend the backward scheme in [Hur\\'e-Pham-Warin. Mathematics of\nComputation 89, no. 324 (2020)] for state-dependent FBSDEs with reflections to\npath-dependent FBSDEs with reflections, by adding the signature layer to the\nbackward scheme. Our algorithm applies to both European and American type\noption pricing problems while the payoff function depends on the whole paths of\nthe underlying forward stock process. We prove the convergence analysis of our\nnumerical algorithm with explicit dependence on the truncation order of the\nsignature and the neural network approximation errors. Numerical examples for\nthe algorithm are provided including: Amerasian option under the Black-Scholes\nmodel, American option with a path-dependent geometric mean payoff function,\nand the Shiryaev's optimal stopping problem.\n","authors":["Erhan Bayraktar","Qi Feng","Zhaoyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2211.11691v3.pdf","comment":"21 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.07733v1","updated":"2024-01-15T14:45:18Z","published":"2024-01-15T14:45:18Z","title":"Conformal Approach To Gaussian Process Surrogate Evaluation With\n Coverage Guarantees","summary":" Gaussian processes (GPs) are a Bayesian machine learning approach widely used\nto construct surrogate models for the uncertainty quantification of computer\nsimulation codes in industrial applications. It provides both a mean predictor\nand an estimate of the posterior prediction variance, the latter being used to\nproduce Bayesian credibility intervals. Interpreting these intervals relies on\nthe Gaussianity of the simulation model as well as the well-specification of\nthe priors which are not always appropriate. We propose to address this issue\nwith the help of conformal prediction. In the present work, a method for\nbuilding adaptive cross-conformal prediction intervals is proposed by weighting\nthe non-conformity score with the posterior standard deviation of the GP. The\nresulting conformal prediction intervals exhibit a level of adaptivity akin to\nBayesian credibility sets and display a significant correlation with the\nsurrogate model local approximation error, while being free from the underlying\nmodel assumptions and having frequentist coverage guarantees. These estimators\ncan thus be used for evaluating the quality of a GP surrogate model and can\nassist a decision-maker in the choice of the best prior for the specific\napplication of the GP. The performance of the method is illustrated through a\npanel of numerical examples based on various reference databases. Moreover, the\npotential applicability of the method is demonstrated in the context of\nsurrogate modeling of an expensive-to-evaluate simulator of the clogging\nphenomenon in steam generators of nuclear reactors.\n","authors":["Edgar Jaber","Vincent Blot","Nicolas Brunel","Vincent Chabridon","Emmanuel Remy","Bertrand Iooss","Didier Lucor","Mathilde Mougeot","Alessandro Leite"],"pdf_url":"https://arxiv.org/pdf/2401.07733v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16369v2","updated":"2024-01-15T14:40:23Z","published":"2023-09-28T12:13:23Z","title":"Bringing the Discussion of Minima Sharpness to the Audio Domain: a\n Filter-Normalised Evaluation for Acoustic Scene Classification","summary":" The correlation between the sharpness of loss minima and generalisation in\nthe context of deep neural networks has been subject to discussion for a long\ntime. Whilst mostly investigated in the context of selected benchmark data sets\nin the area of computer vision, we explore this aspect for the acoustic scene\nclassification task of the DCASE2020 challenge data. Our analysis is based on\ntwo-dimensional filter-normalised visualisations and a derived sharpness\nmeasure. Our exploratory analysis shows that sharper minima tend to show better\ngeneralisation than flat minima -even more so for out-of-domain data, recorded\nfrom previously unseen devices-, thus adding to the dispute about better\ngeneralisation capabilities of flat minima. We further find that, in\nparticular, the choice of optimisers is a main driver of the sharpness of\nminima and we discuss resulting limitations with respect to comparability. Our\ncode, trained model states and loss landscape visualisations are publicly\navailable.\n","authors":["Manuel Milling","Andreas Triantafyllopoulos","Iosif Tsangko","Simon David Noel Rampp","Björn Wolfgang Schuller"],"pdf_url":"https://arxiv.org/pdf/2309.16369v2.pdf","comment":"This work has been submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2401.06604v2","updated":"2024-01-15T14:39:10Z","published":"2024-01-12T14:40:55Z","title":"Identifying Policy Gradient Subspaces","summary":" Policy gradient methods hold great potential for solving complex continuous\ncontrol tasks. Still, their training efficiency can be improved by exploiting\nstructure within the optimization problem. Recent work indicates that\nsupervised learning can be accelerated by leveraging the fact that gradients\nlie in a low-dimensional and slowly-changing subspace. In this paper, we\nconduct a thorough evaluation of this phenomenon for two popular deep policy\ngradient methods on various simulated benchmark tasks. Our results demonstrate\nthe existence of such gradient subspaces despite the continuously changing data\ndistribution inherent to reinforcement learning. These findings reveal\npromising directions for future work on more efficient reinforcement learning,\ne.g., through improving parameter-space exploration or enabling second-order\noptimization.\n","authors":["Jan Schneider","Pierre Schumacher","Simon Guist","Le Chen","Daniel Häufle","Bernhard Schölkopf","Dieter Büchler"],"pdf_url":"https://arxiv.org/pdf/2401.06604v2.pdf","comment":"21 pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.00744v4","updated":"2024-01-15T14:31:50Z","published":"2024-01-01T12:57:15Z","title":"Harmonizing Covariance and Expressiveness for Deep Hamiltonian\n Regression in Crystalline Material Research: a Hybrid Cascaded Regression\n Framework","summary":" Deep learning for Hamiltonian regression of quantum systems in material\nresearch necessitates satisfying the covariance laws, among which achieving\nSO(3)-equivariance without sacrificing the expressiveness capability of\nnetworks remains an elusive challenge due to the restriction to non-linear\nmappings on guaranteeing theoretical equivariance. To alleviate the\ncovariance-expressiveness dilemma, we propose a hybrid framework with two\ncascaded regression stages. The first stage, i.e., a theoretically-guaranteed\ncovariant neural network modeling symmetry properties of 3D atom systems,\npredicts baseline Hamiltonians with theoretically covariant features extracted,\nassisting the second stage in learning covariance. Meanwhile, the second stage,\npowered by a non-linear 3D graph Transformer network we propose for structural\nmodeling of atomic systems, refines the first stage's output as a fine-grained\nprediction of Hamiltonians with better expressiveness capability. The\ncombination of a theoretically covariant yet inevitably less expressive model\nwith a highly expressive non-linear network enables precise, generalizable\npredictions while maintaining robust covariance under coordinate\ntransformations. Our method achieves state-of-the-art performance in\nHamiltonian prediction for electronic structure calculations, confirmed through\nexperiments on six crystalline material databases. The codes and configuration\nscripts are available in the supplementary material.\n","authors":["Shi Yin","Xinyang Pan","Xudong Zhu","Tianyu Gao","Haochong Zhang","Feng Wu","Lixin He"],"pdf_url":"https://arxiv.org/pdf/2401.00744v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07711v1","updated":"2024-01-15T14:27:03Z","published":"2024-01-15T14:27:03Z","title":"Efficient Nonparametric Tensor Decomposition for Binary and Count Data","summary":" In numerous applications, binary reactions or event counts are observed and\nstored within high-order tensors. Tensor decompositions (TDs) serve as a\npowerful tool to handle such high-dimensional and sparse data. However, many\ntraditional TDs are explicitly or implicitly designed based on the Gaussian\ndistribution, which is unsuitable for discrete data. Moreover, most TDs rely on\npredefined multi-linear structures, such as CP and Tucker formats. Therefore,\nthey may not be effective enough to handle complex real-world datasets. To\naddress these issues, we propose ENTED, an \\underline{E}fficient\n\\underline{N}onparametric \\underline{TE}nsor \\underline{D}ecomposition for\nbinary and count tensors. Specifically, we first employ a nonparametric\nGaussian process (GP) to replace traditional multi-linear structures. Next, we\nutilize the \\pg augmentation which provides a unified framework to establish\nconjugate models for binary and count distributions. Finally, to address the\ncomputational issue of GPs, we enhance the model by incorporating sparse\northogonal variational inference of inducing points, which offers a more\neffective covariance approximation within GPs and stochastic natural gradient\nupdates for nonparametric models. We evaluate our model on several real-world\ntensor completion tasks, considering binary and count datasets. The results\nmanifest both better performance and computational advantages of the proposed\nmodel.\n","authors":["Zerui Tao","Toshihisa Tanaka","Qibin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.07711v1.pdf","comment":"AAAI-24"},{"id":"http://arxiv.org/abs/2401.07710v1","updated":"2024-01-15T14:26:44Z","published":"2024-01-15T14:26:44Z","title":"Go-Explore for Residential Energy Management","summary":" Reinforcement learning is commonly applied in residential energy management,\nparticularly for optimizing energy costs. However, RL agents often face\nchallenges when dealing with deceptive and sparse rewards in the energy control\ndomain, especially with stochastic rewards. In such situations, thorough\nexploration becomes crucial for learning an optimal policy. Unfortunately, the\nexploration mechanism can be misled by deceptive reward signals, making\nthorough exploration difficult. Go-Explore is a family of algorithms which\ncombines planning methods and reinforcement learning methods to achieve\nefficient exploration. We use the Go-Explore algorithm to solve the cost-saving\ntask in residential energy management problems and achieve an improvement of up\nto 19.84\\% compared to the well-known reinforcement learning algorithms.\n","authors":["Junlin Lu","Patrick Mannion","Karl Mason"],"pdf_url":"https://arxiv.org/pdf/2401.07710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09677v2","updated":"2024-01-15T14:17:47Z","published":"2023-09-18T11:30:58Z","title":"Single and Few-step Diffusion for Generative Speech Enhancement","summary":" Diffusion models have shown promising results in speech enhancement, using a\ntask-adapted diffusion process for the conditional generation of clean speech\ngiven a noisy mixture. However, at test time, the neural network used for score\nestimation is called multiple times to solve the iterative reverse process.\nThis results in a slow inference process and causes discretization errors that\naccumulate over the sampling trajectory. In this paper, we address these\nlimitations through a two-stage training approach. In the first stage, we train\nthe diffusion model the usual way using the generative denoising score matching\nloss. In the second stage, we compute the enhanced signal by solving the\nreverse process and compare the resulting estimate to the clean speech target\nusing a predictive loss. We show that using this second training stage enables\nachieving the same performance as the baseline model using only 5 function\nevaluations instead of 60 function evaluations. While the performance of usual\ngenerative diffusion algorithms drops dramatically when lowering the number of\nfunction evaluations (NFEs) to obtain single-step diffusion, we show that our\nproposed method keeps a steady performance and therefore largely outperforms\nthe diffusion baseline in this setting and also generalizes better than its\npredictive counterpart.\n","authors":["Bunlong Lay","Jean-Marie Lemercier","Julius Richter","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2309.09677v2.pdf","comment":"copyright 2023 IEEE. Personal use of this material is permitted.\n Permission from IEEE must be obtained for all other uses, in any current or\n future media, including reprinting/republishing this material for advertising\n or promotional purposes, creating new collective works, for resale or\n redistribution to servers or lists, or reuse of any copyrighted component of\n this work in other works"},{"id":"http://arxiv.org/abs/2401.07697v1","updated":"2024-01-15T14:14:16Z","published":"2024-01-15T14:14:16Z","title":"Data vs. Model Machine Learning Fairness Testing: An Empirical Study","summary":" Although several fairness definitions and bias mitigation techniques exist in\nthe literature, all existing solutions evaluate fairness of Machine Learning\n(ML) systems after the training stage. In this paper, we take the first steps\ntowards evaluating a more holistic approach by testing for fairness both before\nand after model training. We evaluate the effectiveness of the proposed\napproach and position it within the ML development lifecycle, using an\nempirical analysis of the relationship between model dependent and independent\nfairness metrics. The study uses 2 fairness metrics, 4 ML algorithms, 5\nreal-world datasets and 1600 fairness evaluation cycles. We find a linear\nrelationship between data and model fairness metrics when the distribution and\nthe size of the training data changes. Our results indicate that testing for\nfairness prior to training can be a ``cheap'' and effective means of catching a\nbiased data collection process early; detecting data drifts in production\nsystems and minimising execution of full training cycles thus reducing\ndevelopment time and costs.\n","authors":["Arumoy Shome","Luis Cruz","Arie van Deursen"],"pdf_url":"https://arxiv.org/pdf/2401.07697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16900v4","updated":"2024-01-15T14:07:55Z","published":"2023-08-31T17:58:28Z","title":"Learning to Taste: A Multimodal Wine Dataset","summary":" We present WineSensed, a large multimodal wine dataset for studying the\nrelations between visual perception, language, and flavor. The dataset\nencompasses 897k images of wine labels and 824k reviews of wines curated from\nthe Vivino platform. It has over 350k unique bottlings, annotated with year,\nregion, rating, alcohol percentage, price, and grape composition. We obtained\nfine-grained flavor annotations on a subset by conducting a wine-tasting\nexperiment with 256 participants who were asked to rank wines based on their\nsimilarity in flavor, resulting in more than 5k pairwise flavor distances. We\npropose a low-dimensional concept embedding algorithm that combines human\nexperience with automatic machine similarity kernels. We demonstrate that this\nshared concept embedding space improves upon separate embedding spaces for\ncoarse flavor classification (alcohol percentage, country, grape, price,\nrating) and aligns with the intricate human perception of flavor.\n","authors":["Thoranna Bender","Simon Moe Sørensen","Alireza Kashani","K. Eldjarn Hjorleifsson","Grethe Hyldig","Søren Hauberg","Serge Belongie","Frederik Warburg"],"pdf_url":"https://arxiv.org/pdf/2308.16900v4.pdf","comment":"Accepted to NeurIPS 2023. See project page:\n https://thoranna.github.io/learning_to_taste/"},{"id":"http://arxiv.org/abs/2401.07694v1","updated":"2024-01-15T14:04:50Z","published":"2024-01-15T14:04:50Z","title":"Stochastic optimization with arbitrary recurrent data sampling","summary":" For obtaining optimal first-order convergence guarantee for stochastic\noptimization, it is necessary to use a recurrent data sampling algorithm that\nsamples every data point with sufficient frequency. Most commonly used data\nsampling algorithms (e.g., i.i.d., MCMC, random reshuffling) are indeed\nrecurrent under mild assumptions. In this work, we show that for a particular\nclass of stochastic optimization algorithms, we do not need any other property\n(e.g., independence, exponential mixing, and reshuffling) than recurrence in\ndata sampling algorithms to guarantee the optimal rate of first-order\nconvergence. Namely, using regularized versions of Minimization by Incremental\nSurrogate Optimization (MISO), we show that for non-convex and possibly\nnon-smooth objective functions, the expected optimality gap converges at an\noptimal rate $O(n^{-1/2})$ under general recurrent sampling schemes.\nFurthermore, the implied constant depends explicitly on the `speed of\nrecurrence', measured by the expected amount of time to visit a given data\npoint either averaged (`target time') or supremized (`hitting time') over the\ncurrent location. We demonstrate theoretically and empirically that convergence\ncan be accelerated by selecting sampling algorithms that cover the data set\nmost effectively. We discuss applications of our general framework to\ndecentralized optimization and distributed non-negative matrix factorization.\n","authors":["William G. Powell","Hanbaek Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.07694v1.pdf","comment":"41 pages, 3 figures, 1 table"},{"id":"http://arxiv.org/abs/2306.03072v3","updated":"2024-01-15T13:58:51Z","published":"2023-06-05T17:49:43Z","title":"Explore to Generalize in Zero-Shot RL","summary":" We study zero-shot generalization in reinforcement learning-optimizing a\npolicy on a set of training tasks to perform well on a similar but unseen test\ntask. To mitigate overfitting, previous work explored different notions of\ninvariance to the task. However, on problems such as the ProcGen Maze, an\nadequate solution that is invariant to the task visualization does not exist,\nand therefore invariance-based approaches fail. Our insight is that learning a\npolicy that effectively $\\textit{explores}$ the domain is harder to memorize\nthan a policy that maximizes reward for a specific task, and therefore we\nexpect such learned behavior to generalize well; we indeed demonstrate this\nempirically on several domains that are difficult for invariance-based\napproaches. Our $\\textit{Explore to Generalize}$ algorithm (ExpGen) builds on\nthis insight: we train an additional ensemble of agents that optimize reward.\nAt test time, either the ensemble agrees on an action, and we generalize well,\nor we take exploratory actions, which generalize well and drive us to a novel\npart of the state space, where the ensemble may potentially agree again. We\nshow that our approach is the state-of-the-art on tasks of the ProcGen\nchallenge that have thus far eluded effective generalization, yielding a\nsuccess rate of $83\\%$ on the Maze task and $74\\%$ on Heist with $200$ training\nlevels. ExpGen can also be combined with an invariance based approach to gain\nthe best of both worlds, setting new state-of-the-art results on ProcGen.\n","authors":["Ev Zisselman","Itai Lavie","Daniel Soudry","Aviv Tamar"],"pdf_url":"https://arxiv.org/pdf/2306.03072v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01326v2","updated":"2024-01-15T13:39:38Z","published":"2024-01-02T18:32:14Z","title":"An Autoregressive Text-to-Graph Framework for Joint Entity and Relation\n Extraction","summary":" In this paper, we propose a novel method for joint entity and relation\nextraction from unstructured text by framing it as a conditional sequence\ngeneration problem. In contrast to conventional generative information\nextraction models that are left-to-right token-level generators, our approach\nis \\textit{span-based}. It generates a linearized graph where nodes represent\ntext spans and edges represent relation triplets. Our method employs a\ntransformer encoder-decoder architecture with pointing mechanism on a dynamic\nvocabulary of spans and relation types. Our model can capture the structural\ncharacteristics and boundaries of entities and relations through span\nrepresentations while simultaneously grounding the generated output in the\noriginal text thanks to the pointing mechanism. Evaluation on benchmark\ndatasets validates the effectiveness of our approach, demonstrating competitive\nresults. Code is available at https://github.com/urchade/ATG.\n","authors":["Urchade Zaratiana","Nadi Tomeh","Pierre Holat","Thierry Charnois"],"pdf_url":"https://arxiv.org/pdf/2401.01326v2.pdf","comment":"AAAI 2024 (camera ready version)"},{"id":"http://arxiv.org/abs/2401.07671v1","updated":"2024-01-15T13:35:21Z","published":"2024-01-15T13:35:21Z","title":"CLSA-CIM: A Cross-Layer Scheduling Approach for Computing-in-Memory\n Architectures","summary":" The demand for efficient machine learning (ML) accelerators is growing\nrapidly, driving the development of novel computing concepts such as resistive\nrandom access memory (RRAM)-based tiled computing-in-memory (CIM)\narchitectures. CIM allows to compute within the memory unit, resulting in\nfaster data processing and reduced power consumption. Efficient compiler\nalgorithms are essential to exploit the potential of tiled CIM architectures.\nWhile conventional ML compilers focus on code generation for CPUs, GPUs, and\nother von Neumann architectures, adaptations are needed to cover CIM\narchitectures. Cross-layer scheduling is a promising approach, as it enhances\nthe utilization of CIM cores, thereby accelerating computations. Although\nsimilar concepts are implicitly used in previous work, there is a lack of clear\nand quantifiable algorithmic definitions for cross-layer scheduling for tiled\nCIM architectures. To close this gap, we present CLSA-CIM, a cross-layer\nscheduling algorithm for tiled CIM architectures. We integrate CLSA-CIM with\nexisting weight-mapping strategies and compare performance against\nstate-of-the-art (SOTA) scheduling algorithms. CLSA-CIM improves the\nutilization by up to 17.9 x , resulting in an overall speedup increase of up to\n29.2 x compared to SOTA.\n","authors":["Rebecca Pelke","Jose Cubero-Cascante","Nils Bosbach","Felix Staudigl","Rainer Leupers","Jan Moritz Joseph"],"pdf_url":"https://arxiv.org/pdf/2401.07671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17503v4","updated":"2024-01-15T13:12:36Z","published":"2023-03-29T02:41:23Z","title":"Pgx: Hardware-Accelerated Parallel Game Simulators for Reinforcement\n Learning","summary":" We propose Pgx, a suite of board game reinforcement learning (RL)\nenvironments written in JAX and optimized for GPU/TPU accelerators. By\nleveraging JAX's auto-vectorization and parallelization over accelerators, Pgx\ncan efficiently scale to thousands of simultaneous simulations over\naccelerators. In our experiments on a DGX-A100 workstation, we discovered that\nPgx can simulate RL environments 10-100x faster than existing implementations\navailable in Python. Pgx includes RL environments commonly used as benchmarks\nin RL research, such as backgammon, chess, shogi, and Go. Additionally, Pgx\noffers miniature game sets and baseline models to facilitate rapid research\ncycles. We demonstrate the efficient training of the Gumbel AlphaZero algorithm\nwith Pgx environments. Overall, Pgx provides high-performance environment\nsimulators for researchers to accelerate their RL experiments. Pgx is available\nat http://github.com/sotetsuk/pgx.\n","authors":["Sotetsu Koyamada","Shinri Okano","Soichiro Nishimori","Yu Murata","Keigo Habara","Haruka Kita","Shin Ishii"],"pdf_url":"https://arxiv.org/pdf/2303.17503v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02810v2","updated":"2024-01-15T13:10:12Z","published":"2024-01-05T13:45:08Z","title":"Physics-Informed Neural Networks for High-Frequency and Multi-Scale\n Problems using Transfer Learning","summary":" Physics-informed neural network (PINN) is a data-driven solver for partial\nand ordinary differential equations(ODEs/PDEs). It provides a unified framework\nto address both forward and inverse problems. However, the complexity of the\nobjective function often leads to training failures. This issue is particularly\nprominent when solving high-frequency and multi-scale problems. We proposed\nusing transfer learning to boost the robustness and convergence of training\nPINN, starting training from low-frequency problems and gradually approaching\nhigh-frequency problems. Through two case studies, we discovered that transfer\nlearning can effectively train PINN to approximate solutions from low-frequency\nproblems to high-frequency problems without increasing network parameters.\nFurthermore, it requires fewer data points and less training time. We\nelaborately described our training strategy, including optimizer selection, and\nsuggested guidelines for using transfer learning to train neural networks for\nsolving more complex problems.\n","authors":["Abdul Hannan Mustajab","Hao Lyu","Zarghaam Rizvi","Frank Wuttke"],"pdf_url":"https://arxiv.org/pdf/2401.02810v2.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2305.19706v3","updated":"2024-01-15T13:06:10Z","published":"2023-05-31T10:03:04Z","title":"Necessary and Sufficient Conditions for Optimal Decision Trees using\n Dynamic Programming","summary":" Global optimization of decision trees has shown to be promising in terms of\naccuracy, size, and consequently human comprehensibility. However, many of the\nmethods used rely on general-purpose solvers for which scalability remains an\nissue. Dynamic programming methods have been shown to scale much better because\nthey exploit the tree structure by solving subtrees as independent subproblems.\nHowever, this only works when an objective can be optimized separately for\nsubtrees. We explore this relationship in detail and show the necessary and\nsufficient conditions for such separability and generalize previous dynamic\nprogramming approaches into a framework that can optimize any combination of\nseparable objectives and constraints. Experiments on five application domains\nshow the general applicability of this framework, while outperforming the\nscalability of general-purpose solvers by a large margin.\n","authors":["Jacobus G. M. van der Linden","Mathijs M. de Weerdt","Emir Demirović"],"pdf_url":"https://arxiv.org/pdf/2305.19706v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.09666v2","updated":"2024-01-15T12:54:05Z","published":"2023-06-16T07:45:32Z","title":"A Smooth Binary Mechanism for Efficient Private Continual Observation","summary":" In privacy under continual observation we study how to release differentially\nprivate estimates based on a dataset that evolves over time. The problem of\nreleasing private prefix sums of $x_1,x_2,x_3,\\dots \\in\\{0,1\\}$ (where the\nvalue of each $x_i$ is to be private) is particularly well-studied, and a\ngeneralized form is used in state-of-the-art methods for private stochastic\ngradient descent (SGD). The seminal binary mechanism privately releases the\nfirst $t$ prefix sums with noise of variance polylogarithmic in $t$. Recently,\nHenzinger et al. and Denisov et al. showed that it is possible to improve on\nthe binary mechanism in two ways: The variance of the noise can be reduced by a\n(large) constant factor, and also made more even across time steps. However,\ntheir algorithms for generating the noise distribution are not as efficient as\none would like in terms of computation time and (in particular) space. We\naddress the efficiency problem by presenting a simple alternative to the binary\nmechanism in which 1) generating the noise takes constant average time per\nvalue, 2) the variance is reduced by a factor about 4 compared to the binary\nmechanism, and 3) the noise distribution at each step is identical.\nEmpirically, a simple Python implementation of our approach outperforms the\nrunning time of the approach of Henzinger et al., as well as an attempt to\nimprove their algorithm using high-performance algorithms for multiplication\nwith Toeplitz matrices.\n","authors":["Joel Daniel Andersson","Rasmus Pagh"],"pdf_url":"https://arxiv.org/pdf/2306.09666v2.pdf","comment":"Appeared at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.07657v1","updated":"2024-01-15T12:53:58Z","published":"2024-01-15T12:53:58Z","title":"Empirical Evidence for the Fragment level Understanding on Drug\n Molecular Structure of LLMs","summary":" AI for drug discovery has been a research hotspot in recent years, and\nSMILES-based language models has been increasingly applied in drug molecular\ndesign. However, no work has explored whether and how language models\nunderstand the chemical spatial structure from 1D sequences. In this work, we\npre-train a transformer model on chemical language and fine-tune it toward drug\ndesign objectives, and investigate the correspondence between high-frequency\nSMILES substrings and molecular fragments. The results indicate that language\nmodels can understand chemical structures from the perspective of molecular\nfragments, and the structural knowledge learned through fine-tuning is\nreflected in the high-frequency SMILES substrings generated by the model.\n","authors":["Xiuyuan Hu","Guoqing Liu","Yang Zhao","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07657v1.pdf","comment":"Accepted by AAAI 2024 workshop: Large Language Models for Biological\n Discoveries (LLMs4Bio)"},{"id":"http://arxiv.org/abs/2401.07656v1","updated":"2024-01-15T12:52:56Z","published":"2024-01-15T12:52:56Z","title":"Learning Explainable and Better Performing Representations of POMDP\n Strategies","summary":" Strategies for partially observable Markov decision processes (POMDP)\ntypically require memory. One way to represent this memory is via automata. We\npresent a method to learn an automaton representation of a strategy using the\nL*-algorithm. Compared to the tabular representation of a strategy, the\nresulting automaton is dramatically smaller and thus also more explainable.\nMoreover, in the learning process, our heuristics may even improve the\nstrategy's performance. In contrast to approaches that synthesize an automaton\ndirectly from the POMDP thereby solving it, our approach is incomparably more\nscalable.\n","authors":["Alexander Bork","Debraj Chakraborty","Kush Grover","Jan Kretinsky","Stefanie Mohr"],"pdf_url":"https://arxiv.org/pdf/2401.07656v1.pdf","comment":"Technical report for the submission to TACAS 24"},{"id":"http://arxiv.org/abs/2401.07655v1","updated":"2024-01-15T12:51:13Z","published":"2024-01-15T12:51:13Z","title":"MLAD: A Unified Model for Multi-system Log Anomaly Detection","summary":" In spite of the rapid advancements in unsupervised log anomaly detection\ntechniques, the current mainstream models still necessitate specific training\nfor individual system datasets, resulting in costly procedures and limited\nscalability due to dataset size, thereby leading to performance bottlenecks.\nFurthermore, numerous models lack cognitive reasoning capabilities, posing\nchallenges in direct transferability to similar systems for effective anomaly\ndetection. Additionally, akin to reconstruction networks, these models often\nencounter the \"identical shortcut\" predicament, wherein the majority of system\nlogs are classified as normal, erroneously predicting normal classes when\nconfronted with rare anomaly logs due to reconstruction errors.\n To address the aforementioned issues, we propose MLAD, a novel anomaly\ndetection model that incorporates semantic relational reasoning across multiple\nsystems. Specifically, we employ Sentence-bert to capture the similarities\nbetween log sequences and convert them into highly-dimensional learnable\nsemantic vectors. Subsequently, we revamp the formulas of the Attention layer\nto discern the significance of each keyword in the sequence and model the\noverall distribution of the multi-system dataset through appropriate vector\nspace diffusion. Lastly, we employ a Gaussian mixture model to highlight the\nuncertainty of rare words pertaining to the \"identical shortcut\" problem,\noptimizing the vector space of the samples using the maximum expectation model.\nExperiments on three real-world datasets demonstrate the superiority of MLAD.\n","authors":["Runqiang Zang","Hongcheng Guo","Jian Yang","Jiaheng Liu","Zhoujun Li","Tieqiao Zheng","Xu Shi","Liangfan Zheng","Bo Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07646v1","updated":"2024-01-15T12:42:15Z","published":"2024-01-15T12:42:15Z","title":"Multifractal-spectral features enhance classification of anomalous\n diffusion","summary":" Anomalous diffusion processes pose a unique challenge in classification and\ncharacterization. Previously (Mangalam et al., 2023, Physical Review Research\n5, 023144), we established a framework for understanding anomalous diffusion\nusing multifractal formalism. The present study delves into the potential of\nmultifractal spectral features for effectively distinguishing anomalous\ndiffusion trajectories from five widely used models: fractional Brownian\nmotion, scaled Brownian motion, continuous time random walk, annealed transient\ntime motion, and L\\'evy walk. To accomplish this, we generate extensive\ndatasets comprising $10^6$ trajectories from these five anomalous diffusion\nmodels and extract multiple multifractal spectra from each trajectory. Our\ninvestigation entails a thorough analysis of neural network performance,\nencompassing features derived from varying numbers of spectra. Furthermore, we\nexplore the integration of multifractal spectra into traditional feature\ndatasets, enabling us to assess their impact comprehensively. To ensure a\nstatistically meaningful comparison, we categorize features into concept groups\nand train neural networks using features from each designated group. Notably,\nseveral feature groups demonstrate similar levels of accuracy, with the highest\nperformance observed in groups utilizing moving-window characteristics and\n$p$-variation features. Multifractal spectral features, particularly those\nderived from three spectra involving different timescales and cutoffs, closely\nfollow, highlighting their robust discriminatory potential. Remarkably, a\nneural network exclusively trained on features from a single multifractal\nspectrum exhibits commendable performance, surpassing other feature groups. Our\nfindings underscore the diverse and potent efficacy of multifractal spectral\nfeatures in enhancing classification of anomalous diffusion.\n","authors":["Henrik Seckler","Ralf Metzler","Damian G. Kelty-Stephen","Madhur Mangalam"],"pdf_url":"https://arxiv.org/pdf/2401.07646v1.pdf","comment":"23 pages, 6 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.07942v1","updated":"2024-01-15T20:09:56Z","published":"2024-01-15T20:09:56Z","title":"Transformer-based Video Saliency Prediction with High Temporal Dimension\n Decoding","summary":" In recent years, finding an effective and efficient strategy for exploiting\nspatial and temporal information has been a hot research topic in video\nsaliency prediction (VSP). With the emergence of spatio-temporal transformers,\nthe weakness of the prior strategies, e.g., 3D convolutional networks and\nLSTM-based networks, for capturing long-range dependencies has been effectively\ncompensated. While VSP has drawn benefits from spatio-temporal transformers,\nfinding the most effective way for aggregating temporal features is still\nchallenging. To address this concern, we propose a transformer-based video\nsaliency prediction approach with high temporal dimension decoding network\n(THTD-Net). This strategy accounts for the lack of complex hierarchical\ninteractions between features that are extracted from the transformer-based\nspatio-temporal encoder: in particular, it does not require multiple decoders\nand aims at gradually reducing temporal features' dimensions in the decoder.\nThis decoder-based architecture yields comparable performance to multi-branch\nand over-complicated models on common benchmarks such as DHF1K, UCF-sports and\nHollywood-2.\n","authors":["Morteza Moradi","Simone Palazzo","Concetto Spampinato"],"pdf_url":"https://arxiv.org/pdf/2401.07942v1.pdf","comment":"8 pages, 2 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.12401v3","updated":"2024-01-15T07:32:28Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Marginalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose Causal Abstraction Segmentation Refiner (CASR), which can\nrefine TAS results from various models by enhancing video causality in\nmarginalizing frame-level casual relationships. Specifically, we define the\nequivalent frame-level casual model and segment-level causal model, so that the\ncausal adjacency matrix constructed from marginalized frame-level causal\nrelationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12798v3","updated":"2024-01-15T03:02:01Z","published":"2023-10-19T14:52:58Z","title":"MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and\n Uni-Modal Adapter","summary":" Language Models (LMs) have demonstrated impressive molecule understanding\nability on various 1D text-related tasks. However, they inherently lack 2D\ngraph perception - a critical ability of human professionals in comprehending\nmolecules' topological structures. To bridge this gap, we propose MolCA:\nMolecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal\nAdapter. MolCA enables an LM (e.g., Galactica) to understand both text- and\ngraph-based molecular contents via the cross-modal projector. Specifically, the\ncross-modal projector is implemented as a Q-Former to connect a graph encoder's\nrepresentation space and an LM's text space. Further, MolCA employs a uni-modal\nadapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks.\nUnlike previous studies that couple an LM with a graph encoder via cross-modal\ncontrastive learning, MolCA retains the LM's ability of open-ended text\ngeneration and augments it with 2D graph information. To showcase its\neffectiveness, we extensively benchmark MolCA on tasks of molecule captioning,\nIUPAC name prediction, and molecule-text retrieval, on which MolCA\nsignificantly outperforms the baselines. Our codes and checkpoints can be found\nat https://github.com/acharkq/MolCA.\n","authors":["Zhiyuan Liu","Sihang Li","Yanchen Luo","Hao Fei","Yixin Cao","Kenji Kawaguchi","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.12798v3.pdf","comment":"EMNLP main conference. 9 pages"},{"id":"http://arxiv.org/abs/2401.07411v1","updated":"2024-01-15T01:35:42Z","published":"2024-01-15T01:35:42Z","title":"Startup Delay Aware Short Video Ordering: Problem, Model, and A\n Reinforcement Learning based Algorithm","summary":" Short video applications have attracted billions of users on the Internet and\ncan satisfy diverse users' fragmented spare time with content-rich and\nduration-short videos. To achieve fast playback at user side, existing short\nvideo systems typically enforce burst transmission of initial segment of each\nvideo when being requested for improved quality of user experiences. However,\nsuch a way of burst transmissions can cause unexpected large startup delays at\nuser side. This is because users may frequently switch videos when sequentially\nwatching a list of short videos recommended by the server side, which can cause\nexcessive burst transmissions of initial segments of different short videos and\nthus quickly deplete the network transmission capacity. In this paper, we adopt\ntoken bucket to characterize the video transmission path between video server\nand each user, and accordingly study how to effectively reduce the startup\ndelay of short videos by effectively arranging the viewing order of a video\nlist at the server side. We formulate the optimal video ordering problem for\nminimizing the maximum video startup delay as a combinatorial optimization\nproblem and prove its NP-hardness. We accordingly propose a Partially Shared\nActor Critic reinforcement learning algorithm (PSAC) to learn optimized video\nordering strategy. Numerical results based on a real dataset provided by a\nlarge-scale short video service provider demonstrate that the proposed PSAC\nalgorithm can significantly reduce the video startup delay compared to baseline\nalgorithms.\n","authors":["Zhipeng Gao","Chunxi Li","Yongxiang Zhao","Baoxian Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.07411v1.pdf","comment":null}]},"2024-01-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.09407v1","updated":"2024-01-17T18:45:13Z","published":"2024-01-17T18:45:13Z","title":"Deciphering Textual Authenticity: A Generalized Strategy through the\n Lens of Large Language Semantics for Detecting Human vs. Machine-Generated\n Text","summary":" With the recent proliferation of Large Language Models (LLMs), there has been\nan increasing demand for tools to detect machine-generated text. The effective\ndetection of machine-generated text face two pertinent problems: First, they\nare severely limited in generalizing against real-world scenarios, where\nmachine-generated text is produced by a variety of generators, including but\nnot limited to GPT-4 and Dolly, and spans diverse domains, ranging from\nacademic manuscripts to social media posts. Second, existing detection\nmethodologies treat texts produced by LLMs through a restrictive binary\nclassification lens, neglecting the nuanced diversity of artifacts generated by\ndifferent LLMs. In this work, we undertake a systematic study on the detection\nof machine-generated text in real-world scenarios. We first study the\neffectiveness of state-of-the-art approaches and find that they are severely\nlimited against text produced by diverse generators and domains in the real\nworld. Furthermore, t-SNE visualizations of the embeddings from a pretrained\nLLM's encoder show that they cannot reliably distinguish between human and\nmachine-generated text. Based on our findings, we introduce a novel system,\nT5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder\ncombined with LLM embedding sub-clustering to address the text produced by\ndiverse generators and domains in the real world. We evaluate our approach\nacross 9 machine-generated text systems and 9 domains and find that our\napproach provides state-of-the-art generalization ability, with an average\nincrease in F1 score on machine-generated text of 19.6\\% on unseen generators\nand domains compared to the top performing existing approaches and correctly\nattributes the generator of text with an accuracy of 93.6\\%.\n","authors":["Mazal Bethany","Brandon Wherry","Emet Bethany","Nishant Vishwamitra","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2401.09407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09395v1","updated":"2024-01-17T18:13:07Z","published":"2024-01-17T18:13:07Z","title":"Stuck in the Quicksand of Numeracy, Far from AGI Summit: Evaluating\n LLMs' Mathematical Competency through Ontology-guided Perturbations","summary":" Recent advancements in Large Language Models (LLMs) have showcased striking\nresults on existing logical reasoning benchmarks, with some models even\nsurpassing human performance. However, the true depth of their competencies and\nrobustness, in mathematical reasoning tasks, remains an open question. In\nresponse, we develop (i) an ontology of perturbations of maths questions, (ii)\na semi-automatic method of perturbation, and (iii) a dataset of perturbed maths\nquestions to probe the limits of LLM capabilities in mathematical reasoning\ntasks. These controlled perturbations span across multiple fine dimensions of\nthe structural and representational aspects of maths questions. Using GPT-4, we\ngenerated the MORE dataset by perturbing randomly selected five seed questions\nfrom GSM8K. This process was guided by our ontology and involved a thorough\nautomatic and manual filtering process, yielding a set of 216 maths problems.\nWe conducted comprehensive evaluation of both closed-source and open-source\nLLMs on MORE. The results show a significant performance drop across all the\nmodels against the perturbed questions. This strongly suggests that current\nLLMs lack robust mathematical skills and deep reasoning abilities. This\nresearch not only identifies multiple gaps in the capabilities of current\nmodels, but also highlights multiple potential directions for future\ndevelopment. Our dataset will be made publicly available at\nhttps://huggingface.co/datasets/declare-lab/GSM8k_MORE.\n","authors":["Pengfei Hong","Deepanway Ghosal","Navonil Majumder","Somak Aditya","Rada Mihalcea","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2401.09395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05268v2","updated":"2024-01-17T17:57:24Z","published":"2024-01-10T16:57:24Z","title":"AUTOACT: Automatic Agent Learning from Scratch via Self-Planning","summary":" Language agents have achieved considerable performance on various complex\ntasks. Despite the incessant exploration in this field, existing language agent\nsystems still struggle with costly, non-reproducible data reliance and face the\nchallenge of compelling a single model for multiple functions. To this end, we\nintroduce AutoAct, an automatic agent learning framework that does not rely on\nlarge-scale annotated data and synthetic trajectories from closed-source models\n(e.g., GPT-4). Given limited data with a tool library, AutoAct first\nautomatically synthesizes planning trajectories without any assistance from\nhumans or strong closed-source models. Then, AutoAct leverages a\ndivision-of-labor strategy to automatically differentiate based on the target\ntask information and synthesized trajectories, producing a sub-agent group to\ncomplete the task. We conduct comprehensive experiments with different LLMs,\nwhich demonstrates that AutoAct yields better or parallel performance compared\nto various strong baselines. We even notice that AutoAct, when using the\nLlama-2-13b model, can achieve performance comparable to that of the zero-shot\nGPT-3.5-Turbo agent. Code will be available at\nhttps://github.com/zjunlp/AutoAct.\n","authors":["Shuofei Qiao","Ningyu Zhang","Runnan Fang","Yujie Luo","Wangchunshu Zhou","Yuchen Eleanor Jiang","Chengfei Lv","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2401.05268v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.14517v2","updated":"2024-01-17T17:41:18Z","published":"2023-09-25T20:23:51Z","title":"Watch Your Language: Investigating Content Moderation with Large\n Language Models","summary":" Large language models (LLMs) have exploded in popularity due to their ability\nto perform a wide array of natural language tasks. Text-based content\nmoderation is one LLM use case that has received recent enthusiasm, however,\nthere is little research investigating how LLMs perform in content moderation\nsettings. In this work, we evaluate a suite of commodity LLMs on two common\ncontent moderation tasks: rule-based community moderation and toxic content\ndetection. For rule-based community moderation, we instantiate 95 subcommunity\nspecific LLMs by prompting GPT-3.5 with rules from 95 Reddit subcommunities. We\nfind that GPT-3.5 is effective at rule-based moderation for many communities,\nachieving a median accuracy of 64% and a median precision of 83%. For toxicity\ndetection, we evaluate a suite of commodity LLMs (GPT-3, GPT-3.5, GPT-4, Gemini\nPro, LLAMA 2) and show that LLMs significantly outperform currently widespread\ntoxicity classifiers. However, recent increases in model size add only marginal\nbenefit to toxicity detection, suggesting a potential performance plateau for\nLLMs on toxicity detection tasks. We conclude by outlining avenues for future\nwork in studying LLMs and content moderation.\n","authors":["Deepak Kumar","Yousef AbuHashem","Zakir Durumeric"],"pdf_url":"https://arxiv.org/pdf/2309.14517v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06855v2","updated":"2024-01-17T17:23:20Z","published":"2024-01-12T19:02:48Z","title":"Fine-grained Hallucination Detection and Editing for Language Models","summary":" Large language models (LMs) are prone to generate diverse factually incorrect\nstatements, which are widely called hallucinations. Current approaches\npredominantly focus on coarse-grained automatic hallucination detection or\nediting, overlooking nuanced error levels. In this paper, we propose a novel\ntask -- automatic fine-grained hallucination detection -- and present a\ncomprehensive taxonomy encompassing six hierarchically defined types of\nhallucination. To facilitate evaluation, we introduce a new benchmark that\nincludes fine-grained human judgments on two LM outputs across various domains.\nOur analysis reveals that ChatGPT and Llama 2-Chat exhibit hallucinations in\n60% and 75% of their outputs, respectively, and a majority of these\nhallucinations fall into categories that have been underexplored. As an initial\nstep to address this, we train FAVA, a retrieval-augmented LM by carefully\ndesigning synthetic data generations to detect and correct fine-grained\nhallucinations. On our benchmark, our automatic and human evaluations show that\nFAVA significantly outperforms ChatGPT on fine-grained hallucination detection\nby a large margin though a large room for future improvement still exists.\nFAVA's suggested edits also improve the factuality of LM-generated text,\nresulting in 5-10% FActScore improvements.\n","authors":["Abhika Mishra","Akari Asai","Vidhisha Balachandran","Yizhong Wang","Graham Neubig","Yulia Tsvetkov","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2401.06855v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09343v1","updated":"2024-01-17T17:08:36Z","published":"2024-01-17T17:08:36Z","title":"Efficient slot labelling","summary":" Slot labelling is an essential component of any dialogue system, aiming to\nfind important arguments in every user turn. Common approaches involve large\npre-trained language models (PLMs) like BERT or RoBERTa, but they face\nchallenges such as high computational requirements and dependence on\npre-training data. In this work, we propose a lightweight method which performs\non par or better than the state-of-the-art PLM-based methods, while having\nalmost 10x less trainable parameters. This makes it especially applicable for\nreal-life industry scenarios.\n","authors":["Vladimir Vlasov"],"pdf_url":"https://arxiv.org/pdf/2401.09343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15316v2","updated":"2024-01-17T17:07:37Z","published":"2023-12-23T18:14:56Z","title":"Paralinguistics-Enhanced Large Language Modeling of Spoken Dialogue","summary":" Large Language Models (LLMs) have demonstrated superior abilities in tasks\nsuch as chatting, reasoning, and question-answering. However, standard LLMs may\nignore crucial paralinguistic information, such as sentiment, emotion, and\nspeaking style, which are essential for achieving natural, human-like spoken\nconversation, especially when such information is conveyed by acoustic cues. We\ntherefore propose Paralinguistics-enhanced Generative Pretrained Transformer\n(ParalinGPT), an LLM that utilizes text and speech modalities to better model\nthe linguistic content and paralinguistic attributes of spoken dialogue. The\nmodel takes the conversational context of text, speech embeddings, and\nparalinguistic attributes as input prompts within a serialized multitasking\nmultimodal framework. Specifically, our framework serializes tasks in the order\nof current paralinguistic attribute prediction, response paralinguistic\nattribute prediction, and response text generation with autoregressive\nconditioning. We utilize the Switchboard-1 corpus, including its sentiment\nlabels as the paralinguistic attribute, as our spoken dialogue dataset.\nExperimental results indicate the proposed serialized multitasking method\noutperforms typical sequence classification techniques on current and response\nsentiment classification. Furthermore, leveraging conversational context and\nspeech embeddings significantly improves both response text generation and\nsentiment prediction. Our proposed framework achieves relative improvements of\n6.7%, 12.0%, and 3.5% in current sentiment accuracy, response sentiment\naccuracy, and response text BLEU score, respectively.\n","authors":["Guan-Ting Lin","Prashanth Gurunath Shivakumar","Ankur Gandhe","Chao-Han Huck Yang","Yile Gu","Shalini Ghosh","Andreas Stolcke","Hung-yi Lee","Ivan Bulyko"],"pdf_url":"https://arxiv.org/pdf/2312.15316v2.pdf","comment":"Accepted by ICASSP 2024. Camera-ready version"},{"id":"http://arxiv.org/abs/2401.09340v1","updated":"2024-01-17T17:04:35Z","published":"2024-01-17T17:04:35Z","title":"SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene\n Understanding","summary":" 3D vision-language grounding, which focuses on aligning language with the 3D\nphysical environment, stands as a cornerstone in the development of embodied\nagents. In comparison to recent advancements in the 2D domain, grounding\nlanguage in 3D scenes faces several significant challenges: (i) the inherent\ncomplexity of 3D scenes due to the diverse object configurations, their rich\nattributes, and intricate relationships; (ii) the scarcity of paired 3D\nvision-language data to support grounded learning; and (iii) the absence of a\nunified learning framework to distill knowledge from grounded 3D data. In this\nwork, we aim to address these three major challenges in 3D vision-language by\nexamining the potential of systematically upscaling 3D vision-language learning\nin indoor environments. We introduce the first million-scale 3D vision-language\ndataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising\n2.5M vision-language pairs derived from both human annotations and our scalable\nscene-graph-based generation approach. We demonstrate that this scaling allows\nfor a unified pre-training framework, Grounded Pre-training for Scenes (GPS),\nfor 3D vision-language learning. Through extensive experiments, we showcase the\neffectiveness of GPS by achieving state-of-the-art performance on all existing\n3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is\nunveiled through zero-shot transfer experiments in the challenging 3D\nvision-language tasks. Project website: https://scene-verse.github.io .\n","authors":["Baoxiong Jia","Yixin Chen","Huangyue Yu","Yan Wang","Xuesong Niu","Tengyu Liu","Qing Li","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2401.09340v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2311.12023v2","updated":"2024-01-17T17:01:57Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09334v1","updated":"2024-01-17T16:57:19Z","published":"2024-01-17T16:57:19Z","title":"Large Language Models Are Neurosymbolic Reasoners","summary":" A wide range of real-world applications is characterized by their symbolic\nnature, necessitating a strong capability for symbolic reasoning. This paper\ninvestigates the potential application of Large Language Models (LLMs) as\nsymbolic reasoners. We focus on text-based games, significant benchmarks for\nagents with natural language capabilities, particularly in symbolic tasks like\nmath, map reading, sorting, and applying common sense in text-based worlds. To\nfacilitate these agents, we propose an LLM agent designed to tackle symbolic\nchallenges and achieve in-game objectives. We begin by initializing the LLM\nagent and informing it of its role. The agent then receives observations and a\nset of valid actions from the text-based games, along with a specific symbolic\nmodule. With these inputs, the LLM agent chooses an action and interacts with\nthe game environments. Our experimental results demonstrate that our method\nsignificantly enhances the capability of LLMs as automated agents for symbolic\nreasoning, and our LLM agent is effective in text-based games involving\nsymbolic tasks, achieving an average performance of 88% across all tasks.\n","authors":["Meng Fang","Shilong Deng","Yudi Zhang","Zijing Shi","Ling Chen","Mykola Pechenizkiy","Jun Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09334v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.09333v1","updated":"2024-01-17T16:57:18Z","published":"2024-01-17T16:57:18Z","title":"Machines Do See Color: A Guideline to Classify Different Forms of Racist\n Discourse in Large Corpora","summary":" Current methods to identify and classify racist language in text rely on\nsmall-n qualitative approaches or large-n approaches focusing exclusively on\novert forms of racist discourse. This article provides a step-by-step\ngeneralizable guideline to identify and classify different forms of racist\ndiscourse in large corpora. In our approach, we start by conceptualizing racism\nand its different manifestations. We then contextualize these racist\nmanifestations to the time and place of interest, which allows researchers to\nidentify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a\ncross-lingual model for supervised text classification with a cutting-edge\ncontextual understanding of text. We show that XLM-R and XLM-R-Racismo, our\npretrained model, outperform other state-of-the-art approaches in classifying\nracism in large corpora. We illustrate our approach using a corpus of tweets\nrelating to the Ecuadorian ind\\'igena community between 2018 and 2021.\n","authors":["Diana Davila Gordillo","Joan Timoneda","Sebastian Vallejo Vera"],"pdf_url":"https://arxiv.org/pdf/2401.09333v1.pdf","comment":"37 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2309.13426v2","updated":"2024-01-17T16:36:58Z","published":"2023-09-23T16:32:59Z","title":"A Chat About Boring Problems: Studying GPT-based text normalization","summary":" Text normalization - the conversion of text from written to spoken form - is\ntraditionally assumed to be an ill-formed task for language models. In this\nwork, we argue otherwise. We empirically show the capacity of Large-Language\nModels (LLM) for text normalization in few-shot scenarios. Combining\nself-consistency reasoning with linguistic-informed prompt engineering, we find\nLLM based text normalization to achieve error rates around 40\\% lower than top\nnormalization systems. Further, upon error analysis, we note key limitations in\nthe conventional design of text normalization tasks. We create a new taxonomy\nof text normalization errors and apply it to results from GPT-3.5-Turbo and\nGPT-4.0. Through this new framework, we can identify strengths and weaknesses\nof GPT-based TN, opening opportunities for future work.\n","authors":["Yang Zhang","Travis M. Bartley","Mariana Graterol-Fuenmayor","Vitaly Lavrukhin","Evelina Bakhturina","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2309.13426v2.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2307.07421v2","updated":"2024-01-17T16:12:38Z","published":"2023-07-12T12:51:23Z","title":"SummaryMixing: A Linear-Complexity Alternative to Self-Attention for\n Speech Recognition and Understanding","summary":" Modern speech processing systems rely on self-attention. Unfortunately, token\nmixing with self-attention takes quadratic time in the length of the speech\nutterance, slowing down inference as well as training and increasing memory\nconsumption. Cheaper alternatives to self-attention for ASR have been\ndeveloped, but they fail to consistently reach the same level of accuracy. This\npaper, therefore, proposes a novel linear-time alternative to self-attention.\nIt summarises an utterance with the mean over vectors for all time steps. This\nsingle summary is then combined with time-specific information. We call this\nmethod \"SummaryMixing\". Introducing SummaryMixing in state-of-the-art ASR\nmodels makes it feasible to preserve or exceed previous speech recognition\nperformance while lowering the training and inference times by up to 28$\\%$ and\nreducing the memory budget by a factor of two. The benefits of SummaryMixing\ncan also be generalized to other speech-processing tasks, such as speech\nunderstanding.\n","authors":["Titouan Parcollet","Rogier van Dalen","Shucong Zhang","Sourav Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.07421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09248v1","updated":"2024-01-17T14:52:26Z","published":"2024-01-17T14:52:26Z","title":"Learning from Emotions, Demographic Information and Implicit User\n Feedback in Task-Oriented Document-Grounded Dialogues","summary":" The success of task-oriented and document-grounded dialogue systems depends\non users accepting and enjoying using them. To achieve this, recently published\nwork in the field of Human-Computer Interaction suggests that the combination\nof considering demographic information, user emotions and learning from the\nimplicit feedback in their utterances, is particularly important. However,\nthese findings have not yet been transferred to the field of Natural Language\nProcessing, where these data are primarily studied separately. Accordingly, no\nsufficiently annotated dataset is available. To address this gap, we introduce\nFEDI, the first English dialogue dataset for task-oriented document-grounded\ndialogues annotated with demographic information, user emotions and implicit\nfeedback. Our experiments with FLAN-T5, GPT-2 and LLaMA-2 show that these data\nhave the potential to improve task completion and the factual consistency of\nthe generated responses and user acceptance.\n","authors":["Dominic Petrak","Thy Thy Tran","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2401.09248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09244v1","updated":"2024-01-17T14:44:27Z","published":"2024-01-17T14:44:27Z","title":"Cross-lingual Offensive Language Detection: A Systematic Review of\n Datasets, Transfer Approaches and Challenges","summary":" The growing prevalence and rapid evolution of offensive language in social\nmedia amplify the complexities of detection, particularly highlighting the\nchallenges in identifying such content across diverse languages. This survey\npresents a systematic and comprehensive exploration of Cross-Lingual Transfer\nLearning (CLTL) techniques in offensive language detection in social media. Our\nstudy stands as the first holistic overview to focus exclusively on the\ncross-lingual scenario in this domain. We analyse 67 relevant papers and\ncategorise these studies across various dimensions, including the\ncharacteristics of multilingual datasets used, the cross-lingual resources\nemployed, and the specific CLTL strategies implemented. According to \"what to\ntransfer\", we also summarise three main CLTL transfer approaches: instance,\nfeature, and parameter transfer. Additionally, we shed light on the current\nchallenges and future research opportunities in this field. Furthermore, we\nhave made our survey resources available online, including two comprehensive\ntables that provide accessible references to the multilingual datasets and CLTL\nmethods used in the reviewed literature.\n","authors":["Aiqi Jiang","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.09244v1.pdf","comment":"35 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.04350v3","updated":"2024-01-17T14:41:55Z","published":"2023-12-07T15:12:12Z","title":"CLadder: Assessing Causal Reasoning in Language Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsights into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v3.pdf","comment":"NeurIPS 2023; updated with CLadder dataset v1.5"},{"id":"http://arxiv.org/abs/2401.09220v1","updated":"2024-01-17T14:02:36Z","published":"2024-01-17T14:02:36Z","title":"UniVIE: A Unified Label Space Approach to Visual Information Extraction\n from Form-like Documents","summary":" Existing methods for Visual Information Extraction (VIE) from form-like\ndocuments typically fragment the process into separate subtasks, such as key\ninformation extraction, key-value pair extraction, and choice group extraction.\nHowever, these approaches often overlook the hierarchical structure of form\ndocuments, including hierarchical key-value pairs and hierarchical choice\ngroups. To address these limitations, we present a new perspective, reframing\nVIE as a relation prediction problem and unifying labels of different tasks\ninto a single label space. This unified approach allows for the definition of\nvarious relation types and effectively tackles hierarchical relationships in\nform-like documents. In line with this perspective, we present UniVIE, a\nunified model that addresses the VIE problem comprehensively. UniVIE functions\nusing a coarse-to-fine strategy. It initially generates tree proposals through\na tree proposal network, which are subsequently refined into hierarchical trees\nby a relation decoder module. To enhance the relation prediction capabilities\nof UniVIE, we incorporate two novel tree constraints into the relation decoder:\na tree attention mask and a tree level embedding. Extensive experimental\nevaluations on both our in-house dataset HierForms and a publicly available\ndataset SIBR, substantiate that our method achieves state-of-the-art results,\nunderscoring the effectiveness and potential of our unified approach in\nadvancing the field of VIE.\n","authors":["Kai Hu","Jiawei Wang","Weihong Lin","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.09220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12519v2","updated":"2024-01-17T13:09:30Z","published":"2023-08-24T03:11:45Z","title":"Rational Decision-Making Agent with Internalized Utility Judgment","summary":" Large language models (LLMs) have demonstrated remarkable advancements and\nhave attracted significant efforts to develop LLMs into agents capable of\nexecuting intricate multi-step decision-making tasks beyond traditional NLP\napplications. Existing approaches to LLM-based decision-making predominantly\nbuild upon the manually-designed external performance metrics to guide the\ndecision-making process. However, reliance on the external performance metrics\nas prior is problematic in real-world scenarios, where such prior may be\nunavailable, flawed, or even erroneous. For genuine autonomous decision making,\nit is imperative for the agent to develop its rationality from its posterior\nexperiences to judge decisions independently. Central to the development of\nrationality is the construction of an internalized utility judgment, capable of\nassigning numerical utilities to each decision. This paper proposes RadAgent\n(Rational Decision-Making Agent), which fosters the development of its\nrationality through an iterative framework involving Experience Exploration and\nUtility Learning. Within this framework, Elo-based Utility Construction is\ndevised to assign Elo scores to individual decision steps to judge their\nutilities via pairwise comparisons. Consequently, these Elo scores guide the\ndecision-making process to derive optimal outcomes. Experimental results on the\nToolBench dataset demonstrate RadAgent's superiority over baselines, achieving\nover 10% improvement in Pass Rate on diverse tasks. It offers higher-quality\nsolutions and reduces costs (ChatGPT API calls), highlighting its effectiveness\nand efficiency.\n","authors":["Yining Ye","Xin Cong","Shizuo Tian","Yujia Qin","Chong Liu","Yankai Lin","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2308.12519v2.pdf","comment":"Received 8,6,6,6 scores on ICLR 2024"},{"id":"http://arxiv.org/abs/2401.02906v2","updated":"2024-01-17T12:58:36Z","published":"2024-01-05T17:05:42Z","title":"MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance","summary":" The deployment of multimodal large language models (MLLMs) has brought forth\na unique vulnerability: susceptibility to malicious attacks through visual\ninputs. We delve into the novel challenge of defending MLLMs against such\nattacks. We discovered that images act as a \"foreign language\" that is not\nconsidered during alignment, which can make MLLMs prone to producing harmful\nresponses. Unfortunately, unlike the discrete tokens considered in text-based\nLLMs, the continuous nature of image signals presents significant alignment\nchallenges, which poses difficulty to thoroughly cover the possible scenarios.\nThis vulnerability is exacerbated by the fact that open-source MLLMs are\npredominantly fine-tuned on limited image-text pairs that is much less than the\nextensive text-based pretraining corpus, which makes the MLLMs more prone to\ncatastrophic forgetting of their original abilities during explicit alignment\ntuning. To tackle these challenges, we introduce MLLM-Protector, a\nplug-and-play strategy combining a lightweight harm detector and a response\ndetoxifier. The harm detector's role is to identify potentially harmful outputs\nfrom the MLLM, while the detoxifier corrects these outputs to ensure the\nresponse stipulates to the safety standards. This approach effectively\nmitigates the risks posed by malicious visual inputs without compromising the\nmodel's overall performance. Our results demonstrate that MLLM-Protector offers\na robust solution to a previously unaddressed aspect of MLLM security.\n","authors":["Renjie Pi","Tianyang Han","Yueqi Xie","Rui Pan","Qing Lian","Hanze Dong","Jipeng Zhang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.02906v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09175v1","updated":"2024-01-17T12:31:45Z","published":"2024-01-17T12:31:45Z","title":"QAnswer: Towards Question Answering Search over Websites","summary":" Question Answering (QA) is increasingly used by search engines to provide\nresults to their end-users, yet very few websites currently use QA technologies\nfor their search functionality. To illustrate the potential of QA technologies\nfor the website search practitioner, we demonstrate web searches that combine\nQA over knowledge graphs and QA over free text -- each being usually tackled\nseparately. We also discuss the different benefits and drawbacks of both\napproaches for web site searches. We use the case studies made of websites\nhosted by the Wikimedia Foundation (namely Wikipedia and Wikidata). Differently\nfrom a search engine (e.g. Google, Bing, etc), the data are indexed integrally,\ni.e. we do not index only a subset, and they are indexed exclusively, i.e. we\nindex only data available on the corresponding website.\n","authors":["Kunpeng Guo","Clement Defretiere","Dennis Diefenbach","Christophe Gravier","Antoine Gourru"],"pdf_url":"https://arxiv.org/pdf/2401.09175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09168v1","updated":"2024-01-17T12:21:20Z","published":"2024-01-17T12:21:20Z","title":"Fine-tuning Strategies for Domain Specific Question Answering under Low\n Annotation Budget Constraints","summary":" The progress introduced by pre-trained language models and their fine-tuning\nhas resulted in significant improvements in most downstream NLP tasks. The\nunsupervised training of a language model combined with further target task\nfine-tuning has become the standard QA fine-tuning procedure. In this work, we\ndemonstrate that this strategy is sub-optimal for fine-tuning QA models,\nespecially under a low QA annotation budget, which is a usual setting in\npractice due to the extractive QA labeling cost. We draw our conclusions by\nconducting an exhaustive analysis of the performance of the alternatives of the\nsequential fine-tuning strategy on different QA datasets. Based on the\nexperiments performed, we observed that the best strategy to fine-tune the QA\nmodel in low-budget settings is taking a pre-trained language model (PLM) and\nthen fine-tuning PLM with a dataset composed of the target dataset and SQuAD\ndataset. With zero extra annotation effort, the best strategy outperforms the\nstandard strategy by 2.28% to 6.48%. Our experiments provide one of the first\ninvestigations on how to best fine-tune a QA system under a low budget and are\ntherefore of the utmost practical interest to the QA practitioners.\n","authors":["Kunpeng Guo","Dennis Diefenbach","Antoine Gourru","Christophe Gravier"],"pdf_url":"https://arxiv.org/pdf/2401.09168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07895v5","updated":"2024-01-17T12:02:33Z","published":"2023-05-13T11:28:37Z","title":"On the Hidden Mystery of OCR in Large Multimodal Models","summary":" Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark.Our study encompasses 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results showcased in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Biao Yang","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09150v1","updated":"2024-01-17T11:50:53Z","published":"2024-01-17T11:50:53Z","title":"Bridging Research and Readers: A Multi-Modal Automated Academic Papers\n Interpretation System","summary":" In the contemporary information era, significantly accelerated by the advent\nof Large-scale Language Models, the proliferation of scientific literature is\nreaching unprecedented levels. Researchers urgently require efficient tools for\nreading and summarizing academic papers, uncovering significant scientific\nliterature, and employing diverse interpretative methodologies. To address this\nburgeoning demand, the role of automated scientific literature interpretation\nsystems has become paramount. However, prevailing models, both commercial and\nopen-source, confront notable challenges: they often overlook multimodal data,\ngrapple with summarizing over-length texts, and lack diverse user interfaces.\nIn response, we introduce an open-source multi-modal automated academic paper\ninterpretation system (MMAPIS) with three-step process stages, incorporating\nLLMs to augment its functionality. Our system first employs the hybrid modality\npreprocessing and alignment module to extract plain text, and tables or figures\nfrom documents separately. It then aligns this information based on the section\nnames they belong to, ensuring that data with identical section names are\ncategorized under the same section. Following this, we introduce a hierarchical\ndiscourse-aware summarization method. It utilizes the extracted section names\nto divide the article into shorter text segments, facilitating specific\nsummarizations both within and between sections via LLMs with specific prompts.\nFinally, we have designed four types of diversified user interfaces, including\npaper recommendation, multimodal Q\\&A, audio broadcasting, and interpretation\nblog, which can be widely applied across various scenarios. Our qualitative and\nquantitative evaluations underscore the system's superiority, especially in\nscientific summarization, where it outperforms solutions relying solely on\nGPT-4.\n","authors":["Feng Jiang","Kuang Wang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2401.09150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09135v1","updated":"2024-01-17T11:17:04Z","published":"2024-01-17T11:17:04Z","title":"Asynchronous Local-SGD Training for Language Modeling","summary":" Local stochastic gradient descent (Local-SGD), also referred to as federated\naveraging, is an approach to distributed optimization where each device\nperforms more than one SGD update per communication. This work presents an\nempirical study of {\\it asynchronous} Local-SGD for training language models;\nthat is, each worker updates the global parameters as soon as it has finished\nits SGD steps. We conduct a comprehensive investigation by examining how worker\nhardware heterogeneity, model size, number of workers, and optimizer could\nimpact the learning performance. We find that with naive implementations,\nasynchronous Local-SGD takes more iterations to converge than its synchronous\ncounterpart despite updating the (global) model parameters more frequently. We\nidentify momentum acceleration on the global parameters when worker gradients\nare stale as a key challenge. We propose a novel method that utilizes a delayed\nNesterov momentum update and adjusts the workers' local training steps based on\ntheir computation speed. This approach, evaluated with models up to 150M\nparameters on the C4 dataset, matches the performance of synchronous Local-SGD\nin terms of perplexity per update step, and significantly surpasses it in terms\nof wall clock time.\n","authors":["Bo Liu","Rachita Chhaparia","Arthur Douillard","Satyen Kale","Andrei A. Rusu","Jiajun Shen","Arthur Szlam","Marc'Aurelio Ranzato"],"pdf_url":"https://arxiv.org/pdf/2401.09135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01070v2","updated":"2024-01-17T10:56:08Z","published":"2023-11-02T08:37:30Z","title":"Multilingual DistilWhisper: Efficient Distillation of Multi-task Speech\n Models via Language-Specific Experts","summary":" Whisper is a multitask and multilingual speech model covering 99 languages.\nIt yields commendable automatic speech recognition (ASR) results in a subset of\nits covered languages, but the model still underperforms on a non-negligible\nnumber of under-represented languages, a problem exacerbated in smaller model\nversions. In this work, we propose DistilWhisper, an approach able to bridge\nthe performance gap in ASR for these languages while retaining the advantages\nof multitask and multilingual capabilities. Our approach involves two key\nstrategies: lightweight modular ASR fine-tuning of whisper-small using\nlanguage-specific experts, and knowledge distillation from whisper-large-v2.\nThis dual approach allows us to effectively boost ASR performance while keeping\nthe robustness inherited from the multitask and multilingual pre-training.\nResults demonstrate that our approach is more effective than standard\nfine-tuning or LoRA adapters, boosting performance in the targeted languages\nfor both in- and out-of-domain test sets, while introducing only a negligible\nparameter overhead at inference.\n","authors":["Thomas Palmeira Ferraz","Marcely Zanon Boito","Caroline Brun","Vassilina Nikoulina"],"pdf_url":"https://arxiv.org/pdf/2311.01070v2.pdf","comment":"Accepted to IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.09084v2","updated":"2024-01-17T10:26:04Z","published":"2023-12-14T16:16:35Z","title":"Language Modeling on a SpiNNaker 2 Neuromorphic Chip","summary":" As large language models continue to scale in size rapidly, so too does the\ncomputational power required to run them. Event-based networks on neuromorphic\ndevices offer a potential way to reduce energy consumption for inference\nsignificantly. However, to date, most event-based networks that can run on\nneuromorphic hardware, including spiking neural networks (SNNs), have not\nachieved task performance even on par with LSTM models for language modeling.\nAs a result, language modeling on neuromorphic devices has seemed a distant\nprospect. In this work, we demonstrate the first-ever implementation of a\nlanguage model on a neuromorphic device - specifically the SpiNNaker 2 chip -\nbased on a recently published event-based architecture called the EGRU.\nSpiNNaker 2 is a many-core neuromorphic chip designed for large-scale\nasynchronous processing, while the EGRU is architected to leverage such\nhardware efficiently while maintaining competitive task performance. This\nimplementation marks the first time a neuromorphic language model matches\nLSTMs, setting the stage for taking task performance to the level of large\nlanguage models. We also demonstrate results on a gesture recognition task\nbased on inputs from a DVS camera. Overall, our results showcase the\nfeasibility of this neuro-inspired neural network in hardware, highlighting\nsignificant gains versus conventional hardware in energy efficiency for the\ncommon use case of single batch inference.\n","authors":["Khaleelulla Khan Nazeer","Mark Schöne","Rishav Mukherji","Bernhard Vogginger","Christian Mayr","David Kappel","Anand Subramoney"],"pdf_url":"https://arxiv.org/pdf/2312.09084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05268v2","updated":"2024-01-17T10:09:46Z","published":"2023-11-09T10:50:36Z","title":"Modelling prospective memory and resilient situated communications via\n Wizard of Oz","summary":" This abstract presents a scenario for human-robot action in a home setting\ninvolving an older adult and a robot. The scenario is designed to explore the\nenvisioned modelling of memory for communication with a socially assistive\nrobots (SAR). The scenario will enable the gathering of data on failures of\nspeech technology and human-robot communication involving shared memory that\nmay occur during daily activities such as a music-listening activity.\n","authors":["Yanzhe Li","Frank Broz","Mark Neerincx"],"pdf_url":"https://arxiv.org/pdf/2311.05268v2.pdf","comment":"In WTF Workshop Proceedings (arXiv:2401.04108) held in conjunction\n with the ACM conference on Conversational User Interfaces (CUI), 19 - 21/07\n 2023, in Eindhoven, The Netherlands"},{"id":"http://arxiv.org/abs/2401.09082v1","updated":"2024-01-17T09:44:03Z","published":"2024-01-17T09:44:03Z","title":"What makes for a 'good' social actor? Using respect as a lens to\n evaluate interactions with language agents","summary":" With the growing popularity of dialogue agents based on large language models\n(LLMs), urgent attention has been drawn to finding ways to ensure their\nbehaviour is ethical and appropriate. These are largely interpreted in terms of\nthe 'HHH' criteria: making outputs more helpful and honest, and avoiding\nharmful (biased, toxic, or inaccurate) statements. Whilst this semantic focus\nis useful from the perspective of viewing LLM agents as mere mediums for\ninformation, it fails to account for pragmatic factors that can make the same\nutterance seem more or less offensive or tactless in different social\nsituations. We propose an approach to ethics that is more centred on relational\nand situational factors, exploring what it means for a system, as a social\nactor, to treat an individual respectfully in a (series of) interaction(s). Our\nwork anticipates a set of largely unexplored risks at the level of situated\ninteraction, and offers practical suggestions to help LLM technologies behave\nas 'good' social actors and treat people respectfully.\n","authors":["Lize Alberts","Geoff Keeling","Amanda McCroskery"],"pdf_url":"https://arxiv.org/pdf/2401.09082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09074v1","updated":"2024-01-17T09:23:59Z","published":"2024-01-17T09:23:59Z","title":"Code Simulation Challenges for Large Language Models","summary":" We investigate the extent to which Large Language Models (LLMs) can simulate\nthe execution of computer code and algorithms. We begin by looking straight\nline programs, and show that current LLMs demonstrate poor performance even\nwith such simple programs -- performance rapidly degrades with the length of\ncode. We then investigate the ability of LLMs to simulate programs that contain\ncritical paths and redundant instructions. We also go beyond straight line\nprogram simulation with sorting algorithms and nested loops, and we show the\ncomputational complexity of a routine directly affects the ability of an LLM to\nsimulate its execution. We observe that LLMs execute instructions sequentially\nand with a low error margin only for short programs or standard procedures.\nLLMs' code simulation is in tension with their pattern recognition and\nmemorisation capabilities: on tasks where memorisation is detrimental, we\npropose a novel prompting method to simulate code execution line by line.\nEmpirically, our new Chain of Simulation (CoSm) method improves on the standard\nChain of Thought prompting approach by avoiding the pitfalls of memorisation.\n","authors":["Emanuele La Malfa","Christoph Weinhuber","Orazio Torre","Fangru Lin","Anthony Cohn","Nigel Shadbolt","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2401.09074v1.pdf","comment":"main paper (10 pages) + Appendix (11 pages)"},{"id":"http://arxiv.org/abs/2309.12697v2","updated":"2024-01-17T08:50:59Z","published":"2023-09-22T08:11:01Z","title":"Semantic similarity prediction is better than other semantic similarity\n measures","summary":" Semantic similarity between natural language texts is typically measured\neither by looking at the overlap between subsequences (e.g., BLEU) or by using\nembeddings (e.g., BERTScore, S-BERT). Within this paper, we argue that when we\nare only interested in measuring the semantic similarity, it is better to\ndirectly predict the similarity using a fine-tuned model for such a task. Using\na fine-tuned model for the Semantic Textual Similarity Benchmark tasks (STS-B)\nfrom the GLUE benchmark, we define the STSScore approach and show that the\nresulting similarity is better aligned with our expectations on a robust\nsemantic similarity measure than other approaches.\n","authors":["Steffen Herbold"],"pdf_url":"https://arxiv.org/pdf/2309.12697v2.pdf","comment":"Accepted at TMLR: https://openreview.net/forum?id=bfsNmgN5je"},{"id":"http://arxiv.org/abs/2401.09042v1","updated":"2024-01-17T08:22:52Z","published":"2024-01-17T08:22:52Z","title":"LLMs for Relational Reasoning: How Far are We?","summary":" Large language models (LLMs) have revolutionized many areas (e.g. natural\nlanguage processing, software engineering, etc.) by achieving state-of-the-art\nperformance on extensive downstream tasks. Aiming to achieve robust and general\nartificial intelligence, there has been a surge of interest in investigating\nthe reasoning ability of the LLMs. Whereas the textual and numerical reasoning\nbenchmarks adopted by previous works are rather shallow and simple, it is hard\nto conclude that the LLMs possess strong reasoning ability by merely achieving\npositive results on these benchmarks. Recent efforts have demonstrated that the\nLLMs are poor at solving sequential decision-making problems that require\ncommon-sense planning by evaluating their performance on the reinforcement\nlearning benchmarks. In this work, we conduct an in-depth assessment of several\nstate-of-the-art LLMs' reasoning ability based on the inductive logic\nprogramming (ILP) benchmark, which is broadly recognized as a representative\nand challenging measurement for evaluating logic program induction/synthesis\nsystems as it requires inducing strict cause-effect logic to achieve robust\ndeduction on independent and identically distributed (IID) and\nout-of-distribution (OOD) test samples. Our evaluations illustrate that\ncompared with the neural program induction systems which are much smaller in\nmodel size, the state-of-the-art LLMs are much poorer in terms of reasoning\nability by achieving much lower performance and generalization using either\nnatural language prompting or truth-value matrix prompting.\n","authors":["Zhiming Li","Yushi Cao","Xiufeng Xu","Junzhe Jiang","Xu Liu","Yon Shin Teo","Shang-wei Lin","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09042v1.pdf","comment":"Accepted by The First International Workshop on Large Language Models\n for Code (ICSE 2024)"},{"id":"http://arxiv.org/abs/2401.09041v1","updated":"2024-01-17T08:16:05Z","published":"2024-01-17T08:16:05Z","title":"Textual Summarisation of Large Sets: Towards a General Approach","summary":" We are developing techniques to generate summary descriptions of sets of\nobjects. In this paper, we present and evaluate a rule-based NLG technique for\nsummarising sets of bibliographical references in academic papers. This extends\nour previous work on summarising sets of consumer products and shows how our\nmodel generalises across these two very different domains.\n","authors":["Kittipitch Kuptavanich","Ehud Reiter","Kees Van Deemter","Advaith Siddharthan"],"pdf_url":"https://arxiv.org/pdf/2401.09041v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.08846v2","updated":"2024-01-17T08:05:07Z","published":"2023-12-14T12:02:24Z","title":"TiMix: Text-aware Image Mixing for Effective Vision-Language\n Pre-training","summary":" Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances\nmodern Vision-Language Pre-training (VLP) models by aligning visual and\nlinguistic modalities. Due to noises in web-harvested text-image pairs,\nhowever, scaling up training data volume in SMCL presents considerable\nobstacles in terms of computational cost and data inefficiency. To improve data\nefficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates\nmix-based data augmentation techniques into SMCL, yielding significant\nperformance improvements without significantly increasing computational\noverhead. We provide a theoretical analysis of TiMixfrom a mutual information\n(MI) perspective, showing that mixed data samples for cross-modal contrastive\nlearning implicitly serve as a regularizer for the contrastive loss. The\nexperimental results demonstrate that TiMix exhibits a comparable performance\non downstream tasks, even with a reduced amount of training data and shorter\ntraining time, when benchmarked against existing methods. This work empirically\nand theoretically demonstrates the potential of data mixing for data-efficient\nand computationally viable VLP, benefiting broader VLP model adoption in\npractical scenarios.\n","authors":["Chaoya Jiang","Wei ye","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.08846v2.pdf","comment":"Accepted on AAAI2024"},{"id":"http://arxiv.org/abs/2401.09023v1","updated":"2024-01-17T07:36:22Z","published":"2024-01-17T07:36:22Z","title":"Explain Thyself Bully: Sentiment Aided Cyberbullying Detection with\n Explanation","summary":" Cyberbullying has become a big issue with the popularity of different social\nmedia networks and online communication apps. While plenty of research is going\non to develop better models for cyberbullying detection in monolingual\nlanguage, there is very little research on the code-mixed languages and\nexplainability aspect of cyberbullying. Recent laws like \"right to\nexplanations\" of General Data Protection Regulation, have spurred research in\ndeveloping interpretable models rather than focusing on performance. Motivated\nby this we develop the first interpretable multi-task model called {\\em mExCB}\nfor automatic cyberbullying detection from code-mixed languages which can\nsimultaneously solve several tasks, cyberbullying detection,\nexplanation/rationale identification, target group detection and sentiment\nanalysis. We have introduced {\\em BullyExplain}, the first benchmark dataset\nfor explainable cyberbullying detection in code-mixed language. Each post in\n{\\em BullyExplain} dataset is annotated with four labels, i.e., {\\em bully\nlabel, sentiment label, target and rationales (explainability)}, i.e., which\nphrases are being responsible for annotating the post as a bully. The proposed\nmultitask framework (mExCB) based on CNN and GRU with word and sub-sentence\n(SS) level attention is able to outperform several baselines and state of the\nart models when applied on {\\em BullyExplain} dataset.\n","authors":["Krishanu Maity","Prince Jha","Raghav Jain","Sriparna Saha","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2401.09023v1.pdf","comment":"ICDAR 2023"},{"id":"http://arxiv.org/abs/2401.09003v1","updated":"2024-01-17T06:48:16Z","published":"2024-01-17T06:48:16Z","title":"Augmenting Math Word Problems via Iterative Question Composing","summary":" Despite recent progress in improving the mathematical reasoning ability of\nlarge language models(LLMs), solving competition-level math problems without\nthe use of external tools remains challenging for open-source LLMs. In this\nwork, we introduce the MMIQC dataset, a mixture of processed web data and\nsynthetic question-response pairs, to equip base models with better\nmathematical reasoning skills. Mistral-7B-MMIQC, the model obtained by\nfine-tuning Mistral-7B(arXiv:2310.06825) on MMIQC, achieves 36.0\\% accuracy on\nMATH(arXiv:2103.03874), 5.8\\% higher than the previous (model size $\\sim$7B)\nSOTA. Our experiments also show that a large part of the improvement attributes\nto our novel augmentation method IQC(Iterative Question Composing), where we\niteratively ask an LLM to compose new questions from the given seed problems\nand do rejection sampling from another LLM. MMIQC has now been released on\nhttps://huggingface.co/datasets/Vivacem/MMIQC.\n","authors":["Haoxiong Liu","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2401.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08350v2","updated":"2024-01-17T06:47:29Z","published":"2024-01-16T13:30:09Z","title":"Salute the Classic: Revisiting Challenges of Machine Translation in the\n Age of Large Language Models","summary":" The evolution of Neural Machine Translation (NMT) has been significantly\ninfluenced by six core challenges (Koehn and Knowles, 2017), which have acted\nas benchmarks for progress in this field. This study revisits these challenges,\noffering insights into their ongoing relevance in the context of advanced Large\nLanguage Models (LLMs): domain mismatch, amount of parallel data, rare word\nprediction, translation of long sentences, attention model as word alignment,\nand sub-optimal beam search. Our empirical findings indicate that LLMs\neffectively lessen the reliance on parallel data for major languages in the\npretraining phase. Additionally, the LLM-based translation system significantly\nenhances the translation of long sentences that contain approximately 80 words\nand shows the capability to translate documents of up to 512 words. However,\ndespite these significant improvements, the challenges of domain mismatch and\nprediction of rare words persist. While the challenges of word alignment and\nbeam search, specifically associated with NMT, may not apply to LLMs, we\nidentify three new challenges for LLMs in translation tasks: inference\nefficiency, translation of low-resource languages in the pretraining phase, and\nhuman-aligned evaluation. The datasets and models are released at\nhttps://github.com/pangjh3/LLM4MT.\n","authors":["Jianhui Pang","Fanghua Ye","Longyue Wang","Dian Yu","Derek F. Wong","Shuming Shi","Zhaopeng Tu"],"pdf_url":"https://arxiv.org/pdf/2401.08350v2.pdf","comment":"17 pages. Longyue Wang is the Corresponding Author"},{"id":"http://arxiv.org/abs/2401.09002v1","updated":"2024-01-17T06:42:44Z","published":"2024-01-17T06:42:44Z","title":"AttackEval: How to Evaluate the Effectiveness of Jailbreak Attacking on\n Large Language Models","summary":" In our research, we pioneer a novel approach to evaluate the effectiveness of\njailbreak attacks on Large Language Models (LLMs), such as GPT-4 and LLaMa2,\ndiverging from traditional robustness-focused binary evaluations. Our study\nintroduces two distinct evaluation frameworks: a coarse-grained evaluation and\na fine-grained evaluation. Each framework, using a scoring range from 0 to 1,\noffers a unique perspective, enabling a more comprehensive and nuanced\nevaluation of attack effectiveness and empowering attackers to refine their\nattack prompts with greater understanding. Furthermore, we have developed a\ncomprehensive ground truth dataset specifically tailored for jailbreak tasks.\nThis dataset not only serves as a crucial benchmark for our current study but\nalso establishes a foundational resource for future research, enabling\nconsistent and comparative analyses in this evolving field. Upon meticulous\ncomparison with traditional evaluation methods, we discovered that our\nevaluation aligns with the baseline's trend while offering a more profound and\ndetailed assessment. We believe that by accurately evaluating the effectiveness\nof attack prompts in the Jailbreak task, our work lays a solid foundation for\nassessing a wider array of similar or even more complex tasks in the realm of\nprompt injection, potentially revolutionizing this field.\n","authors":["Dong shu","Mingyu Jin","Suiyuan Zhu","Beichen Wang","Zihao Zhou","Chong Zhang","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08992v1","updated":"2024-01-17T06:01:16Z","published":"2024-01-17T06:01:16Z","title":"Efficient Adapter Finetuning for Tail Languages in Streaming\n Multilingual ASR","summary":" The end-to-end ASR model is often desired in the streaming multilingual\nscenario since it is easier to deploy and can benefit from pre-trained speech\nmodels such as powerful foundation models. Meanwhile, the heterogeneous nature\nand imbalanced data abundance of different languages may cause performance\ndegradation, leading to asynchronous peak performance for different languages\nduring training, especially on tail ones. Sometimes even the data itself may\nbecome unavailable as a result of the enhanced privacy protection. Existing\nwork tend to significantly increase the model size or learn language-specific\ndecoders to accommodate each language separately. In this study, we explore\nsimple yet effective Language-Dependent Adapter (LDA) finetuning under a\ncascaded Conformer transducer framework enhanced by teacher pseudo-labeling for\ntail languages in the streaming multilingual ASR. The adapter only accounts for\n0.4% of the full model per language. It is plugged into the frozen foundation\nmodel and is the only trainable module during the finetuning process with noisy\nstudent training. The final model merges the adapter parameters from different\ncheckpoints for different languages. The model performance is validated on a\nchallenging multilingual dictation dataset, which includes 39 tail languages\nacross Latin, Greek, Arabic, etc. Our proposed method brings 12.2% word error\nrate reduction on average and up to 37.5% on a single locale. Furthermore, we\nshow that our parameter-efficient LDA can match the quality of the full model\nfinetuning, thus greatly alleviating the asynchronous peak performance issue.\n","authors":["Junwen Bai","Bo Li","Qiujia Li","Tara N. Sainath","Trevor Strohman"],"pdf_url":"https://arxiv.org/pdf/2401.08992v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.07544v2","updated":"2024-01-17T05:28:18Z","published":"2024-01-15T09:09:14Z","title":"See the Unseen: Better Context-Consistent Knowledge-Editing by Noises","summary":" Knowledge-editing updates knowledge of large language models (LLMs) and\ncontributes to the interpretability and application of LLMs. However, knowledge\napplying is context-consistent: LLMs can recall the same knowledge in different\ncontexts. Existing works ignore this property and the editing lacks\ngeneralization. In this paper, we empirically find that the effects of\ndifferent contexts upon LLMs in recalling the same knowledge follow a\nGaussian-like distribution. We then sample Gaussian noises to simulate the\neffects of different contexts when updating LLMs. By such, we can make LLMs see\nthe unseen contexts where the edited knowledge will be applied, therefore\nimproving the editing generalization. Experimental results on three LLMs\ndemonstrate the effectiveness of our methods and also distinguish our methods\nfrom the others of fine-tuning LLMs by noises.\n","authors":["Youcheng Huang","Wenqiang Lei","Zheng Zhang","Jiancheng Lv","Shuicheng Yan"],"pdf_url":"https://arxiv.org/pdf/2401.07544v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16522v3","updated":"2024-01-17T05:04:12Z","published":"2023-11-28T05:00:27Z","title":"Dynamic Fault Characteristics Evaluation in Power Grid","summary":" To enhance the intelligence degree in operation and maintenance, a novel\nmethod for fault detection in power grids is proposed. The proposed GNN-based\napproach first identifies fault nodes through a specialized feature extraction\nmethod coupled with a knowledge graph. By incorporating temporal data, the\nmethod leverages the status of nodes from preceding and subsequent time periods\nto help current fault detection. To validate the effectiveness of the node\nfeatures, a correlation analysis of the output features from each node was\nconducted. The results from experiments show that this method can accurately\nlocate fault nodes in simulation scenarios with a remarkable accuracy.\nAdditionally, the graph neural network based feature modeling allows for a\nqualitative examination of how faults spread across nodes, which provides\nvaluable insights for analyzing fault nodes.\n","authors":["Hao Pei","Si Lin","Chuanfu Li","Che Wang","Haoming Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.16522v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13708v3","updated":"2024-01-17T05:03:03Z","published":"2023-11-22T21:59:46Z","title":"Dynamic Fault Analysis in Substations Based on Knowledge Graphs","summary":" To address the challenge of identifying hidden danger in substations from\nunstructured text, a novel dynamic analysis method is proposed. We first\nextract relevant information from the unstructured text, and then leverages a\nflexible distributed search engine built on Elastic-Search to handle the data.\nFollowing this, the hidden Markov model is employed to train the data within\nthe engine. The Viterbi algorithm is integrated to decipher the hidden state\nsequences, facilitating the segmentation and labeling of entities related to\nhidden dangers. The final step involves using the Neo4j graph database to\ndynamically create a knowledge graph that visualizes hidden dangers in the\nsubstation. The effectiveness of the proposed method is demonstrated through a\ncase analysis from a specific substation with hidden dangers revealed in the\ntext records.\n","authors":["Weiwei Li","Xing Liu","Wei Wang","Lu Chen","Sizhe Li","Hui Fan"],"pdf_url":"https://arxiv.org/pdf/2311.13708v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08973v1","updated":"2024-01-17T04:52:40Z","published":"2024-01-17T04:52:40Z","title":"OCTO+: A Suite for Automatic Open-Vocabulary Object Placement in Mixed\n Reality","summary":" One key challenge in Augmented Reality is the placement of virtual content in\nnatural locations. Most existing automated techniques can only work with a\nclosed-vocabulary, fixed set of objects. In this paper, we introduce and\nevaluate several methods for automatic object placement using recent advances\nin open-vocabulary vision-language models. Through a multifaceted evaluation,\nwe identify a new state-of-the-art method, OCTO+. We also introduce a benchmark\nfor automatically evaluating the placement of virtual objects in augmented\nreality, alleviating the need for costly user studies. Through this, in\naddition to human evaluations, we find that OCTO+ places objects in a valid\nregion over 70% of the time, outperforming other methods on a range of metrics.\n","authors":["Aditya Sharma","Luke Yoffe","Tobias Höllerer"],"pdf_url":"https://arxiv.org/pdf/2401.08973v1.pdf","comment":"2024 IEEE International Conference on Artificial Intelligence and\n eXtended and Virtual Reality (AIXVR)"},{"id":"http://arxiv.org/abs/2401.08967v1","updated":"2024-01-17T04:43:21Z","published":"2024-01-17T04:43:21Z","title":"ReFT: Reasoning with Reinforced Fine-Tuning","summary":" One way to enhance the reasoning capability of Large Language Models (LLMs)\nis to conduct Supervised Fine-Tuning (SFT) using Chain-of-Thought (CoT)\nannotations. This approach does not show sufficiently strong generalization\nability, however, because the training only relies on the given CoT data. In\nmath problem-solving, for example, there is usually only one annotated\nreasoning path for each question in the training data. Intuitively, it would be\nbetter for the algorithm to learn from multiple annotated reasoning paths given\na question. To address this issue, we propose a simple yet effective approach\ncalled Reinforced Fine-Tuning (ReFT) to enhance the generalizability of\nlearning LLMs for reasoning, with math problem-solving as an example. ReFT\nfirst warmups the model with SFT, and then employs on-line reinforcement\nlearning, specifically the PPO algorithm in this paper, to further fine-tune\nthe model, where an abundance of reasoning paths are automatically sampled\ngiven the question and the rewards are naturally derived from the ground-truth\nanswers. Extensive experiments on GSM8K, MathQA, and SVAMP datasets show that\nReFT significantly outperforms SFT, and the performance can be potentially\nfurther boosted by combining inference-time strategies such as majority voting\nand re-ranking. Note that ReFT obtains the improvement by learning from the\nsame training questions as SFT, without relying on extra or augmented training\nquestions. This indicates a superior generalization ability for ReFT.\n","authors":["Trung Quoc Luong","Xinbo Zhang","Zhanming Jie","Peng Sun","Xiaoran Jin","Hang Li"],"pdf_url":"https://arxiv.org/pdf/2401.08967v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2401.06792v2","updated":"2024-01-17T04:40:13Z","published":"2024-01-08T03:52:40Z","title":"LightHouse: A Survey of AGI Hallucination","summary":" With the development of artificial intelligence, large-scale models have\nbecome increasingly intelligent. However, numerous studies indicate that\nhallucinations within these large models are a bottleneck hindering the\ndevelopment of AI research. In the pursuit of achieving strong artificial\nintelligence, a significant volume of research effort is being invested in the\nAGI (Artificial General Intelligence) hallucination research. Previous\nexplorations have been conducted in researching hallucinations within LLMs\n(Large Language Models). As for multimodal AGI, research on hallucinations is\nstill in an early stage. To further the progress of research in the domain of\nhallucinatory phenomena, we present a bird's eye view of hallucinations in AGI,\nsummarizing the current work on AGI hallucinations and proposing some\ndirections for future research.\n","authors":["Feng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.06792v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16042v2","updated":"2024-01-17T04:07:06Z","published":"2023-09-27T21:53:56Z","title":"Towards Best Practices of Activation Patching in Language Models:\n Metrics and Methods","summary":" Mechanistic interpretability seeks to understand the internal mechanisms of\nmachine learning models, where localization -- identifying the important model\ncomponents -- is a key step. Activation patching, also known as causal tracing\nor interchange intervention, is a standard technique for this task (Vig et al.,\n2020), but the literature contains many variants with little consensus on the\nchoice of hyperparameters or methodology. In this work, we systematically\nexamine the impact of methodological details in activation patching, including\nevaluation metrics and corruption methods. In several settings of localization\nand circuit discovery in language models, we find that varying these\nhyperparameters could lead to disparate interpretability results. Backed by\nempirical observations, we give conceptual arguments for why certain metrics or\nmethods may be preferred. Finally, we provide recommendations for the best\npractices of activation patching going forwards.\n","authors":["Fred Zhang","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2309.16042v2.pdf","comment":"27 pages. ICLR 2024"},{"id":"http://arxiv.org/abs/2306.13649v3","updated":"2024-01-17T03:23:23Z","published":"2023-06-23T17:56:26Z","title":"On-Policy Distillation of Language Models: Learning from Self-Generated\n Mistakes","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\nreduce its inference cost and memory footprint, by training a smaller student\nmodel. However, current KD methods for auto-regressive sequence models suffer\nfrom distribution mismatch between output sequences seen during training and\nthose generated by the student during inference. To address this issue, we\nintroduce Generalized Knowledge Distillation (GKD). Instead of solely relying\non a fixed set of output sequences, GKD trains the student on its\nself-generated output sequences by leveraging feedback from the teacher on such\nsequences. Unlike supervised KD approaches, GKD also offers the flexibility to\nemploy alternative loss functions between the student and teacher, which can be\nuseful when the student lacks the expressivity to mimic the teacher's\ndistribution. Furthermore, GKD facilitates the seamless integration of\ndistillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for\ndistilling auto-regressive language models on summarization, translation, and\narithmetic reasoning tasks, and task-agnostic distillation for\ninstruction-tuning.\n","authors":["Rishabh Agarwal","Nino Vieillard","Yongchao Zhou","Piotr Stanczyk","Sabela Ramos","Matthieu Geist","Olivier Bachem"],"pdf_url":"https://arxiv.org/pdf/2306.13649v3.pdf","comment":"Accepted at ICLR 2024. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2310.11374v4","updated":"2024-01-17T02:44:56Z","published":"2023-10-17T16:15:34Z","title":"DialogueLLM: Context and Emotion Knowledge-Tuned Large Language Models\n for Emotion Recognition in Conversations","summary":" Large language models (LLMs) and their variants have shown extraordinary\nefficacy across numerous downstream natural language processing (NLP) tasks,\nwhich has presented a new vision for the development of NLP. Despite their\nremarkable performance in natural language generating (NLG), LLMs lack a\ndistinct focus on the emotion understanding domain. As a result, using LLMs for\nemotion recognition may lead to suboptimal and inadequate precision. Another\nlimitation of LLMs is that they are typical trained without leveraging\nmulti-modal information. To overcome these limitations, we propose DialogueLLM,\na context and emotion knowledge tuned LLM that is obtained by fine-tuning LLaMA\nmodels with 13,638 multi-modal (i.e., texts and videos) emotional dialogues.\nThe visual information is considered as the supplementary knowledge to\nconstruct high-quality instructions. We offer a comprehensive evaluation of our\nproposed model on three benchmarking emotion recognition in conversations (ERC)\ndatasets and compare the results against the SOTA baselines and other SOTA\nLLMs. Additionally, DialogueLLM-7B can be easily trained using LoRA on a 40GB\nA100 GPU in 5 hours, facilitating reproducibility for other researchers.\n","authors":["Yazhou Zhang","Mengyao Wang","Youxi Wu","Prayag Tiwari","Qiuchi Li","Benyou Wang","Jing Qin"],"pdf_url":"https://arxiv.org/pdf/2310.11374v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08919v1","updated":"2024-01-17T02:04:59Z","published":"2024-01-17T02:04:59Z","title":"Partial Diacritization: A Context-Contrastive Inference Approach","summary":" Diacritization plays a pivotal role in improving readability and\ndisambiguating the meaning of Arabic texts. Efforts have so far focused on\nmarking every eligible character (Full Diacritization). Comparatively\noverlooked, Partial Diacritzation (PD) is the selection of a subset of\ncharacters to be marked to aid comprehension where needed. Research has\nindicated that excessive diacritic marks can hinder skilled readers--reducing\nreading speed and accuracy. We conduct a behavioral experiment and show that\npartially marked text is often easier to read than fully marked text, and\nsometimes easier than plain text. In this light, we introduce\nContext-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which\nintegrates seamlessly with existing Arabic diacritization systems. CCPD\nprocesses each word twice, once with context and once without, and diacritizes\nonly the characters with disparities between the two inferences. Further, we\nintroduce novel indicators for measuring partial diacritization quality (SR,\nPDER, HDER, ERE), essential for establishing this as a machine learning task.\nLastly, we introduce TD2, a Transformer-variant of an established model which\noffers a markedly different per formance profile on our proposed indicators\ncompared to all other known systems.\n","authors":["Muhammad ElNokrashy","Badr AlKhamissi"],"pdf_url":"https://arxiv.org/pdf/2401.08919v1.pdf","comment":"13 equations, 5 tables, 5 figures"},{"id":"http://arxiv.org/abs/2401.09647v1","updated":"2024-01-17T23:32:56Z","published":"2024-01-17T23:32:56Z","title":"Characterizing Online Eating Disorder Communities with Large Language\n Models","summary":" The rise in eating disorders, a dangerous mental health condition with high\nmortality and morbidity, has been linked to the proliferation of idealized body\nimages on social media. However, the link between social media and eating\ndisorders is far more complex. We argue that social media platforms create a\nfeedback loop that amplifies the growth of content and communities that promote\neating disorders like anorexia and bulimia. Specifically, social media\nplatforms make it easy for vulnerable individuals to find and connect to\nlike-minded others, while group dynamic processes encourage them to stay\nengaged within communities that promote and glorify harmful behaviors linked to\neating disorders. We characterize this dynamic empirically through a\ncombination of network and language analysis. We describe a novel framework\nthat leverages large language models to analyze the discourse within online\ncommunities and probe their attitudes on topics related to eating disorders to\nidentify potentially harmful content. Our work emphasizes the need for better\nsocial media moderation to disrupt harmful feedback loops and protect\nvulnerable individuals.\n","authors":["Minh Duc Chu","Aryan Karnati","Zihao He","Kristina Lerman"],"pdf_url":"https://arxiv.org/pdf/2401.09647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09646v1","updated":"2024-01-17T23:29:46Z","published":"2024-01-17T23:29:46Z","title":"ClimateGPT: Towards AI Synthesizing Interdisciplinary Research on\n Climate Change","summary":" This paper introduces ClimateGPT, a model family of domain-specific large\nlanguage models that synthesize interdisciplinary research on climate change.\nWe trained two 7B models from scratch on a science-oriented dataset of 300B\ntokens. For the first model, the 4.2B domain-specific tokens were included\nduring pre-training and the second was adapted to the climate domain after\npre-training. Additionally, ClimateGPT-7B, 13B and 70B are continuously\npre-trained from Llama~2 on a domain-specific dataset of 4.2B tokens. Each\nmodel is instruction fine-tuned on a high-quality and human-generated\ndomain-specific dataset that has been created in close cooperation with climate\nscientists. To reduce the number of hallucinations, we optimize the model for\nretrieval augmentation and propose a hierarchical retrieval strategy. To\nincrease the accessibility of our model to non-English speakers, we propose to\nmake use of cascaded machine translation and show that this approach can\nperform comparably to natively multilingual models while being easier to scale\nto a large number of languages. Further, to address the intrinsic\ninterdisciplinary aspect of climate change we consider different research\nperspectives. Therefore, the model can produce in-depth answers focusing on\ndifferent perspectives in addition to an overall answer. We propose a suite of\nautomatic climate-specific benchmarks to evaluate LLMs. On these benchmarks,\nClimateGPT-7B performs on par with the ten times larger Llama-2-70B Chat model\nwhile not degrading results on general domain benchmarks. Our human evaluation\nconfirms the trends we saw in our benchmarks. All models were trained and\nevaluated using renewable energy and are released publicly.\n","authors":["David Thulke","Yingbo Gao","Petrus Pelser","Rein Brune","Rricha Jalota","Floris Fok","Michael Ramos","Ian van Wyk","Abdallah Nasir","Hayden Goldstein","Taylor Tragemann","Katie Nguyen","Ariana Fowler","Andrew Stanco","Jon Gabriel","Jordan Taylor","Dean Moro","Evgenii Tsymbalov","Juliette de Waal","Evgeny Matusov","Mudar Yaghi","Mohammad Shihadah","Hermann Ney","Christian Dugast","Jonathan Dotan","Daniel Erasmus"],"pdf_url":"https://arxiv.org/pdf/2401.09646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09637v1","updated":"2024-01-17T23:14:52Z","published":"2024-01-17T23:14:52Z","title":"Impact of Large Language Model Assistance on Patients Reading Clinical\n Notes: A Mixed-Methods Study","summary":" Patients derive numerous benefits from reading their clinical notes,\nincluding an increased sense of control over their health and improved\nunderstanding of their care plan. However, complex medical concepts and jargon\nwithin clinical notes hinder patient comprehension and may lead to anxiety. We\ndeveloped a patient-facing tool to make clinical notes more readable,\nleveraging large language models (LLMs) to simplify, extract information from,\nand add context to notes. We prompt engineered GPT-4 to perform these\naugmentation tasks on real clinical notes donated by breast cancer survivors\nand synthetic notes generated by a clinician, a total of 12 notes with 3868\nwords. In June 2023, 200 female-identifying US-based participants were randomly\nassigned three clinical notes with varying levels of augmentations using our\ntool. Participants answered questions about each note, evaluating their\nunderstanding of follow-up actions and self-reported confidence. We found that\naugmentations were associated with a significant increase in action\nunderstanding score (0.63 $\\pm$ 0.04 for select augmentations, compared to 0.54\n$\\pm$ 0.02 for the control) with p=0.002. In-depth interviews of\nself-identifying breast cancer patients (N=7) were also conducted via video\nconferencing. Augmentations, especially definitions, elicited positive\nresponses among the seven participants, with some concerns about relying on\nLLMs. Augmentations were evaluated for errors by clinicians, and we found\nmisleading errors occur, with errors more common in real donated notes than\nsynthetic notes, illustrating the importance of carefully written clinical\nnotes. Augmentations improve some but not all readability metrics. This work\ndemonstrates the potential of LLMs to improve patients' experience with\nclinical notes at a lower burden to clinicians. However, having a human in the\nloop is important to correct potential model errors.\n","authors":["Niklas Mannhardt","Elizabeth Bondi-Kelly","Barbara Lam","Chloe O'Connell","Mercy Asiedu","Hussein Mozannar","Monica Agrawal","Alejandro Buendia","Tatiana Urman","Irbaz B. Riaz","Catherine E. Ricciardi","Marzyeh Ghassemi","David Sontag"],"pdf_url":"https://arxiv.org/pdf/2401.09637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07525v2","updated":"2024-01-17T23:06:15Z","published":"2024-01-15T07:57:58Z","title":"TAROT: A Hierarchical Framework with Multitask Co-Pretraining on\n Semi-Structured Data towards Effective Person-Job Fit","summary":" Person-job fit is an essential part of online recruitment platforms in\nserving various downstream applications like Job Search and Candidate\nRecommendation. Recently, pretrained large language models have further\nenhanced the effectiveness by leveraging richer textual information in user\nprofiles and job descriptions apart from user behavior features and job\nmetadata. However, the general domain-oriented design struggles to capture the\nunique structural information within user profiles and job descriptions,\nleading to a loss of latent semantic correlations. We propose TAROT, a\nhierarchical multitask co-pretraining framework, to better utilize structural\nand semantic information for informative text embeddings. TAROT targets\nsemi-structured text in profiles and jobs, and it is co-pretained with\nmulti-grained pretraining tasks to constrain the acquired semantic information\nat each level. Experiments on a real-world LinkedIn dataset show significant\nperformance improvements, proving its effectiveness in person-job fit tasks.\n","authors":["Yihan Cao","Xu Chen","Lun Du","Hao Chen","Qiang Fu","Shi Han","Yushu Du","Yanbin Kang","Guangming Lu","Zi Li"],"pdf_url":"https://arxiv.org/pdf/2401.07525v2.pdf","comment":"ICASSP 2024 camera ready. 5 pages, 1 figure, 3 tables"},{"id":"http://arxiv.org/abs/2310.08744v2","updated":"2024-01-17T23:03:05Z","published":"2023-10-12T22:12:28Z","title":"Circuit Component Reuse Across Tasks in Transformer Language Models","summary":" Recent work in mechanistic interpretability has shown that behaviors in\nlanguage models can be successfully reverse-engineered through circuit\nanalysis. A common criticism, however, is that each circuit is task-specific,\nand thus such analysis cannot contribute to understanding the models at a\nhigher level. In this work, we present evidence that insights (both low-level\nfindings about specific heads and higher-level findings about general\nalgorithms) can indeed generalize across tasks. Specifically, we study the\ncircuit discovered in Wang et al. (2022) for the Indirect Object Identification\n(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that\nit is mostly reused to solve a seemingly different task: Colored Objects\n(Ippolito & Callison-Burch, 2023). We provide evidence that the process\nunderlying both tasks is functionally very similar, and contains about a 78%\noverlap in in-circuit attention heads. We further present a proof-of-concept\nintervention experiment, in which we adjust four attention heads in middle\nlayers in order to 'repair' the Colored Objects circuit and make it behave like\nthe IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the\nColored Objects task and explain most sources of error. The intervention\naffects downstream attention heads in specific ways predicted by their\ninteractions in the IOI circuit, indicating that this subcircuit behavior is\ninvariant to the different task inputs. Overall, our results provide evidence\nthat it may yet be possible to explain large language models' behavior in terms\nof a relatively small number of interpretable task-general algorithmic building\nblocks and computational components.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2310.08744v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2312.14345v2","updated":"2024-01-17T22:05:50Z","published":"2023-12-22T00:30:10Z","title":"Logic-Scaffolding: Personalized Aspect-Instructed Recommendation\n Explanation Generation using LLMs","summary":" The unique capabilities of Large Language Models (LLMs), such as the natural\nlanguage text generation ability, position them as strong candidates for\nproviding explanation for recommendations. However, despite the size of the\nLLM, most existing models struggle to produce zero-shot explanations reliably.\nTo address this issue, we propose a framework called Logic-Scaffolding, that\ncombines the ideas of aspect-based explanation and chain-of-thought prompting\nto generate explanations through intermediate reasoning steps. In this paper,\nwe share our experience in building the framework and present an interactive\ndemonstration for exploring our results.\n","authors":["Behnam Rahdari","Hao Ding","Ziwei Fan","Yifei Ma","Zhuotong Chen","Anoop Deoras","Branislav Kveton"],"pdf_url":"https://arxiv.org/pdf/2312.14345v2.pdf","comment":"The 17th ACM International Conference on Web Search and Data Mining\n (WSDM 2024)"},{"id":"http://arxiv.org/abs/2401.09615v1","updated":"2024-01-17T21:55:15Z","published":"2024-01-17T21:55:15Z","title":"Learning Shortcuts: On the Misleading Promise of NLU in Language Models","summary":" The advent of large language models (LLMs) has enabled significant\nperformance gains in the field of natural language processing. However, recent\nstudies have found that LLMs often resort to shortcuts when performing tasks,\ncreating an illusion of enhanced performance while lacking generalizability in\ntheir decision rules. This phenomenon introduces challenges in accurately\nassessing natural language understanding in LLMs. Our paper provides a concise\nsurvey of relevant research in this area and puts forth a perspective on the\nimplications of shortcut learning in the evaluation of language models,\nspecifically for NLU tasks. This paper urges more research efforts to be put\ntowards deepening our comprehension of shortcut learning, contributing to the\ndevelopment of more robust language models, and raising the standards of NLU\nevaluation in real-world scenarios.\n","authors":["Geetanjali Bihani","Julia Taylor Rayz"],"pdf_url":"https://arxiv.org/pdf/2401.09615v1.pdf","comment":"Accepted at HICSS-SDPS 2024"},{"id":"http://arxiv.org/abs/2401.05566v3","updated":"2024-01-17T20:26:01Z","published":"2024-01-10T22:14:35Z","title":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety\n Training","summary":" Humans are capable of strategically deceptive behavior: behaving helpfully in\nmost situations, but then behaving very differently in order to pursue\nalternative objectives when given the opportunity. If an AI system learned such\na deceptive strategy, could we detect it and remove it using current\nstate-of-the-art safety training techniques? To study this question, we\nconstruct proof-of-concept examples of deceptive behavior in large language\nmodels (LLMs). For example, we train models that write secure code when the\nprompt states that the year is 2023, but insert exploitable code when the\nstated year is 2024. We find that such backdoor behavior can be made\npersistent, so that it is not removed by standard safety training techniques,\nincluding supervised fine-tuning, reinforcement learning, and adversarial\ntraining (eliciting unsafe behavior and then training to remove it). The\nbackdoor behavior is most persistent in the largest models and in models\ntrained to produce chain-of-thought reasoning about deceiving the training\nprocess, with the persistence remaining even when the chain-of-thought is\ndistilled away. Furthermore, rather than removing backdoors, we find that\nadversarial training can teach models to better recognize their backdoor\ntriggers, effectively hiding the unsafe behavior. Our results suggest that,\nonce a model exhibits deceptive behavior, standard techniques could fail to\nremove such deception and create a false impression of safety.\n","authors":["Evan Hubinger","Carson Denison","Jesse Mu","Mike Lambert","Meg Tong","Monte MacDiarmid","Tamera Lanham","Daniel M. Ziegler","Tim Maxwell","Newton Cheng","Adam Jermyn","Amanda Askell","Ansh Radhakrishnan","Cem Anil","David Duvenaud","Deep Ganguli","Fazl Barez","Jack Clark","Kamal Ndousse","Kshitij Sachan","Michael Sellitto","Mrinank Sharma","Nova DasSarma","Roger Grosse","Shauna Kravec","Yuntao Bai","Zachary Witten","Marina Favaro","Jan Brauner","Holden Karnofsky","Paul Christiano","Samuel R. Bowman","Logan Graham","Jared Kaplan","Sören Mindermann","Ryan Greenblatt","Buck Shlegeris","Nicholas Schiefer","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2401.05566v3.pdf","comment":"updated to add missing acknowledgements"},{"id":"http://arxiv.org/abs/2401.07927v2","updated":"2024-01-17T20:14:05Z","published":"2024-01-15T19:39:15Z","title":"Are self-explanations from Large Language Models faithful?","summary":" Instruction-tuned large language models (LLMs) excel at many tasks, and will\neven provide explanations for their behavior. Since these models are directly\naccessible to the public, there is a risk that convincing and wrong\nexplanations can lead to unsupported confidence in LLMs. Therefore,\ninterpretability-faithfulness of self-explanations is an important\nconsideration for AI Safety. Assessing the interpretability-faithfulness of\nthese explanations, termed self-explanations, is challenging as the models are\ntoo complex for humans to annotate what is a correct explanation. To address\nthis, we propose employing self-consistency checks as a measure of\nfaithfulness. For example, if an LLM says a set of words is important for\nmaking a prediction, then it should not be able to make the same prediction\nwithout these words. While self-consistency checks are a common approach to\nfaithfulness, they have not previously been applied to LLM's self-explanations.\nWe apply self-consistency checks to three types of self-explanations:\ncounterfactuals, importance measures, and redactions. Our work demonstrate that\nfaithfulness is both task and model dependent, e.g., for sentiment\nclassification, counterfactual explanations are more faithful for Llama2,\nimportance measures for Mistral, and redaction for Falcon 40B. Finally, our\nfindings are robust to prompt-variations.\n","authors":["Andreas Madsen","Sarath Chandar","Siva Reddy"],"pdf_url":"https://arxiv.org/pdf/2401.07927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10321v2","updated":"2024-01-17T20:11:38Z","published":"2023-12-16T05:01:23Z","title":"LLM-SQL-Solver: Can LLMs Determine SQL Equivalence?","summary":" Judging the equivalence between two SQL queries is a fundamental problem with\nmany practical applications in data management and SQL generation (i.e.,\nevaluating the quality of generated SQL queries in text-to-SQL task). While the\nresearch community has reasoned about SQL equivalence for decades, it poses\nconsiderable difficulties and no complete solutions exist. Recently, Large\nLanguage Models (LLMs) have shown strong reasoning capability in conversation,\nquestion answering and solving mathematics challenges. In this paper, we study\nif LLMs can be used to determine the equivalence between SQL queries under two\nnotions of SQL equivalence (semantic equivalence and relaxed equivalence). To\nassist LLMs in generating high quality responses, we present two prompting\ntechniques: Miniature & Mull and Explain & Compare. The former technique is\nused to evaluate the semantic equivalence in which it asks LLMs to execute a\nquery on a simple database instance and then explore if a counterexample exists\nby modifying the database. The latter technique is used to evaluate the relaxed\nequivalence in which it asks LLMs to explain the queries and then compare if\nthey contain significant logical differences. Our experiments demonstrate using\nour techniques, LLMs is a promising tool to help data engineers in writing\nsemantically equivalent SQL queries, however challenges still persist, and is a\nbetter metric for evaluating SQL generation than the popular execution\naccuracy.\n","authors":["Fuheng Zhao","Lawrence Lim","Ishtiyaque Ahmad","Divyakant Agrawal","Amr El Abbadi"],"pdf_url":"https://arxiv.org/pdf/2312.10321v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08406v2","updated":"2024-01-17T20:03:15Z","published":"2024-01-16T14:44:47Z","title":"RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on\n Agriculture","summary":" There are two common ways in which developers are incorporating proprietary\nand domain-specific data when building applications of Large Language Models\n(LLMs): Retrieval-Augmented Generation (RAG) and Fine-Tuning. RAG augments the\nprompt with the external data, while fine-Tuning incorporates the additional\nknowledge into the model itself. However, the pros and cons of both approaches\nare not well understood. In this paper, we propose a pipeline for fine-tuning\nand RAG, and present the tradeoffs of both for multiple popular LLMs, including\nLlama2-13B, GPT-3.5, and GPT-4. Our pipeline consists of multiple stages,\nincluding extracting information from PDFs, generating questions and answers,\nusing them for fine-tuning, and leveraging GPT-4 for evaluating the results. We\npropose metrics to assess the performance of different stages of the RAG and\nfine-Tuning pipeline. We conduct an in-depth study on an agricultural dataset.\nAgriculture as an industry has not seen much penetration of AI, and we study a\npotentially disruptive application - what if we could provide location-specific\ninsights to a farmer? Our results show the effectiveness of our dataset\ngeneration pipeline in capturing geographic-specific knowledge, and the\nquantitative and qualitative benefits of RAG and fine-tuning. We see an\naccuracy increase of over 6 p.p. when fine-tuning the model and this is\ncumulative with RAG, which increases accuracy by 5 p.p. further. In one\nparticular experiment, we also demonstrate that the fine-tuned model leverages\ninformation from across geographies to answer specific questions, increasing\nanswer similarity from 47% to 72%. Overall, the results point to how systems\nbuilt using LLMs can be adapted to respond and incorporate knowledge across a\ndimension that is critical for a specific industry, paving the way for further\napplications of LLMs in other industrial domains.\n","authors":["Angels Balaguer","Vinamra Benara","Renato Luiz de Freitas Cunha","Roberto de M. Estevão Filho","Todd Hendry","Daniel Holstein","Jennifer Marsman","Nick Mecklenburg","Sara Malvar","Leonardo O. Nunes","Rafael Padilha","Morris Sharp","Bruno Silva","Swati Sharma","Vijay Aski","Ranveer Chandra"],"pdf_url":"https://arxiv.org/pdf/2401.08406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09566v1","updated":"2024-01-17T19:43:43Z","published":"2024-01-17T19:43:43Z","title":"Aligning Large Language Models with Counterfactual DPO","summary":" Advancements in large language models (LLMs) have demonstrated remarkable\ncapabilities across a diverse range of applications. These models excel in\ngenerating text completions that are contextually coherent and cover an\nextensive array of subjects. However, the vast datasets required for their\ntraining make aligning response styles during the pretraining and instruction\ntuning phases challenging. Consequently, an additional alignment phase is\ntypically employed, wherein the model is further trained with human preference\ndata to better align its outputs with human expectations. While this process\ndoesn't introduce new capabilities per se, it does accentuate generation styles\ninnate to the model. This paper explores the utilization of counterfactual\nprompting within the framework of Direct Preference Optimization (DPO) to align\nthe model's style without relying on human intervention. We demonstrate that\nthis method effectively instils desirable behaviour, mitigates undesirable\nones, and encourages the model to disregard inappropriate instructions. Our\nfindings suggest that counterfactual prompting with DPO presents a low-resource\nway to fine-tune LLMs to meet the demands for responsible and ethically aligned\nAI systems.\n","authors":["Bradley Butcher"],"pdf_url":"https://arxiv.org/pdf/2401.09566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05535v2","updated":"2024-01-17T19:39:42Z","published":"2023-05-24T12:09:42Z","title":"Detecting Check-Worthy Claims in Political Debates, Speeches, and\n Interviews Using Audio Data","summary":" Developing tools to automatically detect check-worthy claims in political\ndebates and speeches can greatly help moderators of debates, journalists, and\nfact-checkers. While previous work on this problem has focused exclusively on\nthe text modality, here we explore the utility of the audio modality as an\nadditional input. We create a new multimodal dataset (text and audio in\nEnglish) containing 48 hours of speech from past political debates in the USA.\nWe then experimentally demonstrate that, in the case of multiple speakers,\nadding the audio modality yields sizable improvements over using the text\nmodality alone; moreover, an audio-only model could outperform a text-only one\nfor a single speaker. With the aim to enable future research, we make all our\ndata and code publicly available at\nhttps://github.com/petar-iv/audio-checkworthiness-detection.\n","authors":["Petar Ivanov","Ivan Koychev","Momchil Hardalov","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2306.05535v2.pdf","comment":"Check-Worthiness, Fact-Checking, Fake News, Misinformation,\n Disinformation, Political Debates, Multimodality"},{"id":"http://arxiv.org/abs/2401.09555v1","updated":"2024-01-17T19:13:05Z","published":"2024-01-17T19:13:05Z","title":"Improving Classification Performance With Human Feedback: Label a few,\n we label the rest","summary":" In the realm of artificial intelligence, where a vast majority of data is\nunstructured, obtaining substantial amounts of labeled data to train supervised\nmachine learning models poses a significant challenge. To address this, we\ndelve into few-shot and active learning, where are goal is to improve AI models\nwith human feedback on a few labeled examples. This paper focuses on\nunderstanding how a continuous feedback loop can refine models, thereby\nenhancing their accuracy, recall, and precision through incremental human\ninput. By employing Large Language Models (LLMs) such as GPT-3.5, BERT, and\nSetFit, we aim to analyze the efficacy of using a limited number of labeled\nexamples to substantially improve model accuracy. We benchmark this approach on\nthe Financial Phrasebank, Banking, Craigslist, Trec, Amazon Reviews datasets to\nprove that with just a few labeled examples, we are able to surpass the\naccuracy of zero shot large language models to provide enhanced text\nclassification performance. We demonstrate that rather than needing to manually\nlabel millions of rows of data, we just need to label a few and the model can\neffectively predict the rest.\n","authors":["Natan Vidra","Thomas Clifford","Katherine Jijo","Eden Chung","Liang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09553v1","updated":"2024-01-17T19:11:30Z","published":"2024-01-17T19:11:30Z","title":"BERTologyNavigator: Advanced Question Answering with BERT-based\n Semantics","summary":" The development and integration of knowledge graphs and language models has\nsignificance in artificial intelligence and natural language processing. In\nthis study, we introduce the BERTologyNavigator -- a two-phased system that\ncombines relation extraction techniques and BERT embeddings to navigate the\nrelationships within the DBLP Knowledge Graph (KG). Our approach focuses on\nextracting one-hop relations and labelled candidate pairs in the first phases.\nThis is followed by employing BERT's CLS embeddings and additional heuristics\nfor relation selection in the second phase. Our system reaches an F1 score of\n0.2175 on the DBLP QuAD Final test dataset for Scholarly QALD and 0.98 F1 score\non the subset of the DBLP QuAD test dataset during the QA phase.\n","authors":["Shreya Rajpal","Ricardo Usbeck"],"pdf_url":"https://arxiv.org/pdf/2401.09553v1.pdf","comment":"Accepted in Scholarly QALD Challenge @ ISWC 2023"},{"id":"http://arxiv.org/abs/2306.09212v2","updated":"2024-01-17T19:09:57Z","published":"2023-06-15T15:49:51Z","title":"CMMLU: Measuring massive multitask language understanding in Chinese","summary":" As the capabilities of large language models (LLMs) continue to advance,\nevaluating their performance becomes increasingly crucial and challenging. This\npaper aims to bridge this gap by introducing CMMLU, a comprehensive Chinese\nbenchmark that covers various subjects, including natural science, social\nsciences, engineering, and humanities. We conduct a thorough evaluation of 18\nadvanced multilingual- and Chinese-oriented LLMs, assessing their performance\nacross different subjects and settings. The results reveal that most existing\nLLMs struggle to achieve an average accuracy of 50%, even when provided with\nin-context examples and chain-of-thought prompts, whereas the random baseline\nstands at 25%. This highlights significant room for improvement in LLMs.\nAdditionally, we conduct extensive experiments to identify factors impacting\nthe models' performance and propose directions for enhancing LLMs. CMMLU fills\nthe gap in evaluating the knowledge and reasoning capabilities of large\nlanguage models within the Chinese context.\n","authors":["Haonan Li","Yixuan Zhang","Fajri Koto","Yifei Yang","Hai Zhao","Yeyun Gong","Nan Duan","Timothy Baldwin"],"pdf_url":"https://arxiv.org/pdf/2306.09212v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.09419v1","updated":"2024-01-17T18:57:53Z","published":"2024-01-17T18:57:53Z","title":"GARField: Group Anything with Radiance Fields","summary":" Grouping is inherently ambiguous due to the multiple levels of granularity in\nwhich one can decompose a scene -- should the wheels of an excavator be\nconsidered separate or part of the whole? We present Group Anything with\nRadiance Fields (GARField), an approach for decomposing 3D scenes into a\nhierarchy of semantically meaningful groups from posed image inputs. To do this\nwe embrace group ambiguity through physical scale: by optimizing a\nscale-conditioned 3D affinity feature field, a point in the world can belong to\ndifferent groups of different sizes. We optimize this field from a set of 2D\nmasks provided by Segment Anything (SAM) in a way that respects coarse-to-fine\nhierarchy, using scale to consistently fuse conflicting masks from different\nviewpoints. From this field we can derive a hierarchy of possible groupings via\nautomatic tree construction or user interaction. We evaluate GARField on a\nvariety of in-the-wild scenes and find it effectively extracts groups at many\nlevels: clusters of objects, objects, and various subparts. GARField inherently\nrepresents multi-view consistent groupings and produces higher fidelity groups\nthan the input SAM masks. GARField's hierarchical grouping could have exciting\ndownstream applications such as 3D asset extraction or dynamic scene\nunderstanding. See the project website at https://www.garfield.studio/\n","authors":["Chung Min Kim","Mingxuan Wu","Justin Kerr","Ken Goldberg","Matthew Tancik","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2401.09419v1.pdf","comment":"Project site: https://www.garfield.studio/ First three authors\n contributed equally"},{"id":"http://arxiv.org/abs/2401.09417v1","updated":"2024-01-17T18:56:18Z","published":"2024-01-17T18:56:18Z","title":"Vision Mamba: Efficient Visual Representation Learning with\n Bidirectional State Space Model","summary":" Recently the state space models (SSMs) with efficient hardware-aware designs,\ni.e., Mamba, have shown great potential for long sequence modeling. Building\nefficient and generic vision backbones purely upon SSMs is an appealing\ndirection. However, representing visual data is challenging for SSMs due to the\nposition-sensitivity of visual data and the requirement of global context for\nvisual understanding. In this paper, we show that the reliance of visual\nrepresentation learning on self-attention is not necessary and propose a new\ngeneric vision backbone with bidirectional Mamba blocks (Vim), which marks the\nimage sequences with position embeddings and compresses the visual\nrepresentation with bidirectional state space models. On ImageNet\nclassification, COCO object detection, and ADE20k semantic segmentation tasks,\nVim achieves higher performance compared to well-established vision\ntransformers like DeiT, while also demonstrating significantly improved\ncomputation & memory efficiency. For example, Vim is 2.8$\\times$ faster than\nDeiT and saves 86.8% GPU memory when performing batch inference to extract\nfeatures on images with a resolution of 1248$\\times$1248. The results\ndemonstrate that Vim is capable of overcoming the computation & memory\nconstraints on performing Transformer-style understanding for high-resolution\nimages and it has great potential to become the next-generation backbone for\nvision foundation models. Code is available at https://github.com/hustvl/Vim.\n","authors":["Lianghui Zhu","Bencheng Liao","Qian Zhang","Xinlong Wang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09417v1.pdf","comment":"Work in progress. Code is available at https://github.com/hustvl/Vim"},{"id":"http://arxiv.org/abs/2401.09416v1","updated":"2024-01-17T18:55:49Z","published":"2024-01-17T18:55:49Z","title":"TextureDreamer: Image-guided Texture Synthesis through Geometry-aware\n Diffusion","summary":" We present TextureDreamer, a novel image-guided texture synthesis method to\ntransfer relightable textures from a small number of input images (3 to 5) to\ntarget 3D shapes across arbitrary categories. Texture creation is a pivotal\nchallenge in vision and graphics. Industrial companies hire experienced artists\nto manually craft textures for 3D assets. Classical methods require densely\nsampled views and accurately aligned geometry, while learning-based methods are\nconfined to category-specific shapes within the dataset. In contrast,\nTextureDreamer can transfer highly detailed, intricate textures from real-world\nenvironments to arbitrary objects with only a few casually captured images,\npotentially significantly democratizing texture creation. Our core idea,\npersonalized geometry-aware score distillation (PGSD), draws inspiration from\nrecent advancements in diffuse models, including personalized modeling for\ntexture information extraction, variational score distillation for detailed\nappearance synthesis, and explicit geometry guidance with ControlNet. Our\nintegration and several essential modifications substantially improve the\ntexture quality. Experiments on real images spanning different categories show\nthat TextureDreamer can successfully transfer highly realistic, semantic\nmeaningful texture to arbitrary objects, surpassing the visual quality of\nprevious state-of-the-art.\n","authors":["Yu-Ying Yeh","Jia-Bin Huang","Changil Kim","Lei Xiao","Thu Nguyen-Phuoc","Numair Khan","Cheng Zhang","Manmohan Chandraker","Carl S Marshall","Zhao Dong","Zhengqin Li"],"pdf_url":"https://arxiv.org/pdf/2401.09416v1.pdf","comment":"Project page: https://texturedreamer.github.io"},{"id":"http://arxiv.org/abs/2401.09414v1","updated":"2024-01-17T18:55:12Z","published":"2024-01-17T18:55:12Z","title":"Vlogger: Make Your Dream A Vlog","summary":" In this work, we present Vlogger, a generic AI system for generating a\nminute-level video blog (i.e., vlog) of user descriptions. Different from short\nvideos with a few seconds, vlog often contains a complex storyline with\ndiversified scenes, which is challenging for most existing video generation\napproaches. To break through this bottleneck, our Vlogger smartly leverages\nLarge Language Model (LLM) as Director and decomposes a long video generation\ntask of vlog into four key stages, where we invoke various foundation models to\nplay the critical roles of vlog professionals, including (1) Script, (2) Actor,\n(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings,\nour Vlogger can generate vlogs through explainable cooperation of top-down\nplanning and bottom-up shooting. Moreover, we introduce a novel video diffusion\nmodel, ShowMaker, which serves as a videographer in our Vlogger for generating\nthe video snippet of each shooting scene. By incorporating Script and Actor\nattentively as textual and visual prompts, it can effectively enhance\nspatial-temporal coherence in the snippet. Besides, we design a concise mixed\ntraining paradigm for ShowMaker, boosting its capacity for both T2V generation\nand prediction. Finally, the extensive experiments show that our method\nachieves state-of-the-art performance on zero-shot T2V generation and\nprediction tasks. More importantly, Vlogger can generate over 5-minute vlogs\nfrom open-world descriptions, without loss of video coherence on script and\nactor. The code and model is all available at\nhttps://github.com/zhuangshaobin/Vlogger.\n","authors":["Shaobin Zhuang","Kunchang Li","Xinyuan Chen","Yaohui Wang","Ziwei Liu","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09414v1.pdf","comment":"16 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2401.09413v1","updated":"2024-01-17T18:51:53Z","published":"2024-01-17T18:51:53Z","title":"POP-3D: Open-Vocabulary 3D Occupancy Prediction from Images","summary":" We describe an approach to predict open-vocabulary 3D semantic voxel\noccupancy map from input 2D images with the objective of enabling 3D grounding,\nsegmentation and retrieval of free-form language queries. This is a challenging\nproblem because of the 2D-3D ambiguity and the open-vocabulary nature of the\ntarget tasks, where obtaining annotated training data in 3D is difficult. The\ncontributions of this work are three-fold. First, we design a new model\narchitecture for open-vocabulary 3D semantic occupancy prediction. The\narchitecture consists of a 2D-3D encoder together with occupancy prediction and\n3D-language heads. The output is a dense voxel map of 3D grounded language\nembeddings enabling a range of open-vocabulary tasks. Second, we develop a\ntri-modal self-supervised learning algorithm that leverages three modalities:\n(i) images, (ii) language and (iii) LiDAR point clouds, and enables training\nthe proposed architecture using a strong pre-trained vision-language model\nwithout the need for any 3D manual language annotations. Finally, we\ndemonstrate quantitatively the strengths of the proposed model on several\nopen-vocabulary tasks: Zero-shot 3D semantic segmentation using existing\ndatasets; 3D grounding and retrieval of free-form language queries, using a\nsmall dataset that we propose as an extension of nuScenes. You can find the\nproject page here https://vobecant.github.io/POP3D.\n","authors":["Antonin Vobecky","Oriane Siméoni","David Hurych","Spyros Gidaris","Andrei Bursuc","Patrick Pérez","Josef Sivic"],"pdf_url":"https://arxiv.org/pdf/2401.09413v1.pdf","comment":"accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.09386v1","updated":"2024-01-17T17:59:03Z","published":"2024-01-17T17:59:03Z","title":"Tri$^{2}$-plane: Volumetric Avatar Reconstruction with Feature Pyramid","summary":" Recent years have witnessed considerable achievements in facial avatar\nreconstruction with neural volume rendering. Despite notable advancements, the\nreconstruction of complex and dynamic head movements from monocular videos\nstill suffers from capturing and restoring fine-grained details. In this work,\nwe propose a novel approach, named Tri$^2$-plane, for monocular photo-realistic\nvolumetric head avatar reconstructions. Distinct from the existing works that\nrely on a single tri-plane deformation field for dynamic facial modeling, the\nproposed Tri$^2$-plane leverages the principle of feature pyramids and three\ntop-to-down lateral connections tri-planes for details improvement. It samples\nand renders facial details at multiple scales, transitioning from the entire\nface to specific local regions and then to even more refined sub-regions.\nMoreover, we incorporate a camera-based geometry-aware sliding window method as\nan augmentation in training, which improves the robustness beyond the canonical\nspace, with a particular improvement in cross-identity generation capabilities.\nExperimental outcomes indicate that the Tri$^2$-plane not only surpasses\nexisting methodologies but also achieves superior performance across both\nquantitative metrics and qualitative assessments through experiments.\n","authors":["Luchuan Song","Pinxin Liu","Lele Chen","Celong Liu","Chenliang Xu"],"pdf_url":"https://arxiv.org/pdf/2401.09386v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.09384v1","updated":"2024-01-17T17:55:06Z","published":"2024-01-17T17:55:06Z","title":"Diverse Part Synthesis for 3D Shape Creation","summary":" Methods that use neural networks for synthesizing 3D shapes in the form of a\npart-based representation have been introduced over the last few years. These\nmethods represent shapes as a graph or hierarchy of parts and enable a variety\nof applications such as shape sampling and reconstruction. However, current\nmethods do not allow easily regenerating individual shape parts according to\nuser preferences. In this paper, we investigate techniques that allow the user\nto generate multiple, diverse suggestions for individual parts. Specifically,\nwe experiment with multimodal deep generative models that allow sampling\ndiverse suggestions for shape parts and focus on models which have not been\nconsidered in previous work on shape synthesis. To provide a comparative study\nof these techniques, we introduce a method for synthesizing 3D shapes in a\npart-based representation and evaluate all the part suggestion techniques\nwithin this synthesis method. In our method, which is inspired by previous\nwork, shapes are represented as a set of parts in the form of implicit\nfunctions which are then positioned in space to form the final shape. Synthesis\nin this representation is enabled by a neural network architecture based on an\nimplicit decoder and a spatial transformer. We compare the various multimodal\ngenerative models by evaluating their performance in generating part\nsuggestions. Our contribution is to show with qualitative and quantitative\nevaluations which of the new techniques for multimodal part generation perform\nthe best and that a synthesis method based on the top-performing techniques\nallows the user to more finely control the parts that are generated in the 3D\nshapes while maintaining high shape fidelity when reconstructing shapes.\n","authors":["Yanran Guan","Oliver van Kaick"],"pdf_url":"https://arxiv.org/pdf/2401.09384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09340v1","updated":"2024-01-17T17:04:35Z","published":"2024-01-17T17:04:35Z","title":"SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene\n Understanding","summary":" 3D vision-language grounding, which focuses on aligning language with the 3D\nphysical environment, stands as a cornerstone in the development of embodied\nagents. In comparison to recent advancements in the 2D domain, grounding\nlanguage in 3D scenes faces several significant challenges: (i) the inherent\ncomplexity of 3D scenes due to the diverse object configurations, their rich\nattributes, and intricate relationships; (ii) the scarcity of paired 3D\nvision-language data to support grounded learning; and (iii) the absence of a\nunified learning framework to distill knowledge from grounded 3D data. In this\nwork, we aim to address these three major challenges in 3D vision-language by\nexamining the potential of systematically upscaling 3D vision-language learning\nin indoor environments. We introduce the first million-scale 3D vision-language\ndataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising\n2.5M vision-language pairs derived from both human annotations and our scalable\nscene-graph-based generation approach. We demonstrate that this scaling allows\nfor a unified pre-training framework, Grounded Pre-training for Scenes (GPS),\nfor 3D vision-language learning. Through extensive experiments, we showcase the\neffectiveness of GPS by achieving state-of-the-art performance on all existing\n3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is\nunveiled through zero-shot transfer experiments in the challenging 3D\nvision-language tasks. Project website: https://scene-verse.github.io .\n","authors":["Baoxiong Jia","Yixin Chen","Huangyue Yu","Yan Wang","Xuesong Niu","Tengyu Liu","Qing Li","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2401.09340v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2401.09336v1","updated":"2024-01-17T16:58:10Z","published":"2024-01-17T16:58:10Z","title":"To deform or not: treatment-aware longitudinal registration for breast\n DCE-MRI during neoadjuvant chemotherapy via unsupervised keypoints detection","summary":" Clinicians compare breast DCE-MRI after neoadjuvant chemotherapy (NAC) with\npre-treatment scans to evaluate the response to NAC. Clinical evidence supports\nthat accurate longitudinal deformable registration without deforming treated\ntumor regions is key to quantifying tumor changes. We propose a conditional\npyramid registration network based on unsupervised keypoint detection and\nselective volume-preserving to quantify changes over time. In this approach, we\nextract the structural and the abnormal keypoints from DCE-MRI, apply the\nstructural keypoints for the registration algorithm to restrict large\ndeformation, and employ volume-preserving loss based on abnormal keypoints to\nkeep the volume of the tumor unchanged after registration. We use a clinical\ndataset with 1630 MRI scans from 314 patients treated with NAC. The results\ndemonstrate that our method registers with better performance and better volume\npreservation of the tumors. Furthermore, a local-global-combining biomarker\nbased on the proposed method achieves high accuracy in pathological complete\nresponse (pCR) prediction, indicating that predictive information exists\noutside tumor regions. The biomarkers could potentially be used to avoid\nunnecessary surgeries for certain patients. It may be valuable for clinicians\nand/or computer systems to conduct follow-up tumor segmentation and response\nprediction on images registered by our method. Our code is available on\n\\url{https://github.com/fiy2W/Treatment-aware-Longitudinal-Registration}.\n","authors":["Luyi Han","Tao Tan","Tianyu Zhang","Yuan Gao","Xin Wang","Valentina Longo","Sofía Ventura-Díaz","Anna D'Angelo","Jonas Teuwen","Ritse Mann"],"pdf_url":"https://arxiv.org/pdf/2401.09336v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09331v1","updated":"2024-01-17T16:52:20Z","published":"2024-01-17T16:52:20Z","title":"Event-Based Visual Odometry on Non-Holonomic Ground Vehicles","summary":" Despite the promise of superior performance under challenging conditions,\nevent-based motion estimation remains a hard problem owing to the difficulty of\nextracting and tracking stable features from event streams. In order to\nrobustify the estimation, it is generally believed that fusion with other\nsensors is a requirement. In this work, we demonstrate reliable, purely\nevent-based visual odometry on planar ground vehicles by employing the\nconstrained non-holonomic motion model of Ackermann steering platforms. We\nextend single feature n-linearities for regular frame-based cameras to the case\nof quasi time-continuous event-tracks, and achieve a polynomial form via\nvariable degree Taylor expansions. Robust averaging over multiple event tracks\nis simply achieved via histogram voting. As demonstrated on both simulated and\nreal data, our algorithm achieves accurate and robust estimates of the\nvehicle's instantaneous rotational velocity, and thus results that are\ncomparable to the delta rotations obtained by frame-based sensors under normal\nconditions. We furthermore significantly outperform the more traditional\nalternatives in challenging illumination scenarios. The code is available at\n\\url{https://github.com/gowanting/NHEVO}.\n","authors":["Wanting Xu","Si'ao Zhang","Li Cui","Xin Peng","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2401.09331v1.pdf","comment":"Accepted by 3DV 2024"},{"id":"http://arxiv.org/abs/2401.09328v1","updated":"2024-01-17T16:51:28Z","published":"2024-01-17T16:51:28Z","title":"Online Stability Improvement of Groebner Basis Solvers using Deep\n Learning","summary":" Over the past decade, the Gr\\\"obner basis theory and automatic solver\ngeneration have lead to a large number of solutions to geometric vision\nproblems. In practically all cases, the derived solvers apply a fixed\nelimination template to calculate the Gr\\\"obner basis and thereby identify the\nzero-dimensional variety of the original polynomial constraints. However, it is\nclear that different variable or monomial orderings lead to different\nelimination templates, and we show that they may present a large variability in\naccuracy for a certain instance of a problem. The present paper has two\ncontributions. We first show that for a common class of problems in geometric\nvision, variable reordering simply translates into a permutation of the columns\nof the initial coefficient matrix, and that -- as a result -- one and the same\nelimination template can be reused in different ways, each one leading to\npotentially different accuracy. We then prove that the original set of\ncoefficients may contain sufficient information to train a classifier for\nonline selection of a good solver, most notably at the cost of only a small\ncomputational overhead. We demonstrate wide applicability at the hand of\ngeneric dense polynomial problem solvers, as well as a concrete solver from\ngeometric vision.\n","authors":["Wanting Xu","Lan Hu","Manolis C. Tsakiris","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2401.09328v1.pdf","comment":"Accepted by 3DV 2019"},{"id":"http://arxiv.org/abs/2401.09325v1","updated":"2024-01-17T16:48:55Z","published":"2024-01-17T16:48:55Z","title":"Siamese Meets Diffusion Network: SMDNet for Enhanced Change Detection in\n High-Resolution RS Imagery","summary":" Recently, the application of deep learning to change detection (CD) has\nsignificantly progressed in remote sensing images. In recent years, CD tasks\nhave mostly used architectures such as CNN and Transformer to identify these\nchanges. However, these architectures have shortcomings in representing\nboundary details and are prone to false alarms and missed detections under\ncomplex lighting and weather conditions. For that, we propose a new network,\nSiamese Meets Diffusion Network (SMDNet). This network combines the Siam-U2Net\nFeature Differential Encoder (SU-FDE) and the denoising diffusion implicit\nmodel to improve the accuracy of image edge change detection and enhance the\nmodel's robustness under environmental changes. First, we propose an innovative\nSU-FDE module that utilizes shared weight features to capture differences\nbetween time series images and identify similarities between features to\nenhance edge detail detection. Furthermore, we add an attention mechanism to\nidentify key coarse features to improve the model's sensitivity and accuracy.\nFinally, the diffusion model of progressive sampling is used to fuse key coarse\nfeatures, and the noise reduction ability of the diffusion model and the\nadvantages of capturing the probability distribution of image data are used to\nenhance the adaptability of the model in different environments. Our method's\ncombination of feature extraction and diffusion models demonstrates\neffectiveness in change detection in remote sensing images. The performance\nevaluation of SMDNet on LEVIR-CD, DSIFN-CD, and CDD datasets yields validated\nF1 scores of 90.99%, 88.40%, and 88.47%, respectively. This substantiates the\nadvanced capabilities of our model in accurately identifying variations and\nintricate details.\n","authors":["Jia Jia","Geunho Lee","Zhibo Wang","Lyu Zhi","Yuchu He"],"pdf_url":"https://arxiv.org/pdf/2401.09325v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2305.19858v2","updated":"2024-01-17T16:41:01Z","published":"2023-05-31T13:48:51Z","title":"Enhancing image quality prediction with self-supervised visual masking","summary":" Full-reference image quality metrics (FR-IQMs) aim to measure the visual\ndifferences between a pair of reference and distorted images, with the goal of\naccurately predicting human judgments. However, existing FR-IQMs, including\ntraditional ones like PSNR and SSIM and even perceptual ones such as HDR-VDP,\nLPIPS, and DISTS, still fall short in capturing the complexities and nuances of\nhuman perception. In this work, rather than devising a novel IQM model, we seek\nto improve upon the perceptual quality of existing FR-IQM methods. We achieve\nthis by considering visual masking, an important characteristic of the human\nvisual system that changes its sensitivity to distortions as a function of\nlocal image content. Specifically, for a given FR-IQM metric, we propose to\npredict a visual masking model that modulates reference and distorted images in\na way that penalizes the visual errors based on their visibility. Since the\nground truth visual masks are difficult to obtain, we demonstrate how they can\nbe derived in a self-supervised manner solely based on mean opinion scores\n(MOS) collected from an FR-IQM dataset. Our approach results in enhanced FR-IQM\nmetrics that are more in line with human prediction both visually and\nquantitatively.\n","authors":["Uğur Çoğalan","Mojtaba Bemana","Hans-Peter Seidel","Karol Myszkowski"],"pdf_url":"https://arxiv.org/pdf/2305.19858v2.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2305.18171v3","updated":"2024-01-17T16:38:47Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the loss saturation problem under\nmassive false negatives; second, mixed sample data augmentation for\nprobabilistic matching. Experimental results on MS-COCO Caption and two\nextended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of\nPCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is\nalso evaluated under noisy image-text correspondences. In addition, the\npotential applicability of PCME++ in automatic prompt tuning for zero-shot\nclassification is shown. The code is available at\nhttps://github.com/naver-ai/pcmepp.\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v3.pdf","comment":"ICLR 2024; Code: https://github.com/naver-ai/pcmepp. Project page:\n https://naver-ai.github.io/pcmepp/. 26 pages, 2.4 MB"},{"id":"http://arxiv.org/abs/2401.09296v1","updated":"2024-01-17T15:56:57Z","published":"2024-01-17T15:56:57Z","title":"Tight Fusion of Events and Inertial Measurements for Direct Velocity\n Estimation","summary":" Traditional visual-inertial state estimation targets absolute camera poses\nand spatial landmark locations while first-order kinematics are typically\nresolved as an implicitly estimated sub-state. However, this poses a risk in\nvelocity-based control scenarios, as the quality of the estimation of\nkinematics depends on the stability of absolute camera and landmark coordinates\nestimation. To address this issue, we propose a novel solution to tight\nvisual-inertial fusion directly at the level of first-order kinematics by\nemploying a dynamic vision sensor instead of a normal camera. More\nspecifically, we leverage trifocal tensor geometry to establish an incidence\nrelation that directly depends on events and camera velocity, and demonstrate\nhow velocity estimates in highly dynamic situations can be obtained over short\ntime intervals. Noise and outliers are dealt with using a nested two-layer\nRANSAC scheme. Additionally, smooth velocity signals are obtained from a tight\nfusion with pre-integrated inertial signals using a sliding window optimizer.\nExperiments on both simulated and real data demonstrate that the proposed tight\nevent-inertial fusion leads to continuous and reliable velocity estimation in\nhighly dynamic scenarios independently of absolute coordinates. Furthermore, in\nextreme cases, it achieves more stable and more accurate estimation of\nkinematics than traditional, point-position-based visual-inertial odometry.\n","authors":["Wanting Xu","Xin Peng","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2401.09296v1.pdf","comment":"Accepted by IEEE Transactions on Robotics (T-RO)"},{"id":"http://arxiv.org/abs/2305.16494v3","updated":"2024-01-17T15:38:27Z","published":"2023-05-25T21:51:23Z","title":"Diffusion-Based Adversarial Sample Generation for Improved Stealthiness\n and Controllability","summary":" Neural networks are known to be susceptible to adversarial samples: small\nvariations of natural examples crafted to deliberately mislead the models.\nWhile they can be easily generated using gradient-based techniques in digital\nand physical scenarios, they often differ greatly from the actual data\ndistribution of natural images, resulting in a trade-off between strength and\nstealthiness. In this paper, we propose a novel framework dubbed\nDiffusion-Based Projected Gradient Descent (Diff-PGD) for generating realistic\nadversarial samples. By exploiting a gradient guided by a diffusion model,\nDiff-PGD ensures that adversarial samples remain close to the original data\ndistribution while maintaining their effectiveness. Moreover, our framework can\nbe easily customized for specific tasks such as digital attacks, physical-world\nattacks, and style-based attacks. Compared with existing methods for generating\nnatural-style adversarial samples, our framework enables the separation of\noptimizing adversarial loss from other surrogate losses (e.g.,\ncontent/smoothness/style loss), making it more stable and controllable.\nFinally, we demonstrate that the samples generated using Diff-PGD have better\ntransferability and anti-purification power than traditional gradient-based\nmethods. Code will be released in https://github.com/xavihart/Diff-PGD\n","authors":["Haotian Xue","Alexandre Araujo","Bin Hu","Yongxin Chen"],"pdf_url":"https://arxiv.org/pdf/2305.16494v3.pdf","comment":"Accepted as a conference paper in NeurIPS'2023. Code repo:\n https://github.com/xavihart/Diff-PGD"},{"id":"http://arxiv.org/abs/2401.09283v1","updated":"2024-01-17T15:37:00Z","published":"2024-01-17T15:37:00Z","title":"A gradient-based approach to fast and accurate head motion compensation\n in cone-beam CT","summary":" Cone-beam computed tomography (CBCT) systems, with their portability, present\na promising avenue for direct point-of-care medical imaging, particularly in\ncritical scenarios such as acute stroke assessment. However, the integration of\nCBCT into clinical workflows faces challenges, primarily linked to long scan\nduration resulting in patient motion during scanning and leading to image\nquality degradation in the reconstructed volumes. This paper introduces a novel\napproach to CBCT motion estimation using a gradient-based optimization\nalgorithm, which leverages generalized derivatives of the backprojection\noperator for cone-beam CT geometries. Building on that, a fully differentiable\ntarget function is formulated which grades the quality of the current motion\nestimate in reconstruction space. We drastically accelerate motion estimation\nyielding a 19-fold speed-up compared to existing methods. Additionally, we\ninvestigate the architecture of networks used for quality metric regression and\npropose predicting voxel-wise quality maps, favoring autoencoder-like\narchitectures over contracting ones. This modification improves gradient flow,\nleading to more accurate motion estimation. The presented method is evaluated\nthrough realistic experiments on head anatomy. It achieves a reduction in\nreprojection error from an initial average of 3mm to 0.61mm after motion\ncompensation and consistently demonstrates superior performance compared to\nexisting approaches. The analytic Jacobian for the backprojection operation,\nwhich is at the core of the proposed method, is made publicly available. In\nsummary, this paper contributes to the advancement of CBCT integration into\nclinical workflows by proposing a robust motion estimation approach that\nenhances efficiency and accuracy, addressing critical challenges in\ntime-sensitive scenarios.\n","authors":["Mareike Thies","Fabian Wagner","Noah Maul","Haijun Yu","Manuela Meier","Linda-Sophie Schneider","Mingxuan Gu","Siyuan Mei","Lukas Folle","Andreas Maier"],"pdf_url":"https://arxiv.org/pdf/2401.09283v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.12143v3","updated":"2024-01-17T15:25:22Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Diffusion Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07825v2","updated":"2024-01-17T15:23:00Z","published":"2024-01-15T16:53:20Z","title":"Phenotyping calcification in vascular tissues using artificial\n intelligence","summary":" Vascular calcification is implicated as an important factor in major adverse\ncardiovascular events (MACE), including heart attack and stroke. A controversy\nremains over how to integrate the diverse forms of vascular calcification into\nclinical risk assessment tools. Even the commonly used calcium score for\ncoronary arteries, which assumes risk scales positively with total\ncalcification, has important inconsistencies. Fundamental studies are needed to\ndetermine how risk is influenced by the diverse calcification phenotypes.\nHowever, studies of these kinds are hindered by the lack of high-throughput,\nobjective, and non-destructive tools for classifying calcification in imaging\ndata sets. Here, we introduce a new classification system for phenotyping\ncalcification along with a semi-automated, non-destructive pipeline that can\ndistinguish these phenotypes in even atherosclerotic tissues. The pipeline\nincludes a deep-learning-based framework for segmenting lipid pools in noisy\nmicro-CT images and an unsupervised clustering framework for categorizing\ncalcification based on size, clustering, and topology. This approach is\nillustrated for five vascular specimens, providing phenotyping for thousands of\ncalcification particles across as many as 3200 images in less than seven hours.\nAverage Dice Similarity Coefficients of 0.96 and 0.87 could be achieved for\ntissue and lipid pool, respectively, with training and validation needed on\nonly 13 images despite the high heterogeneity in these tissues. By introducing\nan efficient and comprehensive approach to phenotyping calcification, this work\nenables large-scale studies to identify a more reliable indicator of the risk\nof cardiovascular events, a leading cause of global mortality and morbidity.\n","authors":["Mehdi Ramezanpour","Anne M. Robertson","Yasutaka Tobe","Xiaowei Jia","Juan R. Cebral"],"pdf_url":"https://arxiv.org/pdf/2401.07825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09271v1","updated":"2024-01-17T15:20:10Z","published":"2024-01-17T15:20:10Z","title":"PixelDINO: Semi-Supervised Semantic Segmentation for Detecting\n Permafrost Disturbances","summary":" Arctic Permafrost is facing significant changes due to global climate change.\nAs these regions are largely inaccessible, remote sensing plays a crucial rule\nin better understanding the underlying processes not just on a local scale, but\nacross the Arctic. In this study, we focus on the remote detection of\nretrogressive thaw slumps (RTS), a permafrost disturbance comparable to\nlandslides induced by thawing. For such analyses from space, deep learning has\nbecome an indispensable tool, but limited labelled training data remains a\nchallenge for training accurate models. To improve model generalization across\nthe Arctic without the need for additional labelled data, we present a\nsemi-supervised learning approach to train semantic segmentation models to\ndetect RTS. Our framework called PixelDINO is trained in parallel on labelled\ndata as well as unlabelled data. For the unlabelled data, the model segments\nthe imagery into self-taught pseudo-classes and the training procedure ensures\nconsistency of these pseudo-classes across strong augmentations of the input\ndata. Our experimental results demonstrate that PixelDINO can improve model\nperformance both over supervised baseline methods as well as existing\nsemi-supervised semantic segmentation approaches, highlighting its potential\nfor training robust models that generalize well to regions that were not\nincluded in the training data. The project page containing code and other\nmaterials for this study can be found at\n\\url{https://khdlr.github.io/PixelDINO/}.\n","authors":["Konrad Heidler","Ingmar Nitze","Guido Grosse","Xiao Xiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2401.09271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09266v1","updated":"2024-01-17T15:15:46Z","published":"2024-01-17T15:15:46Z","title":"P$^2$OT: Progressive Partial Optimal Transport for Deep Imbalanced\n Clustering","summary":" Deep clustering, which learns representation and semantic clustering without\nlabels information, poses a great challenge for deep learning-based approaches.\nDespite significant progress in recent years, most existing methods focus on\nuniformly distributed datasets, significantly limiting the practical\napplicability of their methods. In this paper, we first introduce a more\npractical problem setting named deep imbalanced clustering, where the\nunderlying classes exhibit an imbalance distribution. To tackle this problem,\nwe propose a novel pseudo-labeling-based learning framework. Our framework\nformulates pseudo-label generation as a progressive partial optimal transport\nproblem, which progressively transports each sample to imbalanced clusters\nunder prior distribution constraints, thus generating imbalance-aware\npseudo-labels and learning from high-confident samples. In addition, we\ntransform the initial formulation into an unbalanced optimal transport problem\nwith augmented constraints, which can be solved efficiently by a fast matrix\nscaling algorithm. Experiments on various datasets, including a human-curated\nlong-tailed CIFAR100, challenging ImageNet-R, and large-scale subsets of\nfine-grained iNaturalist2018 datasets, demonstrate the superiority of our\nmethod.\n","authors":["Chuyu Zhang","Hui Ren","Xuming He"],"pdf_url":"https://arxiv.org/pdf/2401.09266v1.pdf","comment":"Accepted by ICLR2024"},{"id":"http://arxiv.org/abs/2310.10224v3","updated":"2024-01-17T15:13:37Z","published":"2023-10-16T09:34:06Z","title":"Generalizing Medical Image Representations via Quaternion Wavelet\n Networks","summary":" Neural network generalizability is becoming a broad research field due to the\nincreasing availability of datasets from different sources and for various\ntasks. This issue is even wider when processing medical data, where a lack of\nmethodological standards causes large variations being provided by different\nimaging centers or acquired with various devices and cofactors. To overcome\nthese limitations, we introduce a novel, generalizable, data- and task-agnostic\nframework able to extract salient features from medical images. The proposed\nquaternion wavelet network (QUAVE) can be easily integrated with any\npre-existing medical image analysis or synthesis task, and it can be involved\nwith real, quaternion, or hypercomplex-valued models, generalizing their\nadoption to single-channel data. QUAVE first extracts different sub-bands\nthrough the quaternion wavelet transform, resulting in both\nlow-frequency/approximation bands and high-frequency/fine-grained features.\nThen, it weighs the most representative set of sub-bands to be involved as\ninput to any other neural model for image processing, replacing standard data\nsamples. We conduct an extensive experimental evaluation comprising different\ndatasets, diverse image analysis, and synthesis tasks including reconstruction,\nsegmentation, and modality translation. We also evaluate QUAVE in combination\nwith both real and quaternion-valued models. Results demonstrate the\neffectiveness and the generalizability of the proposed framework that improves\nnetwork performance while being flexible to be adopted in manifold scenarios\nand robust to domain shifts. The full code is available at:\nhttps://github.com/ispamm/QWT.\n","authors":["Luigi Sigillo","Eleonora Grassucci","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.10224v3.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2310.04741v4","updated":"2024-01-17T15:10:26Z","published":"2023-10-07T08:54:43Z","title":"Balancing stability and plasticity in continual learning: the\n readout-decomposition of activation change (RDAC) framework","summary":" Continual learning (CL) algorithms strive to acquire new knowledge while\npreserving prior information. However, this stability-plasticity trade-off\nremains a central challenge. This paper introduces a framework that dissects\nthis trade-off, offering valuable insights into CL algorithms. The\nReadout-Decomposition of Activation Change (RDAC) framework first addresses the\nstability-plasticity dilemma and its relation to catastrophic forgetting. It\nrelates learning-induced activation changes in the range of prior readouts to\nthe degree of stability and changes in the null space to the degree of\nplasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the\nframework clarifies the stability-plasticity trade-offs of the popular\nregularization algorithms Synaptic intelligence (SI), Elastic-weight\nconsolidation (EWC), and learning without Forgetting (LwF), and replay-based\nalgorithms Gradient episodic memory (GEM), and data replay. GEM and data replay\npreserved stability and plasticity, while SI, EWC, and LwF traded off\nplasticity for stability. The inability of the regularization algorithms to\nmaintain plasticity was linked to them restricting the change of activations in\nthe null space of the prior readout. Additionally, for one-hidden-layer linear\nneural networks, we derived a gradient decomposition algorithm to restrict\nactivation change only in the range of the prior readouts, to maintain high\nstability while not further sacrificing plasticity. Results demonstrate that\nthe algorithm maintained stability without significant plasticity loss. The\nRDAC framework informs the behavior of existing CL algorithms and paves the way\nfor novel CL approaches. Finally, it sheds light on the connection between\nlearning-induced activation/representation changes and the stability-plasticity\ndilemma, also offering insights into representational drift in biological\nsystems.\n","authors":["Daniel Anthes","Sushrut Thorat","Peter König","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2310.04741v4.pdf","comment":"15 pages, 5 figures, Revision"},{"id":"http://arxiv.org/abs/2401.09258v1","updated":"2024-01-17T15:05:00Z","published":"2024-01-17T15:05:00Z","title":"An Efficient Generalizable Framework for Visuomotor Policies via\n Control-aware Augmentation and Privilege-guided Distillation","summary":" Visuomotor policies, which learn control mechanisms directly from\nhigh-dimensional visual observations, confront challenges in adapting to new\nenvironments with intricate visual variations. Data augmentation emerges as a\npromising method for bridging these generalization gaps by enriching data\nvariety. However, straightforwardly augmenting the entire observation shall\nimpose excessive burdens on policy learning and may even result in performance\ndegradation. In this paper, we propose to improve the generalization ability of\nvisuomotor policies as well as preserve training stability from two aspects: 1)\nWe learn a control-aware mask through a self-supervised reconstruction task\nwith three auxiliary losses and then apply strong augmentation only to those\ncontrol-irrelevant regions based on the mask to reduce the generalization gaps.\n2) To address training instability issues prevalent in visual reinforcement\nlearning (RL), we distill the knowledge from a pretrained RL expert processing\nlow-level environment states, to the student visuomotor policy. The policy is\nsubsequently deployed to unseen environments without any further finetuning. We\nconducted comparison and ablation studies across various benchmarks: the\nDMControl Generalization Benchmark (DMC-GB), the enhanced Robot Manipulation\nDistraction Benchmark (RMDB), and a specialized long-horizontal drawer-opening\nrobotic task. The extensive experimental results well demonstrate the\neffectiveness of our method, e.g., showing a 17\\% improvement over previous\nmethods in the video-hard setting of DMC-GB.\n","authors":["Yinuo Zhao","Kun Wu","Tianjiao Yi","Zhiyuan Xu","Xiaozhu Ju","Zhengping Che","Qinru Qiu","Chi Harold Liu","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.09258v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05055v3","updated":"2024-01-17T14:59:30Z","published":"2023-10-08T07:41:15Z","title":"FairTune: Optimizing Parameter Efficient Fine Tuning for Fairness in\n Medical Image Analysis","summary":" Training models with robust group fairness properties is crucial in ethically\nsensitive application areas such as medical diagnosis. Despite the growing body\nof work aiming to minimise demographic bias in AI, this problem remains\nchallenging. A key reason for this challenge is the fairness generalisation\ngap: High-capacity deep learning models can fit all training data nearly\nperfectly, and thus also exhibit perfect fairness during training. In this\ncase, bias emerges only during testing when generalisation performance differs\nacross subgroups. This motivates us to take a bi-level optimisation perspective\non fair learning: Optimising the learning strategy based on validation\nfairness. Specifically, we consider the highly effective workflow of adapting\npre-trained models to downstream medical imaging tasks using\nparameter-efficient fine-tuning (PEFT) techniques. There is a trade-off between\nupdating more parameters, enabling a better fit to the task of interest vs.\nfewer parameters, potentially reducing the generalisation gap. To manage this\ntradeoff, we propose FairTune, a framework to optimise the choice of PEFT\nparameters with respect to fairness. We demonstrate empirically that FairTune\nleads to improved fairness on a range of medical imaging datasets. The code is\navailable at https://github.com/Raman1121/FairTune\n","authors":["Raman Dutt","Ondrej Bohdal","Sotirios A. Tsaftaris","Timothy Hospedales"],"pdf_url":"https://arxiv.org/pdf/2310.05055v3.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09252v1","updated":"2024-01-17T14:57:27Z","published":"2024-01-17T14:57:27Z","title":"3D Scene Geometry Estimation from 360$^\\circ$ Imagery: A Survey","summary":" This paper provides a comprehensive survey on pioneer and state-of-the-art 3D\nscene geometry estimation methodologies based on single, two, or multiple\nimages captured under the omnidirectional optics. We first revisit the basic\nconcepts of the spherical camera model, and review the most common acquisition\ntechnologies and representation formats suitable for omnidirectional (also\ncalled 360$^\\circ$, spherical or panoramic) images and videos. We then survey\nmonocular layout and depth inference approaches, highlighting the recent\nadvances in learning-based solutions suited for spherical data. The classical\nstereo matching is then revised on the spherical domain, where methodologies\nfor detecting and describing sparse and dense features become crucial. The\nstereo matching concepts are then extrapolated for multiple view camera setups,\ncategorizing them among light fields, multi-view stereo, and structure from\nmotion (or visual simultaneous localization and mapping). We also compile and\ndiscuss commonly adopted datasets and figures of merit indicated for each\npurpose and list recent results for completeness. We conclude this paper by\npointing out current and future trends.\n","authors":["Thiago Lopes Trugillo da Silveira","Paulo Gamarra Lessa Pinto","Jeffri Erwin Murrugarra Llerena","Claudio Rosito Jung"],"pdf_url":"https://arxiv.org/pdf/2401.09252v1.pdf","comment":"Published in ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2401.09245v1","updated":"2024-01-17T14:47:26Z","published":"2024-01-17T14:47:26Z","title":"Uncertainty estimates for semantic segmentation: providing enhanced\n reliability for automated motor claims handling","summary":" Deep neural network models for image segmentation can be a powerful tool for\nthe automation of motor claims handling processes in the insurance industry. A\ncrucial aspect is the reliability of the model outputs when facing adverse\nconditions, such as low quality photos taken by claimants to document damages.\nWe explore the use of a meta-classification model to assess the precision of\nsegments predicted by a model trained for the semantic segmentation of car body\nparts. Different sets of features correlated with the quality of a segment are\ncompared, and an AUROC score of 0.915 is achieved for distinguishing between\nhigh- and low-quality segments. By removing low-quality segments, the average\nmIoU of the segmentation output is improved by 16 percentage points and the\nnumber of wrongly predicted segments is reduced by 77%.\n","authors":["Jan Küchler","Daniel Kröll","Sebastian Schoenen","Andreas Witte"],"pdf_url":"https://arxiv.org/pdf/2401.09245v1.pdf","comment":"9 pages, 7 figures, 2 tables, submitted to MVAA"},{"id":"http://arxiv.org/abs/2304.14660v7","updated":"2024-01-17T14:42:40Z","published":"2023-04-28T07:23:31Z","title":"Segment Anything Model for Medical Images?","summary":" The Segment Anything Model (SAM) is the first foundation model for general\nimage segmentation. It has achieved impressive results on various natural image\nsegmentation tasks. However, medical image segmentation (MIS) is more\nchallenging because of the complex modalities, fine anatomical structures,\nuncertain and complex object boundaries, and wide-range object scales. To fully\nvalidate SAM's performance on medical data, we collected and sorted 53\nopen-source datasets and built a large medical segmentation dataset with 18\nmodalities, 84 objects, 125 object-modality paired targets, 1050K 2D images,\nand 6033K masks. We comprehensively analyzed different models and strategies on\nthe so-called COSMOS 1050K dataset. Our findings mainly include the following:\n1) SAM showed remarkable performance in some specific objects but was unstable,\nimperfect, or even totally failed in other situations. 2) SAM with the large\nViT-H showed better overall performance than that with the small ViT-B. 3) SAM\nperformed better with manual hints, especially box, than the Everything mode.\n4) SAM could help human annotation with high labeling quality and less time. 5)\nSAM was sensitive to the randomness in the center point and tight box prompts,\nand may suffer from a serious performance drop. 6) SAM performed better than\ninteractive methods with one or a few points, but will be outpaced as the\nnumber of points increases. 7) SAM's performance correlated to different\nfactors, including boundary complexity, intensity differences, etc. 8)\nFinetuning the SAM on specific medical tasks could improve its average DICE\nperformance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that\nthis comprehensive report can help researchers explore the potential of SAM\napplications in MIS, and guide how to appropriately use and develop SAM.\n","authors":["Yuhao Huang","Xin Yang","Lian Liu","Han Zhou","Ao Chang","Xinrui Zhou","Rusi Chen","Junxuan Yu","Jiongquan Chen","Chaoyu Chen","Sijing Liu","Haozhe Chi","Xindi Hu","Kejuan Yue","Lei Li","Vicente Grau","Deng-Ping Fan","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2304.14660v7.pdf","comment":"Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2401.09239v1","updated":"2024-01-17T14:39:55Z","published":"2024-01-17T14:39:55Z","title":"DaFoEs: Mixing Datasets towards the generalization of vision-state\n deep-learning Force Estimation in Minimally Invasive Robotic Surgery","summary":" Precisely determining the contact force during safe interaction in Minimally\nInvasive Robotic Surgery (MIRS) is still an open research challenge. Inspired\nby post-operative qualitative analysis from surgical videos, the use of\ncross-modality data driven deep neural network models has been one of the\nnewest approaches to predict sensorless force trends. However, these methods\nrequired for large and variable datasets which are not currently available. In\nthis paper, we present a new vision-haptic dataset (DaFoEs) with variable soft\nenvironments for the training of deep neural models. In order to reduce the\nbias from a single dataset, we present a pipeline to generalize different\nvision and state data inputs for mixed dataset training, using a previously\nvalidated dataset with different setup. Finally, we present a variable\nencoder-decoder architecture to predict the forces done by the laparoscopic\ntool using single input or sequence of inputs. For input sequence, we use a\nrecurrent decoder, named with the prefix R, and a new temporal sampling to\nrepresent the acceleration of the tool. During our training, we demonstrate\nthat single dataset training tends to overfit to the training data domain, but\nhas difficulties on translating the results across new domains. However,\ndataset mixing presents a good translation with a mean relative estimated force\nerror of 5% and 12% for the recurrent and non-recurrent models respectively.\nOur method, also marginally increase the effectiveness of transformers for\nforce estimation up to a maximum of ~15%, as the volume of available data is\nincrease by 150%. In conclusion, we demonstrate that mixing experimental set\nups for vision-state force estimation in MIRS is a possible approach towards\nthe general solution of the problem.\n","authors":["Mikel De Iturrate Reyzabal","Mingcong Chen","Wei Huang","Sebastien Ourselin","Hongbin Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09239v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06637v3","updated":"2024-01-17T14:37:58Z","published":"2024-01-12T15:29:21Z","title":"Adversarial Examples are Misaligned in Diffusion Model Manifolds","summary":" In recent years, diffusion models (DMs) have drawn significant attention for\ntheir success in approximating data distributions, yielding state-of-the-art\ngenerative results. Nevertheless, the versatility of these models extends\nbeyond their generative capabilities to encompass various vision applications,\nsuch as image inpainting, segmentation, adversarial robustness, among others.\nThis study is dedicated to the investigation of adversarial attacks through the\nlens of diffusion models. However, our objective does not involve enhancing the\nadversarial robustness of image classifiers. Instead, our focus lies in\nutilizing the diffusion model to detect and analyze the anomalies introduced by\nthese attacks on images. To that end, we systematically examine the alignment\nof the distributions of adversarial examples when subjected to the process of\ntransformation using diffusion models. The efficacy of this approach is\nassessed across CIFAR-10 and ImageNet datasets, including varying image sizes\nin the latter. The results demonstrate a notable capacity to discriminate\neffectively between benign and attacked images, providing compelling evidence\nthat adversarial instances do not align with the learned manifold of the DMs.\n","authors":["Peter Lorenz","Ricard Durall","Janis Keuper"],"pdf_url":"https://arxiv.org/pdf/2401.06637v3.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2401.09232v1","updated":"2024-01-17T14:17:59Z","published":"2024-01-17T14:17:59Z","title":"Dynamic Relation Transformer for Contextual Text Block Detection","summary":" Contextual Text Block Detection (CTBD) is the task of identifying coherent\ntext blocks within the complexity of natural scenes. Previous methodologies\nhave treated CTBD as either a visual relation extraction challenge within\ncomputer vision or as a sequence modeling problem from the perspective of\nnatural language processing. We introduce a new framework that frames CTBD as a\ngraph generation problem. This methodology consists of two essential\nprocedures: identifying individual text units as graph nodes and discerning the\nsequential reading order relationships among these units as graph edges.\nLeveraging the cutting-edge capabilities of DQ-DETR for node detection, our\nframework innovates further by integrating a novel mechanism, a Dynamic\nRelation Transformer (DRFormer), dedicated to edge generation. DRFormer\nincorporates a dual interactive transformer decoder that deftly manages a\ndynamic graph structure refinement process. Through this iterative process, the\nmodel systematically enhances the graph's fidelity, ultimately resulting in\nimproved precision in detecting contextual text blocks. Comprehensive\nexperimental evaluations conducted on both SCUT-CTW-Context and ReCTS-Context\ndatasets substantiate that our method achieves state-of-the-art results,\nunderscoring the effectiveness and potential of our graph generation framework\nin advancing the field of CTBD.\n","authors":["Jiawei Wang","Shunchi Zhang","Kai Hu","Chixiang Ma","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.09232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01812v2","updated":"2024-01-17T14:04:11Z","published":"2023-10-03T05:55:11Z","title":"PPT: Token Pruning and Pooling for Efficient Vision Transformers","summary":" Vision Transformers (ViTs) have emerged as powerful models in the field of\ncomputer vision, delivering superior performance across various vision tasks.\nHowever, the high computational complexity poses a significant barrier to their\npractical applications in real-world scenarios. Motivated by the fact that not\nall tokens contribute equally to the final predictions and fewer tokens bring\nless computational cost, reducing redundant tokens has become a prevailing\nparadigm for accelerating vision transformers. However, we argue that it is not\noptimal to either only reduce inattentive redundancy by token pruning, or only\nreduce duplicative redundancy by token merging. To this end, in this paper we\npropose a novel acceleration framework, namely token Pruning & Pooling\nTransformers (PPT), to adaptively tackle these two types of redundancy in\ndifferent layers. By heuristically integrating both token pruning and token\npooling techniques in ViTs without additional trainable parameters, PPT\neffectively reduces the model complexity while maintaining its predictive\naccuracy. For example, PPT reduces over 37% FLOPs and improves the throughput\nby over 45% for DeiT-S without any accuracy drop on the ImageNet dataset. The\ncode is available at https://github.com/xjwu1024/PPT and\nhttps://github.com/mindspore-lab/models/\n","authors":["Xinjian Wu","Fanhu Zeng","Xiudong Wang","Yunhe Wang","Xinghao Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01812v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.10973v3","updated":"2024-01-17T13:50:09Z","published":"2022-08-23T13:45:15Z","title":"Robust and Large-Payload DNN Watermarking via Fixed,\n Distribution-Optimized, Weights","summary":" The design of an effective multi-bit watermarking algorithm hinges upon\nfinding a good trade-off between the three fundamental requirements forming the\nwatermarking trade-off triangle, namely, robustness against network\nmodifications, payload, and unobtrusiveness, ensuring minimal impact on the\nperformance of the watermarked network. In this paper, we first revisit the\nnature of the watermarking trade-off triangle for the DNN case, then we exploit\nour findings to propose a white-box, multi-bit watermarking method achieving\nvery large payload and strong robustness against network modification. In the\nproposed system, the weights hosting the watermark are set prior to training,\nmaking sure that their amplitude is large enough to bear the target payload and\nsurvive network modifications, notably retraining, and are left unchanged\nthroughout the training process. The distribution of the weights carrying the\nwatermark is theoretically optimised to ensure the secrecy of the watermark and\nmake sure that the watermarked weights are indistinguishable from the\nnon-watermarked ones. The proposed method can achieve outstanding performance,\nwith no significant impact on network accuracy, including robustness against\nnetwork modifications, retraining and transfer learning, while ensuring a\npayload which is out of reach of state of the art methods achieving a lower -\nor at most comparable - robustness.\n","authors":["Benedetta Tondi","Andrea Costanzo","Mauro Barni"],"pdf_url":"https://arxiv.org/pdf/2208.10973v3.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.04960v2","updated":"2024-01-17T13:47:32Z","published":"2023-12-08T10:50:02Z","title":"MIMIR: Masked Image Modeling for Mutual Information-based Adversarial\n Robustness","summary":" Vision Transformers (ViTs) achieve superior performance on various tasks\ncompared to convolutional neural networks (CNNs), but ViTs are also vulnerable\nto adversarial attacks. Adversarial training is one of the most successful\nmethods to build robust CNN models. Thus, recent works explored new\nmethodologies for adversarial training of ViTs based on the differences between\nViTs and CNNs, such as better training strategies, preventing attention from\nfocusing on a single block, or discarding low-attention embeddings. However,\nthese methods still follow the design of traditional supervised adversarial\ntraining, limiting the potential of adversarial training on ViTs. This paper\nproposes a novel defense method, MIMIR, which aims to build a different\nadversarial training methodology by utilizing Masked Image Modeling at\npre-training. We create an autoencoder that accepts adversarial examples as\ninput but takes the clean examples as the modeling target. Then, we create a\nmutual information (MI) penalty following the idea of the Information\nBottleneck. Among the two information source inputs and corresponding\nadversarial perturbation, the perturbation information is eliminated due to the\nconstraint of the modeling target. Next, we provide a theoretical analysis of\nMIMIR using the bounds of the MI penalty. We also design two adaptive attacks\nwhen the adversary is aware of the MIMIR defense and show that MIMIR still\nperforms well. The experimental results show that MIMIR improves (natural and\nadversarial) accuracy on average by 4.19% on CIFAR-10 and 5.52% on ImageNet-1K,\ncompared to baselines. On Tiny-ImageNet, we obtained improved natural accuracy\nof 2.99\\% on average and comparable adversarial accuracy. Our code and trained\nmodels are publicly available https://github.com/xiaoyunxxy/MIMIR.\n","authors":["Xiaoyun Xu","Shujian Yu","Jingzheng Wu","Stjepan Picek"],"pdf_url":"https://arxiv.org/pdf/2312.04960v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09195v1","updated":"2024-01-17T13:07:22Z","published":"2024-01-17T13:07:22Z","title":"Training-Free Semantic Video Composition via Pre-trained Diffusion Model","summary":" The video composition task aims to integrate specified foregrounds and\nbackgrounds from different videos into a harmonious composite. Current\napproaches, predominantly trained on videos with adjusted foreground color and\nlighting, struggle to address deep semantic disparities beyond superficial\nadjustments, such as domain gaps. Therefore, we propose a training-free\npipeline employing a pre-trained diffusion model imbued with semantic prior\nknowledge, which can process composite videos with broader semantic\ndisparities. Specifically, we process the video frames in a cascading manner\nand handle each frame in two processes with the diffusion model. In the\ninversion process, we propose Balanced Partial Inversion to obtain generation\ninitial points that balance reversibility and modifiability. Then, in the\ngeneration process, we further propose Inter-Frame Augmented attention to\naugment foreground continuity across frames. Experimental results reveal that\nour pipeline successfully ensures the visual harmony and inter-frame coherence\nof the outputs, demonstrating efficacy in managing broader semantic\ndisparities.\n","authors":["Jiaqi Guo","Sitong Su","Junchen Zhu","Lianli Gao","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2401.09195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09190v1","updated":"2024-01-17T13:00:57Z","published":"2024-01-17T13:00:57Z","title":"Exploring the Role of Convolutional Neural Networks (CNN) in Dental\n Radiography Segmentation: A Comprehensive Systematic Literature Review","summary":" In the field of dentistry, there is a growing demand for increased precision\nin diagnostic tools, with a specific focus on advanced imaging techniques such\nas computed tomography, cone beam computed tomography, magnetic resonance\nimaging, ultrasound, and traditional intra-oral periapical X-rays. Deep\nlearning has emerged as a pivotal tool in this context, enabling the\nimplementation of automated segmentation techniques crucial for extracting\nessential diagnostic data. This integration of cutting-edge technology\naddresses the urgent need for effective management of dental conditions, which,\nif left undetected, can have a significant impact on human health. The\nimpressive track record of deep learning across various domains, including\ndentistry, underscores its potential to revolutionize early detection and\ntreatment of oral health issues. Objective: Having demonstrated significant\nresults in diagnosis and prediction, deep convolutional neural networks (CNNs)\nrepresent an emerging field of multidisciplinary research. The goals of this\nstudy were to provide a concise overview of the state of the art, standardize\nthe current debate, and establish baselines for future research. Method: In\nthis study, a systematic literature review is employed as a methodology to\nidentify and select relevant studies that specifically investigate the deep\nlearning technique for dental imaging analysis. This study elucidates the\nmethodological approach, including the systematic collection of data,\nstatistical analysis, and subsequent dissemination of outcomes. Conclusion:\nThis work demonstrates how Convolutional Neural Networks (CNNs) can be employed\nto analyze images, serving as effective tools for detecting dental pathologies.\nAlthough this research acknowledged some limitations, CNNs utilized for\nsegmenting and categorizing teeth exhibited their highest level of performance\noverall.\n","authors":["Walid Brahmi","Imen Jdey","Fadoua Drira"],"pdf_url":"https://arxiv.org/pdf/2401.09190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02906v2","updated":"2024-01-17T12:58:36Z","published":"2024-01-05T17:05:42Z","title":"MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance","summary":" The deployment of multimodal large language models (MLLMs) has brought forth\na unique vulnerability: susceptibility to malicious attacks through visual\ninputs. We delve into the novel challenge of defending MLLMs against such\nattacks. We discovered that images act as a \"foreign language\" that is not\nconsidered during alignment, which can make MLLMs prone to producing harmful\nresponses. Unfortunately, unlike the discrete tokens considered in text-based\nLLMs, the continuous nature of image signals presents significant alignment\nchallenges, which poses difficulty to thoroughly cover the possible scenarios.\nThis vulnerability is exacerbated by the fact that open-source MLLMs are\npredominantly fine-tuned on limited image-text pairs that is much less than the\nextensive text-based pretraining corpus, which makes the MLLMs more prone to\ncatastrophic forgetting of their original abilities during explicit alignment\ntuning. To tackle these challenges, we introduce MLLM-Protector, a\nplug-and-play strategy combining a lightweight harm detector and a response\ndetoxifier. The harm detector's role is to identify potentially harmful outputs\nfrom the MLLM, while the detoxifier corrects these outputs to ensure the\nresponse stipulates to the safety standards. This approach effectively\nmitigates the risks posed by malicious visual inputs without compromising the\nmodel's overall performance. Our results demonstrate that MLLM-Protector offers\na robust solution to a previously unaddressed aspect of MLLM security.\n","authors":["Renjie Pi","Tianyang Han","Yueqi Xie","Rui Pan","Qing Lian","Hanze Dong","Jipeng Zhang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.02906v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09180v1","updated":"2024-01-17T12:43:28Z","published":"2024-01-17T12:43:28Z","title":"Unsupervised Multiple Domain Translation through Controlled\n Disentanglement in Variational Autoencoder","summary":" Unsupervised Multiple Domain Translation is the task of transforming data\nfrom one domain to other domains without having paired data to train the\nsystems. Typically, methods based on Generative Adversarial Networks (GANs) are\nused to address this task. However, our proposal exclusively relies on a\nmodified version of a Variational Autoencoder. This modification consists of\nthe use of two latent variables disentangled in a controlled way by design. One\nof this latent variables is imposed to depend exclusively on the domain, while\nthe other one must depend on the rest of the variability factors of the data.\nAdditionally, the conditions imposed over the domain latent variable allow for\nbetter control and understanding of the latent space. We empirically\ndemonstrate that our approach works on different vision datasets improving the\nperformance of other well known methods. Finally, we prove that, indeed, one of\nthe latent variables stores all the information related to the domain and the\nother one hardly contains any domain information.\n","authors":["Almudévar Antonio","Mariotte Théo","Ortega Alfonso","Tahon Marie"],"pdf_url":"https://arxiv.org/pdf/2401.09180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15583v5","updated":"2024-01-17T12:13:50Z","published":"2023-05-24T21:39:27Z","title":"Alleviating Exposure Bias in Diffusion Models through Sampling with\n Shifted Time Steps","summary":" Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the\nsynthesis of high-quality images. However, their inference process\ncharacteristically requires numerous, potentially hundreds, of iterative steps,\nwhich could exaggerate the problem of exposure bias due to the training and\ninference discrepancy. Previous work has attempted to mitigate this issue by\nperturbing inputs during training, which consequently mandates the retraining\nof the DPM. In this work, we conduct a systematic study of exposure bias in DPM\nand, intriguingly, we find that the exposure bias could be alleviated with a\nnovel sampling method that we propose, without retraining the model. We\nempirically and theoretically show that, during inference, for each backward\ntime step $t$ and corresponding state $\\hat{x}_t$, there might exist another\ntime step $t_s$ which exhibits superior coupling with $\\hat{x}_t$. Based on\nthis finding, we introduce a sampling method named Time-Shift Sampler. Our\nframework can be seamlessly integrated to existing sampling algorithms, such as\nDDPM, DDIM and other high-order solvers, inducing merely minimal additional\ncomputations. Experimental results show our method brings significant and\nconsistent improvements in FID scores on different datasets and sampling\nmethods. For example, integrating Time-Shift Sampler to F-PNDM yields a\nFID=3.88, achieving 44.49\\% improvements as compared to F-PNDM, on CIFAR-10\nwith 10 sampling steps, which is more performant than the vanilla DDIM with 100\nsampling steps. Our code is available at https://github.com/Mingxiao-Li/TS-DPM.\n","authors":["Mingxiao Li","Tingyu Qu","Ruicong Yao","Wei Sun","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2305.15583v5.pdf","comment":"Accepted at International Conference on Learning Representations\n (ICLR2024)"},{"id":"http://arxiv.org/abs/2401.05217v2","updated":"2024-01-17T12:09:46Z","published":"2024-01-10T15:30:19Z","title":"Exploring Vulnerabilities of No-Reference Image Quality Assessment\n Models: A Query-Based Black-Box Method","summary":" No-Reference Image Quality Assessment (NR-IQA) aims to predict image quality\nscores consistent with human perception without relying on pristine reference\nimages, serving as a crucial component in various visual tasks. Ensuring the\nrobustness of NR-IQA methods is vital for reliable comparisons of different\nimage processing techniques and consistent user experiences in recommendations.\nThe attack methods for NR-IQA provide a powerful instrument to test the\nrobustness of NR-IQA. However, current attack methods of NR-IQA heavily rely on\nthe gradient of the NR-IQA model, leading to limitations when the gradient\ninformation is unavailable. In this paper, we present a pioneering query-based\nblack box attack against NR-IQA methods. We propose the concept of score\nboundary and leverage an adaptive iterative approach with multiple score\nboundaries. Meanwhile, the initial attack directions are also designed to\nleverage the characteristics of the Human Visual System (HVS). Experiments show\nour method outperforms all compared state-of-the-art attack methods and is far\nahead of previous black-box methods. The effective NR-IQA model DBCNN suffers a\nSpearman's rank-order correlation coefficient (SROCC) decline of 0.6381\nattacked by our method, revealing the vulnerability of NR-IQA models to\nblack-box attacks. The proposed attack method also provides a potent tool for\nfurther exploration into NR-IQA robustness.\n","authors":["Chenxi Yang","Yujia Liu","Dingquan Li","Tingting Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.05217v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09160v1","updated":"2024-01-17T12:08:30Z","published":"2024-01-17T12:08:30Z","title":"DK-SLAM: Monocular Visual SLAM with Deep Keypoints Adaptive Learning,\n Tracking and Loop-Closing","summary":" Unreliable feature extraction and matching in handcrafted features undermine\nthe performance of visual SLAM in complex real-world scenarios. While learned\nlocal features, leveraging CNNs, demonstrate proficiency in capturing\nhigh-level information and excel in matching benchmarks, they encounter\nchallenges in continuous motion scenes, resulting in poor generalization and\nimpacting loop detection accuracy. To address these issues, we present DK-SLAM,\na monocular visual SLAM system with adaptive deep local features. MAML\noptimizes the training of these features, and we introduce a coarse-to-fine\nfeature tracking approach. Initially, a direct method approximates the relative\npose between consecutive frames, followed by a feature matching method for\nrefined pose estimation. To counter cumulative positioning errors, a novel\nonline learning binary feature-based online loop closure module identifies loop\nnodes within a sequence. Experimental results underscore DK-SLAM's efficacy,\noutperforms representative SLAM solutions, such as ORB-SLAM3 on publicly\navailable datasets.\n","authors":["Hao Qu","Lilian Zhang","Jun Mao","Junbo Tie","Xiaofeng He","Xiaoping Hu","Yifei Shi","Changhao Chen"],"pdf_url":"https://arxiv.org/pdf/2401.09160v1.pdf","comment":"In submission"},{"id":"http://arxiv.org/abs/2305.07895v5","updated":"2024-01-17T12:02:33Z","published":"2023-05-13T11:28:37Z","title":"On the Hidden Mystery of OCR in Large Multimodal Models","summary":" Large models have recently played a dominant role in natural language\nprocessing and multimodal vision-language learning. However, their\neffectiveness in text-related visual tasks remains relatively unexplored. In\nthis paper, we conducted a comprehensive evaluation of Large Multimodal Models,\nsuch as GPT4V and Gemini, in various text-related visual tasks including Text\nRecognition, Scene Text-Centric Visual Question Answering (VQA),\nDocument-Oriented VQA, Key Information Extraction (KIE), and Handwritten\nMathematical Expression Recognition (HMER). To facilitate the assessment of\nOptical Character Recognition (OCR) capabilities in Large Multimodal Models, we\npropose OCRBench, a comprehensive evaluation benchmark.Our study encompasses 29\ndatasets, making it the most comprehensive OCR evaluation benchmark available.\nFurthermore, our study reveals both the strengths and weaknesses of these\nmodels, particularly in handling multilingual text, handwritten text,\nnon-semantic text, and mathematical expression recognition. Most importantly,\nthe baseline results showcased in this study could provide a foundational\nframework for the conception and assessment of innovative strategies targeted\nat enhancing zero-shot multimodal techniques. The evaluation pipeline and\nbenchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR.\n","authors":["Yuliang Liu","Zhang Li","Biao Yang","Chunyuan Li","Xucheng Yin","Cheng-lin Liu","Lianwen Jin","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2305.07895v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09146v1","updated":"2024-01-17T11:40:05Z","published":"2024-01-17T11:40:05Z","title":"Continuous Piecewise-Affine Based Motion Model for Image Animation","summary":" Image animation aims to bring static images to life according to driving\nvideos and create engaging visual content that can be used for various purposes\nsuch as animation, entertainment, and education. Recent unsupervised methods\nutilize affine and thin-plate spline transformations based on keypoints to\ntransfer the motion in driving frames to the source image. However, limited by\nthe expressive power of the transformations used, these methods always produce\npoor results when the gap between the motion in the driving frame and the\nsource image is large. To address this issue, we propose to model motion from\nthe source image to the driving frame in highly-expressive diffeomorphism\nspaces. Firstly, we introduce Continuous Piecewise-Affine based (CPAB)\ntransformation to model the motion and present a well-designed inference\nalgorithm to generate CPAB transformation from control keypoints. Secondly, we\npropose a SAM-guided keypoint semantic loss to further constrain the keypoint\nextraction process and improve the semantic consistency between the\ncorresponding keypoints on the source and driving images. Finally, we design a\nstructure alignment loss to align the structure-related features extracted from\ndriving and generated images, thus helping the generator generate results that\nare more consistent with the driving action. Extensive experiments on four\ndatasets demonstrate the effectiveness of our method against state-of-the-art\ncompetitors quantitatively and qualitatively. Code will be publicly available\nat: https://github.com/DevilPG/AAAI2024-CPABMM.\n","authors":["Hexiang Wang","Fengqi Liu","Qianyu Zhou","Ran Yi","Xin Tan","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.09146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09140v1","updated":"2024-01-17T11:28:28Z","published":"2024-01-17T11:28:28Z","title":"Relative Pose for Nonrigid Multi-Perspective Cameras: The Static Case","summary":" Multi-perspective cameras with potentially non-overlapping fields of view\nhave become an important exteroceptive sensing modality in a number of\napplications such as intelligent vehicles, drones, and mixed reality headsets.\nIn this work, we challenge one of the basic assumptions made in these\nscenarios, which is that the multi-camera rig is rigid. More specifically, we\nare considering the problem of estimating the relative pose between a static\nnon-rigid rig in different spatial orientations while taking into account the\neffect of gravity onto the system. The deformable physical connections between\neach camera and the body center are approximated by a simple cantilever model,\nand inserted into the generalized epipolar constraint. Our results lead us to\nthe important insight that the latent parameters of the deformation model,\nmeaning the gravity vector in both views, become observable. We present a\nconcise analysis of the observability of all variables based on noise,\noutliers, and rig rigidity for two different algorithms. The first one is a\nvision-only alternative, while the second one makes use of additional gravity\nmeasurements. To conclude, we demonstrate the ability to sense gravity in a\nreal-world example, and discuss practical implications.\n","authors":["Min Li","Jiaqi Yang","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2401.09140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09133v1","updated":"2024-01-17T11:15:09Z","published":"2024-01-17T11:15:09Z","title":"SM$^3$: Self-Supervised Multi-task Modeling with Multi-view 2D Images\n for Articulated Objects","summary":" Reconstructing real-world objects and estimating their movable joint\nstructures are pivotal technologies within the field of robotics. Previous\nresearch has predominantly focused on supervised approaches, relying on\nextensively annotated datasets to model articulated objects within limited\ncategories. However, this approach falls short of effectively addressing the\ndiversity present in the real world. To tackle this issue, we propose a\nself-supervised interaction perception method, referred to as SM$^3$, which\nleverages multi-view RGB images captured before and after interaction to model\narticulated objects, identify the movable parts, and infer the parameters of\ntheir rotating joints. By constructing 3D geometries and textures from the\ncaptured 2D images, SM$^3$ achieves integrated optimization of movable part and\njoint parameters during the reconstruction process, obviating the need for\nannotations. Furthermore, we introduce the MMArt dataset, an extension of\nPartNet-Mobility, encompassing multi-view and multi-modal data of articulated\nobjects spanning diverse categories. Evaluations demonstrate that SM$^3$\nsurpasses existing benchmarks across various categories and objects, while its\nadaptability in real-world scenarios has been thoroughly validated.\n","authors":["Haowen Wang","Zhen Zhao","Zhao Jin","Zhengping Che","Liang Qiao","Yakun Huang","Zhipeng Fan","Xiuquan Qiao","Jian Tang"],"pdf_url":"https://arxiv.org/pdf/2401.09133v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09126v1","updated":"2024-01-17T11:02:52Z","published":"2024-01-17T11:02:52Z","title":"Objects With Lighting: A Real-World Dataset for Evaluating\n Reconstruction and Rendering for Object Relighting","summary":" Reconstructing an object from photos and placing it virtually in a new\nenvironment goes beyond the standard novel view synthesis task as the\nappearance of the object has to not only adapt to the novel viewpoint but also\nto the new lighting conditions and yet evaluations of inverse rendering methods\nrely on novel view synthesis data or simplistic synthetic datasets for\nquantitative analysis. This work presents a real-world dataset for measuring\nthe reconstruction and rendering of objects for relighting. To this end, we\ncapture the environment lighting and ground truth images of the same objects in\nmultiple environments allowing to reconstruct the objects from images taken in\none environment and quantify the quality of the rendered views for the unseen\nlighting environments. Further, we introduce a simple baseline composed of\noff-the-shelf methods and test several state-of-the-art methods on the\nrelighting task and show that novel view synthesis is not a reliable proxy to\nmeasure performance. Code and dataset are available at\nhttps://github.com/isl-org/objects-with-lighting .\n","authors":["Benjamin Ummenhofer","Sanskar Agrawal","Rene Sepulveda","Yixing Lao","Kai Zhang","Tianhang Cheng","Stephan Richter","Shenlong Wang","German Ros"],"pdf_url":"https://arxiv.org/pdf/2401.09126v1.pdf","comment":"Accepted at 3DV 2024, Oral presentation. For the project page see\n https://github.com/isl-org/objects-with-lighting"},{"id":"http://arxiv.org/abs/2401.09112v1","updated":"2024-01-17T10:26:17Z","published":"2024-01-17T10:26:17Z","title":"Stream Query Denoising for Vectorized HD Map Construction","summary":" To enhance perception performance in complex and extensive scenarios within\nthe realm of autonomous driving, there has been a noteworthy focus on temporal\nmodeling, with a particular emphasis on streaming methods. The prevailing trend\nin streaming models involves the utilization of stream queries for the\npropagation of temporal information. Despite the prevalence of this approach,\nthe direct application of the streaming paradigm to the construction of\nvectorized high-definition maps (HD-maps) fails to fully harness the inherent\npotential of temporal information. This paper introduces the Stream Query\nDenoising (SQD) strategy as a novel approach for temporal modeling in\nhigh-definition map (HD-map) construction. SQD is designed to facilitate the\nlearning of temporal consistency among map elements within the streaming model.\nThe methodology involves denoising the queries that have been perturbed by the\naddition of noise to the ground-truth information from the preceding frame.\nThis denoising process aims to reconstruct the ground-truth information for the\ncurrent frame, thereby simulating the prediction process inherent in stream\nqueries. The SQD strategy can be applied to those streaming methods (e.g.,\nStreamMapNet) to enhance the temporal modeling. The proposed SQD-MapNet is the\nStreamMapNet equipped with SQD. Extensive experiments on nuScenes and\nArgoverse2 show that our method is remarkably superior to other existing\nmethods across all settings of close range and long range. The code will be\navailable soon.\n","authors":["Shuo Wang","Fan Jia","Yingfei Liu","Yucheng Zhao","Zehui Chen","Tiancai Wang","Chi Zhang","Xiangyu Zhang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.09112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09109v1","updated":"2024-01-17T10:21:08Z","published":"2024-01-17T10:21:08Z","title":"Trapped in texture bias? A large scale comparison of deep instance\n segmentation","summary":" Do deep learning models for instance segmentation generalize to novel objects\nin a systematic way? For classification, such behavior has been questioned. In\nthis study, we aim to understand if certain design decisions such as framework,\narchitecture or pre-training contribute to the semantic understanding of\ninstance segmentation. To answer this question, we consider a special case of\nrobustness and compare pre-trained models on a challenging benchmark for\nobject-centric, out-of-distribution texture. We do not introduce another method\nin this work. Instead, we take a step back and evaluate a broad range of\nexisting literature. This includes Cascade and Mask R-CNN, Swin Transformer,\nBMask, YOLACT(++), DETR, BCNet, SOTR and SOLOv2. We find that YOLACT++, SOTR\nand SOLOv2 are significantly more robust to out-of-distribution texture than\nother frameworks. In addition, we show that deeper and dynamic architectures\nimprove robustness whereas training schedules, data augmentation and\npre-training have only a minor impact. In summary we evaluate 68 models on 61\nversions of MS COCO for a total of 4148 evaluations.\n","authors":["Johannes Theodoridis","Jessica Hofmann","Johannes Maucher","Andreas Schilling"],"pdf_url":"https://arxiv.org/pdf/2401.09109v1.pdf","comment":"Accepted at ECCV 2022. Code:\n https://github.com/JohannesTheo/trapped-in-texture-bias"},{"id":"http://arxiv.org/abs/2309.08480v2","updated":"2024-01-17T10:09:14Z","published":"2023-09-15T15:36:50Z","title":"PoseFix: Correcting 3D Human Poses with Natural Language","summary":" Automatically producing instructions to modify one's posture could open the\ndoor to endless applications, such as personalized coaching and in-home\nphysical therapy. Tackling the reverse problem (i.e., refining a 3D pose based\non some natural language feedback) could help for assisted 3D character\nanimation or robot teaching, for instance. Although a few recent works explore\nthe connections between natural language and 3D human pose, none focus on\ndescribing 3D body pose differences. In this paper, we tackle the problem of\ncorrecting 3D human poses with natural language. To this end, we introduce the\nPoseFix dataset, which consists of several thousand paired 3D poses and their\ncorresponding text feedback, that describe how the source pose needs to be\nmodified to obtain the target pose. We demonstrate the potential of this\ndataset on two tasks: (1) text-based pose editing, that aims at generating\ncorrected 3D body poses given a query pose and a text modifier; and (2)\ncorrectional text generation, where instructions are generated based on the\ndifferences between two body poses.\n","authors":["Ginger Delmas","Philippe Weinzaepfel","Francesc Moreno-Noguer","Grégory Rogez"],"pdf_url":"https://arxiv.org/pdf/2309.08480v2.pdf","comment":"Published in ICCV 2023"},{"id":"http://arxiv.org/abs/2401.09101v1","updated":"2024-01-17T10:06:12Z","published":"2024-01-17T10:06:12Z","title":"PIN-SLAM: LiDAR SLAM Using a Point-Based Implicit Neural Representation\n for Achieving Global Map Consistency","summary":" Accurate and robust localization and mapping are essential components for\nmost autonomous robots. In this paper, we propose a SLAM system for building\nglobally consistent maps, called PIN-SLAM, that is based on an elastic and\ncompact point-based implicit neural map representation. Taking range\nmeasurements as input, our approach alternates between incremental learning of\nthe local implicit signed distance field and the pose estimation given the\ncurrent local map using a correspondence-free, point-to-implicit model\nregistration. Our implicit map is based on sparse optimizable neural points,\nwhich are inherently elastic and deformable with the global pose adjustment\nwhen closing a loop. Loops are also detected using the neural point features.\nExtensive experiments validate that PIN-SLAM is robust to various environments\nand versatile to different range sensors such as LiDAR and RGB-D cameras.\nPIN-SLAM achieves pose estimation accuracy better or on par with the\nstate-of-the-art LiDAR odometry or SLAM systems and outperforms the recent\nneural implicit SLAM approaches while maintaining a more consistent, and highly\ncompact implicit map that can be reconstructed as accurate and complete meshes.\nFinally, thanks to the voxel hashing for efficient neural points indexing and\nthe fast implicit map-based registration without closest point association,\nPIN-SLAM can run at the sensor frame rate on a moderate GPU. Codes will be\navailable at: https://github.com/PRBonn/PIN_SLAM.\n","authors":["Yue Pan","Xingguang Zhong","Louis Wiesmann","Thorbjörn Posewsky","Jens Behley","Cyrill Stachniss"],"pdf_url":"https://arxiv.org/pdf/2401.09101v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.09084v1","updated":"2024-01-17T09:46:13Z","published":"2024-01-17T09:46:13Z","title":"UniVG: Towards UNIfied-modal Video Generation","summary":" Diffusion based video generation has received extensive attention and\nachieved considerable success within both the academic and industrial\ncommunities. However, current efforts are mainly concentrated on\nsingle-objective or single-task video generation, such as generation driven by\ntext, by image, or by a combination of text and image. This cannot fully meet\nthe needs of real-world application scenarios, as users are likely to input\nimages and text conditions in a flexible manner, either individually or in\ncombination. To address this, we propose a Unified-modal Video Genearation\nsystem that is capable of handling multiple video generation tasks across text\nand image modalities. To this end, we revisit the various video generation\ntasks within our system from the perspective of generative freedom, and\nclassify them into high-freedom and low-freedom video generation categories.\nFor high-freedom video generation, we employ Multi-condition Cross Attention to\ngenerate videos that align with the semantics of the input images or text. For\nlow-freedom video generation, we introduce Biased Gaussian Noise to replace the\npure random Gaussian Noise, which helps to better preserve the content of the\ninput conditions. Our method achieves the lowest Fr\\'echet Video Distance (FVD)\non the public academic benchmark MSR-VTT, surpasses the current open-source\nmethods in human evaluations, and is on par with the current close-source\nmethod Gen2. For more samples, visit https://univg-baidu.github.io.\n","authors":["Ludan Ruan","Lei Tian","Chuanwei Huang","Xu Zhang","Xinyan Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.09084v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09083v1","updated":"2024-01-17T09:44:07Z","published":"2024-01-17T09:44:07Z","title":"Remote Sensing ChatGPT: Solving Remote Sensing Tasks with ChatGPT and\n Visual Models","summary":" Recently, the flourishing large language models(LLM), especially ChatGPT,\nhave shown exceptional performance in language understanding, reasoning, and\ninteraction, attracting users and researchers from multiple fields and domains.\nAlthough LLMs have shown great capacity to perform human-like task\naccomplishment in natural language and natural image, their potential in\nhandling remote sensing interpretation tasks has not yet been fully explored.\nMoreover, the lack of automation in remote sensing task planning hinders the\naccessibility of remote sensing interpretation techniques, especially to\nnon-remote sensing experts from multiple research fields. To this end, we\npresent Remote Sensing ChatGPT, an LLM-powered agent that utilizes ChatGPT to\nconnect various AI-based remote sensing models to solve complicated\ninterpretation tasks. More specifically, given a user request and a remote\nsensing image, we utilized ChatGPT to understand user requests, perform task\nplanning according to the tasks' functions, execute each subtask iteratively,\nand generate the final response according to the output of each subtask.\nConsidering that LLM is trained with natural language and is not capable of\ndirectly perceiving visual concepts as contained in remote sensing images, we\ndesigned visual cues that inject visual information into ChatGPT. With Remote\nSensing ChatGPT, users can simply send a remote sensing image with the\ncorresponding request, and get the interpretation results as well as language\nfeedback from Remote Sensing ChatGPT. Experiments and examples show that Remote\nSensing ChatGPT can tackle a wide range of remote sensing tasks and can be\nextended to more tasks with more sophisticated models such as the remote\nsensing foundation model. The code and demo of Remote Sensing ChatGPT is\npublicly available at https://github.com/HaonanGuo/Remote-Sensing-ChatGPT .\n","authors":["Haonan Guo","Xin Su","Chen Wu","Bo Du","Liangpei Zhang","Deren Li"],"pdf_url":"https://arxiv.org/pdf/2401.09083v1.pdf","comment":"The manuscript is submitted to IEEE International Geoscience and\n Remote Sensing Symposium(IGARSS2024). Looking forward to seeing you in July!"},{"id":"http://arxiv.org/abs/2312.04118v2","updated":"2024-01-17T09:43:14Z","published":"2023-12-07T08:18:40Z","title":"Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic\n Play","summary":" Infants' ability to recognize and categorize objects develops gradually. The\nsecond year of life is marked by both the emergence of more semantic visual\nrepresentations and a better understanding of word meaning. This suggests that\nlanguage input may play an important role in shaping visual representations.\nHowever, even in suitable contexts for word learning like dyadic play sessions,\ncaregivers utterances are sparse and ambiguous, often referring to objects that\nare different from the one to which the child attends. Here, we systematically\ninvestigate to what extent caregivers' utterances can nevertheless enhance\nvisual representations. For this we propose a computational model of visual\nrepresentation learning during dyadic play. We introduce a synthetic dataset of\nego-centric images perceived by a toddler-agent that moves and rotates toy\nobjects in different parts of its home environment while hearing caregivers'\nutterances, modeled as captions. We propose to model toddlers' learning as\nsimultaneously aligning representations for 1) close-in-time images and 2)\nco-occurring images and utterances. We show that utterances with statistics\nmatching those of real caregivers give rise to representations supporting\nimproved category recognition. Our analysis reveals that a small\ndecrease/increase in object-relevant naming frequencies can drastically impact\nthe learned representations. This affects the attention on object names within\nan utterance, which is required for efficient visuo-linguistic alignment.\nOverall, our results support the hypothesis that caregivers' naming utterances\ncan improve toddlers' visual representations.\n","authors":["Timothy Schaumlöffel","Arthur Aubret","Gemma Roig","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2312.04118v2.pdf","comment":"Proceedings of the 2023 IEEE International Conference on Development\n and Learning (ICDL)"},{"id":"http://arxiv.org/abs/2204.02779v4","updated":"2024-01-17T09:38:09Z","published":"2022-04-05T15:02:18Z","title":"A Dempster-Shafer approach to trustworthy AI with application to fetal\n brain MRI segmentation","summary":" Deep learning models for medical image segmentation can fail unexpectedly and\nspectacularly for pathological cases and images acquired at different centers\nthan training images, with labeling errors that violate expert knowledge. Such\nerrors undermine the trustworthiness of deep learning models for medical image\nsegmentation. Mechanisms for detecting and correcting such failures are\nessential for safely translating this technology into clinics and are likely to\nbe a requirement of future regulations on artificial intelligence (AI). In this\nwork, we propose a trustworthy AI theoretical framework and a practical system\nthat can augment any backbone AI system using a fallback method and a fail-safe\nmechanism based on Dempster-Shafer theory. Our approach relies on an actionable\ndefinition of trustworthy AI. Our method automatically discards the voxel-level\nlabeling predicted by the backbone AI that violate expert knowledge and relies\non a fallback for those voxels. We demonstrate the effectiveness of the\nproposed trustworthy AI approach on the largest reported annotated dataset of\nfetal MRI consisting of 540 manually annotated fetal brain 3D T2w MRIs from 13\ncenters. Our trustworthy AI method improves the robustness of a\nstate-of-the-art backbone AI for fetal brain MRIs acquired across various\ncenters and for fetuses with various brain abnormalities.\n","authors":["Lucas Fidon","Michael Aertsen","Florian Kofler","Andrea Bink","Anna L. David","Thomas Deprest","Doaa Emam","Frédéric Guffens","András Jakab","Gregor Kasprian","Patric Kienast","Andrew Melbourne","Bjoern Menze","Nada Mufti","Ivana Pogledic","Daniela Prayer","Marlene Stuempflen","Esther Van Elslander","Sébastien Ourselin","Jan Deprest","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2204.02779v4.pdf","comment":"Published in IEEE TPAMI. Minor revision compared to the previous\n version"},{"id":"http://arxiv.org/abs/2401.05686v2","updated":"2024-01-17T09:28:01Z","published":"2024-01-11T06:22:40Z","title":"Self Expanding Convolutional Neural Networks","summary":" In this paper, we present a novel method for dynamically expanding\nConvolutional Neural Networks (CNNs) during training, aimed at meeting the\nincreasing demand for efficient and sustainable deep learning models. Our\napproach, drawing from the seminal work on Self-Expanding Neural Networks\n(SENN), employs a natural expansion score as an expansion criteria to address\nthe common issue of over-parameterization in deep convolutional neural\nnetworks, thereby ensuring that the model's complexity is finely tuned to the\ntask's specific needs. A significant benefit of this method is its eco-friendly\nnature, as it obviates the necessity of training multiple models of different\nsizes. We employ a strategy where a single model is dynamically expanded,\nfacilitating the extraction of checkpoints at various complexity levels,\neffectively reducing computational resource use and energy consumption while\nalso expediting the development cycle by offering diverse model complexities\nfrom a single training session. We evaluate our method on the CIFAR-10 dataset\nand our experimental results validate this approach, demonstrating that\ndynamically adding layers not only maintains but also improves CNN performance,\nunderscoring the effectiveness of our expansion criteria. This approach marks a\nconsiderable advancement in developing adaptive, scalable, and environmentally\nconsiderate neural network architectures, addressing key challenges in the\nfield of deep learning.\n","authors":["Blaise Appolinary","Alex Deaconu","Sophia Yang"," Qingze"," Li"],"pdf_url":"https://arxiv.org/pdf/2401.05686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09067v1","updated":"2024-01-17T09:01:29Z","published":"2024-01-17T09:01:29Z","title":"Towards Continual Learning Desiderata via HSIC-Bottleneck\n Orthogonalization and Equiangular Embedding","summary":" Deep neural networks are susceptible to catastrophic forgetting when trained\non sequential tasks. Various continual learning (CL) methods often rely on\nexemplar buffers or/and network expansion for balancing model stability and\nplasticity, which, however, compromises their practical value due to privacy\nand memory concerns. Instead, this paper considers a strict yet realistic\nsetting, where the training data from previous tasks is unavailable and the\nmodel size remains relatively constant during sequential training. To achieve\nsuch desiderata, we propose a conceptually simple yet effective method that\nattributes forgetting to layer-wise parameter overwriting and the resulting\ndecision boundary distortion. This is achieved by the synergy between two key\ncomponents: HSIC-Bottleneck Orthogonalization (HBO) implements non-overwritten\nparameter updates mediated by Hilbert-Schmidt independence criterion in an\northogonal space and EquiAngular Embedding (EAE) enhances decision boundary\nadaptation between old and new tasks with predefined basis vectors. Extensive\nexperiments demonstrate that our method achieves competitive accuracy\nperformance, even with absolute superiority of zero exemplar buffer and 1.02x\nthe base model.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Qining Ren","Kenji Kawaguchi","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.09067v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.09059v1","updated":"2024-01-17T08:48:29Z","published":"2024-01-17T08:48:29Z","title":"Autonomous Catheterization with Open-source Simulator and Expert\n Trajectory","summary":" Endovascular robots have been actively developed in both academia and\nindustry. However, progress toward autonomous catheterization is often hampered\nby the widespread use of closed-source simulators and physical phantoms.\nAdditionally, the acquisition of large-scale datasets for training machine\nlearning algorithms with endovascular robots is usually infeasible due to\nexpensive medical procedures. In this chapter, we introduce CathSim, the first\nopen-source simulator for endovascular intervention to address these\nlimitations. CathSim emphasizes real-time performance to enable rapid\ndevelopment and testing of learning algorithms. We validate CathSim against the\nreal robot and show that our simulator can successfully mimic the behavior of\nthe real robot. Based on CathSim, we develop a multimodal expert navigation\nnetwork and demonstrate its effectiveness in downstream endovascular navigation\ntasks. The intensive experimental results suggest that CathSim has the\npotential to significantly accelerate research in the autonomous\ncatheterization field. Our project is publicly available at\nhttps://github.com/airvlab/cathsim.\n","authors":["Tudor Jianu","Baoru Huang","Tuan Vo","Minh Nhat Vu","Jingxuan Kang","Hoan Nguyen","Olatunji Omisore","Pierre Berthet-Rayne","Sebastiano Fichera","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2401.09059v1.pdf","comment":"Code: https://github.com/airvlab/cathsim"},{"id":"http://arxiv.org/abs/2401.09057v1","updated":"2024-01-17T08:46:47Z","published":"2024-01-17T08:46:47Z","title":"CrossVideo: Self-supervised Cross-modal Contrastive Learning for Point\n Cloud Video Understanding","summary":" This paper introduces a novel approach named CrossVideo, which aims to\nenhance self-supervised cross-modal contrastive learning in the field of point\ncloud video understanding. Traditional supervised learning methods encounter\nlimitations due to data scarcity and challenges in label acquisition. To\naddress these issues, we propose a self-supervised learning method that\nleverages the cross-modal relationship between point cloud videos and image\nvideos to acquire meaningful feature representations. Intra-modal and\ncross-modal contrastive learning techniques are employed to facilitate\neffective comprehension of point cloud video. We also propose a multi-level\ncontrastive approach for both modalities. Through extensive experiments, we\ndemonstrate that our method significantly surpasses previous state-of-the-art\napproaches, and we conduct comprehensive ablation studies to validate the\neffectiveness of our proposed designs.\n","authors":["Yunze Liu","Changxi Chen","Zifan Wang","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2401.09057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07126v2","updated":"2024-01-17T08:39:42Z","published":"2024-01-13T18:01:44Z","title":"IVIM-Morph: Motion-compensated quantitative Intra-voxel Incoherent\n Motion (IVIM) analysis for functional fetal lung maturity assessment from\n diffusion-weighted MRI data","summary":" Quantitative analysis of pseudo-diffusion in diffusion-weighted magnetic\nresonance imaging (DWI) data shows potential for assessing fetal lung\nmaturation and generating valuable imaging biomarkers. Yet, the clinical\nutility of DWI data is hindered by unavoidable fetal motion during acquisition.\nWe present IVIM-morph, a self-supervised deep neural network model for\nmotion-corrected quantitative analysis of DWI data using the Intra-voxel\nIncoherent Motion (IVIM) model. IVIM-morph combines two sub-networks, a\nregistration sub-network, and an IVIM model fitting sub-network, enabling\nsimultaneous estimation of IVIM model parameters and motion. To promote\nphysically plausible image registration, we introduce a biophysically informed\nloss function that effectively balances registration and model-fitting quality.\nWe validated the efficacy of IVIM-morph by establishing a correlation between\nthe predicted IVIM model parameters of the lung and gestational age (GA) using\nfetal DWI data of 39 subjects. IVIM-morph exhibited a notably improved\ncorrelation with gestational age (GA) when performing in-vivo quantitative\nanalysis of fetal lung DWI data during the canalicular phase. IVIM-morph shows\npotential in developing valuable biomarkers for non-invasive assessment of\nfetal lung maturity with DWI data. Moreover, its adaptability opens the door to\npotential applications in other clinical contexts where motion compensation is\nessential for quantitative DWI analysis. The IVIM-morph code is readily\navailable at: https://github.com/TechnionComputationalMRILab/qDWI-Morph.\n","authors":["Noga Kertes","Yael Zaffrani-Reznikov","Onur Afacan","Sila Kurugol","Simon K. Warfield","Moti Freiman"],"pdf_url":"https://arxiv.org/pdf/2401.07126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06920v2","updated":"2024-01-17T08:35:13Z","published":"2023-03-13T08:37:59Z","title":"Pixel-wise Gradient Uncertainty for Convolutional Neural Networks\n applied to Out-of-Distribution Segmentation","summary":" In recent years, deep neural networks have defined the state-of-the-art in\nsemantic segmentation where their predictions are constrained to a predefined\nset of semantic classes. They are to be deployed in applications such as\nautomated driving, although their categorically confined expressive power runs\ncontrary to such open world scenarios. Thus, the detection and segmentation of\nobjects from outside their predefined semantic space, i.e., out-of-distribution\n(OoD) objects, is of highest interest. Since uncertainty estimation methods\nlike softmax entropy or Bayesian models are sensitive to erroneous predictions,\nthese methods are a natural baseline for OoD detection. Here, we present a\nmethod for obtaining uncertainty scores from pixel-wise loss gradients which\ncan be computed efficiently during inference. Our approach is simple to\nimplement for a large class of models, does not require any additional training\nor auxiliary data and can be readily used on pre-trained segmentation models.\nOur experiments show the ability of our method to identify wrong pixel\nclassifications and to estimate prediction quality at negligible computational\noverhead. In particular, we observe superior performance in terms of OoD\nsegmentation to comparable baselines on the SegmentMeIfYouCan benchmark,\nclearly outperforming other methods.\n","authors":["Kira Maag","Tobias Riedlinger"],"pdf_url":"https://arxiv.org/pdf/2303.06920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09050v1","updated":"2024-01-17T08:32:07Z","published":"2024-01-17T08:32:07Z","title":"Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation\n with Deterministic Sampling Prior","summary":" Score distillation sampling (SDS) and its variants have greatly boosted the\ndevelopment of text-to-3D generation, but are vulnerable to geometry collapse\nand poor textures yet. To solve this issue, we first deeply analyze the SDS and\nfind that its distillation sampling process indeed corresponds to the\ntrajectory sampling of a stochastic differential equation (SDE): SDS samples\nalong an SDE trajectory to yield a less noisy sample which then serves as a\nguidance to optimize a 3D model. However, the randomness in SDE sampling often\nleads to a diverse and unpredictable sample which is not always less noisy, and\nthus is not a consistently correct guidance, explaining the vulnerability of\nSDS. Since for any SDE, there always exists an ordinary differential equation\n(ODE) whose trajectory sampling can deterministically and consistently converge\nto the desired target point as the SDE, we propose a novel and effective\n\"Consistent3D\" method that explores the ODE deterministic sampling prior for\ntext-to-3D generation. Specifically, at each training iteration, given a\nrendered image by a 3D model, we first estimate its desired 3D score function\nby a pre-trained 2D diffusion model, and build an ODE for trajectory sampling.\nNext, we design a consistency distillation sampling loss which samples along\nthe ODE trajectory to generate two adjacent samples and uses the less noisy\nsample to guide another more noisy one for distilling the deterministic prior\ninto the 3D model. Experimental results show the efficacy of our Consistent3D\nin generating high-fidelity and diverse 3D objects and large-scale scenes, as\nshown in Fig. 1. The codes are available at\nhttps://github.com/sail-sg/Consistent3D.\n","authors":["Zike Wu","Pan Zhou","Xuanyu Yi","Xiaoding Yuan","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09049v1","updated":"2024-01-17T08:31:58Z","published":"2024-01-17T08:31:58Z","title":"Enhancing Lidar-based Object Detection in Adverse Weather using Offset\n Sequences in Time","summary":" Automated vehicles require an accurate perception of their surroundings for\nsafe and efficient driving. Lidar-based object detection is a widely used\nmethod for environment perception, but its performance is significantly\naffected by adverse weather conditions such as rain and fog. In this work, we\ninvestigate various strategies for enhancing the robustness of lidar-based\nobject detection by processing sequential data samples generated by lidar\nsensors. Our approaches leverage temporal information to improve a lidar object\ndetection model, without the need for additional filtering or pre-processing\nsteps. We compare $10$ different neural network architectures that process\npoint cloud sequences including a novel augmentation strategy introducing a\ntemporal offset between frames of a sequence during training and evaluate the\neffectiveness of all strategies on lidar point clouds under adverse weather\nconditions through experiments. Our research provides a comprehensive study of\neffective methods for mitigating the effects of adverse weather on the\nreliability of lidar-based object detection using sequential data that are\nevaluated using public datasets such as nuScenes, Dense, and the Canadian\nAdverse Driving Conditions Dataset. Our findings demonstrate that our novel\nmethod, involving temporal offset augmentation through randomized frame\nskipping in sequences, enhances object detection accuracy compared to both the\nbaseline model (Pillar-based Object Detection) and no augmentation.\n","authors":["Raphael van Kempen","Tim Rehbronn","Abin Jose","Johannes Stegmaier","Bastian Lampe","Timo Woopen","Lutz Eckstein"],"pdf_url":"https://arxiv.org/pdf/2401.09049v1.pdf","comment":"Published as part of the III. International Conference on Electrical,\n Computer and Energy Technologies (ICECET 2023), Cape Town, South Africa,\n November 16-17, 2023"},{"id":"http://arxiv.org/abs/2401.09048v1","updated":"2024-01-17T08:30:47Z","published":"2024-01-17T08:30:47Z","title":"Compose and Conquer: Diffusion-Based 3D Depth Aware Composable Image\n Synthesis","summary":" Addressing the limitations of text as a source of accurate layout\nrepresentation in text-conditional diffusion models, many works incorporate\nadditional signals to condition certain attributes within a generated image.\nAlthough successful, previous works do not account for the specific\nlocalization of said attributes extended into the three dimensional plane. In\nthis context, we present a conditional diffusion model that integrates control\nover three-dimensional object placement with disentangled representations of\nglobal stylistic semantics from multiple exemplar images. Specifically, we\nfirst introduce \\textit{depth disentanglement training} to leverage the\nrelative depth of objects as an estimator, allowing the model to identify the\nabsolute positions of unseen objects through the use of synthetic image\ntriplets. We also introduce \\textit{soft guidance}, a method for imposing\nglobal semantics onto targeted regions without the use of any additional\nlocalization cues. Our integrated framework, \\textsc{Compose and Conquer\n(CnC)}, unifies these techniques to localize multiple conditions in a\ndisentangled manner. We demonstrate that our approach allows perception of\nobjects at varying depths while offering a versatile framework for composing\nlocalized objects with different global semantics. Code:\nhttps://github.com/tomtom1103/compose-and-conquer/\n","authors":["Jonghyun Lee","Hansam Cho","Youngjoon Yoo","Seoung Bum Kim","Yonghyun Jeong"],"pdf_url":"https://arxiv.org/pdf/2401.09048v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09047v1","updated":"2024-01-17T08:30:32Z","published":"2024-01-17T08:30:32Z","title":"VideoCrafter2: Overcoming Data Limitations for High-Quality Video\n Diffusion Models","summary":" Text-to-video generation aims to produce a video based on a given prompt.\nRecently, several commercial video models have been able to generate plausible\nvideos with minimal noise, excellent details, and high aesthetic scores.\nHowever, these models rely on large-scale, well-filtered, high-quality videos\nthat are not accessible to the community. Many existing research works, which\ntrain models using the low-quality WebVid-10M dataset, struggle to generate\nhigh-quality videos because the models are optimized to fit WebVid-10M. In this\nwork, we explore the training scheme of video models extended from Stable\nDiffusion and investigate the feasibility of leveraging low-quality videos and\nsynthesized high-quality images to obtain a high-quality video model. We first\nanalyze the connection between the spatial and temporal modules of video models\nand the distribution shift to low-quality videos. We observe that full training\nof all modules results in a stronger coupling between spatial and temporal\nmodules than only training temporal modules. Based on this stronger coupling,\nwe shift the distribution to higher quality without motion degradation by\nfinetuning spatial modules with high-quality images, resulting in a generic\nhigh-quality video model. Evaluations are conducted to demonstrate the\nsuperiority of the proposed method, particularly in picture quality, motion,\nand concept composition.\n","authors":["Haoxin Chen","Yong Zhang","Xiaodong Cun","Menghan Xia","Xintao Wang","Chao Weng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.09047v1.pdf","comment":"Homepage: https://ailab-cvc.github.io/videocrafter; Github:\n https://github.com/AILab-CVC/VideoCrafter"},{"id":"http://arxiv.org/abs/2312.08846v2","updated":"2024-01-17T08:05:07Z","published":"2023-12-14T12:02:24Z","title":"TiMix: Text-aware Image Mixing for Effective Vision-Language\n Pre-training","summary":" Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances\nmodern Vision-Language Pre-training (VLP) models by aligning visual and\nlinguistic modalities. Due to noises in web-harvested text-image pairs,\nhowever, scaling up training data volume in SMCL presents considerable\nobstacles in terms of computational cost and data inefficiency. To improve data\nefficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates\nmix-based data augmentation techniques into SMCL, yielding significant\nperformance improvements without significantly increasing computational\noverhead. We provide a theoretical analysis of TiMixfrom a mutual information\n(MI) perspective, showing that mixed data samples for cross-modal contrastive\nlearning implicitly serve as a regularizer for the contrastive loss. The\nexperimental results demonstrate that TiMix exhibits a comparable performance\non downstream tasks, even with a reduced amount of training data and shorter\ntraining time, when benchmarked against existing methods. This work empirically\nand theoretically demonstrates the potential of data mixing for data-efficient\nand computationally viable VLP, benefiting broader VLP model adoption in\npractical scenarios.\n","authors":["Chaoya Jiang","Wei ye","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.08846v2.pdf","comment":"Accepted on AAAI2024"},{"id":"http://arxiv.org/abs/2401.09029v1","updated":"2024-01-17T07:54:49Z","published":"2024-01-17T07:54:49Z","title":"Cross-modality Guidance-aided Multi-modal Learning with Dual Attention\n for MRI Brain Tumor Grading","summary":" Brain tumor represents one of the most fatal cancers around the world, and is\nvery common in children and the elderly. Accurate identification of the type\nand grade of tumor in the early stages plays an important role in choosing a\nprecise treatment plan. The Magnetic Resonance Imaging (MRI) protocols of\ndifferent sequences provide clinicians with important contradictory information\nto identify tumor regions. However, manual assessment is time-consuming and\nerror-prone due to big amount of data and the diversity of brain tumor types.\nHence, there is an unmet need for MRI automated brain tumor diagnosis. We\nobserve that the predictive capability of uni-modality models is limited and\ntheir performance varies widely across modalities, and the commonly used\nmodality fusion methods would introduce potential noise, which results in\nsignificant performance degradation. To overcome these challenges, we propose a\nnovel cross-modality guidance-aided multi-modal learning with dual attention\nfor addressing the task of MRI brain tumor grading. To balance the tradeoff\nbetween model efficiency and efficacy, we employ ResNet Mix Convolution as the\nbackbone network for feature extraction. Besides, dual attention is applied to\ncapture the semantic interdependencies in spatial and slice dimensions\nrespectively. To facilitate information interaction among modalities, we design\na cross-modality guidance-aided module where the primary modality guides the\nother secondary modalities during the process of training, which can\neffectively leverage the complementary information of different MRI modalities\nand meanwhile alleviate the impact of the possible noise.\n","authors":["Dunyuan Xu","Xi Wang","Jinyue Cai","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2401.09029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08747v3","updated":"2024-01-17T07:47:29Z","published":"2023-11-15T07:29:24Z","title":"Improved Dense Nested Attention Network Based on Transformer for\n Infrared Small Target Detection","summary":" Infrared small target detection based on deep learning offers unique\nadvantages in separating small targets from complex and dynamic backgrounds.\nHowever, the features of infrared small targets gradually weaken as the depth\nof convolutional neural network (CNN) increases. To address this issue, we\npropose a novel method for detecting infrared small targets called improved\ndense nested attention network (IDNANet), which is based on the transformer\narchitecture. We preserve the dense nested structure of dense nested attention\nnetwork (DNANet) and introduce the Swin-transformer during feature extraction\nstage to enhance the continuity of features. Furthermore, we integrate the\nACmix attention structure into the dense nested structure to enhance the\nfeatures of intermediate layers. Additionally, we design a weighted dice binary\ncross-entropy (WD-BCE) loss function to mitigate the negative impact of\nforeground-background imbalance in the samples. Moreover, we develop a dataset\nspecifically for infrared small targets, called BIT-SIRST. The dataset\ncomprises a significant amount of real-world targets and manually annotated\nlabels, as well as synthetic data and corresponding labels. We have evaluated\nthe effectiveness of our method through experiments conducted on public\ndatasets. In comparison to other state-of-the-art methods, our approach\noutperforms in terms of probability of detection ($P_d$), false-alarm rate\n($F_a$), and mean intersection of union ($mIoU$). The $mIoU$ reaches 90.89\\% on\nthe NUDT-SIRST dataset and 79.72\\% on the SIRST dataset. The BIT-SIRST dataset\nand codes are available openly at\n\\href{https://github.com/EdwardBao1006/bit\\_sirst}{\\color[HTML]{B22222}{https://github.com/EdwardBao1006/bit\\_sirst}}.\n","authors":["Chun Bao","Jie Cao","Yaqian Ning","Tianhua Zhao","Zhijun Li","Zechen Wang","Li Zhang","Qun Hao"],"pdf_url":"https://arxiv.org/pdf/2311.08747v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12048v3","updated":"2024-01-17T07:47:13Z","published":"2023-06-21T06:40:31Z","title":"Online Unsupervised Video Object Segmentation via Contrastive Motion\n Clustering","summary":" Online unsupervised video object segmentation (UVOS) uses the previous frames\nas its input to automatically separate the primary object(s) from a streaming\nvideo without using any further manual annotation. A major challenge is that\nthe model has no access to the future and must rely solely on the history,\ni.e., the segmentation mask is predicted from the current frame as soon as it\nis captured. In this work, a novel contrastive motion clustering algorithm with\nan optical flow as its input is proposed for the online UVOS by exploiting the\ncommon fate principle that visual elements tend to be perceived as a group if\nthey possess the same motion pattern. We build a simple and effective\nauto-encoder to iteratively summarize non-learnable prototypical bases for the\nmotion pattern, while the bases in turn help learn the representation of the\nembedding network. Further, a contrastive learning strategy based on a boundary\nprior is developed to improve foreground and background feature discrimination\nin the representation learning stage. The proposed algorithm can be optimized\non arbitrarily-scale data i.e., frame, clip, dataset) and performed in an\nonline fashion. Experiments on $\\textit{DAVIS}_{\\textit{16}}$, $\\textit{FBMS}$,\nand $\\textit{SegTrackV2}$ datasets show that the accuracy of our method\nsurpasses the previous state-of-the-art (SoTA) online UVOS method by a margin\nof 0.8%, 2.9%, and 1.1%, respectively. Furthermore, by using an online deep\nsubspace clustering to tackle the motion grouping, our method is able to\nachieve higher accuracy at $3\\times$ faster inference time compared to SoTA\nonline UVOS method, and making a good trade-off between effectiveness and\nefficiency. Our code is available at https://github.com/xilin1991/ClusterNet.\n","authors":["Lin Xi","Weihai Chen","Xingming Wu","Zhong Liu","Zhengguo Li"],"pdf_url":"https://arxiv.org/pdf/2306.12048v3.pdf","comment":"Accepted by IEEE Transactions on Circuits and Systems for Video\n Technology (TCSVT)"},{"id":"http://arxiv.org/abs/2401.06506v3","updated":"2024-01-17T07:44:50Z","published":"2024-01-12T11:02:12Z","title":"Frequency Masking for Universal Deepfake Detection","summary":" We study universal deepfake detection. Our goal is to detect synthetic images\nfrom a range of generative AI approaches, particularly from emerging ones which\nare unseen during training of the deepfake detector. Universal deepfake\ndetection requires outstanding generalization capability. Motivated by recently\nproposed masked image modeling which has demonstrated excellent generalization\nin self-supervised pre-training, we make the first attempt to explore masked\nimage modeling for universal deepfake detection. We study spatial and frequency\ndomain masking in training deepfake detectors. Based on empirical analysis, we\npropose a novel deepfake detector via frequency masking. Our focus on frequency\ndomain is different from the majority, which primarily target spatial domain\ndetection. Our comparative analyses reveal substantial performance gains over\nexisting methods. Code and models are publicly available.\n","authors":["Chandler Timm Doloriel","Ngai-Man Cheung"],"pdf_url":"https://arxiv.org/pdf/2401.06506v3.pdf","comment":"Accepted to IEEE ICASSP-2024"},{"id":"http://arxiv.org/abs/2110.09772v3","updated":"2024-01-17T07:38:17Z","published":"2021-10-19T07:29:14Z","title":"Synergy between 3DMM and 3D Landmarks for Accurate 3D Facial Geometry","summary":" This work studies learning from a synergy process of 3D Morphable Models\n(3DMM) and 3D facial landmarks to predict complete 3D facial geometry,\nincluding 3D alignment, face orientation, and 3D face modeling. Our synergy\nprocess leverages a representation cycle for 3DMM parameters and 3D landmarks.\n3D landmarks can be extracted and refined from face meshes built by 3DMM\nparameters. We next reverse the representation direction and show that\npredicting 3DMM parameters from sparse 3D landmarks improves the information\nflow. Together we create a synergy process that utilizes the relation between\n3D landmarks and 3DMM parameters, and they collaboratively contribute to better\nperformance. We extensively validate our contribution on full tasks of facial\ngeometry prediction and show our superior and robust performance on these tasks\nfor various scenarios. Particularly, we adopt only simple and widely-used\nnetwork operations to attain fast and accurate facial geometry prediction.\nCodes and data: https://choyingw.github.io/works/SynergyNet/\n","authors":["Cho-Ying Wu","Qiangeng Xu","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2110.09772v3.pdf","comment":"Accepted at 3DV 2021. This conference version supersedes\n arXiv:2104.08403"},{"id":"http://arxiv.org/abs/2401.09019v1","updated":"2024-01-17T07:30:52Z","published":"2024-01-17T07:30:52Z","title":"Change Detection Between Optical Remote Sensing Imagery and Map Data via\n Segment Anything Model (SAM)","summary":" Unsupervised multimodal change detection is pivotal for time-sensitive tasks\nand comprehensive multi-temporal Earth monitoring. In this study, we explore\nunsupervised multimodal change detection between two key remote sensing data\nsources: optical high-resolution imagery and OpenStreetMap (OSM) data.\nSpecifically, we propose to utilize the vision foundation model Segmentation\nAnything Model (SAM), for addressing our task. Leveraging SAM's exceptional\nzero-shot transfer capability, high-quality segmentation maps of optical images\ncan be obtained. Thus, we can directly compare these two heterogeneous data\nforms in the so-called segmentation domain. We then introduce two strategies\nfor guiding SAM's segmentation process: the 'no-prompt' and 'box/mask prompt'\nmethods. The two strategies are designed to detect land-cover changes in\ngeneral scenarios and to identify new land-cover objects within existing\nbackgrounds, respectively. Experimental results on three datasets indicate that\nthe proposed approach can achieve more competitive results compared to\nrepresentative unsupervised multimodal change detection methods.\n","authors":["Hongruixuan Chen","Jian Song","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2401.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04351v2","updated":"2024-01-17T07:20:30Z","published":"2023-04-10T02:02:57Z","title":"Evaluate Geometry of Radiance Fields with Low-frequency Color Prior","summary":" A radiance field is an effective representation of 3D scenes, which has been\nwidely adopted in novel-view synthesis and 3D reconstruction. It is still an\nopen and challenging problem to evaluate the geometry, i.e., the density field,\nas the ground-truth is almost impossible to obtain. One alternative indirect\nsolution is to transform the density field into a point-cloud and compute its\nChamfer Distance with the scanned ground-truth. However, many widely-used\ndatasets have no point-cloud ground-truth since the scanning process along with\nthe equipment is expensive and complicated. To this end, we propose a novel\nmetric, named Inverse Mean Residual Color (IMRC), which can evaluate the\ngeometry only with the observation images. Our key insight is that the better\nthe geometry, the lower-frequency the computed color field. From this insight,\ngiven a reconstructed density field and observation images, we design a\nclosed-form method to approximate the color field with low-frequency spherical\nharmonics, and compute the inverse mean residual color. Then the higher the\nIMRC, the better the geometry. Qualitative and quantitative experimental\nresults verify the effectiveness of our proposed IMRC metric. We also benchmark\nseveral state-of-the-art methods using IMRC to promote future related research.\nOur code is available at https://github.com/qihangGH/IMRC.\n","authors":["Qihang Fang","Yafei Song","Keqiang Li","Li Shen","Huaiyu Wu","Gang Xiong","Liefeng Bo"],"pdf_url":"https://arxiv.org/pdf/2304.04351v2.pdf","comment":"This paper has been accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.08332v2","updated":"2024-01-17T07:18:11Z","published":"2024-01-16T12:53:42Z","title":"Generative Denoise Distillation: Simple Stochastic Noises Induce\n Efficient Knowledge Transfer for Dense Prediction","summary":" Knowledge distillation is the process of transferring knowledge from a more\npowerful large model (teacher) to a simpler counterpart (student). Numerous\ncurrent approaches involve the student imitating the knowledge of the teacher\ndirectly. However, redundancy still exists in the learned representations\nthrough these prevalent methods, which tend to learn each spatial location's\nfeatures indiscriminately. To derive a more compact representation (concept\nfeature) from the teacher, inspired by human cognition, we suggest an\ninnovative method, termed Generative Denoise Distillation (GDD), where\nstochastic noises are added to the concept feature of the student to embed them\ninto the generated instance feature from a shallow network. Then, the generated\ninstance feature is aligned with the knowledge of the instance from the\nteacher. We extensively experiment with object detection, instance\nsegmentation, and semantic segmentation to demonstrate the versatility and\neffectiveness of our method. Notably, GDD achieves new state-of-the-art\nperformance in the tasks mentioned above. We have achieved substantial\nimprovements in semantic segmentation by enhancing PspNet and DeepLabV3, both\nof which are based on ResNet-18, resulting in mIoU scores of 74.67 and 77.69,\nrespectively, surpassing their previous scores of 69.85 and 73.20 on the\nCityscapes dataset of 20 categories. The source code is available at\nhttps://github.com/ZhgLiu/GDD.\n","authors":["Zhaoge Liu","Xiaohao Xu","Yunkang Cao","Weiming Shen"],"pdf_url":"https://arxiv.org/pdf/2401.08332v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09008v1","updated":"2024-01-17T07:06:56Z","published":"2024-01-17T07:06:56Z","title":"Hybrid of DiffStride and Spectral Pooling in Convolutional Neural\n Networks","summary":" Stride determines the distance between adjacent filter positions as the\nfilter moves across the input. A fixed stride causes important information\ncontained in the image can not be captured, so that important information is\nnot classified. Therefore, in previous research, the DiffStride Method was\napplied, namely the Strided Convolution Method with which it can learn its own\nstride value. Severe Quantization and a constraining lower bound on preserved\ninformation are arises with Max Pooling Downsampling Method. Spectral Pooling\nreduce the constraint lower bound on preserved information by cutting off the\nrepresentation in the frequency domain. In this research a CNN Model is\nproposed with the Downsampling Learnable Stride Technique performed by\nBackpropagation combined with the Spectral Pooling Technique. Diffstride and\nSpectral Pooling techniques are expected to maintain most of the information\ncontained in the image. In this study, we compare the Hybrid Method, which is a\ncombined implementation of Spectral Pooling and DiffStride against the Baseline\nMethod, which is the DiffStride implementation on ResNet 18. The accuracy\nresult of the DiffStride combination with Spectral Pooling improves over\nDiffStride which is baseline method by 0.0094. This shows that the Hybrid\nMethod can maintain most of the information by cutting of the representation in\nthe frequency domain and determine the stride of the learning result through\nBackpropagation.\n","authors":["Sulthan Rafif","Mochamad Arfan Ravy Wahyu Pratama","Mohammad Faris Azhar","Ahmad Mustafidul Ibad","Lailil Muflikhah","Novanto Yudistira"],"pdf_url":"https://arxiv.org/pdf/2401.09008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09006v1","updated":"2024-01-17T06:59:32Z","published":"2024-01-17T06:59:32Z","title":"Generalized Face Liveness Detection via De-spoofing Face Generator","summary":" Previous Face Anti-spoofing (FAS) works face the challenge of generalizing in\nunseen domains. One of the major problems is that most existing FAS datasets\nare relatively small and lack data diversity. However, we find that there are\nnumerous real faces that can be easily achieved under various conditions, which\nare neglected by previous FAS works. In this paper, we conduct an Anomalous cue\nGuided FAS (AG-FAS) method, which leverages real faces for improving model\ngeneralization via a De-spoofing Face Generator (DFG). Specifically, the DFG\ntrained only on the real faces gains the knowledge of what a real face should\nbe like and can generate a \"real\" version of the face corresponding to any\ngiven input face. The difference between the generated \"real\" face and the\ninput face can provide an anomalous cue for the downstream FAS task. We then\npropose an Anomalous cue Guided FAS feature extraction Network (AG-Net) to\nfurther improve the FAS feature generalization via a cross-attention\ntransformer. Extensive experiments on a total of nine public datasets show our\nmethod achieves state-of-the-art results under cross-domain evaluations with\nunseen scenarios and unknown presentation attacks.\n","authors":["Xingming Long","Shiguang Shan","Jie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09006v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2401.03145v2","updated":"2024-01-17T06:45:29Z","published":"2024-01-06T07:30:41Z","title":"Self-supervised Feature Adaptation for 3D Industrial Anomaly Detection","summary":" Industrial anomaly detection is generally addressed as an unsupervised task\nthat aims at locating defects with only normal training samples. Recently,\nnumerous 2D anomaly detection methods have been proposed and have achieved\npromising results, however, using only the 2D RGB data as input is not\nsufficient to identify imperceptible geometric surface anomalies. Hence, in\nthis work, we focus on multi-modal anomaly detection. Specifically, we\ninvestigate early multi-modal approaches that attempted to utilize models\npre-trained on large-scale visual datasets, i.e., ImageNet, to construct\nfeature databases. And we empirically find that directly using these\npre-trained models is not optimal, it can either fail to detect subtle defects\nor mistake abnormal features as normal ones. This may be attributed to the\ndomain gap between target industrial data and source data.Towards this problem,\nwe propose a Local-to-global Self-supervised Feature Adaptation (LSFA) method\nto finetune the adaptors and learn task-oriented representation toward anomaly\ndetection.Both intra-modal adaptation and cross-modal alignment are optimized\nfrom a local-to-global perspective in LSFA to ensure the representation quality\nand consistency in the inference stage.Extensive experiments demonstrate that\nour method not only brings a significant performance boost to feature embedding\nbased approaches, but also outperforms previous State-of-The-Art (SoTA) methods\nprominently on both MVTec-3D AD and Eyecandies datasets, e.g., LSFA achieves\n97.1% I-AUROC on MVTec-3D, surpass previous SoTA by +3.4%.\n","authors":["Yuanpeng Tu","Boshen Zhang","Liang Liu","Yuxi Li","Xuhai Chen","Jiangning Zhang","Yabiao Wang","Chengjie Wang","Cai Rong Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.03145v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10829v2","updated":"2024-01-17T06:37:18Z","published":"2023-04-21T09:12:29Z","title":"Deep Attention Unet: A Network Model with Global Feature Perception\n Ability","summary":" Remote sensing image segmentation is a specific task of remote sensing image\ninterpretation. A good remote sensing image segmentation algorithm can provide\nguidance for environmental protection, agricultural production, and urban\nconstruction. This paper proposes a new type of UNet image segmentation\nalgorithm based on channel self attention mechanism and residual connection\ncalled . In my experiment, the new network model improved mIOU by 2.48%\ncompared to traditional UNet on the FoodNet dataset. The image segmentation\nalgorithm proposed in this article enhances the internal connections between\ndifferent items in the image, thus achieving better segmentation results for\nremote sensing images with occlusion.\n","authors":["Jiacheng Li"],"pdf_url":"https://arxiv.org/pdf/2304.10829v2.pdf","comment":"The experiment was inadequate and the experimental method needed\n major changes"},{"id":"http://arxiv.org/abs/2006.07802v2","updated":"2024-01-17T06:30:19Z","published":"2020-06-14T05:03:52Z","title":"Geometry-Aware Instance Segmentation with Disparity Maps","summary":" Most previous works of outdoor instance segmentation for images only use\ncolor information. We explore a novel direction of sensor fusion to exploit\nstereo cameras. Geometric information from disparities helps separate\noverlapping objects of the same or different classes. Moreover, geometric\ninformation penalizes region proposals with unlikely 3D shapes thus suppressing\nfalse positive detections. Mask regression is based on 2D, 2.5D, and 3D ROI\nusing the pseudo-lidar and image-based representations. These mask predictions\nare fused by a mask scoring process. However, public datasets only adopt stereo\nsystems with shorter baseline and focal legnth, which limit measuring ranges of\nstereo cameras. We collect and utilize High-Quality Driving Stereo (HQDS)\ndataset, using much longer baseline and focal length with higher resolution.\nOur performance attains state of the art. Please refer to our project page. The\nfull paper is available here.\n","authors":["Cho-Ying Wu","Xiaoyan Hu","Michael Happold","Qiangeng Xu","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2006.07802v2.pdf","comment":"CVPR 2020 Workshop of Scalability in Autonomous Driving (WSAD).\n Please refer to WSAD site for details; fix typos"},{"id":"http://arxiv.org/abs/2401.08998v1","updated":"2024-01-17T06:22:47Z","published":"2024-01-17T06:22:47Z","title":"Attack and Reset for Unlearning: Exploiting Adversarial Noise toward\n Machine Unlearning through Parameter Re-initialization","summary":" With growing concerns surrounding privacy and regulatory compliance, the\nconcept of machine unlearning has gained prominence, aiming to selectively\nforget or erase specific learned information from a trained model. In response\nto this critical need, we introduce a novel approach called Attack-and-Reset\nfor Unlearning (ARU). This algorithm leverages meticulously crafted adversarial\nnoise to generate a parameter mask, effectively resetting certain parameters\nand rendering them unlearnable. ARU outperforms current state-of-the-art\nresults on two facial machine-unlearning benchmark datasets, MUFAC and MUCAC.\nIn particular, we present the steps involved in attacking and masking that\nstrategically filter and re-initialize network parameters biased towards the\nforget set. Our work represents a significant advancement in rendering data\nunexploitable to deep learning models through parameter re-initialization,\nachieved by harnessing adversarial noise to craft a mask.\n","authors":["Yoonhwa Jung","Ikhyun Cho","Shun-Hsiang Hsu","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2401.08998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.05109v2","updated":"2024-01-17T05:39:34Z","published":"2023-02-10T08:21:01Z","title":"Adjacent-Level Feature Cross-Fusion With 3-D CNN for Remote Sensing\n Image Change Detection","summary":" Deep learning-based change detection (CD) using remote sensing images has\nreceived increasing attention in recent years. However, how to effectively\nextract and fuse the deep features of bi-temporal images for improving the\naccuracy of CD is still a challenge. To address that, a novel adjacent-level\nfeature fusion network with 3D convolution (named AFCF3D-Net) is proposed in\nthis article. First, through the inner fusion property of 3D convolution, we\ndesign a new feature fusion way that can simultaneously extract and fuse the\nfeature information from bi-temporal images. Then, to alleviate the semantic\ngap between low-level features and high-level features, we propose an\nadjacent-level feature cross-fusion (AFCF) module to aggregate complementary\nfeature information between the adjacent levels. Furthermore, the full-scale\nskip connection strategy is introduced to improve the capability of pixel-wise\nprediction and the compactness of changed objects in the results. Finally, the\nproposed AFCF3D-Net has been validated on the three challenging remote sensing\nCD datasets: the Wuhan building dataset (WHU-CD), the LEVIR building dataset\n(LEVIR-CD), and the Sun Yat-Sen University dataset (SYSU-CD). The results of\nquantitative analysis and qualitative comparison demonstrate that the proposed\nAFCF3D-Net achieves better performance compared to other state-of-the-art\nmethods. The code for this work is available at\nhttps://github.com/wm-Githuber/AFCF3D-Net.\n","authors":["Yuanxin Ye","Mengmeng Wang","Liang Zhou","Guangyang Lei","Jianwei Fan","Yao Qin"],"pdf_url":"https://arxiv.org/pdf/2302.05109v2.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2003.06945v4","updated":"2024-01-17T05:29:28Z","published":"2020-03-15T23:23:26Z","title":"Scene Completeness-Aware Lidar Depth Completion for Driving Scenario","summary":" This paper introduces Scene Completeness-Aware Depth Completion (SCADC) to\ncomplete raw lidar scans into dense depth maps with fine and complete scene\nstructures. Recent sparse depth completion for lidars only focuses on the lower\nscenes and produces irregular estimations on the upper because existing\ndatasets, such as KITTI, do not provide groundtruth for upper areas. These\nareas are considered less important since they are usually sky or trees of less\nscene understanding interest. However, we argue that in several driving\nscenarios such as large trucks or cars with loads, objects could extend to the\nupper parts of scenes. Thus depth maps with structured upper scene estimation\nare important for RGBD algorithms. SCADC adopts stereo images that produce\ndisparities with better scene completeness but are generally less precise than\nlidars, to help sparse lidar depth completion. To our knowledge, we are the\nfirst to focus on scene completeness of sparse depth completion. We validate\nour SCADC on both depth estimate precision and scene-completeness on KITTI.\nMoreover, we experiment on less-explored outdoor RGBD semantic segmentation\nwith scene completeness-aware D-input to validate our method.\n","authors":["Cho-Ying Wu","Ulrich Neumann"],"pdf_url":"https://arxiv.org/pdf/2003.06945v4.pdf","comment":"Present at ICASSP 2021; fix typos"},{"id":"http://arxiv.org/abs/2401.08973v1","updated":"2024-01-17T04:52:40Z","published":"2024-01-17T04:52:40Z","title":"OCTO+: A Suite for Automatic Open-Vocabulary Object Placement in Mixed\n Reality","summary":" One key challenge in Augmented Reality is the placement of virtual content in\nnatural locations. Most existing automated techniques can only work with a\nclosed-vocabulary, fixed set of objects. In this paper, we introduce and\nevaluate several methods for automatic object placement using recent advances\nin open-vocabulary vision-language models. Through a multifaceted evaluation,\nwe identify a new state-of-the-art method, OCTO+. We also introduce a benchmark\nfor automatically evaluating the placement of virtual objects in augmented\nreality, alleviating the need for costly user studies. Through this, in\naddition to human evaluations, we find that OCTO+ places objects in a valid\nregion over 70% of the time, outperforming other methods on a range of metrics.\n","authors":["Aditya Sharma","Luke Yoffe","Tobias Höllerer"],"pdf_url":"https://arxiv.org/pdf/2401.08973v1.pdf","comment":"2024 IEEE International Conference on Artificial Intelligence and\n eXtended and Virtual Reality (AIXVR)"},{"id":"http://arxiv.org/abs/2401.08972v1","updated":"2024-01-17T04:52:32Z","published":"2024-01-17T04:52:32Z","title":"Hearing Loss Detection from Facial Expressions in One-on-one\n Conversations","summary":" Individuals with impaired hearing experience difficulty in conversations,\nespecially in noisy environments. This difficulty often manifests as a change\nin behavior and may be captured via facial expressions, such as the expression\nof discomfort or fatigue. In this work, we build on this idea and introduce the\nproblem of detecting hearing loss from an individual's facial expressions\nduring a conversation. Building machine learning models that can represent\nhearing-related facial expression changes is a challenge. In addition, models\nneed to disentangle spurious age-related correlations from hearing-driven\nexpressions. To this end, we propose a self-supervised pre-training strategy\ntailored for the modeling of expression variations. We also use adversarial\nrepresentation learning to mitigate the age bias. We evaluate our approach on a\nlarge-scale egocentric dataset with real-world conversational scenarios\ninvolving subjects with hearing loss and show that our method for hearing loss\ndetection achieves superior performance over baselines.\n","authors":["Yufeng Yin","Ishwarya Ananthabhotla","Vamsi Krishna Ithapu","Stavros Petridis","Yu-Hsiang Wu","Christi Miller"],"pdf_url":"https://arxiv.org/pdf/2401.08972v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08154v2","updated":"2024-01-17T04:44:37Z","published":"2024-01-16T06:53:03Z","title":"Learned Image Compression with ROI-Weighted Distortion and Bit\n Allocation","summary":" This one page paper describes our method for the track of image compression.\nTo achieve better perceptual quality, we use the adversarial loss to generate\nrealistic textures, use region of interest (ROI) mask to guide the bit\nallocation for different regions. Our Team name is TLIC.\n","authors":["Wei Jiang","Yongqi Zhai","Hangyu Li","Ronggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08154v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.08968v1","updated":"2024-01-17T04:43:45Z","published":"2024-01-17T04:43:45Z","title":"COCO is \"ALL'' You Need for Visual Instruction Fine-tuning","summary":" Multi-modal Large Language Models (MLLMs) are increasingly prominent in the\nfield of artificial intelligence. Visual instruction fine-tuning (IFT) is a\nvital process for aligning MLLMs' output with user's intentions. High-quality\nand diversified instruction following data is the key to this fine-tuning\nprocess. Recent studies propose to construct visual IFT datasets through a\nmultifaceted approach: transforming existing datasets with rule-based\ntemplates, employing GPT-4 for rewriting annotations, and utilizing GPT-4V for\nvisual dataset pseudo-labeling. LLaVA-1.5 adopted similar approach and\nconstruct LLaVA-mix-665k, which is one of the simplest, most widely used, yet\nmost effective IFT datasets today. Notably, when properly fine-tuned with this\ndataset, MLLMs can achieve state-of-the-art performance on several benchmarks.\nHowever, we noticed that models trained with this dataset often struggle to\nfollow user instructions properly in multi-round dialog. In addition, tradition\ncaption and VQA evaluation benchmarks, with their closed-form evaluation\nstructure, are not fully equipped to assess the capabilities of modern\nopen-ended generative MLLMs. This problem is not unique to the LLaVA-mix-665k\ndataset, but may be a potential issue in all IFT datasets constructed from\nimage captioning or VQA sources, though the extent of this issue may vary. We\nargue that datasets with diverse and high-quality detailed instruction\nfollowing annotations are essential and adequate for MLLMs IFT. In this work,\nwe establish a new IFT dataset, with images sourced from the COCO dataset along\nwith more diverse instructions. Our experiments show that when fine-tuned with\nout proposed dataset, MLLMs achieve better performance on open-ended evaluation\nbenchmarks in both single-round and multi-round dialog setting.\n","authors":["Xiaotian Han","Yiqi Wang","Bohan Zhai","Quanzeng You","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2401.08968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08965v1","updated":"2024-01-17T04:40:30Z","published":"2024-01-17T04:40:30Z","title":"Dynamic DNNs and Runtime Management for Efficient Inference on\n Mobile/Embedded Devices","summary":" Deep neural network (DNN) inference is increasingly being executed on mobile\nand embedded platforms due to several key advantages in latency, privacy and\nalways-on availability. However, due to limited computing resources, efficient\nDNN deployment on mobile and embedded platforms is challenging. Although many\nhardware accelerators and static model compression methods were proposed by\nprevious works, at system runtime, multiple applications are typically executed\nconcurrently and compete for hardware resources. This raises two main\nchallenges: Runtime Hardware Availability and Runtime Application Variability.\nPrevious works have addressed these challenges through either dynamic neural\nnetworks that contain sub-networks with different performance trade-offs or\nruntime hardware resource management. In this thesis, we proposed a combined\nmethod, a system was developed for DNN performance trade-off management,\ncombining the runtime trade-off opportunities in both algorithms and hardware\nto meet dynamically changing application performance targets and hardware\nconstraints in real time. We co-designed novel Dynamic Super-Networks to\nmaximise runtime system-level performance and energy efficiency on\nheterogeneous hardware platforms. Compared with SOTA, our experimental results\nusing ImageNet on the GPU of Jetson Xavier NX show our model is 2.4x faster for\nsimilar ImageNet Top-1 accuracy, or 5.1% higher accuracy at similar latency. We\nalso designed a hierarchical runtime resource manager that tunes both dynamic\nneural networks and DVFS at runtime. Compared with the Linux DVFS governor\nschedutil, our runtime approach achieves up to a 19% energy reduction and a 9%\nlatency reduction in single model deployment scenario, and an 89% energy\nreduction and a 23% latency reduction in a two concurrent model deployment\nscenario.\n","authors":["Lei Xun","Jonathon Hare","Geoff V. Merrett"],"pdf_url":"https://arxiv.org/pdf/2401.08965v1.pdf","comment":"Accepted at Design, Automation & Test in Europe Conference (DATE)\n 2024, PhD Forum"},{"id":"http://arxiv.org/abs/2401.00436v4","updated":"2024-01-17T04:21:47Z","published":"2023-12-31T09:24:28Z","title":"Diff-PCR: Diffusion-Based Correspondence Searching in Doubly Stochastic\n Matrix Space for Point Cloud Registration","summary":" Efficiently finding optimal correspondences between point clouds is crucial\nfor solving both rigid and non-rigid point cloud registration problems.\nExisting methods often rely on geometric or semantic feature embedding to\nestablish correspondences and estimate transformations or flow fields.\nRecently, state-of-the-art methods have employed RAFT-like iterative updates to\nrefine the solution. However, these methods have certain limitations. Firstly,\ntheir iterative refinement design lacks transparency, and their iterative\nupdates follow a fixed path during the refinement process, which can lead to\nsuboptimal results. Secondly, these methods overlook the importance of refining\nor optimizing correspondences (or matching matrices) as a precursor to solving\ntransformations or flow fields. They typically compute candidate\ncorrespondences based on distances in the point feature space. However, they\nonly project the candidate matching matrix into some matrix space once with\nSinkhorn or dual softmax operations to obtain final correspondences. This\none-shot projected matching matrix may be far from the globally optimal one,\nand these approaches do not consider the distribution of the target matching\nmatrix. In this paper, we propose a novel approach that exploits the Denoising\nDiffusion Model to predict a searching gradient for the optimal matching matrix\nwithin the Doubly Stochastic Matrix Space. During the reverse denoising\nprocess, our method iteratively searches for better solutions along this\ndenoising gradient, which points towards the maximum likelihood direction of\nthe target matching matrix. Our method offers flexibility by allowing the\nsearch to start from any initial matching matrix provided by the online\nbackbone or white noise. Experimental evaluations on the 3DMatch/3DLoMatch and\n4DMatch/4DLoMatch datasets demonstrate the effectiveness of our newly designed\nframework.\n","authors":["Qianliang Wu","Haobo Jiang","Yaqing Ding","Lei Luo","Jin Xie","Jian Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00436v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00519v2","updated":"2024-01-17T03:56:02Z","published":"2023-07-02T09:02:53Z","title":"Image Background Serves as Good Proxy for Out-of-distribution Data","summary":" Out-of-distribution (OOD) detection empowers the model trained on the closed\nimage set to identify unknown data in the open world. Though many prior\ntechniques have yielded considerable improvements in this research direction,\ntwo crucial obstacles still remain. Firstly, a unified perspective has yet to\nbe presented to view the developed arts with individual designs, which is vital\nfor providing insights into future work. Secondly, we expect sufficient natural\nOOD supervision to promote the generation of compact boundaries between the\nin-distribution (ID) and OOD data without collecting explicit OOD samples. To\ntackle these issues, we propose a general probabilistic framework to interpret\nmany existing methods and an OOD-data-free model, namely\n\\textbf{S}elf-supervised \\textbf{S}ampling for \\textbf{O}OD \\textbf{D}etection\n(SSOD). SSOD efficiently exploits natural OOD signals from the ID data based on\nthe local property of convolution. With these supervisions, it jointly\noptimizes the OOD detection and conventional ID classification in an end-to-end\nmanner. Extensive experiments reveal that SSOD establishes competitive\nstate-of-the-art performance on many large-scale benchmarks, outperforming the\nbest previous method by a large margin, \\eg, reporting \\textbf{-6.28\\%} FPR95\nand \\textbf{+0.77\\%} AUROC on ImageNet, \\textbf{-19.01\\%} FPR95 and\n\\textbf{+3.04\\%} AUROC on CIFAR-10, and top-ranked performance on hard OOD\ndatasets, \\ie, ImageNet-O and OpenImage-O.\n","authors":["Sen Pei"],"pdf_url":"https://arxiv.org/pdf/2307.00519v2.pdf","comment":"ICLR 2024. arXiv admin note: text overlap with arXiv:2301.06657"},{"id":"http://arxiv.org/abs/2401.08943v1","updated":"2024-01-17T03:34:38Z","published":"2024-01-17T03:34:38Z","title":"Fluid Dynamic DNNs for Reliable and Adaptive Distributed Inference on\n Edge Devices","summary":" Distributed inference is a popular approach for efficient DNN inference at\nthe edge. However, traditional Static and Dynamic DNNs are not\ndistribution-friendly, causing system reliability and adaptability issues. In\nthis paper, we introduce Fluid Dynamic DNNs (Fluid DyDNNs), tailored for\ndistributed inference. Distinct from Static and Dynamic DNNs, Fluid DyDNNs\nutilize a novel nested incremental training algorithm to enable independent and\ncombined operation of its sub-networks, enhancing system reliability and\nadaptability. Evaluation on embedded Arm CPUs with a DNN model and the MNIST\ndataset, shows that in scenarios of single device failure, Fluid DyDNNs ensure\ncontinued inference, whereas Static and Dynamic DNNs fail. When devices are\nfully operational, Fluid DyDNNs can operate in either a High-Accuracy mode and\nachieve comparable accuracy with Static DNNs, or in a High-Throughput mode and\nachieve 2.5x and 2x throughput compared with Static and Dynamic DNNs,\nrespectively.\n","authors":["Lei Xun","Mingyu Hu","Hengrui Zhao","Amit Kumar Singh","Jonathon Hare","Geoff V. Merrett"],"pdf_url":"https://arxiv.org/pdf/2401.08943v1.pdf","comment":"Accepted at Design, Automation & Test in Europe Conference (DATE)\n 2024"},{"id":"http://arxiv.org/abs/2401.08937v1","updated":"2024-01-17T03:18:02Z","published":"2024-01-17T03:18:02Z","title":"ICON: Incremental CONfidence for Joint Pose and Radiance Field\n Optimization","summary":" Neural Radiance Fields (NeRF) exhibit remarkable performance for Novel View\nSynthesis (NVS) given a set of 2D images. However, NeRF training requires\naccurate camera pose for each input view, typically obtained by\nStructure-from-Motion (SfM) pipelines. Recent works have attempted to relax\nthis constraint, but they still often rely on decent initial poses which they\ncan refine. Here we aim at removing the requirement for pose initialization. We\npresent Incremental CONfidence (ICON), an optimization procedure for training\nNeRFs from 2D video frames. ICON only assumes smooth camera motion to estimate\ninitial guess for poses. Further, ICON introduces ``confidence\": an adaptive\nmeasure of model quality used to dynamically reweight gradients. ICON relies on\nhigh-confidence poses to learn NeRF, and high-confidence 3D structure (as\nencoded by NeRF) to learn poses. We show that ICON, without prior pose\ninitialization, achieves superior performance in both CO3D and HO3D versus\nmethods which use SfM pose.\n","authors":["Weiyao Wang","Pierre Gleize","Hao Tang","Xingyu Chen","Kevin J Liang","Matt Feiszli"],"pdf_url":"https://arxiv.org/pdf/2401.08937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08932v1","updated":"2024-01-17T03:02:31Z","published":"2024-01-17T03:02:31Z","title":"Learning to detect cloud and snow in remote sensing images from noisy\n labels","summary":" Detecting clouds and snow in remote sensing images is an essential\npreprocessing task for remote sensing imagery. Previous works draw inspiration\nfrom semantic segmentation models in computer vision, with most research\nfocusing on improving model architectures to enhance detection performance.\nHowever, unlike natural images, the complexity of scenes and the diversity of\ncloud types in remote sensing images result in many inaccurate labels in cloud\nand snow detection datasets, introducing unnecessary noises into the training\nand testing processes. By constructing a new dataset and proposing a novel\ntraining strategy with the curriculum learning paradigm, we guide the model in\nreducing overfitting to noisy labels. Additionally, we design a more\nappropriate model performance evaluation method, that alleviates the\nperformance assessment bias caused by noisy labels. By conducting experiments\non models with UNet and Segformer, we have validated the effectiveness of our\nproposed method. This paper is the first to consider the impact of label noise\non the detection of clouds and snow in remote sensing images.\n","authors":["Zili Liu","Hao Chen","Wenyuan Li","Keyan Chen","Zipeng Qi","Chenyang Liu","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2401.08932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08930v1","updated":"2024-01-17T02:59:34Z","published":"2024-01-17T02:59:34Z","title":"3D Human Pose Analysis via Diffusion Synthesis","summary":" Diffusion models have demonstrated remarkable success in generative modeling.\nIn this paper, we propose PADS (Pose Analysis by Diffusion Synthesis), a novel\nframework designed to address various challenges in 3D human pose analysis\nthrough a unified pipeline. Central to PADS are two distinctive strategies: i)\nlearning a task-agnostic pose prior using a diffusion synthesis process to\neffectively capture the kinematic constraints in human pose data, and ii)\nunifying multiple pose analysis tasks like estimation, completion, denoising,\netc, as instances of inverse problems. The learned pose prior will be treated\nas a regularization imposing on task-specific constraints, guiding the\noptimization process through a series of conditional denoising steps. PADS\nrepresents the first diffusion-based framework for tackling general 3D human\npose analysis within the inverse problem framework. Its performance has been\nvalidated on different benchmarks, signaling the adaptability and robustness of\nthis pipeline.\n","authors":["Haorui Ji","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2401.08930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07486v3","updated":"2024-01-17T02:58:20Z","published":"2023-04-15T06:35:06Z","title":"Region-Enhanced Feature Learning for Scene Semantic Segmentation","summary":" Semantic segmentation in complex scenes relies not only on object appearance\nbut also on object location and the surrounding environment. Nonetheless, it is\ndifficult to model long-range context in the format of pairwise point\ncorrelations due to the huge computational cost for large-scale point clouds.\nIn this paper, we propose using regions as the intermediate representation of\npoint clouds instead of fine-grained points or voxels to reduce the\ncomputational burden. We introduce a novel Region-Enhanced Feature Learning\nNetwork (REFL-Net) that leverages region correlations to enhance point feature\nlearning. We design a region-based feature enhancement (RFE) module, which\nconsists of a Semantic-Spatial Region Extraction stage and a Region Dependency\nModeling stage. In the first stage, the input points are grouped into a set of\nregions based on their semantic and spatial proximity. In the second stage, we\nexplore inter-region semantic and spatial relationships by employing a\nself-attention block on region features and then fuse point features with the\nregion features to obtain more discriminative representations. Our proposed RFE\nmodule is plug-and-play and can be integrated with common semantic segmentation\nbackbones. We conduct extensive experiments on ScanNetV2 and S3DIS datasets and\nevaluate our RFE module with different segmentation backbones. Our REFL-Net\nachieves 1.8% mIoU gain on ScanNetV2 and 1.7% mIoU gain on S3DIS with\nnegligible computational cost compared with backbone models. Both quantitative\nand qualitative results show the powerful long-range context modeling ability\nand strong generalization ability of our REFL-Net.\n","authors":["Xin Kang","Chaoqun Wang","Xuejin Chen"],"pdf_url":"https://arxiv.org/pdf/2304.07486v3.pdf","comment":"Accepted by IEEE Transactions on Multimedia 2023"},{"id":"http://arxiv.org/abs/2306.08834v2","updated":"2024-01-17T02:49:33Z","published":"2023-06-15T03:38:09Z","title":"ScrollTimes: Tracing the Provenance of Paintings as a Window into\n History","summary":" The study of cultural artifact provenance, tracing ownership and\npreservation, holds significant importance in archaeology and art history.\nModern technology has advanced this field, yet challenges persist, including\nrecognizing evidence from diverse sources, integrating sociocultural context,\nand enhancing interactive automation for comprehensive provenance analysis. In\ncollaboration with art historians, we examined the handscroll, a traditional\nChinese painting form that provides a rich source of historical data and a\nunique opportunity to explore history through cultural artifacts. We present a\nthree-tiered methodology encompassing artifact, contextual, and provenance\nlevels, designed to create a \"Biography\" for handscroll. Our approach\nincorporates the application of image processing techniques and language models\nto extract, validate, and augment elements within handscroll using various\ncultural heritage databases. To facilitate efficient analysis of non-contiguous\nextracted elements, we have developed a distinctive layout. Additionally, we\nintroduce ScrollTimes, a visual analysis system tailored to support the\nthree-tiered analysis of handscroll, allowing art historians to interactively\ncreate biographies tailored to their interests. Validated through case studies\nand expert interviews, our approach offers a window into history, fostering a\nholistic understanding of handscroll provenance and historical significance.\n","authors":["Wei Zhang","Wong Kam-Kwai","Yitian Chen","Ailing Jia","Luwei Wang","Jian-Wei Zhang","Lechao Cheng","Huamin Qu","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2306.08834v2.pdf","comment":"Accepted by IEEE Transactions on Visualization and Computer Graphics\n (TVCG)"},{"id":"http://arxiv.org/abs/2311.06031v4","updated":"2024-01-17T02:46:53Z","published":"2023-11-10T12:38:16Z","title":"Diagonal Hierarchical Consistency Learning for Semi-supervised Medical\n Image Segmentation","summary":" Medical image segmentation, which is essential for many clinical\napplications, has achieved almost human-level performance via data-driven deep\nlearning technologies. Nevertheless, its performance is predicated upon the\ncostly process of manually annotating a vast amount of medical images. To this\nend, we propose a novel framework for robust semi-supervised medical image\nsegmentation using diagonal hierarchical consistency learning (DiHC-Net).\nFirst, it is composed of multiple sub-models with identical multi-scale\narchitecture but with distinct sub-layers, such as up-sampling and\nnormalisation layers. Second, with mutual consistency, a novel consistency\nregularisation is enforced between one model's intermediate and final\nprediction and soft pseudo labels from other models in a diagonal hierarchical\nfashion. A series of experiments verifies the efficacy of our simple framework,\noutperforming all previous approaches on public benchmark dataset on organ and\ntumour.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2311.06031v4.pdf","comment":"4 pages, 2 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2401.08926v1","updated":"2024-01-17T02:25:42Z","published":"2024-01-17T02:25:42Z","title":"Uncertainty-aware No-Reference Point Cloud Quality Assessment","summary":" The evolution of compression and enhancement algorithms necessitates an\naccurate quality assessment for point clouds. Previous works consistently\nregard point cloud quality assessment (PCQA) as a MOS regression problem and\ndevise a deterministic mapping, ignoring the stochasticity in generating MOS\nfrom subjective tests. Besides, the viewpoint switching of 3D point clouds in\nsubjective tests reinforces the judging stochasticity of different subjects\ncompared with traditional images. This work presents the first probabilistic\narchitecture for no-reference PCQA, motivated by the labeling process of\nexisting datasets. The proposed method can model the quality judging\nstochasticity of subjects through a tailored conditional variational\nautoencoder (CVAE) and produces multiple intermediate quality ratings. These\nintermediate ratings simulate the judgments from different subjects and are\nthen integrated into an accurate quality prediction, mimicking the generation\nprocess of a ground truth MOS. Specifically, our method incorporates a Prior\nModule, a Posterior Module, and a Quality Rating Generator, where the former\ntwo modules are introduced to model the judging stochasticity in subjective\ntests, while the latter is developed to generate diverse quality ratings.\nExtensive experiments indicate that our approach outperforms previous\ncutting-edge methods by a large margin and exhibits gratifying cross-dataset\nrobustness.\n","authors":["Songlin Fan","Zixuan Guo","Wei Gao","Ge Li"],"pdf_url":"https://arxiv.org/pdf/2401.08926v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08923v1","updated":"2024-01-17T02:12:57Z","published":"2024-01-17T02:12:57Z","title":"Subwavelength Imaging using a Solid-Immersion Diffractive Optical\n Processor","summary":" Phase imaging is widely used in biomedical imaging, sensing, and material\ncharacterization, among other fields. However, direct imaging of phase objects\nwith subwavelength resolution remains a challenge. Here, we demonstrate\nsubwavelength imaging of phase and amplitude objects based on all-optical\ndiffractive encoding and decoding. To resolve subwavelength features of an\nobject, the diffractive imager uses a thin, high-index solid-immersion layer to\ntransmit high-frequency information of the object to a spatially-optimized\ndiffractive encoder, which converts/encodes high-frequency information of the\ninput into low-frequency spatial modes for transmission through air. The\nsubsequent diffractive decoder layers (in air) are jointly designed with the\nencoder using deep-learning-based optimization, and communicate with the\nencoder layer to create magnified images of input objects at its output,\nrevealing subwavelength features that would otherwise be washed away due to\ndiffraction limit. We demonstrate that this all-optical collaboration between a\ndiffractive solid-immersion encoder and the following decoder layers in air can\nresolve subwavelength phase and amplitude features of input objects in a highly\ncompact design. To experimentally demonstrate its proof-of-concept, we used\nterahertz radiation and developed a fabrication method for creating monolithic\nmulti-layer diffractive processors. Through these monolithically fabricated\ndiffractive encoder-decoder pairs, we demonstrated phase-to-intensity\ntransformations and all-optically reconstructed subwavelength phase features of\ninput objects by directly transforming them into magnified intensity features\nat the output. This solid-immersion-based diffractive imager, with its compact\nand cost-effective design, can find wide-ranging applications in bioimaging,\nendoscopy, sensing and materials characterization.\n","authors":["Jingtian Hu","Kun Liao","Niyazi Ulas Dinc","Carlo Gigli","Bijie Bai","Tianyi Gan","Xurong Li","Hanlong Chen","Xilin Yang","Yuhang Li","Cagatay Isil","Md Sadman Sakib Rahman","Jingxi Li","Xiaoyong Hu","Mona Jarrahi","Demetri Psaltis","Aydogan Ozcan"],"pdf_url":"https://arxiv.org/pdf/2401.08923v1.pdf","comment":"32 Pages, 9 Figures"},{"id":"http://arxiv.org/abs/2401.08920v1","updated":"2024-01-17T02:05:21Z","published":"2024-01-17T02:05:21Z","title":"Idempotence and Perceptual Image Compression","summary":" Idempotence is the stability of image codec to re-compression. At the first\nglance, it is unrelated to perceptual image compression. However, we find that\ntheoretically: 1) Conditional generative model-based perceptual codec satisfies\nidempotence; 2) Unconditional generative model with idempotence constraint is\nequivalent to conditional generative codec. Based on this newfound equivalence,\nwe propose a new paradigm of perceptual image codec by inverting unconditional\ngenerative model with idempotence constraints. Our codec is theoretically\nequivalent to conditional generative codec, and it does not require training\nnew models. Instead, it only requires a pre-trained mean-square-error codec and\nunconditional generative model. Empirically, we show that our proposed approach\noutperforms state-of-the-art methods such as HiFiC and ILLM, in terms of\nFr\\'echet Inception Distance (FID). The source code is provided in\nhttps://github.com/tongdaxu/Idempotence-and-Perceptual-Image-Compression.\n","authors":["Tongda Xu","Ziran Zhu","Dailan He","Yanghao Li","Lina Guo","Yuanyuan Wang","Zhe Wang","Hongwei Qin","Yan Wang","Jingjing Liu","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.08920v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2311.02992v2","updated":"2024-01-17T01:56:27Z","published":"2023-11-06T09:55:19Z","title":"NEURO HAND: A weakly supervised Hierarchical Attention Network for\n interpretable neuroimaging abnormality Detection","summary":" Clinical neuroimaging data is naturally hierarchical. Different magnetic\nresonance imaging (MRI) sequences within a series, different slices covering\nthe head, and different regions within each slice all confer different\ninformation. In this work we present a hierarchical attention network for\nabnormality detection using MRI scans obtained in a clinical hospital setting.\nThe proposed network is suitable for non-volumetric data (i.e. stacks of\nhigh-resolution MRI slices), and can be trained from binary examination-level\nlabels. We show that this hierarchical approach leads to improved\nclassification, while providing interpretability through either coarse inter-\nand intra-slice abnormality localisation, or giving importance scores for\ndifferent slices and sequences, making our model suitable for use as an\nautomated triaging system in radiology departments.\n","authors":["David A. Wood"],"pdf_url":"https://arxiv.org/pdf/2311.02992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08913v1","updated":"2024-01-17T01:55:59Z","published":"2024-01-17T01:55:59Z","title":"Efficient Image Super-Resolution via Symmetric Visual Attention Network","summary":" An important development direction in the Single-Image Super-Resolution\n(SISR) algorithms is to improve the efficiency of the algorithms. Recently,\nefficient Super-Resolution (SR) research focuses on reducing model complexity\nand improving efficiency through improved deep small kernel convolution,\nleading to a small receptive field. The large receptive field obtained by large\nkernel convolution can significantly improve image quality, but the\ncomputational cost is too high. To improve the reconstruction details of\nefficient super-resolution reconstruction, we propose a Symmetric Visual\nAttention Network (SVAN) by applying large receptive fields. The SVAN\ndecomposes a large kernel convolution into three different combinations of\nconvolution operations and combines them with an attention mechanism to form a\nSymmetric Large Kernel Attention Block (SLKAB), which forms a symmetric\nattention block with a bottleneck structure by the size of the receptive field\nin the convolution combination to extract depth features effectively as the\nbasic component of the SVAN. Our network gets a large receptive field while\nminimizing the number of parameters and improving the perceptual ability of the\nmodel. The experimental results show that the proposed SVAN can obtain\nhigh-quality super-resolution reconstruction results using only about 30% of\nthe parameters of existing SOTA methods.\n","authors":["Chengxu Wu","Qinrui Fan","Shu Hu","Xi Wu","Xin Wang","Jing Hu"],"pdf_url":"https://arxiv.org/pdf/2401.08913v1.pdf","comment":"13 pages,4 figures"},{"id":"http://arxiv.org/abs/2309.04001v3","updated":"2024-01-17T01:47:40Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v3.pdf","comment":"14 pages, 3 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.01689v2","updated":"2024-01-17T01:29:23Z","published":"2023-12-04T07:23:44Z","title":"Fast and accurate sparse-view CBCT reconstruction using meta-learned\n neural attenuation field and hash-encoding regularization","summary":" Cone beam computed tomography (CBCT) is an emerging medical imaging technique\nto visualize the internal anatomical structures of patients. During a CBCT\nscan, several projection images of different angles or views are collectively\nutilized to reconstruct a tomographic image. However, reducing the number of\nprojections in a CBCT scan while preserving the quality of a reconstructed\nimage is challenging due to the nature of an ill-posed inverse problem.\nRecently, a neural attenuation field (NAF) method was proposed by adopting a\nneural radiance field algorithm as a new way for CBCT reconstruction,\ndemonstrating fast and promising results using only 50 views. However,\ndecreasing the number of projections is still preferable to reduce potential\nradiation exposure, and a faster reconstruction time is required considering a\ntypical scan time. In this work, we propose a fast and accurate sparse-view\nCBCT reconstruction (FACT) method to provide better reconstruction quality and\nfaster optimization speed in the minimal number of view acquisitions ($<$ 50\nviews). In the FACT method, we meta-trained a neural network and a hash-encoder\nusing a few scans (= 15), and a new regularization technique is utilized to\nreconstruct the details of an anatomical structure. In conclusion, we have\nshown that the FACT method produced better, and faster reconstruction results\nover the other conventional algorithms based on CBCT scans of different body\nparts (chest, head, and abdomen) and CT vendors (Siemens, Phillips, and GE).\n","authors":["Heejun Shin","Taehee Kim","Jongho Lee","Se Young Chun","Seungryung Cho","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2312.01689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08903v1","updated":"2024-01-17T01:10:17Z","published":"2024-01-17T01:10:17Z","title":"PPR: Enhancing Dodging Attacks while Maintaining Impersonation Attacks\n on Face Recognition Systems","summary":" Adversarial Attacks on Face Recognition (FR) encompass two types:\nimpersonation attacks and evasion attacks. We observe that achieving a\nsuccessful impersonation attack on FR does not necessarily ensure a successful\ndodging attack on FR in the black-box setting. Introducing a novel attack\nmethod named Pre-training Pruning Restoration Attack (PPR), we aim to enhance\nthe performance of dodging attacks whilst avoiding the degradation of\nimpersonation attacks. Our method employs adversarial example pruning, enabling\na portion of adversarial perturbations to be set to zero, while tending to\nmaintain the attack performance. By utilizing adversarial example pruning, we\ncan prune the pre-trained adversarial examples and selectively free up certain\nadversarial perturbations. Thereafter, we embed adversarial perturbations in\nthe pruned area, which enhances the dodging performance of the adversarial face\nexamples. The effectiveness of our proposed attack method is demonstrated\nthrough our experimental results, showcasing its superior performance.\n","authors":["Fengfan Zhou","Heifei Ling"],"pdf_url":"https://arxiv.org/pdf/2401.08903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13066v2","updated":"2024-01-17T00:39:17Z","published":"2023-12-20T14:45:57Z","title":"PPEA-Depth: Progressive Parameter-Efficient Adaptation for\n Self-Supervised Monocular Depth Estimation","summary":" Self-supervised monocular depth estimation is of significant importance with\napplications spanning across autonomous driving and robotics. However, the\nreliance on self-supervision introduces a strong static-scene assumption,\nthereby posing challenges in achieving optimal performance in dynamic scenes,\nwhich are prevalent in most real-world situations. To address these issues, we\npropose PPEA-Depth, a Progressive Parameter-Efficient Adaptation approach to\ntransfer a pre-trained image model for self-supervised depth estimation. The\ntraining comprises two sequential stages: an initial phase trained on a dataset\nprimarily composed of static scenes, succeeded by an expansion to more\nintricate datasets involving dynamic scenes. To facilitate this process, we\ndesign compact encoder and decoder adapters to enable parameter-efficient\ntuning, allowing the network to adapt effectively. They not only uphold\ngeneralized patterns from pre-trained image models but also retain knowledge\ngained from the preceding phase into the subsequent one. Extensive experiments\ndemonstrate that PPEA-Depth achieves state-of-the-art performance on KITTI,\nCityScapes and DDAD datasets.\n","authors":["Yue-Jiang Dong","Yuan-Chen Guo","Ying-Tian Liu","Fang-Lue Zhang","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.13066v2.pdf","comment":"Accepted by AAAI 2024 Project homepage:\n https://yuejiangdong.github.io/PPEADepth/"},{"id":"http://arxiv.org/abs/2310.16044v3","updated":"2024-01-17T00:18:21Z","published":"2023-10-24T17:57:58Z","title":"Stanford-ORB: A Real-World 3D Object Inverse Rendering Benchmark","summary":" We introduce Stanford-ORB, a new real-world 3D Object inverse Rendering\nBenchmark. Recent advances in inverse rendering have enabled a wide range of\nreal-world applications in 3D content generation, moving rapidly from research\nand commercial use cases to consumer devices. While the results continue to\nimprove, there is no real-world benchmark that can quantitatively assess and\ncompare the performance of various inverse rendering methods. Existing\nreal-world datasets typically only consist of the shape and multi-view images\nof objects, which are not sufficient for evaluating the quality of material\nrecovery and object relighting. Methods capable of recovering material and\nlighting often resort to synthetic data for quantitative evaluation, which on\nthe other hand does not guarantee generalization to complex real-world\nenvironments. We introduce a new dataset of real-world objects captured under a\nvariety of natural scenes with ground-truth 3D scans, multi-view images, and\nenvironment lighting. Using this dataset, we establish the first comprehensive\nreal-world evaluation benchmark for object inverse rendering tasks from\nin-the-wild scenes, and compare the performance of various existing methods.\n","authors":["Zhengfei Kuang","Yunzhi Zhang","Hong-Xing Yu","Samir Agarwala","Shangzhe Wu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2310.16044v3.pdf","comment":"NeurIPS 2023 Datasets and Benchmarks Track. The first two authors\n contributed equally to this work. Project page:\n https://stanfordorb.github.io/"},{"id":"http://arxiv.org/abs/2310.06234v2","updated":"2024-01-17T00:03:00Z","published":"2023-10-10T01:04:15Z","title":"Efficient Adaptation of Large Vision Transformer via Adapter\n Re-Composing","summary":" The advent of high-capacity pre-trained models has revolutionized\nproblem-solving in computer vision, shifting the focus from training\ntask-specific models to adapting pre-trained models. Consequently, effectively\nadapting large pre-trained models to downstream tasks in an efficient manner\nhas become a prominent research area. Existing solutions primarily concentrate\non designing lightweight adapters and their interaction with pre-trained\nmodels, with the goal of minimizing the number of parameters requiring updates.\nIn this study, we propose a novel Adapter Re-Composing (ARC) strategy that\naddresses efficient pre-trained model adaptation from a fresh perspective. Our\napproach considers the reusability of adaptation parameters and introduces a\nparameter-sharing scheme. Specifically, we leverage symmetric\ndown-/up-projections to construct bottleneck operations, which are shared\nacross layers. By learning low-dimensional re-scaling coefficients, we can\neffectively re-compose layer-adaptive adapters. This parameter-sharing strategy\nin adapter design allows us to significantly reduce the number of new\nparameters while maintaining satisfactory performance, thereby offering a\npromising approach to compress the adaptation cost. We conduct experiments on\n24 downstream image classification tasks using various Vision Transformer\nvariants to evaluate our method. The results demonstrate that our approach\nachieves compelling transfer learning performance with a reduced parameter\ncount. Our code is available at\n\\href{https://github.com/DavidYanAnDe/ARC}{https://github.com/DavidYanAnDe/ARC}.\n","authors":["Wei Dong","Dawei Yan","Zhijun Lin","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.06234v2.pdf","comment":"Paper is accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.08587v2","updated":"2024-01-17T23:49:24Z","published":"2023-10-12T17:59:58Z","title":"Pseudo-Generalized Dynamic View Synthesis from a Video","summary":" Rendering scenes observed in a monocular video from novel viewpoints is a\nchallenging problem. For static scenes the community has studied both\nscene-specific optimization techniques, which optimize on every test scene, and\ngeneralized techniques, which only run a deep net forward pass on a test scene.\nIn contrast, for dynamic scenes, scene-specific optimization techniques exist,\nbut, to our best knowledge, there is currently no generalized method for\ndynamic novel view synthesis from a given monocular video. To answer whether\ngeneralized dynamic novel view synthesis from monocular videos is possible\ntoday, we establish an analysis framework based on existing techniques and work\ntoward the generalized approach. We find a pseudo-generalized process without\nscene-specific appearance optimization is possible, but geometrically and\ntemporally consistent depth estimates are needed. Despite no scene-specific\nappearance optimization, the pseudo-generalized approach improves upon some\nscene-specific methods.\n","authors":["Xiaoming Zhao","Alex Colburn","Fangchang Ma","Miguel Angel Bautista","Joshua M. Susskind","Alexander G. Schwing"],"pdf_url":"https://arxiv.org/pdf/2310.08587v2.pdf","comment":"ICLR 2024; Originally titled as \"Is Generalized Dynamic Novel View\n Synthesis from Monocular Videos Possible Today?\"; Project page:\n https://xiaoming-zhao.github.io/projects/pgdvs"},{"id":"http://arxiv.org/abs/2401.09639v1","updated":"2024-01-17T23:21:42Z","published":"2024-01-17T23:21:42Z","title":"Uncertainty Modeling in Ultrasound Image Segmentation for Precise Fetal\n Biometric Measurements","summary":" Medical image segmentation, particularly in the context of ultrasound data,\nis a crucial aspect of computer vision and medical imaging. This paper delves\ninto the complexities of uncertainty in the segmentation process, focusing on\nfetal head and femur ultrasound images. The proposed methodology involves\nextracting target contours and exploring techniques for precise parameter\nmeasurement. Uncertainty modeling methods are employed to enhance the training\nand testing processes of the segmentation network. The study reveals that the\naverage absolute error in fetal head circumference measurement is 8.0833mm,\nwith a relative error of 4.7347%. Similarly, the average absolute error in\nfetal femur measurement is 2.6163mm, with a relative error of 6.3336%.\nUncertainty modeling experiments employing Test-Time Augmentation (TTA)\ndemonstrate effective interpretability of data uncertainty on both datasets.\nThis suggests that incorporating data uncertainty based on the TTA method can\nsupport clinical practitioners in making informed decisions and obtaining more\nreliable measurement results in practical clinical applications. The paper\ncontributes to the advancement of ultrasound image segmentation, addressing\ncritical challenges and improving the reliability of biometric measurements.\n","authors":["Shuge Lei"],"pdf_url":"https://arxiv.org/pdf/2401.09639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09638v1","updated":"2024-01-17T23:17:08Z","published":"2024-01-17T23:17:08Z","title":"Automatic 3D Multi-modal Ultrasound Segmentation of Human Placenta using\n Fusion Strategies and Deep Learning","summary":" Purpose: Ultrasound is the most commonly used medical imaging modality for\ndiagnosis and screening in clinical practice. Due to its safety profile,\nnoninvasive nature and portability, ultrasound is the primary imaging modality\nfor fetal assessment in pregnancy. Current ultrasound processing methods are\neither manual or semi-automatic and are therefore laborious, time-consuming and\nprone to errors, and automation would go a long way in addressing these\nchallenges. Automated identification of placental changes at earlier gestation\ncould facilitate potential therapies for conditions such as fetal growth\nrestriction and pre-eclampsia that are currently detected only at late\ngestational age, potentially preventing perinatal morbidity and mortality.\n Methods: We propose an automatic three-dimensional multi-modal (B-mode and\npower Doppler) ultrasound segmentation of the human placenta using deep\nlearning combined with different fusion strategies.We collected data containing\nBmode and power Doppler ultrasound scans for 400 studies.\n Results: We evaluated different fusion strategies and state-of-the-art image\nsegmentation networks for placenta segmentation based on standard overlap- and\nboundary-based metrics. We found that multimodal information in the form of\nB-mode and power Doppler scans outperform any single modality. Furthermore, we\nfound that B-mode and power Doppler input scans fused at the data level provide\nthe best results with a mean Dice Similarity Coefficient (DSC) of 0.849.\n Conclusion: We conclude that the multi-modal approach of combining B-mode and\npower Doppler scans is effective in segmenting the placenta from 3D ultrasound\nscans in a fully automated manner and is robust to quality variation of the\ndatasets.\n","authors":["Sonit Singh","Gordon Stevenson","Brendan Mein","Alec Welsh","Arcot Sowmya"],"pdf_url":"https://arxiv.org/pdf/2401.09638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09630v1","updated":"2024-01-17T22:44:18Z","published":"2024-01-17T22:44:18Z","title":"CT Liver Segmentation via PVT-based Encoding and Refined Decoding","summary":" Accurate liver segmentation from CT scans is essential for computer-aided\ndiagnosis and treatment planning. Recently, Vision Transformers achieved a\ncompetitive performance in computer vision tasks compared to convolutional\nneural networks due to their exceptional ability to learn global\nrepresentations. However, they often struggle with scalability, memory\nconstraints, and computational inefficiency, particularly in handling\nhigh-resolution medical images. To overcome scalability and efficiency issues,\nwe propose a novel deep learning approach, \\textit{\\textbf{PVTFormer}}, that is\nbuilt upon a pretrained pyramid vision transformer (PVT v2) combined with\nadvanced residual upsampling and decoder block. By integrating a refined\nfeature channel approach with hierarchical decoding strategy, PVTFormer\ngenerates high quality segmentation masks by enhancing semantic features.\nRigorous evaluation of the proposed method on Liver Tumor Segmentation\nBenchmark (LiTS) 2017 demonstrates that our proposed architecture not only\nachieves a high dice coefficient of 86.78\\%, mIoU of 78.46\\%, but also obtains\na low HD of 3.50. The results underscore PVTFormer's efficacy in setting a new\nbenchmark for state-of-the-art liver segmentation methods. The source code of\nthe proposed PVTFormer is available at\n\\url{https://github.com/DebeshJha/PVTFormer}.\n","authors":["Debesh Jha","Nikhil Kumar Tomar","Koushik Biswas","Gorkem Durak","Alpay Medetalibeyoglu","Matthew Antalek","Yury Velichko","Daniela Ladner","Amir Borhani","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2401.09630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09627v1","updated":"2024-01-17T22:34:20Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n Lumbar Spine MRI","summary":" Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09624v1","updated":"2024-01-17T22:30:41Z","published":"2024-01-17T22:30:41Z","title":"MITS-GAN: Safeguarding Medical Imaging from Tampering with Generative\n Adversarial Networks","summary":" The progress in generative models, particularly Generative Adversarial\nNetworks (GANs), opened new possibilities for image generation but raised\nconcerns about potential malicious uses, especially in sensitive areas like\nmedical imaging. This study introduces MITS-GAN, a novel approach to prevent\ntampering in medical images, with a specific focus on CT scans. The approach\ndisrupts the output of the attacker's CT-GAN architecture by introducing\nimperceptible but yet precise perturbations. Specifically, the proposed\napproach involves the introduction of appropriate Gaussian noise to the input\nas a protective measure against various attacks. Our method aims to enhance\ntamper resistance, comparing favorably to existing techniques. Experimental\nresults on a CT scan dataset demonstrate MITS-GAN's superior performance,\nemphasizing its ability to generate tamper-resistant images with negligible\nartifacts. As image tampering in medical domains poses life-threatening risks,\nour proactive approach contributes to the responsible and ethical use of\ngenerative models. This work provides a foundation for future research in\ncountering cyber threats in medical imaging. Models and codes are publicly\navailable at the following link\n\\url{https://iplab.dmi.unict.it/MITS-GAN-2024/}.\n","authors":["Giovanni Pasqualino","Luca Guarnera","Alessandro Ortis","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2401.09624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09607v1","updated":"2024-01-17T21:32:04Z","published":"2024-01-17T21:32:04Z","title":"Land Cover Image Classification","summary":" Land Cover (LC) image classification has become increasingly significant in\nunderstanding environmental changes, urban planning, and disaster management.\nHowever, traditional LC methods are often labor-intensive and prone to human\nerror. This paper explores state-of-the-art deep learning models for enhanced\naccuracy and efficiency in LC analysis. We compare convolutional neural\nnetworks (CNN) against transformer-based methods, showcasing their applications\nand advantages in LC studies. We used EuroSAT, a patch-based LC classification\ndata set based on Sentinel-2 satellite images and achieved state-of-the-art\nresults using current transformer models.\n","authors":["Antonio Rangel","Juan Terven","Diana M. Cordova-Esparza","E. A. Chavez-Urbiola"],"pdf_url":"https://arxiv.org/pdf/2401.09607v1.pdf","comment":"7 pages, 4 figures, 1 table, published in conference"},{"id":"http://arxiv.org/abs/2401.09606v1","updated":"2024-01-17T21:32:03Z","published":"2024-01-17T21:32:03Z","title":"Robustness Evaluation of Machine Learning Models for Robot Arm Action\n Recognition in Noisy Environments","summary":" In the realm of robot action recognition, identifying distinct but spatially\nproximate arm movements using vision systems in noisy environments poses a\nsignificant challenge. This paper studies robot arm action recognition in noisy\nenvironments using machine learning techniques. Specifically, a vision system\nis used to track the robot's movements followed by a deep learning model to\nextract the arm's key points. Through a comparative analysis of machine\nlearning methods, the effectiveness and robustness of this model are assessed\nin noisy environments. A case study was conducted using the Tic-Tac-Toe game in\na 3-by-3 grid environment, where the focus is to accurately identify the\nactions of the arms in selecting specific locations within this constrained\nenvironment. Experimental results show that our approach can achieve precise\nkey point detection and action classification despite the addition of noise and\nuncertainties to the dataset.\n","authors":["Elaheh Motamedi","Kian Behzad","Rojin Zandi","Hojjat Salehinejad","Milad Siami"],"pdf_url":"https://arxiv.org/pdf/2401.09606v1.pdf","comment":"Accepted at ICASSP"},{"id":"http://arxiv.org/abs/2401.09604v1","updated":"2024-01-17T21:30:22Z","published":"2024-01-17T21:30:22Z","title":"MedBlindTuner: Towards Privacy-preserving Fine-tuning on Biomedical\n Images with Transformers and Fully Homomorphic Encryption","summary":" Advancements in machine learning (ML) have significantly revolutionized\nmedical image analysis, prompting hospitals to rely on external ML services.\nHowever, the exchange of sensitive patient data, such as chest X-rays, poses\ninherent privacy risks when shared with third parties. Addressing this concern,\nwe propose MedBlindTuner, a privacy-preserving framework leveraging fully\nhomomorphic encryption (FHE) and a data-efficient image transformer (DEiT).\nMedBlindTuner enables the training of ML models exclusively on FHE-encrypted\nmedical images. Our experimental evaluation demonstrates that MedBlindTuner\nachieves comparable accuracy to models trained on non-encrypted images,\noffering a secure solution for outsourcing ML computations while preserving\npatient data privacy. To the best of our knowledge, this is the first work that\nuses data-efficient image transformers and fully homomorphic encryption in this\ndomain.\n","authors":["Prajwal Panzade","Daniel Takabi","Zhipeng Cai"],"pdf_url":"https://arxiv.org/pdf/2401.09604v1.pdf","comment":"Accepted for the presentation at W3PHIAI, The 38th Annual AAAI\n Conference on Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2401.09596v1","updated":"2024-01-17T21:08:41Z","published":"2024-01-17T21:08:41Z","title":"Efficient generative adversarial networks using linear\n additive-attention Transformers","summary":" Although the capacity of deep generative models for image generation, such as\nDiffusion Models (DMs) and Generative Adversarial Networks (GANs), has\ndramatically improved in recent years, much of their success can be attributed\nto computationally expensive architectures. This has limited their adoption and\nuse to research laboratories and companies with large resources, while\nsignificantly raising the carbon footprint for training, fine-tuning, and\ninference. In this work, we present LadaGAN, an efficient generative\nadversarial network that is built upon a novel Transformer block named\nLadaformer. The main component of this block is a linear additive-attention\nmechanism that computes a single attention vector per head instead of the\nquadratic dot-product attention. We employ Ladaformer in both the generator and\ndiscriminator, which reduces the computational complexity and overcomes the\ntraining instabilities often associated with Transformer GANs. LadaGAN\nconsistently outperforms existing convolutional and Transformer GANs on\nbenchmark datasets at different resolutions while being significantly more\nefficient. Moreover, LadaGAN shows competitive performance compared to\nstate-of-the-art multi-step generative models (e.g. DMs) using orders of\nmagnitude less computational resources.\n","authors":["Emilio Morales-Juarez","Gibran Fuentes-Pineda"],"pdf_url":"https://arxiv.org/pdf/2401.09596v1.pdf","comment":"12 pages, 6 figures"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.09350v1","updated":"2024-01-17T17:13:35Z","published":"2024-01-17T17:13:35Z","title":"Foundations of Vector Retrieval","summary":" Vectors are universal mathematical objects that can represent text, images,\nspeech, or a mix of these data modalities. That happens regardless of whether\ndata is represented by hand-crafted features or learnt embeddings. Collect a\nlarge enough quantity of such vectors and the question of retrieval becomes\nurgently relevant: Finding vectors that are more similar to a query vector.\nThis monograph is concerned with the question above and covers fundamental\nconcepts along with advanced data structures and algorithms for vector\nretrieval. In doing so, it recaps this fascinating topic and lowers barriers of\nentry into this rich area of research.\n","authors":["Sebastian Bruch"],"pdf_url":"https://arxiv.org/pdf/2401.09350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09092v1","updated":"2024-01-17T09:53:50Z","published":"2024-01-17T09:53:50Z","title":"BibSonomy Meets ChatLLMs for Publication Management: From Chat to\n Publication Management: Organizing your related work using BibSonomy & LLMs","summary":" The ever-growing corpus of scientific literature presents significant\nchallenges for researchers with respect to discovery, management, and\nannotation of relevant publications. Traditional platforms like Semantic\nScholar, BibSonomy, and Zotero offer tools for literature management, but\nlargely require manual laborious and error-prone input of tags and metadata.\nHere, we introduce a novel retrieval augmented generation system that leverages\nchat-based large language models (LLMs) to streamline and enhance the process\nof publication management. It provides a unified chat-based interface, enabling\nintuitive interactions with various backends, including Semantic Scholar,\nBibSonomy, and the Zotero Webscraper. It supports two main use-cases: (1)\nExplorative Search & Retrieval - leveraging LLMs to search for and retrieve\nboth specific and general scientific publications, while addressing the\nchallenges of content hallucination and data obsolescence; and (2) Cataloguing\n& Management - aiding in the organization of personal publication libraries, in\nthis case BibSonomy, by automating the addition of metadata and tags, while\nfacilitating manual edits and updates. We compare our system to different LLM\nmodels in three different settings, including a user study, and we can show its\nadvantages in different metrics.\n","authors":["Tom Völker","Jan Pfister","Tobias Koopmann","Andreas Hotho"],"pdf_url":"https://arxiv.org/pdf/2401.09092v1.pdf","comment":"Accepted at 2024 ACM SIGIR CHIIR, For a demo see here\n http://professor-x.de/demos/bibsonomy-chatgpt/demo.mp4"},{"id":"http://arxiv.org/abs/2401.09070v1","updated":"2024-01-17T09:08:23Z","published":"2024-01-17T09:08:23Z","title":"Knowledge Pyramid: A Novel Hierarchical Reasoning Structure for\n Generalized Knowledge Augmentation and Inference","summary":" Knowledge graph (KG) based reasoning has been regarded as an effective means\nfor the analysis of semantic networks and is of great usefulness in areas of\ninformation retrieval, recommendation, decision-making, and man-machine\ninteraction. It is widely used in recommendation, decision-making,\nquestion-answering, search, and other fields. However, previous studies mainly\nused low-level knowledge in the KG for reasoning, which may result in\ninsufficient generalization and poor robustness of reasoning. To this end, this\npaper proposes a new inference approach using a novel knowledge augmentation\nstrategy to improve the generalization capability of KG. This framework\nextracts high-level pyramidal knowledge from low-level knowledge and applies it\nto reasoning in a multi-level hierarchical KG, called knowledge pyramid in this\npaper. We tested some medical data sets using the proposed approach, and the\nexperimental results show that the proposed knowledge pyramid has improved the\nknowledge inference performance with better generalization. Especially, when\nthere are fewer training samples, the inference accuracy can be significantly\nimproved.\n","authors":["Qinghua Huang","Yongzhen Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09070v1.pdf","comment":"10 pages,8 figures"},{"id":"http://arxiv.org/abs/2401.09044v1","updated":"2024-01-17T08:24:57Z","published":"2024-01-17T08:24:57Z","title":"Algorithmic amplification of biases on Google Search","summary":" The evolution of information-seeking processes, driven by search engines like\nGoogle, has transformed the access to information people have. This paper\ninvestigates how individuals' preexisting attitudes influence the modern\ninformation-seeking process, specifically the results presented by Google\nSearch. Through a comprehensive study involving surveys and information-seeking\ntasks focusing on the topic of abortion, the paper provides four crucial\ninsights: 1) Individuals with opposing attitudes on abortion receive different\nsearch results. 2) Individuals express their beliefs in their choice of\nvocabulary used in formulating the search queries, shaping the outcome of the\nsearch. 3) Additionally, the user's search history contributes to divergent\nresults among those with opposing attitudes. 4) Google Search engine reinforces\npreexisting beliefs in search results. Overall, this study provides insights\ninto the interplay between human biases and algorithmic processes, highlighting\nthe potential for information polarization in modern information-seeking\nprocesses.\n","authors":["Hussam Habib","Ryan Stoldt","Andrew High","Brian Ekdale","Ashley Peterson","Katy Biddle","Javie Ssozi","Rishab Nithyanand"],"pdf_url":"https://arxiv.org/pdf/2401.09044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09034v1","updated":"2024-01-17T08:01:18Z","published":"2024-01-17T08:01:18Z","title":"UOEP: User-Oriented Exploration Policy for Enhancing Long-Term User\n Experiences in Recommender Systems","summary":" Reinforcement learning (RL) has gained traction for enhancing user long-term\nexperiences in recommender systems by effectively exploring users' interests.\nHowever, modern recommender systems exhibit distinct user behavioral patterns\namong tens of millions of items, which increases the difficulty of exploration.\nFor example, user behaviors with different activity levels require varying\nintensity of exploration, while previous studies often overlook this aspect and\napply a uniform exploration strategy to all users, which ultimately hurts user\nexperiences in the long run. To address these challenges, we propose\nUser-Oriented Exploration Policy (UOEP), a novel approach facilitating\nfine-grained exploration among user groups. We first construct a distributional\ncritic which allows policy optimization under varying quantile levels of\ncumulative reward feedbacks from users, representing user groups with varying\nactivity levels. Guided by this critic, we devise a population of distinct\nactors aimed at effective and fine-grained exploration within its respective\nuser group. To simultaneously enhance diversity and stability during the\nexploration process, we further introduce a population-level diversity\nregularization term and a supervision module. Experimental results on public\nrecommendation datasets demonstrate that our approach outperforms all other\nbaselines in terms of long-term performance, validating its user-oriented\nexploration effectiveness. Meanwhile, further analyses reveal our approach's\nbenefits of improved performance for low-activity users as well as increased\nfairness among users.\n","authors":["Changshuo Zhang","Sirui Chen","Xiao Zhang","Sunhao Dai","Weijie Yu","Jun Xu"],"pdf_url":"https://arxiv.org/pdf/2401.09034v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08993v1","updated":"2024-01-17T06:03:31Z","published":"2024-01-17T06:03:31Z","title":"Estimating Gender Completeness in Wikipedia","summary":" Gender imbalance in Wikipedia content is a known challenge which the editor\ncommunity is actively addressing. The aim of this paper is to provide the\nWikipedia community with instruments to estimate the magnitude of the problem\nfor different entity types (also known as classes) in Wikipedia. To this end,\nwe apply class completeness estimation methods based on the gender attribute.\nOur results show not only which gender for different sub-classes of Person is\nmore prevalent in Wikipedia, but also an idea of how complete the coverage is\nfor difference genders and sub-classes of Person.\n","authors":["Hrishikesh Patel","Tianwa Chen","Ivano Bongiovanni","Gianluca Demartini"],"pdf_url":"https://arxiv.org/pdf/2401.08993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06982v2","updated":"2024-01-17T05:28:30Z","published":"2024-01-13T05:29:24Z","title":"Denoising Diffusion Recommender Model","summary":" Recommender systems often grapple with noisy implicit feedback. Most studies\nalleviate the noise issues from data cleaning perspective such as data\nresampling and reweighting, but they are constrained by heuristic assumptions.\nAnother denoising avenue is from model perspective, which proactively injects\nnoises into user-item interactions and enhance the intrinsic denoising ability\nof models. However, this kind of denoising process poses significant challenges\nto the recommender model's representation capacity to capture noise patterns.\nTo address this issue, we propose Denoising Diffusion Recommender Model (DDRM),\nwhich leverages multi-step denoising process based on diffusion models to\nrobustify user and item embeddings from any recommender models. DDRM injects\ncontrolled Gaussian noises in the forward process and iteratively removes\nnoises in the reverse denoising process, thereby improving embedding robustness\nagainst noisy feedback. To achieve this target, the key lies in offering\nappropriate guidance to steer the reverse denoising process and providing a\nproper starting point to start the forward-reverse process during inference. In\nparticular, we propose a dedicated denoising module that encodes collaborative\ninformation as denoising guidance. Besides, in the inference stage, DDRM\nutilizes the average embeddings of users' historically liked items as the\nstarting point rather than using pure noise since pure noise lacks\npersonalization, which increases the difficulty of the denoising process.\nExtensive experiments on three datasets with three representative backend\nrecommender models demonstrate the effectiveness of DDRM.\n","authors":["Jujia Zhao","Wenjie Wang","Yiyan Xu","Teng Sun","Fuli Feng"],"pdf_url":"https://arxiv.org/pdf/2401.06982v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07769v2","updated":"2024-01-17T03:38:26Z","published":"2024-01-15T15:27:24Z","title":"Deep Evolutional Instant Interest Network for CTR Prediction in\n Trigger-Induced Recommendation","summary":" The recommendation has been playing a key role in many industries, e.g.,\ne-commerce, streaming media, social media, etc. Recently, a new recommendation\nscenario, called Trigger-Induced Recommendation (TIR), where users are able to\nexplicitly express their instant interests via trigger items, is emerging as an\nessential role in many e-commerce platforms, e.g., Alibaba.com and Amazon.\nWithout explicitly modeling the user's instant interest, traditional\nrecommendation methods usually obtain sub-optimal results in TIR. Even though\nthere are a few methods considering the trigger and target items simultaneously\nto solve this problem, they still haven't taken into account temporal\ninformation of user behaviors, the dynamic change of user instant interest when\nthe user scrolls down and the interactions between the trigger and target\nitems. To tackle these problems, we propose a novel method -- Deep Evolutional\nInstant Interest Network (DEI2N), for click-through rate prediction in TIR\nscenarios. Specifically, we design a User Instant Interest Modeling Layer to\npredict the dynamic change of the intensity of instant interest when the user\nscrolls down. Temporal information is utilized in user behavior modeling.\nMoreover, an Interaction Layer is introduced to learn better interactions\nbetween the trigger and target items. We evaluate our method on several offline\nand real-world industrial datasets. Experimental results show that our proposed\nDEI2N outperforms state-of-the-art baselines. In addition, online A/B testing\ndemonstrates the superiority over the existing baseline in real-world\nproduction environments.\n","authors":["Zhibo Xiao","Luwei Yang","Tao Zhang","Wen Jiang","Wei Ning","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.07769v2.pdf","comment":"7 pages, 3 figures, accepted by the 17th ACM International Conference\n on Web Search and Data Mining(WSDM'2024)"},{"id":"http://arxiv.org/abs/2401.08902v1","updated":"2024-01-17T01:06:22Z","published":"2024-01-17T01:06:22Z","title":"Similar but Faster: Manipulation of Tempo in Music Audio Embeddings for\n Tempo Prediction and Search","summary":" Audio embeddings enable large scale comparisons of the similarity of audio\nfiles for applications such as search and recommendation. Due to the\nsubjectivity of audio similarity, it can be desirable to design systems that\nanswer not only whether audio is similar, but similar in what way (e.g., wrt.\ntempo, mood or genre). Previous works have proposed disentangled embedding\nspaces where subspaces representing specific, yet possibly correlated,\nattributes can be weighted to emphasize those attributes in downstream tasks.\nHowever, no research has been conducted into the independence of these\nsubspaces, nor their manipulation, in order to retrieve tracks that are similar\nbut different in a specific way. Here, we explore the manipulation of tempo in\nembedding spaces as a case-study towards this goal. We propose tempo\ntranslation functions that allow for efficient manipulation of tempo within a\npre-existing embedding space whilst maintaining other properties such as genre.\nAs this translation is specific to tempo it enables retrieval of tracks that\nare similar but have specifically different tempi. We show that such a function\ncan be used as an efficient data augmentation strategy for both training of\ndownstream tempo predictors, and improved nearest neighbor retrieval of\nproperties largely independent of tempo.\n","authors":["Matthew C. McCallum","Florian Henkel","Jaehun Kim","Samuel E. Sandberg","Matthew E. P. Davies"],"pdf_url":"https://arxiv.org/pdf/2401.08902v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.08889v1","updated":"2024-01-17T00:12:13Z","published":"2024-01-17T00:12:13Z","title":"On the Effect of Data-Augmentation on Local Embedding Properties in the\n Contrastive Learning of Music Audio Representations","summary":" Audio embeddings are crucial tools in understanding large catalogs of music.\nTypically embeddings are evaluated on the basis of the performance they provide\nin a wide range of downstream tasks, however few studies have investigated the\nlocal properties of the embedding spaces themselves which are important in\nnearest neighbor algorithms, commonly used in music search and recommendation.\nIn this work we show that when learning audio representations on music datasets\nvia contrastive learning, musical properties that are typically homogeneous\nwithin a track (e.g., key and tempo) are reflected in the locality of\nneighborhoods in the resulting embedding space. By applying appropriate data\naugmentation strategies, localisation of such properties can not only be\nreduced but the localisation of other attributes is increased. For example,\nlocality of features such as pitch and tempo that are less relevant to\nnon-expert listeners, may be mitigated while improving the locality of more\nsalient features such as genre and mood, achieving state-of-the-art performance\nin nearest neighbor retrieval accuracy. Similarly, we show that the optimal\nselection of data augmentation strategies for contrastive learning of music\naudio embeddings is dependent on the downstream task, highlighting this as an\nimportant embedding design decision.\n","authors":["Matthew C. McCallum","Matthew E. P. Davies","Florian Henkel","Jaehun Kim","Samuel E. Sandberg"],"pdf_url":"https://arxiv.org/pdf/2401.08889v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.09572v1","updated":"2024-01-17T19:49:11Z","published":"2024-01-17T19:49:11Z","title":"Handling Large-scale Cardinality in building recommendation systems","summary":" Effective recommendation systems rely on capturing user preferences, often\nrequiring incorporating numerous features such as universally unique\nidentifiers (UUIDs) of entities. However, the exceptionally high cardinality of\nUUIDs poses a significant challenge in terms of model degradation and increased\nmodel size due to sparsity. This paper presents two innovative techniques to\naddress the challenge of high cardinality in recommendation systems.\nSpecifically, we propose a bag-of-words approach, combined with layer sharing,\nto substantially decrease the model size while improving performance. Our\ntechniques were evaluated through offline and online experiments on Uber use\ncases, resulting in promising results demonstrating our approach's\neffectiveness in optimizing recommendation systems and enhancing their overall\nperformance.\n","authors":["Dhruva Dixith Kurra","Bo Ling","Chun Zh","Seyedshahin Ashrafzadeh"],"pdf_url":"https://arxiv.org/pdf/2401.09572v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05535v2","updated":"2024-01-17T19:39:42Z","published":"2023-05-24T12:09:42Z","title":"Detecting Check-Worthy Claims in Political Debates, Speeches, and\n Interviews Using Audio Data","summary":" Developing tools to automatically detect check-worthy claims in political\ndebates and speeches can greatly help moderators of debates, journalists, and\nfact-checkers. While previous work on this problem has focused exclusively on\nthe text modality, here we explore the utility of the audio modality as an\nadditional input. We create a new multimodal dataset (text and audio in\nEnglish) containing 48 hours of speech from past political debates in the USA.\nWe then experimentally demonstrate that, in the case of multiple speakers,\nadding the audio modality yields sizable improvements over using the text\nmodality alone; moreover, an audio-only model could outperform a text-only one\nfor a single speaker. With the aim to enable future research, we make all our\ndata and code publicly available at\nhttps://github.com/petar-iv/audio-checkworthiness-detection.\n","authors":["Petar Ivanov","Ivan Koychev","Momchil Hardalov","Preslav Nakov"],"pdf_url":"https://arxiv.org/pdf/2306.05535v2.pdf","comment":"Check-Worthiness, Fact-Checking, Fake News, Misinformation,\n Disinformation, Political Debates, Multimodality"},{"id":"http://arxiv.org/abs/2401.10942v1","updated":"2024-01-17T18:35:44Z","published":"2024-01-17T18:35:44Z","title":"Machine Unlearning for Recommendation Systems: An Insight","summary":" This review explores machine unlearning (MUL) in recommendation systems,\naddressing adaptability, personalization, privacy, and bias challenges. Unlike\ntraditional models, MUL dynamically adjusts system knowledge based on shifts in\nuser preferences and ethical considerations. The paper critically examines\nMUL's basics, real-world applications, and challenges like algorithmic\ntransparency. It sifts through literature, offering insights into how MUL could\ntransform recommendations, discussing user trust, and suggesting paths for\nfuture research in responsible and user-focused artificial intelligence (AI).\nThe document guides researchers through challenges involving the trade-off\nbetween personalization and privacy, encouraging contributions to meet\npractical demands for targeted data removal. Emphasizing MUL's role in secure\nand adaptive machine learning, the paper proposes ways to push its boundaries.\nThe novelty of this paper lies in its exploration of the limitations of the\nmethods, which highlights exciting prospects for advancing the field.\n","authors":["Bhavika Sachdeva","Harshita Rathee"," Sristi","Arun Sharma","Witold Wydmański"],"pdf_url":"https://arxiv.org/pdf/2401.10942v1.pdf","comment":"In Proceedings of 7th INTERNATIONAL CONFERENCE ON INNOVATIVE\n COMPUTING AND COMMUNICATION 2024 (https://icicc-conf.com/)"},{"id":"http://arxiv.org/abs/2401.10940v1","updated":"2024-01-17T13:11:09Z","published":"2024-01-17T13:11:09Z","title":"RELIANCE: Reliable Ensemble Learning for Information and News\n Credibility Evaluation","summary":" In the era of information proliferation, discerning the credibility of news\ncontent poses an ever-growing challenge. This paper introduces RELIANCE, a\npioneering ensemble learning system designed for robust information and fake\nnews credibility evaluation. Comprising five diverse base models, including\nSupport Vector Machine (SVM), naive Bayes, logistic regression, random forest,\nand Bidirectional Long Short Term Memory Networks (BiLSTMs), RELIANCE employs\nan innovative approach to integrate their strengths, harnessing the collective\nintelligence of the ensemble for enhanced accuracy. Experiments demonstrate the\nsuperiority of RELIANCE over individual models, indicating its efficacy in\ndistinguishing between credible and non-credible information sources. RELIANCE,\nalso surpasses baseline models in information and news credibility assessment,\nestablishing itself as an effective solution for evaluating the reliability of\ninformation sources.\n","authors":["Majid Ramezani","Hamed Mohammad-Shahi","Mahshid Daliry","Soroor Rahmani","Amir-Hosein Asghari"],"pdf_url":"https://arxiv.org/pdf/2401.10940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10934v1","updated":"2024-01-17T03:27:39Z","published":"2024-01-17T03:27:39Z","title":"A New Creative Generation Pipeline for Click-Through Rate with Stable\n Diffusion Model","summary":" In online advertising scenario, sellers often create multiple creatives to\nprovide comprehensive demonstrations, making it essential to present the most\nappealing design to maximize the Click-Through Rate (CTR). However, sellers\ngenerally struggle to consider users preferences for creative design, leading\nto the relatively lower aesthetics and quantities compared to Artificial\nIntelligence (AI)-based approaches. Traditional AI-based approaches still face\nthe same problem of not considering user information while having limited\naesthetic knowledge from designers. In fact that fusing the user information,\nthe generated creatives can be more attractive because different users may have\ndifferent preferences. To optimize the results, the generated creatives in\ntraditional methods are then ranked by another module named creative ranking\nmodel. The ranking model can predict the CTR score for each creative\nconsidering user features. However, the two above stages are regarded as two\ndifferent tasks and are optimized separately. In this paper, we proposed a new\nautomated Creative Generation pipeline for Click-Through Rate (CG4CTR) with the\ngoal of improving CTR during the creative generation stage. Our contributions\nhave 4 parts: 1) The inpainting mode in stable diffusion is firstly applied to\ncreative generation task in online advertising scene. A self-cyclic generation\npipeline is proposed to ensure the convergence of training. 2) Prompt model is\ndesigned to generate individualized creatives for different user groups, which\ncan further improve the diversity and quality. 3) Reward model comprehensively\nconsiders the multimodal features of image and text to improve the\neffectiveness of creative ranking task, and it is also critical in self-cyclic\npipeline. 4) The significant benefits obtained in online and offline\nexperiments verify the significance of our proposed method.\n","authors":["Hao Yang","Jianxin Yuan","Shuai Yang","Linhe Xu","Shuo Yuan","Yifan Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.10934v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.09417v1","updated":"2024-01-17T18:56:18Z","published":"2024-01-17T18:56:18Z","title":"Vision Mamba: Efficient Visual Representation Learning with\n Bidirectional State Space Model","summary":" Recently the state space models (SSMs) with efficient hardware-aware designs,\ni.e., Mamba, have shown great potential for long sequence modeling. Building\nefficient and generic vision backbones purely upon SSMs is an appealing\ndirection. However, representing visual data is challenging for SSMs due to the\nposition-sensitivity of visual data and the requirement of global context for\nvisual understanding. In this paper, we show that the reliance of visual\nrepresentation learning on self-attention is not necessary and propose a new\ngeneric vision backbone with bidirectional Mamba blocks (Vim), which marks the\nimage sequences with position embeddings and compresses the visual\nrepresentation with bidirectional state space models. On ImageNet\nclassification, COCO object detection, and ADE20k semantic segmentation tasks,\nVim achieves higher performance compared to well-established vision\ntransformers like DeiT, while also demonstrating significantly improved\ncomputation & memory efficiency. For example, Vim is 2.8$\\times$ faster than\nDeiT and saves 86.8% GPU memory when performing batch inference to extract\nfeatures on images with a resolution of 1248$\\times$1248. The results\ndemonstrate that Vim is capable of overcoming the computation & memory\nconstraints on performing Transformer-style understanding for high-resolution\nimages and it has great potential to become the next-generation backbone for\nvision foundation models. Code is available at https://github.com/hustvl/Vim.\n","authors":["Lianghui Zhu","Bencheng Liao","Qian Zhang","Xinlong Wang","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09417v1.pdf","comment":"Work in progress. Code is available at https://github.com/hustvl/Vim"},{"id":"http://arxiv.org/abs/2401.09414v1","updated":"2024-01-17T18:55:12Z","published":"2024-01-17T18:55:12Z","title":"Vlogger: Make Your Dream A Vlog","summary":" In this work, we present Vlogger, a generic AI system for generating a\nminute-level video blog (i.e., vlog) of user descriptions. Different from short\nvideos with a few seconds, vlog often contains a complex storyline with\ndiversified scenes, which is challenging for most existing video generation\napproaches. To break through this bottleneck, our Vlogger smartly leverages\nLarge Language Model (LLM) as Director and decomposes a long video generation\ntask of vlog into four key stages, where we invoke various foundation models to\nplay the critical roles of vlog professionals, including (1) Script, (2) Actor,\n(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings,\nour Vlogger can generate vlogs through explainable cooperation of top-down\nplanning and bottom-up shooting. Moreover, we introduce a novel video diffusion\nmodel, ShowMaker, which serves as a videographer in our Vlogger for generating\nthe video snippet of each shooting scene. By incorporating Script and Actor\nattentively as textual and visual prompts, it can effectively enhance\nspatial-temporal coherence in the snippet. Besides, we design a concise mixed\ntraining paradigm for ShowMaker, boosting its capacity for both T2V generation\nand prediction. Finally, the extensive experiments show that our method\nachieves state-of-the-art performance on zero-shot T2V generation and\nprediction tasks. More importantly, Vlogger can generate over 5-minute vlogs\nfrom open-world descriptions, without loss of video coherence on script and\nactor. The code and model is all available at\nhttps://github.com/zhuangshaobin/Vlogger.\n","authors":["Shaobin Zhuang","Kunchang Li","Xinyuan Chen","Yaohui Wang","Ziwei Liu","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09414v1.pdf","comment":"16 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2401.09407v1","updated":"2024-01-17T18:45:13Z","published":"2024-01-17T18:45:13Z","title":"Deciphering Textual Authenticity: A Generalized Strategy through the\n Lens of Large Language Semantics for Detecting Human vs. Machine-Generated\n Text","summary":" With the recent proliferation of Large Language Models (LLMs), there has been\nan increasing demand for tools to detect machine-generated text. The effective\ndetection of machine-generated text face two pertinent problems: First, they\nare severely limited in generalizing against real-world scenarios, where\nmachine-generated text is produced by a variety of generators, including but\nnot limited to GPT-4 and Dolly, and spans diverse domains, ranging from\nacademic manuscripts to social media posts. Second, existing detection\nmethodologies treat texts produced by LLMs through a restrictive binary\nclassification lens, neglecting the nuanced diversity of artifacts generated by\ndifferent LLMs. In this work, we undertake a systematic study on the detection\nof machine-generated text in real-world scenarios. We first study the\neffectiveness of state-of-the-art approaches and find that they are severely\nlimited against text produced by diverse generators and domains in the real\nworld. Furthermore, t-SNE visualizations of the embeddings from a pretrained\nLLM's encoder show that they cannot reliably distinguish between human and\nmachine-generated text. Based on our findings, we introduce a novel system,\nT5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder\ncombined with LLM embedding sub-clustering to address the text produced by\ndiverse generators and domains in the real world. We evaluate our approach\nacross 9 machine-generated text systems and 9 domains and find that our\napproach provides state-of-the-art generalization ability, with an average\nincrease in F1 score on machine-generated text of 19.6\\% on unseen generators\nand domains compared to the top performing existing approaches and correctly\nattributes the generator of text with an accuracy of 93.6\\%.\n","authors":["Mazal Bethany","Brandon Wherry","Emet Bethany","Nishant Vishwamitra","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2401.09407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02811v2","updated":"2024-01-17T18:14:20Z","published":"2023-04-06T01:20:23Z","title":"HomPINNs: homotopy physics-informed neural networks for solving the\n inverse problems of nonlinear differential equations with multiple solutions","summary":" Due to the complex behavior arising from non-uniqueness, symmetry, and\nbifurcations in the solution space, solving inverse problems of nonlinear\ndifferential equations (DEs) with multiple solutions is a challenging task. To\naddress this, we propose homotopy physics-informed neural networks (HomPINNs),\na novel framework that leverages homotopy continuation and neural networks\n(NNs) to solve inverse problems. The proposed framework begins with the use of\nNNs to simultaneously approximate unlabeled observations across diverse\nsolutions while adhering to DE constraints. Through homotopy continuation, the\nproposed method solves the inverse problem by tracing the observations and\nidentifying multiple solutions. The experiments involve testing the performance\nof the proposed method on one-dimensional DEs and applying it to solve a\ntwo-dimensional Gray-Scott simulation. Our findings demonstrate that the\nproposed method is scalable and adaptable, providing an effective solution for\nsolving DEs with multiple solutions and unknown parameters. Moreover, it has\nsignificant potential for various applications in scientific computing, such as\nmodeling complex systems and solving inverse problems in physics, chemistry,\nbiology, etc.\n","authors":["Haoyang Zheng","Yao Huang","Ziyang Huang","Wenrui Hao","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2304.02811v2.pdf","comment":"20 pages, 15 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.09393v1","updated":"2024-01-17T18:09:26Z","published":"2024-01-17T18:09:26Z","title":"Élivágar: Efficient Quantum Circuit Search for Classification","summary":" Designing performant and noise-robust circuits for Quantum Machine Learning\n(QML) is challenging -- the design space scales exponentially with circuit\nsize, and there are few well-supported guiding principles for QML circuit\ndesign. Although recent Quantum Circuit Search (QCS) methods attempt to search\nfor performant QML circuits that are also robust to hardware noise, they\ndirectly adopt designs from classical Neural Architecture Search (NAS) that are\nmisaligned with the unique constraints of quantum hardware, resulting in high\nsearch overheads and severe performance bottlenecks.\n We present \\'Eliv\\'agar, a novel resource-efficient, noise-guided QCS\nframework. \\'Eliv\\'agar innovates in all three major aspects of QCS -- search\nspace, search algorithm and candidate evaluation strategy -- to address the\ndesign flaws in current classically-inspired QCS methods. \\'Eliv\\'agar achieves\nhardware-efficiency and avoids an expensive circuit-mapping co-search via\nnoise- and device topology-aware candidate generation. By introducing two\ncheap-to-compute predictors, Clifford noise resilience and Representational\ncapacity, \\'Eliv\\'agar decouples the evaluation of noise robustness and\nperformance, enabling early rejection of low-fidelity circuits and reducing\ncircuit evaluation costs. Due to its resource-efficiency, \\'Eliv\\'agar can\nfurther search for data embeddings, significantly improving performance.\n Based on a comprehensive evaluation of \\'Eliv\\'agar on 12 real quantum\ndevices and 9 QML applications, \\'Eliv\\'agar achieves 5.3% higher accuracy and\na 271$\\times$ speedup compared to state-of-the-art QCS methods.\n","authors":["Sashwat Anagolum","Narges Alavisamani","Poulami Das","Moinuddin Qureshi","Eric Kessler","Yunong Shi"],"pdf_url":"https://arxiv.org/pdf/2401.09393v1.pdf","comment":"13 pages, 11 figures. To appear in ASPLOS 2024"},{"id":"http://arxiv.org/abs/2401.05268v2","updated":"2024-01-17T17:57:24Z","published":"2024-01-10T16:57:24Z","title":"AUTOACT: Automatic Agent Learning from Scratch via Self-Planning","summary":" Language agents have achieved considerable performance on various complex\ntasks. Despite the incessant exploration in this field, existing language agent\nsystems still struggle with costly, non-reproducible data reliance and face the\nchallenge of compelling a single model for multiple functions. To this end, we\nintroduce AutoAct, an automatic agent learning framework that does not rely on\nlarge-scale annotated data and synthetic trajectories from closed-source models\n(e.g., GPT-4). Given limited data with a tool library, AutoAct first\nautomatically synthesizes planning trajectories without any assistance from\nhumans or strong closed-source models. Then, AutoAct leverages a\ndivision-of-labor strategy to automatically differentiate based on the target\ntask information and synthesized trajectories, producing a sub-agent group to\ncomplete the task. We conduct comprehensive experiments with different LLMs,\nwhich demonstrates that AutoAct yields better or parallel performance compared\nto various strong baselines. We even notice that AutoAct, when using the\nLlama-2-13b model, can achieve performance comparable to that of the zero-shot\nGPT-3.5-Turbo agent. Code will be available at\nhttps://github.com/zjunlp/AutoAct.\n","authors":["Shuofei Qiao","Ningyu Zhang","Runnan Fang","Yujie Luo","Wangchunshu Zhou","Yuchen Eleanor Jiang","Chengfei Lv","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2401.05268v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.09384v1","updated":"2024-01-17T17:55:06Z","published":"2024-01-17T17:55:06Z","title":"Diverse Part Synthesis for 3D Shape Creation","summary":" Methods that use neural networks for synthesizing 3D shapes in the form of a\npart-based representation have been introduced over the last few years. These\nmethods represent shapes as a graph or hierarchy of parts and enable a variety\nof applications such as shape sampling and reconstruction. However, current\nmethods do not allow easily regenerating individual shape parts according to\nuser preferences. In this paper, we investigate techniques that allow the user\nto generate multiple, diverse suggestions for individual parts. Specifically,\nwe experiment with multimodal deep generative models that allow sampling\ndiverse suggestions for shape parts and focus on models which have not been\nconsidered in previous work on shape synthesis. To provide a comparative study\nof these techniques, we introduce a method for synthesizing 3D shapes in a\npart-based representation and evaluate all the part suggestion techniques\nwithin this synthesis method. In our method, which is inspired by previous\nwork, shapes are represented as a set of parts in the form of implicit\nfunctions which are then positioned in space to form the final shape. Synthesis\nin this representation is enabled by a neural network architecture based on an\nimplicit decoder and a spatial transformer. We compare the various multimodal\ngenerative models by evaluating their performance in generating part\nsuggestions. Our contribution is to show with qualitative and quantitative\nevaluations which of the new techniques for multimodal part generation perform\nthe best and that a synthesis method based on the top-performing techniques\nallows the user to more finely control the parts that are generated in the 3D\nshapes while maintaining high shape fidelity when reconstructing shapes.\n","authors":["Yanran Guan","Oliver van Kaick"],"pdf_url":"https://arxiv.org/pdf/2401.09384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09376v1","updated":"2024-01-17T17:46:10Z","published":"2024-01-17T17:46:10Z","title":"Unlocking Unlabeled Data: Ensemble Learning with the Hui- Walter\n Paradigm for Performance Estimation in Online and Static Settings","summary":" In the realm of machine learning and statistical modeling, practitioners\noften work under the assumption of accessible, static, labeled data for\nevaluation and training. However, this assumption often deviates from reality\nwhere data may be private, encrypted, difficult- to-measure, or unlabeled. In\nthis paper, we bridge this gap by adapting the Hui-Walter paradigm, a method\ntraditionally applied in epidemiology and medicine, to the field of machine\nlearning. This approach enables us to estimate key performance metrics such as\nfalse positive rate, false negative rate, and priors in scenarios where no\nground truth is available. We further extend this paradigm for handling online\ndata, opening up new possibilities for dynamic data environments. Our\nmethodology involves partitioning data into latent classes to simulate multiple\ndata populations (if natural populations are unavailable) and independently\ntraining models to replicate multiple tests. By cross-tabulating binary\noutcomes across ensemble categorizers and multiple populations, we are able to\nestimate unknown parameters through Gibbs sampling, eliminating the need for\nground-truth or labeled data. This paper showcases the potential of our\nmethodology to transform machine learning practices by allowing for accurate\nmodel assessment under dynamic and uncertain data conditions.\n","authors":["Kevin Slote","Elaine Lee"],"pdf_url":"https://arxiv.org/pdf/2401.09376v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.05118v2","updated":"2024-01-17T17:27:10Z","published":"2023-05-09T01:34:22Z","title":"Flame: Simplifying Topology Extension in Federated Learning","summary":" Distributed machine learning approaches, including a broad class of federated\nlearning (FL) techniques, present a number of benefits when deploying machine\nlearning applications over widely distributed infrastructures. The benefits are\nhighly dependent on the details of the underlying machine learning topology,\nwhich specifies the functionality executed by the participating nodes, their\ndependencies and interconnections. Current systems lack the flexibility and\nextensibility necessary to customize the topology of a machine learning\ndeployment. We present Flame, a new system that provides flexibility of the\ntopology configuration of distributed FL applications around the specifics of a\nparticular deployment context, and is easily extensible to support new FL\narchitectures. Flame achieves this via a new high-level abstraction Topology\nAbstraction Graphs (TAGs). TAGs decouple the ML application logic from the\nunderlying deployment details, making it possible to specialize the application\ndeployment with reduced development effort. Flame is released as an open source\nproject, and its flexibility and extensibility support a variety of topologies\nand mechanisms, and can facilitate the development of new FL methodologies.\n","authors":["Harshit Daga","Jaemin Shin","Dhruv Garg","Ada Gavrilovska","Myungjin Lee","Ramana Rao Kompella"],"pdf_url":"https://arxiv.org/pdf/2305.05118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09356v1","updated":"2024-01-17T17:24:36Z","published":"2024-01-17T17:24:36Z","title":"Swing: Short-cutting Rings for Higher Bandwidth Allreduce","summary":" The allreduce collective operation accounts for a significant fraction of the\nruntime of workloads running on distributed systems. One factor determining its\nperformance is the distance between communicating nodes, especially on networks\nlike torus, where a higher distance implies multiple messages being forwarded\non the same link, thus reducing the allreduce bandwidth. Torus networks are\nwidely used on systems optimized for machine learning workloads (e.g., Google\nTPUs and Amazon Trainium devices), as well as on some of the Top500\nsupercomputers. To improve allreduce performance on torus networks we introduce\nSwing, a new algorithm that keeps a low distance between communicating nodes by\nswinging between torus directions. Our analysis and experimental evaluation\nshow that Swing outperforms by up to 3x existing allreduce algorithms for\nvectors ranging from 32B to 128MiB, on different types of torus and torus-like\ntopologies, regardless of their shape and size.\n","authors":["Daniele De Sensi","Tommaso Bonato","David Saam","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2401.09356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09352v1","updated":"2024-01-17T17:18:21Z","published":"2024-01-17T17:18:21Z","title":"Neural Contractive Dynamical Systems","summary":" Stability guarantees are crucial when ensuring a fully autonomous robot does\nnot take undesirable or potentially harmful actions. Unfortunately, global\nstability guarantees are hard to provide in dynamical systems learned from\ndata, especially when the learned dynamics are governed by neural networks. We\npropose a novel methodology to learn neural contractive dynamical systems,\nwhere our neural architecture ensures contraction, and hence, global stability.\nTo efficiently scale the method to high-dimensional dynamical systems, we\ndevelop a variant of the variational autoencoder that learns dynamics in a\nlow-dimensional latent representation space while retaining contractive\nstability after decoding. We further extend our approach to learning\ncontractive systems on the Lie group of rotations to account for full-pose\nend-effector dynamic motions. The result is the first highly flexible learning\narchitecture that provides contractive stability guarantees with capability to\nperform obstacle avoidance. Empirically, we demonstrate that our approach\nencodes the desired dynamics more accurately than the current state-of-the-art,\nwhich provides less strong stability guarantees.\n","authors":["Hadi Beik-Mohammadi","Søren Hauberg","Georgios Arvanitidis","Nadia Figueroa","Gerhard Neumann","Leonel Rozo"],"pdf_url":"https://arxiv.org/pdf/2401.09352v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06155v3","updated":"2024-01-17T17:13:59Z","published":"2023-06-09T15:38:25Z","title":"Intensity Profile Projection: A Framework for Continuous-Time\n Representation Learning for Dynamic Networks","summary":" We present a new representation learning framework, Intensity Profile\nProjection, for continuous-time dynamic network data. Given triples $(i,j,t)$,\neach representing a time-stamped ($t$) interaction between two entities\n($i,j$), our procedure returns a continuous-time trajectory for each node,\nrepresenting its behaviour over time. The framework consists of three stages:\nestimating pairwise intensity functions, e.g. via kernel smoothing; learning a\nprojection which minimises a notion of intensity reconstruction error; and\nconstructing evolving node representations via the learned projection. The\ntrajectories satisfy two properties, known as structural and temporal\ncoherence, which we see as fundamental for reliable inference. Moreoever, we\ndevelop estimation theory providing tight control on the error of any estimated\ntrajectory, indicating that the representations could even be used in quite\nnoise-sensitive follow-on analyses. The theory also elucidates the role of\nsmoothing as a bias-variance trade-off, and shows how we can reduce the level\nof smoothing as the signal-to-noise ratio increases on account of the algorithm\n`borrowing strength' across the network.\n","authors":["Alexander Modell","Ian Gallagher","Emma Ceccherini","Nick Whiteley","Patrick Rubin-Delanchy"],"pdf_url":"https://arxiv.org/pdf/2306.06155v3.pdf","comment":"38 pages, 10 figures"},{"id":"http://arxiv.org/abs/2401.09346v1","updated":"2024-01-17T17:11:45Z","published":"2024-01-17T17:11:45Z","title":"High Confidence Level Inference is Almost Free using Parallel Stochastic\n Optimization","summary":" Uncertainty quantification for estimation through stochastic optimization\nsolutions in an online setting has gained popularity recently. This paper\nintroduces a novel inference method focused on constructing confidence\nintervals with efficient computation and fast convergence to the nominal level.\nSpecifically, we propose to use a small number of independent multi-runs to\nacquire distribution information and construct a t-based confidence interval.\nOur method requires minimal additional computation and memory beyond the\nstandard updating of estimates, making the inference process almost cost-free.\nWe provide a rigorous theoretical guarantee for the confidence interval,\ndemonstrating that the coverage is approximately exact with an explicit\nconvergence rate and allowing for high confidence level inference. In\nparticular, a new Gaussian approximation result is developed for the online\nestimators to characterize the coverage properties of our confidence intervals\nin terms of relative errors. Additionally, our method also allows for\nleveraging parallel computing to further accelerate calculations using multiple\ncores. It is easy to implement and can be integrated with existing stochastic\nalgorithms without the need for complicated modifications.\n","authors":["Wanrong Zhu","Zhipeng Lou","Ziyang Wei","Wei Biao Wu"],"pdf_url":"https://arxiv.org/pdf/2401.09346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09340v1","updated":"2024-01-17T17:04:35Z","published":"2024-01-17T17:04:35Z","title":"SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene\n Understanding","summary":" 3D vision-language grounding, which focuses on aligning language with the 3D\nphysical environment, stands as a cornerstone in the development of embodied\nagents. In comparison to recent advancements in the 2D domain, grounding\nlanguage in 3D scenes faces several significant challenges: (i) the inherent\ncomplexity of 3D scenes due to the diverse object configurations, their rich\nattributes, and intricate relationships; (ii) the scarcity of paired 3D\nvision-language data to support grounded learning; and (iii) the absence of a\nunified learning framework to distill knowledge from grounded 3D data. In this\nwork, we aim to address these three major challenges in 3D vision-language by\nexamining the potential of systematically upscaling 3D vision-language learning\nin indoor environments. We introduce the first million-scale 3D vision-language\ndataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising\n2.5M vision-language pairs derived from both human annotations and our scalable\nscene-graph-based generation approach. We demonstrate that this scaling allows\nfor a unified pre-training framework, Grounded Pre-training for Scenes (GPS),\nfor 3D vision-language learning. Through extensive experiments, we showcase the\neffectiveness of GPS by achieving state-of-the-art performance on all existing\n3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is\nunveiled through zero-shot transfer experiments in the challenging 3D\nvision-language tasks. Project website: https://scene-verse.github.io .\n","authors":["Baoxiong Jia","Yixin Chen","Huangyue Yu","Yan Wang","Xuesong Niu","Tengyu Liu","Qing Li","Siyuan Huang"],"pdf_url":"https://arxiv.org/pdf/2401.09340v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2311.12023v2","updated":"2024-01-17T17:01:57Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and\n70B) demonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables\naggressive quantization to sub-3 bits with only minor performance degradations.\nWhen finetuned on a language modeling calibration dataset, LQ-LoRA can also be\nused for model compression; in this setting our 2.75-bit LLaMA-2-70B model\n(which has 2.85 bits on average when including the low-rank components and\nrequires 27GB of GPU memory) performs respectably compared to the 16-bit\nbaseline.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09339v1","updated":"2024-01-17T17:01:08Z","published":"2024-01-17T17:01:08Z","title":"Central Limit Theorem for Two-Timescale Stochastic Approximation with\n Markovian Noise: Theory and Applications","summary":" Two-timescale stochastic approximation (TTSA) is among the most general\nframeworks for iterative stochastic algorithms. This includes well-known\nstochastic optimization methods such as SGD variants and those designed for\nbilevel or minimax problems, as well as reinforcement learning like the family\nof gradient-based temporal difference (GTD) algorithms. In this paper, we\nconduct an in-depth asymptotic analysis of TTSA under controlled Markovian\nnoise via central limit theorem (CLT), uncovering the coupled dynamics of TTSA\ninfluenced by the underlying Markov chain, which has not been addressed by\nprevious CLT results of TTSA only with Martingale difference noise. Building\nupon our CLT, we expand its application horizon of efficient sampling\nstrategies from vanilla SGD to a wider TTSA context in distributed learning,\nthus broadening the scope of Hu et al. (2022). In addition, we leverage our CLT\nresult to deduce the statistical properties of GTD algorithms with nonlinear\nfunction approximation using Markovian samples and show their identical\nasymptotic performance, a perspective not evident from current finite-time\nbounds.\n","authors":["Jie Hu","Vishwaraj Doshi","Do Young Eun"],"pdf_url":"https://arxiv.org/pdf/2401.09339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09333v1","updated":"2024-01-17T16:57:18Z","published":"2024-01-17T16:57:18Z","title":"Machines Do See Color: A Guideline to Classify Different Forms of Racist\n Discourse in Large Corpora","summary":" Current methods to identify and classify racist language in text rely on\nsmall-n qualitative approaches or large-n approaches focusing exclusively on\novert forms of racist discourse. This article provides a step-by-step\ngeneralizable guideline to identify and classify different forms of racist\ndiscourse in large corpora. In our approach, we start by conceptualizing racism\nand its different manifestations. We then contextualize these racist\nmanifestations to the time and place of interest, which allows researchers to\nidentify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a\ncross-lingual model for supervised text classification with a cutting-edge\ncontextual understanding of text. We show that XLM-R and XLM-R-Racismo, our\npretrained model, outperform other state-of-the-art approaches in classifying\nracism in large corpora. We illustrate our approach using a corpus of tweets\nrelating to the Ecuadorian ind\\'igena community between 2018 and 2021.\n","authors":["Diana Davila Gordillo","Joan Timoneda","Sebastian Vallejo Vera"],"pdf_url":"https://arxiv.org/pdf/2401.09333v1.pdf","comment":"37 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.01506v2","updated":"2024-01-17T16:53:09Z","published":"2024-01-03T02:22:39Z","title":"AIRI: Predicting Retention Indices and their Uncertainties using\n Artificial Intelligence","summary":" The Kov\\'ats Retention index (RI) is a quantity measured using gas\nchromatography and commonly used in the identification of chemical structures.\nCreating libraries of observed RI values is a laborious task, so we explore the\nuse of a deep neural network for predicting RI values from structure for\nstandard semipolar columns. This network generated predictions with a mean\nabsolute error of 15.1 and, in a quantification of the tail of the error\ndistribution, a 95th percentile absolute error of 46.5. Because of the\nArtificial Intelligence Retention Indices (AIRI) network's accuracy, it was\nused to predict RI values for the NIST EI-MS spectral libraries. These RI\nvalues are used to improve chemical identification methods and the quality of\nthe library. Estimating uncertainty is an important practical need when using\nprediction models. To quantify the uncertainty of our network for each\nindividual prediction, we used the outputs of an ensemble of 8 networks to\ncalculate a predicted standard deviation for each RI value prediction. This\npredicted standard deviation was corrected to follow the error between observed\nand predicted RI values. The Z scores using these predicted standard deviations\nhad a standard deviation of 1.52 and a 95th percentile absolute Z score\ncorresponding to a mean RI value of 42.6.\n","authors":["Lewis Y. Geer","Stephen E. Stein","William Gary Mallard","Douglas J. Slotta"],"pdf_url":"https://arxiv.org/pdf/2401.01506v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09323v1","updated":"2024-01-17T16:47:39Z","published":"2024-01-17T16:47:39Z","title":"BENO: Boundary-embedded Neural Operators for Elliptic PDEs","summary":" Elliptic partial differential equations (PDEs) are a major class of\ntime-independent PDEs that play a key role in many scientific and engineering\ndomains such as fluid dynamics, plasma physics, and solid mechanics. Recently,\nneural operators have emerged as a promising technique to solve elliptic PDEs\nmore efficiently by directly mapping the input to solutions. However, existing\nnetworks typically cannot handle complex geometries and inhomogeneous boundary\nvalues present in the real world. Here we introduce Boundary-Embedded Neural\nOperators (BENO), a novel neural operator architecture that embeds the complex\ngeometries and inhomogeneous boundary values into the solving of elliptic PDEs.\nInspired by classical Green's function, BENO consists of two branches of Graph\nNeural Networks (GNNs) for interior source term and boundary values,\nrespectively. Furthermore, a Transformer encoder maps the global boundary\ngeometry into a latent vector which influences each message passing layer of\nthe GNNs. We test our model extensively in elliptic PDEs with various boundary\nconditions. We show that all existing baseline methods fail to learn the\nsolution operator. In contrast, our model, endowed with boundary-embedded\narchitecture, outperforms state-of-the-art neural operators and strong\nbaselines by an average of 60.96\\%. Our source code can be found\nhttps://github.com/AI4Science-WestlakeU/beno.git.\n","authors":["Haixin Wang","Jiaxin Li","Anubhav Dwivedi","Kentaro Hara","Tailin Wu"],"pdf_url":"https://arxiv.org/pdf/2401.09323v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2305.18171v3","updated":"2024-01-17T16:38:47Z","published":"2023-05-29T16:02:09Z","title":"Improved Probabilistic Image-Text Representations","summary":" Image-Text Matching (ITM) task, a fundamental vision-language (VL) task,\nsuffers from the inherent ambiguity arising from multiplicity and imperfect\nannotations. Deterministic functions are not sufficiently powerful to capture\nambiguity, prompting the exploration of probabilistic embeddings to tackle the\nchallenge. However, the existing probabilistic ITM approach encounters two key\nshortcomings; the burden of heavy computations due to the Monte Carlo\napproximation, and the loss saturation issue in the face of abundant false\nnegatives. To overcome the issues, this paper presents an improved\nProbabilistic Cross-Modal Embeddings (named PCME++) by introducing a new\nprobabilistic distance with a closed-form solution. In addition, two\noptimization techniques are proposed to enhance PCME++ further: first, the\nincorporation of pseudo-positives to prevent the loss saturation problem under\nmassive false negatives; second, mixed sample data augmentation for\nprobabilistic matching. Experimental results on MS-COCO Caption and two\nextended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of\nPCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is\nalso evaluated under noisy image-text correspondences. In addition, the\npotential applicability of PCME++ in automatic prompt tuning for zero-shot\nclassification is shown. The code is available at\nhttps://github.com/naver-ai/pcmepp.\n","authors":["Sanghyuk Chun"],"pdf_url":"https://arxiv.org/pdf/2305.18171v3.pdf","comment":"ICLR 2024; Code: https://github.com/naver-ai/pcmepp. Project page:\n https://naver-ai.github.io/pcmepp/. 26 pages, 2.4 MB"},{"id":"http://arxiv.org/abs/2304.10045v2","updated":"2024-01-17T16:28:51Z","published":"2023-04-20T01:46:39Z","title":"ID-MixGCL: Identity Mixup for Graph Contrastive Learning","summary":" Graph contrastive learning (GCL) has recently achieved substantial\nadvancements. Existing GCL approaches compare two different ``views'' of the\nsame graph in order to learn node/graph representations. The underlying\nassumption of these studies is that the graph augmentation strategy is capable\nof generating several different graph views such that the graph views are\nstructurally different but semantically similar to the original graphs, and\nthus the ground-truth labels of the original and augmented graph/nodes can be\nregarded identical in contrastive learning. However, we observe that this\nassumption does not always hold. For instance, the deletion of a super-node\nwithin a social network can exert a substantial influence on the partitioning\nof communities for other nodes. Similarly, any perturbation to nodes or edges\nin a molecular graph will change the labels of the graph. Therefore, we believe\nthat augmenting the graph, accompanied by an adaptation of the labels used for\nthe contrastive loss, will facilitate the encoder to learn a better\nrepresentation. Based on this idea, we propose ID-MixGCL, which allows the\nsimultaneous interpolation of input nodes and corresponding identity labels to\nobtain soft-confidence samples, with a controllable degree of change, leading\nto the capture of fine-grained representations from self-supervised training on\nunlabeled graphs. Experimental results demonstrate that ID-MixGCL improves\nperformance on graph classification and node classification tasks, as\ndemonstrated by significant improvements on the Cora, IMDB-B, IMDB-M, and\nPROTEINS datasets compared to state-of-the-art techniques, by 3-29% absolute\npoints.\n","authors":["Gehang Zhang","Bowen Yu","Jiangxia Cao","Xinghua Zhang","Jiawei Sheng","Chuan Zhou","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2304.10045v2.pdf","comment":"10 pages, 7 figures, accepted by IEEE BigData 2023"},{"id":"http://arxiv.org/abs/2401.03955v3","updated":"2024-01-17T16:27:24Z","published":"2024-01-08T15:21:21Z","title":"Tiny Time Mixers (TTMs): Fast Pre-trained Models for Enhanced\n Zero/Few-Shot Forecasting of Multivariate Time Series","summary":" Large pre-trained models for zero/few-shot learning excel in language and\nvision domains but encounter challenges in multivariate time series (TS) due to\nthe diverse nature and scarcity of publicly available pre-training data.\nConsequently, there has been a recent surge in utilizing pre-trained large\nlanguage models (LLMs) with token adaptations for TS forecasting. These\napproaches employ cross-domain transfer learning and surprisingly yield\nimpressive results. However, these models are typically very slow and large\n(~billion parameters) and do not consider cross-channel correlations. To\naddress this, we present Tiny Time Mixers (TTM), a significantly small model\nbased on the lightweight TSMixer architecture. TTM marks the first success in\ndeveloping fast and tiny general pre-trained models (<1M parameters),\nexclusively trained on public TS datasets, with effective transfer learning\ncapabilities for forecasting. To tackle the complexity of pre-training on\nmultiple datasets with varied temporal resolutions, we introduce several novel\nenhancements such as adaptive patching, dataset augmentation via downsampling,\nand resolution prefix tuning. Moreover, we employ a multi-level modeling\nstrategy to effectively model channel correlations and infuse exogenous signals\nduring fine-tuning, a crucial capability lacking in existing benchmarks. TTM\nshows significant accuracy gains (12-38\\%) over popular benchmarks in\nfew/zero-shot forecasting. It also drastically reduces the compute needs as\ncompared to LLM-TS methods, with a 14X cut in learnable parameters, 106X less\ntotal parameters, and substantial reductions in fine-tuning (65X) and inference\ntime (54X). In fact, TTM's zero-shot often surpasses the few-shot results in\nmany popular benchmarks, highlighting the efficacy of our approach. Code and\npre-trained models will be open-sourced.\n","authors":["Vijay Ekambaram","Arindam Jati","Nam H. Nguyen","Pankaj Dayama","Chandra Reddy","Wesley M. Gifford","Jayant Kalagnanam"],"pdf_url":"https://arxiv.org/pdf/2401.03955v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07993v2","updated":"2024-01-17T16:02:27Z","published":"2024-01-15T22:36:11Z","title":"Carrying over algorithm in transformers","summary":" Addition is perhaps one of the simplest arithmetic tasks one can think of and\nis usually performed using the carrying over algorithm. This algorithm consists\nof two tasks: adding digits in the same position and carrying over a one\nwhenever necessary. We study how transformer models implement this algorithm\nand how the two aforementioned tasks are allocated to different parts of the\nnetwork. We first focus on two-layer encoder-only models and show that the\ncarrying over algorithm is implemented in a modular fashion. The first layer is\nmostly responsible for adding digits in the same position. The second layer\nfirst decides, in the attention, which positions need a carried one or not, and\nthen performs the carrying of the one in the final MLP. We provide a simple way\nof precisely identifying which neurons are responsible for that task. This\nimplementation of the carrying over algorithm occurs across a range of\nhyperparameters for two as well as three-layer models. For small decoder-only\nmodels, we observe the same implementation and provide suggestive evidence for\nits existence in three 7B large language models.\n","authors":["Jorrit Kruthoff"],"pdf_url":"https://arxiv.org/pdf/2401.07993v2.pdf","comment":"Comments welcome!"},{"id":"http://arxiv.org/abs/2401.09294v1","updated":"2024-01-17T15:54:36Z","published":"2024-01-17T15:54:36Z","title":"T-FOLEY: A Controllable Waveform-Domain Diffusion Model for\n Temporal-Event-Guided Foley Sound Synthesis","summary":" Foley sound, audio content inserted synchronously with videos, plays a\ncritical role in the user experience of multimedia content. Recently, there has\nbeen active research in Foley sound synthesis, leveraging the advancements in\ndeep generative models. However, such works mainly focus on replicating a\nsingle sound class or a textual sound description, neglecting temporal\ninformation, which is crucial in the practical applications of Foley sound. We\npresent T-Foley, a Temporal-event-guided waveform generation model for Foley\nsound synthesis. T-Foley generates high-quality audio using two conditions: the\nsound class and temporal event feature. For temporal conditioning, we devise a\ntemporal event feature and a novel conditioning technique named Block-FiLM.\nT-Foley achieves superior performance in both objective and subjective\nevaluation metrics and generates Foley sound well-synchronized with the\ntemporal events. Additionally, we showcase T-Foley's practical applications,\nparticularly in scenarios involving vocal mimicry for temporal event control.\nWe show the demo on our companion website.\n","authors":["Yoonjin Chung","Junwon Lee","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2401.09294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13506v3","updated":"2024-01-17T15:45:54Z","published":"2023-01-31T09:46:37Z","title":"Supporting Safety Analysis of Image-processing DNNs through\n Clustering-based Approaches","summary":" The adoption of deep neural networks (DNNs) in safety-critical contexts is\noften prevented by the lack of effective means to explain their results,\nespecially when they are erroneous. In our previous work, we proposed a\nwhite-box approach (HUDD) and a black-box approach (SAFE) to automatically\ncharacterize DNN failures. They both identify clusters of similar images from a\npotentially large set of images leading to DNN failures. However, the analysis\npipelines for HUDD and SAFE were instantiated in specific ways according to\ncommon practices, deferring the analysis of other pipelines to future work. In\nthis paper, we report on an empirical evaluation of 99 different pipelines for\nroot cause analysis of DNN failures. They combine transfer learning,\nautoencoders, heatmaps of neuron relevance, dimensionality reduction\ntechniques, and different clustering algorithms. Our results show that the best\npipeline combines transfer learning, DBSCAN, and UMAP. It leads to clusters\nalmost exclusively capturing images of the same failure scenario, thus\nfacilitating root cause analysis. Further, it generates distinct clusters for\neach root cause of failure, thus enabling engineers to detect all the unsafe\nscenarios. Interestingly, these results hold even for failure scenarios that\nare only observed in a small percentage of the failing images.\n","authors":["Mohammed Oualid Attaoui","Hazem Fahmy","Fabrizio Pastore","Lionel Briand"],"pdf_url":"https://arxiv.org/pdf/2301.13506v3.pdf","comment":"16 Tables, 15 Figures"},{"id":"http://arxiv.org/abs/2401.09278v1","updated":"2024-01-17T15:32:04Z","published":"2024-01-17T15:32:04Z","title":"Adaptive Regret for Bandits Made Possible: Two Queries Suffice","summary":" Fast changing states or volatile environments pose a significant challenge to\nonline optimization, which needs to perform rapid adaptation under limited\nobservation. In this paper, we give query and regret optimal bandit algorithms\nunder the strict notion of strongly adaptive regret, which measures the maximum\nregret over any contiguous interval $I$. Due to its worst-case nature, there is\nan almost-linear $\\Omega(|I|^{1-\\epsilon})$ regret lower bound, when only one\nquery per round is allowed [Daniely el al, ICML 2015]. Surprisingly, with just\ntwo queries per round, we give Strongly Adaptive Bandit Learner (StABL) that\nachieves $\\tilde{O}(\\sqrt{n|I|})$ adaptive regret for multi-armed bandits with\n$n$ arms. The bound is tight and cannot be improved in general. Our algorithm\nleverages a multiplicative update scheme of varying stepsizes and a carefully\nchosen observation distribution to control the variance. Furthermore, we extend\nour results and provide optimal algorithms in the bandit convex optimization\nsetting. Finally, we empirically demonstrate the superior performance of our\nalgorithms under volatile environments and for downstream tasks, such as\nalgorithm selection for hyperparameter optimization.\n","authors":["Zhou Lu","Qiuyi Zhang","Xinyi Chen","Fred Zhang","David Woodruff","Elad Hazan"],"pdf_url":"https://arxiv.org/pdf/2401.09278v1.pdf","comment":"ICLR2024"},{"id":"http://arxiv.org/abs/2401.09274v1","updated":"2024-01-17T15:25:50Z","published":"2024-01-17T15:25:50Z","title":"Avoiding strict saddle points of nonconvex regularized problems","summary":" We introduce a strict saddle property for $\\ell_p$ regularized functions, and\npropose an iterative reweighted $\\ell_1$ algorithm to solve the $\\ell_p$\nregularized problems. The algorithm is guaranteed to converge only to local\nminimizers when randomly initialized. The strict saddle property is shown\ngeneric on these sparse optimization problems. Those analyses as well as the\nproposed algorithm can be easily extended to general nonconvex regularized\nproblems.\n","authors":["Luwei Bai"],"pdf_url":"https://arxiv.org/pdf/2401.09274v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2308.12143v3","updated":"2024-01-17T15:25:22Z","published":"2023-08-23T14:00:58Z","title":"A Probabilistic Fluctuation based Membership Inference Attack for\n Diffusion Models","summary":" Membership Inference Attack (MIA) identifies whether a record exists in a\nmachine learning model's training set by querying the model. MIAs on the\nclassic classification models have been well-studied, and recent works have\nstarted to explore how to transplant MIA onto generative models. Our\ninvestigation indicates that existing MIAs designed for generative models\nmainly depend on the overfitting in target models. However, overfitting can be\navoided by employing various regularization techniques, whereas existing MIAs\ndemonstrate poor performance in practice. Unlike overfitting, memorization is\nessential for deep learning models to attain optimal performance, making it a\nmore prevalent phenomenon. Memorization in generative models leads to an\nincreasing trend in the probability distribution of generating records around\nthe member record. Therefore, we propose a Probabilistic Fluctuation Assessing\nMembership Inference Attack (PFAMI), a black-box MIA that infers memberships by\ndetecting these trends via analyzing the overall probabilistic fluctuations\naround given records. We conduct extensive experiments across multiple\ngenerative models and datasets, which demonstrate PFAMI can improve the attack\nsuccess rate (ASR) by about 27.9% when compared with the best baseline.\n","authors":["Wenjie Fu","Huandong Wang","Chen Gao","Guanghua Liu","Yong Li","Tao Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.12143v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09267v1","updated":"2024-01-17T15:15:52Z","published":"2024-01-17T15:15:52Z","title":"Risk-Aware Accelerated Wireless Federated Learning with Heterogeneous\n Clients","summary":" Wireless Federated Learning (FL) is an emerging distributed machine learning\nparadigm, particularly gaining momentum in domains with confidential and\nprivate data on mobile clients. However, the location-dependent performance, in\nterms of transmission rates and susceptibility to transmission errors, poses\nmajor challenges for wireless FL's convergence speed and accuracy. The\nchallenge is more acute for hostile environments without a metric that\nauthenticates the data quality and security profile of the clients. In this\ncontext, this paper proposes a novel risk-aware accelerated FL framework that\naccounts for the clients heterogeneity in the amount of possessed data,\ntransmission rates, transmission errors, and trustworthiness. Classifying\nclients according to their location-dependent performance and trustworthiness\nprofiles, we propose a dynamic risk-aware global model aggregation scheme that\nallows clients to participate in descending order of their transmission rates\nand an ascending trustworthiness constraint. In particular, the transmission\nrate is the dominant participation criterion for initial rounds to accelerate\nthe convergence speed. Our model then progressively relaxes the transmission\nrate restriction to explore more training data at cell-edge clients. The\naggregation rounds incorporate a debiasing factor that accounts for\ntransmission errors. Risk-awareness is enabled by a validation set, where the\nbase station eliminates non-trustworthy clients at the fine-tuning stage. The\nproposed scheme is benchmarked against a conservative scheme (i.e., only\nallowing trustworthy devices) and an aggressive scheme (i.e., oblivious to the\ntrust metric). The numerical results highlight the superiority of the proposed\nscheme in terms of accuracy and convergence speed when compared to both\nbenchmarks.\n","authors":["Mohamed Ads","Hesham ElSawy","Hossam S. Hassanein"],"pdf_url":"https://arxiv.org/pdf/2401.09267v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10224v3","updated":"2024-01-17T15:13:37Z","published":"2023-10-16T09:34:06Z","title":"Generalizing Medical Image Representations via Quaternion Wavelet\n Networks","summary":" Neural network generalizability is becoming a broad research field due to the\nincreasing availability of datasets from different sources and for various\ntasks. This issue is even wider when processing medical data, where a lack of\nmethodological standards causes large variations being provided by different\nimaging centers or acquired with various devices and cofactors. To overcome\nthese limitations, we introduce a novel, generalizable, data- and task-agnostic\nframework able to extract salient features from medical images. The proposed\nquaternion wavelet network (QUAVE) can be easily integrated with any\npre-existing medical image analysis or synthesis task, and it can be involved\nwith real, quaternion, or hypercomplex-valued models, generalizing their\nadoption to single-channel data. QUAVE first extracts different sub-bands\nthrough the quaternion wavelet transform, resulting in both\nlow-frequency/approximation bands and high-frequency/fine-grained features.\nThen, it weighs the most representative set of sub-bands to be involved as\ninput to any other neural model for image processing, replacing standard data\nsamples. We conduct an extensive experimental evaluation comprising different\ndatasets, diverse image analysis, and synthesis tasks including reconstruction,\nsegmentation, and modality translation. We also evaluate QUAVE in combination\nwith both real and quaternion-valued models. Results demonstrate the\neffectiveness and the generalizability of the proposed framework that improves\nnetwork performance while being flexible to be adopted in manifold scenarios\nand robust to domain shifts. The full code is available at:\nhttps://github.com/ispamm/QWT.\n","authors":["Luigi Sigillo","Eleonora Grassucci","Aurelio Uncini","Danilo Comminiello"],"pdf_url":"https://arxiv.org/pdf/2310.10224v3.pdf","comment":"This paper is currently under review"},{"id":"http://arxiv.org/abs/2401.09261v1","updated":"2024-01-17T15:12:11Z","published":"2024-01-17T15:12:11Z","title":"MSHyper: Multi-Scale Hypergraph Transformer for Long-Range Time Series\n Forecasting","summary":" Demystifying interactions between temporal patterns of different scales is\nfundamental to precise long-range time series forecasting. However, previous\nworks lack the ability to model high-order interactions. To promote more\ncomprehensive pattern interaction modeling for long-range time series\nforecasting, we propose a Multi-Scale Hypergraph Transformer (MSHyper)\nframework. Specifically, a multi-scale hypergraph is introduced to provide\nfoundations for modeling high-order pattern interactions. Then by treating\nhyperedges as nodes, we also build a hyperedge graph to enhance hypergraph\nmodeling. In addition, a tri-stage message passing mechanism is introduced to\naggregate pattern information and learn the interaction strength between\ntemporal patterns of different scales. Extensive experiments on five real-world\ndatasets demonstrate that MSHyper achieves state-of-the-art performance,\nreducing prediction errors by an average of 8.73% and 7.15% over the best\nbaseline in MSE and MAE, respectively.\n","authors":["Zongjiang Shang","Ling Chen"],"pdf_url":"https://arxiv.org/pdf/2401.09261v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.04741v4","updated":"2024-01-17T15:10:26Z","published":"2023-10-07T08:54:43Z","title":"Balancing stability and plasticity in continual learning: the\n readout-decomposition of activation change (RDAC) framework","summary":" Continual learning (CL) algorithms strive to acquire new knowledge while\npreserving prior information. However, this stability-plasticity trade-off\nremains a central challenge. This paper introduces a framework that dissects\nthis trade-off, offering valuable insights into CL algorithms. The\nReadout-Decomposition of Activation Change (RDAC) framework first addresses the\nstability-plasticity dilemma and its relation to catastrophic forgetting. It\nrelates learning-induced activation changes in the range of prior readouts to\nthe degree of stability and changes in the null space to the degree of\nplasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the\nframework clarifies the stability-plasticity trade-offs of the popular\nregularization algorithms Synaptic intelligence (SI), Elastic-weight\nconsolidation (EWC), and learning without Forgetting (LwF), and replay-based\nalgorithms Gradient episodic memory (GEM), and data replay. GEM and data replay\npreserved stability and plasticity, while SI, EWC, and LwF traded off\nplasticity for stability. The inability of the regularization algorithms to\nmaintain plasticity was linked to them restricting the change of activations in\nthe null space of the prior readout. Additionally, for one-hidden-layer linear\nneural networks, we derived a gradient decomposition algorithm to restrict\nactivation change only in the range of the prior readouts, to maintain high\nstability while not further sacrificing plasticity. Results demonstrate that\nthe algorithm maintained stability without significant plasticity loss. The\nRDAC framework informs the behavior of existing CL algorithms and paves the way\nfor novel CL approaches. Finally, it sheds light on the connection between\nlearning-induced activation/representation changes and the stability-plasticity\ndilemma, also offering insights into representational drift in biological\nsystems.\n","authors":["Daniel Anthes","Sushrut Thorat","Peter König","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2310.04741v4.pdf","comment":"15 pages, 5 figures, Revision"},{"id":"http://arxiv.org/abs/2401.09257v1","updated":"2024-01-17T15:03:37Z","published":"2024-01-17T15:03:37Z","title":"A First-Order Multi-Gradient Algorithm for Multi-Objective Bi-Level\n Optimization","summary":" In this paper, we study the Multi-Objective Bi-Level Optimization (MOBLO)\nproblem, where the upper-level subproblem is a multi-objective optimization\nproblem and the lower-level subproblem is for scalar optimization. Existing\ngradient-based MOBLO algorithms need to compute the Hessian matrix, causing the\ncomputational inefficient problem. To address this, we propose an efficient\nfirst-order multi-gradient method for MOBLO, called FORUM. Specifically, we\nreformulate MOBLO problems as a constrained multi-objective optimization (MOO)\nproblem via the value-function approach. Then we propose a novel multi-gradient\naggregation method to solve the challenging constrained MOO problem.\nTheoretically, we provide the complexity analysis to show the efficiency of the\nproposed method and a non-asymptotic convergence result. Empirically, extensive\nexperiments demonstrate the effectiveness and efficiency of the proposed FORUM\nmethod in different learning problems. In particular, it achieves\nstate-of-the-art performance on three multi-task learning benchmark datasets.\n","authors":["Feiyang Ye","Baijiong Lin","Xiaofeng Cao","Yu Zhang","Ivor Tsang"],"pdf_url":"https://arxiv.org/pdf/2401.09257v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2306.09980v2","updated":"2024-01-17T15:03:18Z","published":"2023-06-16T17:23:49Z","title":"Creating Multi-Level Skill Hierarchies in Reinforcement Learning","summary":" What is a useful skill hierarchy for an autonomous agent? We propose an\nanswer based on a graphical representation of how the interaction between an\nagent and its environment may unfold. Our approach uses modularity maximisation\nas a central organising principle to expose the structure of the interaction\ngraph at multiple levels of abstraction. The result is a collection of skills\nthat operate at varying time scales, organised into a hierarchy, where skills\nthat operate over longer time scales are composed of skills that operate over\nshorter time scales. The entire skill hierarchy is generated automatically,\nwith no human intervention, including the skills themselves (their behaviour,\nwhen they can be called, and when they terminate) as well as the hierarchical\ndependency structure between them. In a wide range of environments, this\napproach generates skill hierarchies that are intuitively appealing and that\nconsiderably improve the learning performance of the agent.\n","authors":["Joshua B. Evans","Özgür Şimşek"],"pdf_url":"https://arxiv.org/pdf/2306.09980v2.pdf","comment":"20 pages, 10 figures. Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.09252v1","updated":"2024-01-17T14:57:27Z","published":"2024-01-17T14:57:27Z","title":"3D Scene Geometry Estimation from 360$^\\circ$ Imagery: A Survey","summary":" This paper provides a comprehensive survey on pioneer and state-of-the-art 3D\nscene geometry estimation methodologies based on single, two, or multiple\nimages captured under the omnidirectional optics. We first revisit the basic\nconcepts of the spherical camera model, and review the most common acquisition\ntechnologies and representation formats suitable for omnidirectional (also\ncalled 360$^\\circ$, spherical or panoramic) images and videos. We then survey\nmonocular layout and depth inference approaches, highlighting the recent\nadvances in learning-based solutions suited for spherical data. The classical\nstereo matching is then revised on the spherical domain, where methodologies\nfor detecting and describing sparse and dense features become crucial. The\nstereo matching concepts are then extrapolated for multiple view camera setups,\ncategorizing them among light fields, multi-view stereo, and structure from\nmotion (or visual simultaneous localization and mapping). We also compile and\ndiscuss commonly adopted datasets and figures of merit indicated for each\npurpose and list recent results for completeness. We conclude this paper by\npointing out current and future trends.\n","authors":["Thiago Lopes Trugillo da Silveira","Paulo Gamarra Lessa Pinto","Jeffri Erwin Murrugarra Llerena","Claudio Rosito Jung"],"pdf_url":"https://arxiv.org/pdf/2401.09252v1.pdf","comment":"Published in ACM Computing Surveys"},{"id":"http://arxiv.org/abs/2401.09251v1","updated":"2024-01-17T14:56:42Z","published":"2024-01-17T14:56:42Z","title":"Bridging the Gap Between General and Down-Closed Convex Sets in\n Submodular Maximization","summary":" Optimization of DR-submodular functions has experienced a notable surge in\nsignificance in recent times, marking a pivotal development within the domain\nof non-convex optimization. Motivated by real-world scenarios, some recent\nworks have delved into the maximization of non-monotone DR-submodular functions\nover general (not necessarily down-closed) convex set constraints. Up to this\npoint, these works have all used the minimum $\\ell_\\infty$ norm of any feasible\nsolution as a parameter. Unfortunately, a recent hardness result due to Mualem\n\\& Feldman~\\cite{mualem2023resolving} shows that this approach cannot yield a\nsmooth interpolation between down-closed and non-down-closed constraints. In\nthis work, we suggest novel offline and online algorithms that provably provide\nsuch an interpolation based on a natural decomposition of the convex body\nconstraint into two distinct convex bodies: a down-closed convex body and a\ngeneral convex body. We also empirically demonstrate the superiority of our\nproposed algorithms across three offline and two online applications.\n","authors":["Loay Mualem","Murad Tukan","Moran Fledman"],"pdf_url":"https://arxiv.org/pdf/2401.09251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14411v3","updated":"2024-01-17T14:55:36Z","published":"2023-06-26T04:12:40Z","title":"Score-based Source Separation with Applications to Digital Communication\n Signals","summary":" We propose a new method for separating superimposed sources using\ndiffusion-based generative models. Our method relies only on separately trained\nstatistical priors of independent sources to establish a new objective function\nguided by maximum a posteriori estimation with an $\\alpha$-posterior, across\nmultiple levels of Gaussian smoothing. Motivated by applications in\nradio-frequency (RF) systems, we are interested in sources with underlying\ndiscrete nature and the recovery of encoded bits from a signal of interest, as\nmeasured by the bit error rate (BER). Experimental results with RF mixtures\ndemonstrate that our method results in a BER reduction of 95% over classical\nand existing learning-based methods. Our analysis demonstrates that our\nproposed method yields solutions that asymptotically approach the modes of an\nunderlying discrete distribution. Furthermore, our method can be viewed as a\nmulti-source extension to the recently proposed score distillation sampling\nscheme, shedding additional light on its use beyond conditional sampling. The\nproject webpage is available at https://alpha-rgs.github.io\n","authors":["Tejas Jayashankar","Gary C. F. Lee","Alejandro Lancho","Amir Weiss","Yury Polyanskiy","Gregory W. Wornell"],"pdf_url":"https://arxiv.org/pdf/2306.14411v3.pdf","comment":"34 pages, 18 figures, for associated project webpage see\n https://alpha-rgs.github.io"},{"id":"http://arxiv.org/abs/2310.19802v5","updated":"2024-01-17T14:45:45Z","published":"2023-10-04T01:32:55Z","title":"Stochastic Thermodynamics of Learning Parametric Probabilistic Models","summary":" We have formulated a family of machine learning problems as the time\nevolution of Parametric Probabilistic Models (PPMs), inherently rendering a\nthermodynamic process. Our primary motivation is to leverage the rich toolbox\nof thermodynamics of information to assess the information-theoretic content of\nlearning a probabilistic model. We first introduce two information-theoretic\nmetrics: Memorized-information (M-info) and Learned-information (L-info), which\ntrace the flow of information during the learning process of PPMs. Then, we\ndemonstrate that the accumulation of L-info during the learning process is\nassociated with entropy production, and parameters serve as a heat reservoir in\nthis process, capturing learned information in the form of M-info.\n","authors":["Shervin Sadat Parsi"],"pdf_url":"https://arxiv.org/pdf/2310.19802v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09243v1","updated":"2024-01-17T14:43:59Z","published":"2024-01-17T14:43:59Z","title":"DiffClone: Enhanced Behaviour Cloning in Robotics with Diffusion-Driven\n Policy Learning","summary":" Robot learning tasks are extremely compute-intensive and hardware-specific.\nThus the avenues of tackling these challenges, using a diverse dataset of\noffline demonstrations that can be used to train robot manipulation agents, is\nvery appealing. The Train-Offline-Test-Online (TOTO) Benchmark provides a\nwell-curated open-source dataset for offline training comprised mostly of\nexpert data and also benchmark scores of the common offline-RL and behaviour\ncloning agents. In this paper, we introduce DiffClone, an offline algorithm of\nenhanced behaviour cloning agent with diffusion-based policy learning, and\nmeasured the efficacy of our method on real online physical robots at test\ntime. This is also our official submission to the Train-Offline-Test-Online\n(TOTO) Benchmark Challenge organized at NeurIPS 2023. We experimented with both\npre-trained visual representation and agent policies. In our experiments, we\nfind that MOCO finetuned ResNet50 performs the best in comparison to other\nfinetuned representations. Goal state conditioning and mapping to transitions\nresulted in a minute increase in the success rate and mean-reward. As for the\nagent policy, we developed DiffClone, a behaviour cloning agent improved using\nconditional diffusion.\n","authors":["Sabariswaran Mani","Abhranil Chandra","Sreyas Venkataraman","Adyan Rizvi","Yash Sirvi","Soumojit Bhattacharya","Aritra Hazra"],"pdf_url":"https://arxiv.org/pdf/2401.09243v1.pdf","comment":"NeurIPS 2023 Train Offline Test Online Workshop and Competition"},{"id":"http://arxiv.org/abs/2304.14660v7","updated":"2024-01-17T14:42:40Z","published":"2023-04-28T07:23:31Z","title":"Segment Anything Model for Medical Images?","summary":" The Segment Anything Model (SAM) is the first foundation model for general\nimage segmentation. It has achieved impressive results on various natural image\nsegmentation tasks. However, medical image segmentation (MIS) is more\nchallenging because of the complex modalities, fine anatomical structures,\nuncertain and complex object boundaries, and wide-range object scales. To fully\nvalidate SAM's performance on medical data, we collected and sorted 53\nopen-source datasets and built a large medical segmentation dataset with 18\nmodalities, 84 objects, 125 object-modality paired targets, 1050K 2D images,\nand 6033K masks. We comprehensively analyzed different models and strategies on\nthe so-called COSMOS 1050K dataset. Our findings mainly include the following:\n1) SAM showed remarkable performance in some specific objects but was unstable,\nimperfect, or even totally failed in other situations. 2) SAM with the large\nViT-H showed better overall performance than that with the small ViT-B. 3) SAM\nperformed better with manual hints, especially box, than the Everything mode.\n4) SAM could help human annotation with high labeling quality and less time. 5)\nSAM was sensitive to the randomness in the center point and tight box prompts,\nand may suffer from a serious performance drop. 6) SAM performed better than\ninteractive methods with one or a few points, but will be outpaced as the\nnumber of points increases. 7) SAM's performance correlated to different\nfactors, including boundary complexity, intensity differences, etc. 8)\nFinetuning the SAM on specific medical tasks could improve its average DICE\nperformance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that\nthis comprehensive report can help researchers explore the potential of SAM\napplications in MIS, and guide how to appropriately use and develop SAM.\n","authors":["Yuhao Huang","Xin Yang","Lian Liu","Han Zhou","Ao Chang","Xinrui Zhou","Rusi Chen","Junxuan Yu","Jiongquan Chen","Chaoyu Chen","Sijing Liu","Haozhe Chi","Xindi Hu","Kejuan Yue","Lei Li","Vicente Grau","Deng-Ping Fan","Fajin Dong","Dong Ni"],"pdf_url":"https://arxiv.org/pdf/2304.14660v7.pdf","comment":"Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables"},{"id":"http://arxiv.org/abs/2312.04350v3","updated":"2024-01-17T14:41:55Z","published":"2023-12-07T15:12:12Z","title":"CLadder: Assessing Causal Reasoning in Language Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsights into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v3.pdf","comment":"NeurIPS 2023; updated with CLadder dataset v1.5"},{"id":"http://arxiv.org/abs/2401.09237v1","updated":"2024-01-17T14:34:32Z","published":"2024-01-17T14:34:32Z","title":"Classification and Reconstruction Processes in Deep Predictive Coding\n Networks: Antagonists or Allies?","summary":" Predictive coding-inspired deep networks for visual computing integrate\nclassification and reconstruction processes in shared intermediate layers.\nAlthough synergy between these processes is commonly assumed, it has yet to be\nconvincingly demonstrated. In this study, we take a critical look at how\nclassifying and reconstructing interact in deep learning architectures. Our\napproach utilizes a purposefully designed family of model architectures\nreminiscent of autoencoders, each equipped with an encoder, a decoder, and a\nclassification head featuring varying modules and complexities. We meticulously\nanalyze the extent to which classification- and reconstruction-driven\ninformation can seamlessly coexist within the shared latent layer of the model\narchitectures. Our findings underscore a significant challenge:\nClassification-driven information diminishes reconstruction-driven information\nin intermediate layers' shared representations and vice versa. While expanding\nthe shared representation's dimensions or increasing the network's complexity\ncan alleviate this trade-off effect, our results challenge prevailing\nassumptions in predictive coding and offer guidance for future iterations of\npredictive coding concepts in deep networks.\n","authors":["Jan Rathjens","Laurenz Wiskott"],"pdf_url":"https://arxiv.org/pdf/2401.09237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09235v1","updated":"2024-01-17T14:30:46Z","published":"2024-01-17T14:30:46Z","title":"A Characterization Theorem for Equivariant Networks with Point-wise\n Activations","summary":" Equivariant neural networks have shown improved performance, expressiveness\nand sample complexity on symmetrical domains. But for some specific symmetries,\nrepresentations, and choice of coordinates, the most common point-wise\nactivations, such as ReLU, are not equivariant, hence they cannot be employed\nin the design of equivariant neural networks. The theorem we present in this\npaper describes all possible combinations of finite-dimensional\nrepresentations, choice of coordinates and point-wise activations to obtain an\nexactly equivariant layer, generalizing and strengthening existing\ncharacterizations. Notable cases of practical relevance are discussed as\ncorollaries. Indeed, we prove that rotation-equivariant networks can only be\ninvariant, as it happens for any network which is equivariant with respect to\nconnected compact groups. Then, we discuss implications of our findings when\napplied to important instances of exactly equivariant networks. First, we\ncompletely characterize permutation equivariant networks such as Invariant\nGraph Networks with point-wise nonlinearities and their geometric counterparts,\nhighlighting a plethora of models whose expressive power and performance are\nstill unknown. Second, we show that feature spaces of disentangled steerable\nconvolutional neural networks are trivial representations.\n","authors":["Marco Pacini","Xiaowen Dong","Bruno Lepri","Gabriele Santin"],"pdf_url":"https://arxiv.org/pdf/2401.09235v1.pdf","comment":"Accepted at the 12th International Conference on Learning\n Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2305.17225v3","updated":"2024-01-17T14:22:19Z","published":"2023-05-26T19:34:35Z","title":"Causal Component Analysis","summary":" Independent Component Analysis (ICA) aims to recover independent latent\nvariables from observed mixtures thereof. Causal Representation Learning (CRL)\naims instead to infer causally related (thus often statistically dependent)\nlatent variables, together with the unknown graph encoding their causal\nrelationships. We introduce an intermediate problem termed Causal Component\nAnalysis (CauCA). CauCA can be viewed as a generalization of ICA, modelling the\ncausal dependence among the latent components, and as a special case of CRL. In\ncontrast to CRL, it presupposes knowledge of the causal graph, focusing solely\non learning the unmixing function and the causal mechanisms. Any impossibility\nresults regarding the recovery of the ground truth in CauCA also apply for CRL,\nwhile possibility results may serve as a stepping stone for extensions to CRL.\nWe characterize CauCA identifiability from multiple datasets generated through\ndifferent types of interventions on the latent causal variables. As a\ncorollary, this interventional perspective also leads to new identifiability\nresults for nonlinear ICA -- a special case of CauCA with an empty graph --\nrequiring strictly fewer datasets than previous results. We introduce a\nlikelihood-based approach using normalizing flows to estimate both the unmixing\nfunction and the causal mechanisms, and demonstrate its effectiveness through\nextensive synthetic experiments in the CauCA and ICA setting.\n","authors":["Liang Wendong","Armin Kekić","Julius von Kügelgen","Simon Buchholz","Michel Besserve","Luigi Gresele","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2305.17225v3.pdf","comment":"NeurIPS 2023 final camera-ready version"},{"id":"http://arxiv.org/abs/2401.00828v3","updated":"2024-01-17T14:17:41Z","published":"2024-01-01T17:56:24Z","title":"Multi-Lattice Sampling of Quantum Field Theories via Neural\n Operator-based Flows","summary":" We consider the problem of sampling discrete field configurations $\\phi$ from\nthe Boltzmann distribution $[d\\phi] Z^{-1} e^{-S[\\phi]}$, where $S$ is the\nlattice-discretization of the continuous Euclidean action $\\mathcal S$ of some\nquantum field theory. Since such densities arise as the approximation of the\nunderlying functional density $[\\mathcal D\\phi(x)] \\mathcal Z^{-1} e^{-\\mathcal\nS[\\phi(x)]}$, we frame the task as an instance of operator learning. In\nparticular, we propose to approximate a time-dependent operator $\\mathcal V_t$\nwhose time integral provides a mapping between the functional distributions of\nthe free theory $[\\mathcal D\\phi(x)] \\mathcal Z_0^{-1} e^{-\\mathcal\nS_{0}[\\phi(x)]}$ and of the target theory $[\\mathcal D\\phi(x)]\\mathcal\nZ^{-1}e^{-\\mathcal S[\\phi(x)]}$. Whenever a particular lattice is chosen, the\noperator $\\mathcal V_t$ can be discretized to a finite dimensional,\ntime-dependent vector field $V_t$ which in turn induces a continuous\nnormalizing flow between finite dimensional distributions over the chosen\nlattice. This flow can then be trained to be a diffeormorphism between the\ndiscretized free and target theories $[d\\phi] Z_0^{-1} e^{-S_{0}[\\phi]}$,\n$[d\\phi] Z^{-1}e^{-S[\\phi]}$. We run experiments on the $\\phi^4$-theory to\nexplore to what extent such operator-based flow architectures generalize to\nlattice sizes they were not trained on and show that pretraining on smaller\nlattices can lead to speedup over training only a target lattice size.\n","authors":["Bálint Máté","François Fleuret"],"pdf_url":"https://arxiv.org/pdf/2401.00828v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00229v2","updated":"2024-01-17T14:10:15Z","published":"2023-09-30T02:25:18Z","title":"Combining Spatial and Temporal Abstraction in Planning for Better\n Generalization","summary":" Inspired by human conscious planning, we propose Skipper, a model-based\nreinforcement learning agent utilizing spatio-temporal abstractions to\ngeneralize learned skills in novel situations. It automatically decomposes the\ngiven task into smaller, more manageable subtasks, and hence enables sparse\ndecision-making and focused computation on the relevant parts of the\nenvironment. This relies on the extraction of an abstracted proxy problem\nrepresented as a directed graph, in which vertices and edges are learned\nend-to-end from hindsight. Our theoretical analyses provide performance\nguarantees under appropriate assumptions and establish where our approach is\nexpected to be helpful. Generalization-focused experiments validate Skipper's\nsignificant advantage in zero-shot generalization, compared to existing\nstate-of-the-art hierarchical planning methods.\n","authors":["Mingde Zhao","Safa Alver","Harm van Seijen","Romain Laroche","Doina Precup","Yoshua Bengio"],"pdf_url":"https://arxiv.org/pdf/2310.00229v2.pdf","comment":"accepted version for ICLR 2024"},{"id":"http://arxiv.org/abs/2309.08748v3","updated":"2024-01-17T14:07:16Z","published":"2023-09-15T20:21:46Z","title":"Wasserstein Distributionally Robust Policy Evaluation and Learning for\n Contextual Bandits","summary":" Off-policy evaluation and learning are concerned with assessing a given\npolicy and learning an optimal policy from offline data without direct\ninteraction with the environment. Often, the environment in which the data are\ncollected differs from the environment in which the learned policy is applied.\nTo account for the effect of different environments during learning and\nexecution, distributionally robust optimization (DRO) methods have been\ndeveloped that compute worst-case bounds on the policy values assuming that the\ndistribution of the new environment lies within an uncertainty set. Typically,\nthis uncertainty set is defined based on the KL divergence around the empirical\ndistribution computed from the logging dataset. However, the KL uncertainty set\nfails to encompass distributions with varying support and lacks awareness of\nthe geometry of the distribution support. As a result, KL approaches fall short\nin addressing practical environment mismatches and lead to over-fitting to\nworst-case scenarios. To overcome these limitations, we propose a novel DRO\napproach that employs the Wasserstein distance instead. While Wasserstein DRO\nis generally computationally more expensive compared to KL DRO, we present a\nregularized method and a practical (biased) stochastic gradient descent method\nto optimize the policy efficiently. We also provide a theoretical analysis of\nthe finite sample complexity and iteration complexity for our proposed method.\nWe further validate our approach using a public dataset that was recorded in a\nrandomized stoke trial.\n","authors":["Yi Shen","Pan Xu","Michael M. Zavlanos"],"pdf_url":"https://arxiv.org/pdf/2309.08748v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.03006v2","updated":"2024-01-17T14:02:12Z","published":"2024-01-05T11:35:10Z","title":"The Rise of Diffusion Models in Time-Series Forecasting","summary":" This survey delves into the application of diffusion models in time-series\nforecasting. Diffusion models are demonstrating state-of-the-art results in\nvarious fields of generative AI. The paper includes comprehensive background\ninformation on diffusion models, detailing their conditioning methods and\nreviewing their use in time-series forecasting. The analysis covers 11 specific\ntime-series implementations, the intuition and theory behind them, the\neffectiveness on different datasets, and a comparison among each other. Key\ncontributions of this work are the thorough exploration of diffusion models'\napplications in time-series forecasting and a chronologically ordered overview\nof these models. Additionally, the paper offers an insightful discussion on the\ncurrent state-of-the-art in this domain and outlines potential future research\ndirections. This serves as a valuable resource for researchers in AI and\ntime-series analysis, offering a clear view of the latest advancements and\nfuture potential of diffusion models.\n","authors":["Caspar Meijer","Lydia Y. Chen"],"pdf_url":"https://arxiv.org/pdf/2401.03006v2.pdf","comment":"Version 2, 24 pages, 10 figures, 12 tables, For complete LuaTeX\n source:\n https://github.com/Capsar/The-Rise-of-Diffusion-Models-in-Time-Series-Forecasting\n , Written by: Caspar Meijer, Supervised by: Lydia Y. Chen"},{"id":"http://arxiv.org/abs/2401.07671v2","updated":"2024-01-17T13:49:05Z","published":"2024-01-15T13:35:21Z","title":"CLSA-CIM: A Cross-Layer Scheduling Approach for Computing-in-Memory\n Architectures","summary":" The demand for efficient machine learning (ML) accelerators is growing\nrapidly, driving the development of novel computing concepts such as resistive\nrandom access memory (RRAM)-based tiled computing-in-memory (CIM)\narchitectures. CIM allows to compute within the memory unit, resulting in\nfaster data processing and reduced power consumption. Efficient compiler\nalgorithms are essential to exploit the potential of tiled CIM architectures.\nWhile conventional ML compilers focus on code generation for CPUs, GPUs, and\nother von Neumann architectures, adaptations are needed to cover CIM\narchitectures. Cross-layer scheduling is a promising approach, as it enhances\nthe utilization of CIM cores, thereby accelerating computations. Although\nsimilar concepts are implicitly used in previous work, there is a lack of clear\nand quantifiable algorithmic definitions for cross-layer scheduling for tiled\nCIM architectures. To close this gap, we present CLSA-CIM, a cross-layer\nscheduling algorithm for tiled CIM architectures. We integrate CLSA-CIM with\nexisting weight-mapping strategies and compare performance against\nstate-of-the-art (SOTA) scheduling algorithms. CLSA-CIM improves the\nutilization by up to 17.9 x , resulting in an overall speedup increase of up to\n29.2 x compared to SOTA.\n","authors":["Rebecca Pelke","Jose Cubero-Cascante","Nils Bosbach","Felix Staudigl","Rainer Leupers","Jan Moritz Joseph"],"pdf_url":"https://arxiv.org/pdf/2401.07671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08197v2","updated":"2024-01-17T13:42:40Z","published":"2024-01-16T08:25:29Z","title":"Matrix Completion with Hypergraphs:Sharp Thresholds and Efficient\n Algorithms","summary":" This paper considers the problem of completing a rating matrix based on\nsub-sampled matrix entries as well as observed social graphs and hypergraphs.\nWe show that there exists a \\emph{sharp threshold} on the sample probability\nfor the task of exactly completing the rating matrix -- the task is achievable\nwhen the sample probability is above the threshold, and is impossible otherwise\n-- demonstrating a phase transition phenomenon. The threshold can be expressed\nas a function of the ``quality'' of hypergraphs, enabling us to \\emph{quantify}\nthe amount of reduction in sample probability due to the exploitation of\nhypergraphs. This also highlights the usefulness of hypergraphs in the matrix\ncompletion problem. En route to discovering the sharp threshold, we develop a\ncomputationally efficient matrix completion algorithm that effectively exploits\nthe observed graphs and hypergraphs. Theoretical analyses show that our\nalgorithm succeeds with high probability as long as the sample probability\nexceeds the aforementioned threshold, and this theoretical result is further\nvalidated by synthetic experiments. Moreover, our experiments on a real social\nnetwork dataset (with both graphs and hypergraphs) show that our algorithm\noutperforms other state-of-the-art matrix completion algorithms.\n","authors":["Zhongtian Ma","Qiaosheng Zhang","Zhen Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08197v2.pdf","comment":"Submitted to IEEE for possible publication"},{"id":"http://arxiv.org/abs/2401.08268v2","updated":"2024-01-17T13:28:04Z","published":"2024-01-16T10:41:33Z","title":"An Explainable Proxy Model for Multiabel Audio Segmentation","summary":" Audio signal segmentation is a key task for automatic audio indexing. It\nconsists of detecting the boundaries of class-homogeneous segments in the\nsignal. In many applications, explainable AI is a vital process for\ntransparency of decision-making with machine learning. In this paper, we\npropose an explainable multilabel segmentation model that solves speech\nactivity (SAD), music (MD), noise (ND), and overlapped speech detection (OSD)\nsimultaneously. This proxy uses the non-negative matrix factorization (NMF) to\nmap the embedding used for the segmentation to the frequency domain.\nExperiments conducted on two datasets show similar performances as the\npre-trained black box model while showing strong explainability features.\nSpecifically, the frequency bins used for the decision can be easily identified\nat both the segment level (local explanations) and global level (class\nprototypes).\n","authors":["Théo Mariotte","Antonio Almudévar","Marie Tahon","Alfonso Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.08268v2.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.09200v1","updated":"2024-01-17T13:25:32Z","published":"2024-01-17T13:25:32Z","title":"A Real-Time Lyrics Alignment System Using Chroma And Phonetic Features\n For Classical Vocal Performance","summary":" The goal of real-time lyrics alignment is to take live singing audio as input\nand to pinpoint the exact position within given lyrics on the fly. The task can\nbenefit real-world applications such as the automatic subtitling of live\nconcerts or operas. However, designing a real-time model poses a great\nchallenge due to the constraints of only using past input and operating within\na minimal latency. Furthermore, due to the lack of datasets for real-time\nmodels for lyrics alignment, previous studies have mostly evaluated with\nprivate in-house datasets, resulting in a lack of standard evaluation methods.\nThis paper presents a real-time lyrics alignment system for classical vocal\nperformances with two contributions. First, we improve the lyrics alignment\nalgorithm by finding an optimal combination of chromagram and phonetic\nposteriorgram (PPG) that capture melodic and phonetics features of the singing\nvoice, respectively. Second, we recast the Schubert Winterreise Dataset (SWD)\nwhich contains multiple performance renditions of the same pieces as an\nevaluation set for the real-time lyrics alignment.\n","authors":["Jiyun Park","Sangeon Yong","Taegyun Kwon","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2401.09200v1.pdf","comment":"To Appear IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.09198v1","updated":"2024-01-17T13:24:04Z","published":"2024-01-17T13:24:04Z","title":"Space and Time Continuous Physics Simulation From Partial Observations","summary":" Modern techniques for physical simulations rely on numerical schemes and\nmesh-refinement methods to address trade-offs between precision and complexity,\nbut these handcrafted solutions are tedious and require high computational\npower. Data-driven methods based on large-scale machine learning promise high\nadaptivity by integrating long-range dependencies more directly and\nefficiently. In this work, we focus on fluid dynamics and address the\nshortcomings of a large part of the literature, which are based on fixed\nsupport for computations and predictions in the form of regular or irregular\ngrids. We propose a novel setup to perform predictions in a continuous spatial\nand temporal domain while being trained on sparse observations. We formulate\nthe task as a double observation problem and propose a solution with two\ninterlinked dynamical systems defined on, respectively, the sparse positions\nand the continuous domain, which allows to forecast and interpolate a solution\nfrom the initial condition. Our practical implementation involves recurrent\nGNNs and a spatio-temporal attention observer capable of interpolating the\nsolution at arbitrary locations. Our model not only generalizes to new initial\nconditions (as standard auto-regressive models do) but also performs evaluation\nat arbitrary space and time locations. We evaluate on three standard datasets\nin fluid dynamics and compare to strong baselines, which are outperformed both\nin classical settings and in the extended new task requiring continuous\npredictions.\n","authors":["Janny Steeven","Nadri Madiha","Digne Julie","Wolf Christian"],"pdf_url":"https://arxiv.org/pdf/2401.09198v1.pdf","comment":"Project Page: https://continuous-pde.github.io/"},{"id":"http://arxiv.org/abs/2309.16746v2","updated":"2024-01-17T13:20:12Z","published":"2023-09-28T16:02:39Z","title":"Implicit Gaussian process representation of vector fields over arbitrary\n latent manifolds","summary":" Gaussian processes (GPs) are popular nonparametric statistical models for\nlearning unknown functions and quantifying the spatiotemporal uncertainty in\ndata. Recent works have extended GPs to model scalar and vector quantities\ndistributed over non-Euclidean domains, including smooth manifolds appearing in\nnumerous fields such as computer vision, dynamical systems, and neuroscience.\nHowever, these approaches assume that the manifold underlying the data is\nknown, limiting their practical utility. We introduce RVGP, a generalisation of\nGPs for learning vector signals over latent Riemannian manifolds. Our method\nuses positional encoding with eigenfunctions of the connection Laplacian,\nassociated with the tangent bundle, readily derived from common graph-based\napproximation of data. We demonstrate that RVGP possesses global regularity\nover the manifold, which allows it to super-resolve and inpaint vector fields\nwhile preserving singularities. Furthermore, we use RVGP to reconstruct\nhigh-density neural dynamics derived from low-density EEG recordings in healthy\nindividuals and Alzheimer's patients. We show that vector field singularities\nare important disease markers and that their reconstruction leads to a\ncomparable classification accuracy of disease states to high-density\nrecordings. Thus, our method overcomes a significant practical limitation in\nexperimental and clinical applications.\n","authors":["Robert L. Peach","Matteo Vinao-Carl","Nir Grossman","Michael David","Emma Mallas","David Sharp","Paresh A. Malhotra","Pierre Vandergheynst","Adam Gosztolai"],"pdf_url":"https://arxiv.org/pdf/2309.16746v2.pdf","comment":"ICLR 2024 conference paper. Associated code:\n https://github.com/agosztolai/RVGP"},{"id":"http://arxiv.org/abs/2309.07690v2","updated":"2024-01-17T13:13:37Z","published":"2023-09-14T13:07:36Z","title":"A DenseNet-based method for decoding auditory spatial attention with EEG","summary":" Auditory spatial attention detection (ASAD) aims to decode the attended\nspatial location with EEG in a multiple-speaker setting. ASAD methods are\ninspired by the brain lateralization of cortical neural responses during the\nprocessing of auditory spatial attention, and show promising performance for\nthe task of auditory attention decoding (AAD) with neural recordings. In the\nprevious ASAD methods, the spatial distribution of EEG electrodes is not fully\nexploited, which may limit the performance of these methods. In the present\nwork, by transforming the original EEG channels into a two-dimensional (2D)\nspatial topological map, the EEG data is transformed into a three-dimensional\n(3D) arrangement containing spatial-temporal information. And then a 3D deep\nconvolutional neural network (DenseNet-3D) is used to extract temporal and\nspatial features of the neural representation for the attended locations. The\nresults show that the proposed method achieves higher decoding accuracy than\nthe state-of-the-art (SOTA) method (94.3% compared to XANet's 90.6%) with\n1-second decision window for the widely used KULeuven (KUL) dataset, and the\ncode to implement our work is available on Github:\n https://github.com/xuxiran/ASAD_DenseNet\n","authors":["Xiran Xu","Bo Wang","Yujie Yan","Xihong Wu","Jing Chen"],"pdf_url":"https://arxiv.org/pdf/2309.07690v2.pdf","comment":"5 pages, 3 figures, has been accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.07595v2","updated":"2024-01-17T13:08:36Z","published":"2024-01-15T11:04:47Z","title":"E3x: $\\mathrm{E}(3)$-Equivariant Deep Learning Made Easy","summary":" This work introduces E3x, a software package for building neural networks\nthat are equivariant with respect to the Euclidean group $\\mathrm{E}(3)$,\nconsisting of translations, rotations, and reflections of three-dimensional\nspace. Compared to ordinary neural networks, $\\mathrm{E}(3)$-equivariant models\npromise benefits whenever input and/or output data are quantities associated\nwith three-dimensional objects. This is because the numeric values of such\nquantities (e.g. positions) typically depend on the chosen coordinate system.\nUnder transformations of the reference frame, the values change predictably,\nbut the underlying rules can be difficult to learn for ordinary machine\nlearning models. With built-in $\\mathrm{E}(3)$-equivariance, neural networks\nare guaranteed to satisfy the relevant transformation rules exactly, resulting\nin superior data efficiency and accuracy. The code for E3x is available from\nhttps://github.com/google-research/e3x, detailed documentation and usage\nexamples can be found on https://e3x.readthedocs.io.\n","authors":["Oliver T. Unke","Hartmut Maennel"],"pdf_url":"https://arxiv.org/pdf/2401.07595v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09193v1","updated":"2024-01-17T13:04:23Z","published":"2024-01-17T13:04:23Z","title":"GNN-LoFI: a Novel Graph Neural Network through Localized Feature-based\n Histogram Intersection","summary":" Graph neural networks are increasingly becoming the framework of choice for\ngraph-based machine learning. In this paper, we propose a new graph neural\nnetwork architecture that substitutes classical message passing with an\nanalysis of the local distribution of node features. To this end, we extract\nthe distribution of features in the egonet for each local neighbourhood and\ncompare them against a set of learned label distributions by taking the\nhistogram intersection kernel. The similarity information is then propagated to\nother nodes in the network, effectively creating a message passing-like\nmechanism where the message is determined by the ensemble of the features. We\nperform an ablation study to evaluate the network's performance under different\nchoices of its hyper-parameters. Finally, we test our model on standard graph\nclassification and regression benchmarks, and we find that it outperforms\nwidely used alternative approaches, including both graph kernels and graph\nneural networks.\n","authors":["Alessandro Bicciato","Luca Cosmo","Giorgia Minello","Luca Rossi","Andrea Torsello"],"pdf_url":"https://arxiv.org/pdf/2401.09193v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09192v1","updated":"2024-01-17T13:04:14Z","published":"2024-01-17T13:04:14Z","title":"Preparing Lessons for Progressive Training on Language Models","summary":" The rapid progress of Transformers in artificial intelligence has come at the\ncost of increased resource consumption and greenhouse gas emissions due to\ngrowing model sizes. Prior work suggests using pretrained small models to\nimprove training efficiency, but this approach may not be suitable for new\nmodel structures. On the other hand, training from scratch can be slow, and\nprogressively stacking layers often fails to achieve significant acceleration.\nTo address these challenges, we propose a novel method called Apollo, which\nprep\\textbf{a}res lessons for ex\\textbf{p}anding \\textbf{o}perations by\n\\textbf{l}earning high-\\textbf{l}ayer functi\\textbf{o}nality during training of\nlow layers. Our approach involves low-value-prioritized sampling (LVPS) to\ntrain different depths and weight sharing to facilitate efficient expansion. We\nalso introduce an interpolation method for stable model depth extension.\nExperiments demonstrate that Apollo achieves state-of-the-art acceleration\nratios, even rivaling methods using pretrained models, making it a universal\nand efficient solution for training deep models while reducing time, financial,\nand environmental costs.\n","authors":["Yu Pan","Ye Yuan","Yichun Yin","Jiaxin Shi","Zenglin Xu","Ming Zhang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09191v1","updated":"2024-01-17T13:03:47Z","published":"2024-01-17T13:03:47Z","title":"An Optimal Transport Approach for Computing Adversarial Training Lower\n Bounds in Multiclass Classification","summary":" Despite the success of deep learning-based algorithms, it is widely known\nthat neural networks may fail to be robust. A popular paradigm to enforce\nrobustness is adversarial training (AT), however, this introduces many\ncomputational and theoretical difficulties. Recent works have developed a\nconnection between AT in the multiclass classification setting and\nmultimarginal optimal transport (MOT), unlocking a new set of tools to study\nthis problem. In this paper, we leverage the MOT connection to propose\ncomputationally tractable numerical algorithms for computing universal lower\nbounds on the optimal adversarial risk and identifying optimal classifiers. We\npropose two main algorithms based on linear programming (LP) and entropic\nregularization (Sinkhorn). Our key insight is that one can harmlessly truncate\nthe higher order interactions between classes, preventing the combinatorial run\ntimes typically encountered in MOT problems. We validate these results with\nexperiments on MNIST and CIFAR-$10$, which demonstrate the tractability of our\napproach.\n","authors":["Nicolas Garcia Trillos","Matt Jacobs","Jakwang Kim","Matthew Werenski"],"pdf_url":"https://arxiv.org/pdf/2401.09191v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09190v1","updated":"2024-01-17T13:00:57Z","published":"2024-01-17T13:00:57Z","title":"Exploring the Role of Convolutional Neural Networks (CNN) in Dental\n Radiography Segmentation: A Comprehensive Systematic Literature Review","summary":" In the field of dentistry, there is a growing demand for increased precision\nin diagnostic tools, with a specific focus on advanced imaging techniques such\nas computed tomography, cone beam computed tomography, magnetic resonance\nimaging, ultrasound, and traditional intra-oral periapical X-rays. Deep\nlearning has emerged as a pivotal tool in this context, enabling the\nimplementation of automated segmentation techniques crucial for extracting\nessential diagnostic data. This integration of cutting-edge technology\naddresses the urgent need for effective management of dental conditions, which,\nif left undetected, can have a significant impact on human health. The\nimpressive track record of deep learning across various domains, including\ndentistry, underscores its potential to revolutionize early detection and\ntreatment of oral health issues. Objective: Having demonstrated significant\nresults in diagnosis and prediction, deep convolutional neural networks (CNNs)\nrepresent an emerging field of multidisciplinary research. The goals of this\nstudy were to provide a concise overview of the state of the art, standardize\nthe current debate, and establish baselines for future research. Method: In\nthis study, a systematic literature review is employed as a methodology to\nidentify and select relevant studies that specifically investigate the deep\nlearning technique for dental imaging analysis. This study elucidates the\nmethodological approach, including the systematic collection of data,\nstatistical analysis, and subsequent dissemination of outcomes. Conclusion:\nThis work demonstrates how Convolutional Neural Networks (CNNs) can be employed\nto analyze images, serving as effective tools for detecting dental pathologies.\nAlthough this research acknowledged some limitations, CNNs utilized for\nsegmenting and categorizing teeth exhibited their highest level of performance\noverall.\n","authors":["Walid Brahmi","Imen Jdey","Fadoua Drira"],"pdf_url":"https://arxiv.org/pdf/2401.09190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09184v1","updated":"2024-01-17T12:50:50Z","published":"2024-01-17T12:50:50Z","title":"A Two-Scale Complexity Measure for Deep Learning Models","summary":" We introduce a novel capacity measure 2sED for statistical models based on\nthe effective dimension. The new quantity provably bounds the generalization\nerror under mild assumptions on the model. Furthermore, simulations on standard\ndata sets and popular model architectures show that 2sED correlates well with\nthe training error. For Markovian models, we show how to efficiently\napproximate 2sED from below through a layerwise iterative approach, which\nallows us to tackle deep learning models with a large number of parameters.\nSimulation results suggest that the approximation is good for different\nprominent models and data sets.\n","authors":["Massimiliano Datres","Gian Paolo Leonardi","Alessio Figalli","David Sutter"],"pdf_url":"https://arxiv.org/pdf/2401.09184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12162v3","updated":"2024-01-17T12:48:38Z","published":"2023-05-20T10:42:00Z","title":"A Scalable Neural Network for DSIC Affine Maximizer Auction Design","summary":" Automated auction design aims to find empirically high-revenue mechanisms\nthrough machine learning. Existing works on multi item auction scenarios can be\nroughly divided into RegretNet-like and affine maximizer auctions (AMAs)\napproaches. However, the former cannot strictly ensure dominant strategy\nincentive compatibility (DSIC), while the latter faces scalability issue due to\nthe large number of allocation candidates. To address these limitations, we\npropose AMenuNet, a scalable neural network that constructs the AMA parameters\n(even including the allocation menu) from bidder and item representations.\nAMenuNet is always DSIC and individually rational (IR) due to the properties of\nAMAs, and it enhances scalability by generating candidate allocations through a\nneural network. Additionally, AMenuNet is permutation equivariant, and its\nnumber of parameters is independent of auction scale. We conduct extensive\nexperiments to demonstrate that AMenuNet outperforms strong baselines in both\ncontextual and non-contextual multi-item auctions, scales well to larger\nauctions, generalizes well to different settings, and identifies useful\ndeterministic allocations. Overall, our proposed approach offers an effective\nsolution to automated DSIC auction design, with improved scalability and strong\nrevenue performance in various settings.\n","authors":["Zhijian Duan","Haoran Sun","Yurong Chen","Xiaotie Deng"],"pdf_url":"https://arxiv.org/pdf/2305.12162v3.pdf","comment":"NeurIPS 2023 (spotlight)"},{"id":"http://arxiv.org/abs/2401.09181v1","updated":"2024-01-17T12:44:17Z","published":"2024-01-17T12:44:17Z","title":"Beyond Anti-Forgetting: Multimodal Continual Instruction Tuning with\n Positive Forward Transfer","summary":" Multimodal Continual Instruction Tuning (MCIT) enables Multimodal Large\nLanguage Models (MLLMs) to meet continuously emerging requirements without\nexpensive retraining. MCIT faces two major obstacles: catastrophic forgetting\n(where old knowledge is forgotten) and negative forward transfer (where the\nperformance of future tasks is degraded). Although existing methods have\ngreatly alleviated catastrophic forgetting, they still suffer from negative\nforward transfer. By performing singular value decomposition (SVD) on input\nembeddings, we discover a large discrepancy in different input embeddings. The\ndiscrepancy results in the model learning irrelevant information for old and\npre-trained tasks, which leads to catastrophic forgetting and negative forward\ntransfer. To address these issues, we propose Fwd-Prompt, a prompt-based method\nprojecting prompt gradient to the residual space to minimize the interference\nbetween tasks and to the pre-trained subspace for reusing pre-trained\nknowledge. Our experiments demonstrate that Fwd-Prompt achieves\nstate-of-the-art performance while updating fewer parameters and requiring no\nold samples. Our research sheds light on the potential of continuously adapting\nMLLMs to new tasks under the instruction tuning paradigm and encourages future\nstudies to explore MCIT. The code will soon be publicly available.\n","authors":["Junhao Zheng","Qianli Ma","Zhen Liu","Binquan Wu","Huawen Feng"],"pdf_url":"https://arxiv.org/pdf/2401.09181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09180v1","updated":"2024-01-17T12:43:28Z","published":"2024-01-17T12:43:28Z","title":"Unsupervised Multiple Domain Translation through Controlled\n Disentanglement in Variational Autoencoder","summary":" Unsupervised Multiple Domain Translation is the task of transforming data\nfrom one domain to other domains without having paired data to train the\nsystems. Typically, methods based on Generative Adversarial Networks (GANs) are\nused to address this task. However, our proposal exclusively relies on a\nmodified version of a Variational Autoencoder. This modification consists of\nthe use of two latent variables disentangled in a controlled way by design. One\nof this latent variables is imposed to depend exclusively on the domain, while\nthe other one must depend on the rest of the variability factors of the data.\nAdditionally, the conditions imposed over the domain latent variable allow for\nbetter control and understanding of the latent space. We empirically\ndemonstrate that our approach works on different vision datasets improving the\nperformance of other well known methods. Finally, we prove that, indeed, one of\nthe latent variables stores all the information related to the domain and the\nother one hardly contains any domain information.\n","authors":["Almudévar Antonio","Mariotte Théo","Ortega Alfonso","Tahon Marie"],"pdf_url":"https://arxiv.org/pdf/2401.09180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08056v3","updated":"2024-01-17T12:41:45Z","published":"2023-10-12T06:09:26Z","title":"Learning from Label Proportions: Bootstrapping Supervised Learners via\n Belief Propagation","summary":" Learning from Label Proportions (LLP) is a learning problem where only\naggregate level labels are available for groups of instances, called bags,\nduring training, and the aim is to get the best performance at the\ninstance-level on the test data. This setting arises in domains like\nadvertising and medicine due to privacy considerations. We propose a novel\nalgorithmic framework for this problem that iteratively performs two main\nsteps. For the first step (Pseudo Labeling) in every iteration, we define a\nGibbs distribution over binary instance labels that incorporates a) covariate\ninformation through the constraint that instances with similar covariates\nshould have similar labels and b) the bag level aggregated label. We then use\nBelief Propagation (BP) to marginalize the Gibbs distribution to obtain pseudo\nlabels. In the second step (Embedding Refinement), we use the pseudo labels to\nprovide supervision for a learner that yields a better embedding. Further, we\niterate on the two steps again by using the second step's embeddings as new\ncovariates for the next iteration. In the final iteration, a classifier is\ntrained using the pseudo labels. Our algorithm displays strong gains against\nseveral SOTA baselines (up to 15%) for the LLP Binary Classification problem on\nvarious dataset types - tabular and Image. We achieve these improvements with\nminimal computational overhead above standard supervised learning due to Belief\nPropagation, for large bag sizes, even for a million samples.\n","authors":["Shreyas Havaldar","Navodita Sharma","Shubhi Sareen","Karthikeyan Shanmugam","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2310.08056v3.pdf","comment":"Accepted at The Twelfth International Conference on Learning\n Representations (ICLR 2024) & Oral Presentation at Regulatable ML @ NeurIPS\n 2023"},{"id":"http://arxiv.org/abs/2401.09176v1","updated":"2024-01-17T12:34:17Z","published":"2024-01-17T12:34:17Z","title":"ADCNet: a unified framework for predicting the activity of antibody-drug\n conjugates","summary":" Antibody-drug conjugate (ADC) has revolutionized the field of cancer\ntreatment in the era of precision medicine due to their ability to precisely\ntarget cancer cells and release highly effective drug. Nevertheless, the\nrealization of rational design of ADC is very difficult because the\nrelationship between their structures and activities is difficult to\nunderstand. In the present study, we introduce a unified deep learning\nframework called ADCNet to help design potential ADCs. The ADCNet highly\nintegrates the protein representation learning language model ESM-2 and\nsmall-molecule representation learning language model FG-BERT models to achieve\nactivity prediction through learning meaningful features from antigen and\nantibody protein sequences of ADC, SMILES strings of linker and payload, and\ndrug-antibody ratio (DAR) value. Based on a carefully designed and manually\ntailored ADC data set, extensive evaluation results reveal that ADCNet performs\nbest on the test set compared to baseline machine learning models across all\nevaluation metrics. For example, it achieves an average prediction accuracy of\n87.12%, a balanced accuracy of 0.8689, and an area under receiver operating\ncharacteristic curve of 0.9293 on the test set. In addition, cross-validation,\nablation experiments, and external independent testing results further prove\nthe stability, advancement, and robustness of the ADCNet architecture. For the\nconvenience of the community, we develop the first online platform\n(https://ADCNet.idruglab.cn) for the prediction of ADCs activity based on the\noptimal ADCNet model, and the source code is publicly available at\nhttps://github.com/idrugLab/ADCNet.\n","authors":["Liye Chen","Biaoshun Li","Yihao Chen","Mujie Lin","Shipeng Zhang","Chenxin Li","Yu Pang","Ling Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08971v2","updated":"2024-01-17T11:35:33Z","published":"2023-09-16T12:11:11Z","title":"Regularized Contrastive Pre-training for Few-shot Bioacoustic Sound\n Detection","summary":" Bioacoustic sound event detection allows for better understanding of animal\nbehavior and for better monitoring biodiversity using audio. Deep learning\nsystems can help achieve this goal, however it is difficult to acquire\nsufficient annotated data to train these systems from scratch. To address this\nlimitation, the Detection and Classification of Acoustic Scenes and Events\n(DCASE) community has recasted the problem within the framework of few-shot\nlearning and organize an annual challenge for learning to detect animal sounds\nfrom only five annotated examples. In this work, we regularize supervised\ncontrastive pre-training to learn features that can transfer well on new target\ntasks with animal sounds unseen during training, achieving a high F-score of\n61.52%(0.48) when no feature adaptation is applied, and an F-score of\n68.19%(0.75) when we further adapt the learned features for each new target\ntask. This work aims to lower the entry bar to few-shot bioacoustic sound event\ndetection by proposing a simple and yet effective framework for this task, by\nalso providing open-source code.\n","authors":["Ilyass Moummad","Romain Serizel","Nicolas Farrugia"],"pdf_url":"https://arxiv.org/pdf/2309.08971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09135v1","updated":"2024-01-17T11:17:04Z","published":"2024-01-17T11:17:04Z","title":"Asynchronous Local-SGD Training for Language Modeling","summary":" Local stochastic gradient descent (Local-SGD), also referred to as federated\naveraging, is an approach to distributed optimization where each device\nperforms more than one SGD update per communication. This work presents an\nempirical study of {\\it asynchronous} Local-SGD for training language models;\nthat is, each worker updates the global parameters as soon as it has finished\nits SGD steps. We conduct a comprehensive investigation by examining how worker\nhardware heterogeneity, model size, number of workers, and optimizer could\nimpact the learning performance. We find that with naive implementations,\nasynchronous Local-SGD takes more iterations to converge than its synchronous\ncounterpart despite updating the (global) model parameters more frequently. We\nidentify momentum acceleration on the global parameters when worker gradients\nare stale as a key challenge. We propose a novel method that utilizes a delayed\nNesterov momentum update and adjusts the workers' local training steps based on\ntheir computation speed. This approach, evaluated with models up to 150M\nparameters on the C4 dataset, matches the performance of synchronous Local-SGD\nin terms of perplexity per update step, and significantly surpasses it in terms\nof wall clock time.\n","authors":["Bo Liu","Rachita Chhaparia","Arthur Douillard","Satyen Kale","Andrei A. Rusu","Jiajun Shen","Arthur Szlam","Marc'Aurelio Ranzato"],"pdf_url":"https://arxiv.org/pdf/2401.09135v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09125v1","updated":"2024-01-17T11:01:28Z","published":"2024-01-17T11:01:28Z","title":"Understanding Heterophily for Graph Neural Networks","summary":" Graphs with heterophily have been regarded as challenging scenarios for Graph\nNeural Networks (GNNs), where nodes are connected with dissimilar neighbors\nthrough various patterns. In this paper, we present theoretical understandings\nof the impacts of different heterophily patterns for GNNs by incorporating the\ngraph convolution (GC) operations into fully connected networks via the\nproposed Heterophilous Stochastic Block Models (HSBM), a general random graph\nmodel that can accommodate diverse heterophily patterns. Firstly, we show that\nby applying a GC operation, the separability gains are determined by two\nfactors, i.e., the Euclidean distance of the neighborhood distributions and\n$\\sqrt{\\mathbb{E}\\left[\\operatorname{deg}\\right]}$, where\n$\\mathbb{E}\\left[\\operatorname{deg}\\right]$ is the averaged node degree. It\nreveals that the impact of heterophily on classification needs to be evaluated\nalongside the averaged node degree. Secondly, we show that the topological\nnoise has a detrimental impact on separability, which is equivalent to\ndegrading $\\mathbb{E}\\left[\\operatorname{deg}\\right]$. Finally, when applying\nmultiple GC operations, we show that the separability gains are determined by\nthe normalized distance of the $l$-powered neighborhood distributions. It\nindicates that the nodes still possess separability as $l$ goes to infinity in\na wide range of regimes. Extensive experiments on both synthetic and real-world\ndata verify the effectiveness of our theory.\n","authors":["Junfu Wang","Yuanfang Guo","Liang Yang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.13247v2","updated":"2024-01-17T10:36:43Z","published":"2023-01-30T19:22:46Z","title":"Online Loss Function Learning","summary":" Loss function learning is a new meta-learning paradigm that aims to automate\nthe essential task of designing a loss function for a machine learning model.\nExisting techniques for loss function learning have shown promising results,\noften improving a model's training dynamics and final inference performance.\nHowever, a significant limitation of these techniques is that the loss\nfunctions are meta-learned in an offline fashion, where the meta-objective only\nconsiders the very first few steps of training, which is a significantly\nshorter time horizon than the one typically used for training deep neural\nnetworks. This causes significant bias towards loss functions that perform well\nat the very start of training but perform poorly at the end of training. To\naddress this issue we propose a new loss function learning technique for\nadaptively updating the loss function online after each update to the base\nmodel parameters. The experimental results show that our proposed method\nconsistently outperforms the cross-entropy loss and offline loss function\nlearning techniques on a diverse range of neural network architectures and\ndatasets.\n","authors":["Christian Raymond","Qi Chen","Bing Xue","Mengjie Zhang"],"pdf_url":"https://arxiv.org/pdf/2301.13247v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09084v2","updated":"2024-01-17T10:26:04Z","published":"2023-12-14T16:16:35Z","title":"Language Modeling on a SpiNNaker 2 Neuromorphic Chip","summary":" As large language models continue to scale in size rapidly, so too does the\ncomputational power required to run them. Event-based networks on neuromorphic\ndevices offer a potential way to reduce energy consumption for inference\nsignificantly. However, to date, most event-based networks that can run on\nneuromorphic hardware, including spiking neural networks (SNNs), have not\nachieved task performance even on par with LSTM models for language modeling.\nAs a result, language modeling on neuromorphic devices has seemed a distant\nprospect. In this work, we demonstrate the first-ever implementation of a\nlanguage model on a neuromorphic device - specifically the SpiNNaker 2 chip -\nbased on a recently published event-based architecture called the EGRU.\nSpiNNaker 2 is a many-core neuromorphic chip designed for large-scale\nasynchronous processing, while the EGRU is architected to leverage such\nhardware efficiently while maintaining competitive task performance. This\nimplementation marks the first time a neuromorphic language model matches\nLSTMs, setting the stage for taking task performance to the level of large\nlanguage models. We also demonstrate results on a gesture recognition task\nbased on inputs from a DVS camera. Overall, our results showcase the\nfeasibility of this neuro-inspired neural network in hardware, highlighting\nsignificant gains versus conventional hardware in energy efficiency for the\ncommon use case of single batch inference.\n","authors":["Khaleelulla Khan Nazeer","Mark Schöne","Rishav Mukherji","Bernhard Vogginger","Christian Mayr","David Kappel","Anand Subramoney"],"pdf_url":"https://arxiv.org/pdf/2312.09084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09093v1","updated":"2024-01-17T09:56:10Z","published":"2024-01-17T09:56:10Z","title":"RWKV-TS: Beyond Traditional Recurrent Neural Network for Time Series\n Tasks","summary":" Traditional Recurrent Neural Network (RNN) architectures, such as LSTM and\nGRU, have historically held prominence in time series tasks. However, they have\nrecently seen a decline in their dominant position across various time series\ntasks. As a result, recent advancements in time series forecasting have seen a\nnotable shift away from RNNs towards alternative architectures such as\nTransformers, MLPs, and CNNs. To go beyond the limitations of traditional RNNs,\nwe design an efficient RNN-based model for time series tasks, named RWKV-TS,\nwith three distinctive features: (i) A novel RNN architecture characterized by\n$O(L)$ time complexity and memory usage. (ii) An enhanced ability to capture\nlong-term sequence information compared to traditional RNNs. (iii) High\ncomputational efficiency coupled with the capacity to scale up effectively.\nThrough extensive experimentation, our proposed RWKV-TS model demonstrates\ncompetitive performance when compared to state-of-the-art Transformer-based or\nCNN-based models. Notably, RWKV-TS exhibits not only comparable performance but\nalso demonstrates reduced latency and memory utilization. The success of\nRWKV-TS encourages further exploration and innovation in leveraging RNN-based\napproaches within the domain of Time Series. The combination of competitive\nperformance, low latency, and efficient memory usage positions RWKV-TS as a\npromising avenue for future research in time series tasks. Code is available\nat:\\href{https://github.com/howard-hou/RWKV-TS}{\nhttps://github.com/howard-hou/RWKV-TS}\n","authors":["Haowen Hou","F. Richard Yu"],"pdf_url":"https://arxiv.org/pdf/2401.09093v1.pdf","comment":"13 pages. 2 figures, 14 tables"},{"id":"http://arxiv.org/abs/2312.04118v2","updated":"2024-01-17T09:43:14Z","published":"2023-12-07T08:18:40Z","title":"Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic\n Play","summary":" Infants' ability to recognize and categorize objects develops gradually. The\nsecond year of life is marked by both the emergence of more semantic visual\nrepresentations and a better understanding of word meaning. This suggests that\nlanguage input may play an important role in shaping visual representations.\nHowever, even in suitable contexts for word learning like dyadic play sessions,\ncaregivers utterances are sparse and ambiguous, often referring to objects that\nare different from the one to which the child attends. Here, we systematically\ninvestigate to what extent caregivers' utterances can nevertheless enhance\nvisual representations. For this we propose a computational model of visual\nrepresentation learning during dyadic play. We introduce a synthetic dataset of\nego-centric images perceived by a toddler-agent that moves and rotates toy\nobjects in different parts of its home environment while hearing caregivers'\nutterances, modeled as captions. We propose to model toddlers' learning as\nsimultaneously aligning representations for 1) close-in-time images and 2)\nco-occurring images and utterances. We show that utterances with statistics\nmatching those of real caregivers give rise to representations supporting\nimproved category recognition. Our analysis reveals that a small\ndecrease/increase in object-relevant naming frequencies can drastically impact\nthe learned representations. This affects the attention on object names within\nan utterance, which is required for efficient visuo-linguistic alignment.\nOverall, our results support the hypothesis that caregivers' naming utterances\ncan improve toddlers' visual representations.\n","authors":["Timothy Schaumlöffel","Arthur Aubret","Gemma Roig","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2312.04118v2.pdf","comment":"Proceedings of the 2023 IEEE International Conference on Development\n and Learning (ICDL)"},{"id":"http://arxiv.org/abs/2204.02779v4","updated":"2024-01-17T09:38:09Z","published":"2022-04-05T15:02:18Z","title":"A Dempster-Shafer approach to trustworthy AI with application to fetal\n brain MRI segmentation","summary":" Deep learning models for medical image segmentation can fail unexpectedly and\nspectacularly for pathological cases and images acquired at different centers\nthan training images, with labeling errors that violate expert knowledge. Such\nerrors undermine the trustworthiness of deep learning models for medical image\nsegmentation. Mechanisms for detecting and correcting such failures are\nessential for safely translating this technology into clinics and are likely to\nbe a requirement of future regulations on artificial intelligence (AI). In this\nwork, we propose a trustworthy AI theoretical framework and a practical system\nthat can augment any backbone AI system using a fallback method and a fail-safe\nmechanism based on Dempster-Shafer theory. Our approach relies on an actionable\ndefinition of trustworthy AI. Our method automatically discards the voxel-level\nlabeling predicted by the backbone AI that violate expert knowledge and relies\non a fallback for those voxels. We demonstrate the effectiveness of the\nproposed trustworthy AI approach on the largest reported annotated dataset of\nfetal MRI consisting of 540 manually annotated fetal brain 3D T2w MRIs from 13\ncenters. Our trustworthy AI method improves the robustness of a\nstate-of-the-art backbone AI for fetal brain MRIs acquired across various\ncenters and for fetuses with various brain abnormalities.\n","authors":["Lucas Fidon","Michael Aertsen","Florian Kofler","Andrea Bink","Anna L. David","Thomas Deprest","Doaa Emam","Frédéric Guffens","András Jakab","Gregor Kasprian","Patric Kienast","Andrew Melbourne","Bjoern Menze","Nada Mufti","Ivana Pogledic","Daniela Prayer","Marlene Stuempflen","Esther Van Elslander","Sébastien Ourselin","Jan Deprest","Tom Vercauteren"],"pdf_url":"https://arxiv.org/pdf/2204.02779v4.pdf","comment":"Published in IEEE TPAMI. Minor revision compared to the previous\n version"},{"id":"http://arxiv.org/abs/2401.09074v1","updated":"2024-01-17T09:23:59Z","published":"2024-01-17T09:23:59Z","title":"Code Simulation Challenges for Large Language Models","summary":" We investigate the extent to which Large Language Models (LLMs) can simulate\nthe execution of computer code and algorithms. We begin by looking straight\nline programs, and show that current LLMs demonstrate poor performance even\nwith such simple programs -- performance rapidly degrades with the length of\ncode. We then investigate the ability of LLMs to simulate programs that contain\ncritical paths and redundant instructions. We also go beyond straight line\nprogram simulation with sorting algorithms and nested loops, and we show the\ncomputational complexity of a routine directly affects the ability of an LLM to\nsimulate its execution. We observe that LLMs execute instructions sequentially\nand with a low error margin only for short programs or standard procedures.\nLLMs' code simulation is in tension with their pattern recognition and\nmemorisation capabilities: on tasks where memorisation is detrimental, we\npropose a novel prompting method to simulate code execution line by line.\nEmpirically, our new Chain of Simulation (CoSm) method improves on the standard\nChain of Thought prompting approach by avoiding the pitfalls of memorisation.\n","authors":["Emanuele La Malfa","Christoph Weinhuber","Orazio Torre","Fangru Lin","Anthony Cohn","Nigel Shadbolt","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2401.09074v1.pdf","comment":"main paper (10 pages) + Appendix (11 pages)"},{"id":"http://arxiv.org/abs/2401.09073v1","updated":"2024-01-17T09:23:25Z","published":"2024-01-17T09:23:25Z","title":"Fixed-Budget Differentially Private Best Arm Identification","summary":" We study best arm identification (BAI) in linear bandits in the fixed-budget\nregime under differential privacy constraints, when the arm rewards are\nsupported on the unit interval. Given a finite budget $T$ and a privacy\nparameter $\\varepsilon>0$, the goal is to minimise the error probability in\nfinding the arm with the largest mean after $T$ sampling rounds, subject to the\nconstraint that the policy of the decision maker satisfies a certain {\\em\n$\\varepsilon$-differential privacy} ($\\varepsilon$-DP) constraint. We construct\na policy satisfying the $\\varepsilon$-DP constraint (called {\\sc DP-BAI}) by\nproposing the principle of {\\em maximum absolute determinants}, and derive an\nupper bound on its error probability. Furthermore, we derive a minimax lower\nbound on the error probability, and demonstrate that the lower and the upper\nbounds decay exponentially in $T$, with exponents in the two bounds matching\norder-wise in (a) the sub-optimality gaps of the arms, (b) $\\varepsilon$, and\n(c) the problem complexity that is expressible as the sum of two terms, one\ncharacterising the complexity of standard fixed-budget BAI (without privacy\nconstraints), and the other accounting for the $\\varepsilon$-DP constraint.\nAdditionally, we present some auxiliary results that contribute to the\nderivation of the lower bound on the error probability. These results, we\nposit, may be of independent interest and could prove instrumental in proving\nlower bounds on error probabilities in several other bandit problems. Whereas\nprior works provide results for BAI in the fixed-budget regime without privacy\nconstraints or in the fixed-confidence regime with privacy constraints, our\nwork fills the gap in the literature by providing the results for BAI in the\nfixed-budget regime under the $\\varepsilon$-DP constraint.\n","authors":["Zhirui Chen","P. N. Karthik","Yeow Meng Chee","Vincent Y. F. Tan"],"pdf_url":"https://arxiv.org/pdf/2401.09073v1.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09071v1","updated":"2024-01-17T09:12:31Z","published":"2024-01-17T09:12:31Z","title":"Rethinking Spectral Graph Neural Networks with Spatially Adaptive\n Filtering","summary":" Whilst spectral Graph Neural Networks (GNNs) are theoretically well-founded\nin the spectral domain, their practical reliance on polynomial approximation\nimplies a profound linkage to the spatial domain. As previous studies rarely\nexamine spectral GNNs from the spatial perspective, their spatial-domain\ninterpretability remains elusive, e.g., what information is essentially encoded\nby spectral GNNs in the spatial domain? In this paper, to answer this question,\nwe establish a theoretical connection between spectral filtering and spatial\naggregation, unveiling an intrinsic interaction that spectral filtering\nimplicitly leads the original graph to an adapted new graph, explicitly\ncomputed for spatial aggregation. Both theoretical and empirical investigations\nreveal that the adapted new graph not only exhibits non-locality but also\naccommodates signed edge weights to reflect label consistency between nodes.\nThese findings thus highlight the interpretable role of spectral GNNs in the\nspatial domain and inspire us to rethink graph spectral filters beyond the\nfixed-order polynomials, which neglect global information. Built upon the\ntheoretical findings, we revisit the state-of-the-art spectral GNNs and propose\na novel Spatially Adaptive Filtering (SAF) framework, which leverages the\nadapted new graph by spectral filtering for an auxiliary non-local aggregation.\nNotably, our proposed SAF comprehensively models both node similarity and\ndissimilarity from a global perspective, therefore alleviating persistent\ndeficiencies of GNNs related to long-range dependencies and graph heterophily.\nExtensive experiments over 13 node classification benchmarks demonstrate the\nsuperiority of our proposed framework to the state-of-the-art models.\n","authors":["Jingwei Guo","Kaizhu Huang","Xinping Yi","Zixian Su","Rui Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09068v1","updated":"2024-01-17T09:01:50Z","published":"2024-01-17T09:01:50Z","title":"DTMM: Deploying TinyML Models on Extremely Weak IoT Devices with Pruning","summary":" DTMM is a library designed for efficient deployment and execution of machine\nlearning models on weak IoT devices such as microcontroller units (MCUs). The\nmotivation for designing DTMM comes from the emerging field of tiny machine\nlearning (TinyML), which explores extending the reach of machine learning to\nmany low-end IoT devices to achieve ubiquitous intelligence. Due to the weak\ncapability of embedded devices, it is necessary to compress models by pruning\nenough weights before deploying. Although pruning has been studied extensively\non many computing platforms, two key issues with pruning methods are\nexacerbated on MCUs: models need to be deeply compressed without significantly\ncompromising accuracy, and they should perform efficiently after pruning.\nCurrent solutions only achieve one of these objectives, but not both. In this\npaper, we find that pruned models have great potential for efficient deployment\nand execution on MCUs. Therefore, we propose DTMM with pruning unit selection,\npre-execution pruning optimizations, runtime acceleration, and post-execution\nlow-cost storage to fill the gap for efficient deployment and execution of\npruned models. It can be integrated into commercial ML frameworks for practical\ndeployment, and a prototype system has been developed. Extensive experiments on\nvarious models show promising gains compared to state-of-the-art methods.\n","authors":["Lixiang Han","Zhen Xiao","Zhenjiang Li"],"pdf_url":"https://arxiv.org/pdf/2401.09068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09067v1","updated":"2024-01-17T09:01:29Z","published":"2024-01-17T09:01:29Z","title":"Towards Continual Learning Desiderata via HSIC-Bottleneck\n Orthogonalization and Equiangular Embedding","summary":" Deep neural networks are susceptible to catastrophic forgetting when trained\non sequential tasks. Various continual learning (CL) methods often rely on\nexemplar buffers or/and network expansion for balancing model stability and\nplasticity, which, however, compromises their practical value due to privacy\nand memory concerns. Instead, this paper considers a strict yet realistic\nsetting, where the training data from previous tasks is unavailable and the\nmodel size remains relatively constant during sequential training. To achieve\nsuch desiderata, we propose a conceptually simple yet effective method that\nattributes forgetting to layer-wise parameter overwriting and the resulting\ndecision boundary distortion. This is achieved by the synergy between two key\ncomponents: HSIC-Bottleneck Orthogonalization (HBO) implements non-overwritten\nparameter updates mediated by Hilbert-Schmidt independence criterion in an\northogonal space and EquiAngular Embedding (EAE) enhances decision boundary\nadaptation between old and new tasks with predefined basis vectors. Extensive\nexperiments demonstrate that our method achieves competitive accuracy\nperformance, even with absolute superiority of zero exemplar buffer and 1.02x\nthe base model.\n","authors":["Depeng Li","Tianqi Wang","Junwei Chen","Qining Ren","Kenji Kawaguchi","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.09067v1.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2309.12697v2","updated":"2024-01-17T08:50:59Z","published":"2023-09-22T08:11:01Z","title":"Semantic similarity prediction is better than other semantic similarity\n measures","summary":" Semantic similarity between natural language texts is typically measured\neither by looking at the overlap between subsequences (e.g., BLEU) or by using\nembeddings (e.g., BERTScore, S-BERT). Within this paper, we argue that when we\nare only interested in measuring the semantic similarity, it is better to\ndirectly predict the similarity using a fine-tuned model for such a task. Using\na fine-tuned model for the Semantic Textual Similarity Benchmark tasks (STS-B)\nfrom the GLUE benchmark, we define the STSScore approach and show that the\nresulting similarity is better aligned with our expectations on a robust\nsemantic similarity measure than other approaches.\n","authors":["Steffen Herbold"],"pdf_url":"https://arxiv.org/pdf/2309.12697v2.pdf","comment":"Accepted at TMLR: https://openreview.net/forum?id=bfsNmgN5je"},{"id":"http://arxiv.org/abs/2303.03678v2","updated":"2024-01-17T08:49:31Z","published":"2023-03-07T06:34:04Z","title":"A Comparative Study of Deep Learning and Iterative Algorithms for Joint\n Channel Estimation and Signal Detection","summary":" Joint channel estimation and signal detection (JCESD) in wireless\ncommunication systems is a crucial and challenging task, especially since it\ninherently poses a nonlinear inverse problem. This challenge is further\nhighlighted in low signal-to-noise ratio (SNR) scenarios, where traditional\nalgorithms often perform poorly. Deep learning (DL) methods have been\ninvestigated, but concerns regarding computational expense and lack of\nvalidation in low-SNR settings remain. Hence, the development of a robust and\nlow-complexity model that can deliver excellent performance across a wide range\nof SNRs is highly desirable. In this paper, we aim to establish a benchmark\nwhere traditional algorithms and DL methods are validated on different channel\nmodels, Doppler, and SNR settings. In particular, we propose a new DL model\nwhere the backbone network is formed by unrolling the iterative algorithm, and\nthe hyperparameters are estimated by hypernetworks. Additionally, we adapt a\nlightweight DenseNet to the task of JCESD for comparison. We evaluate different\nmethods in three aspects: generalization in terms of bit error rate (BER),\nrobustness, and complexity. Our results indicate that DL approaches outperform\ntraditional algorithms in the challenging low-SNR setting, while the iterative\nalgorithm performs better in high-SNR settings. Furthermore, the iterative\nalgorithm is more robust in the presence of carrier frequency offset, whereas\nDL methods excel when signals are corrupted by asymmetric Gaussian noise.\n","authors":["Haocheng Ju","Haimiao Zhang","Lin Li","Xiao Li","Bin Dong"],"pdf_url":"https://arxiv.org/pdf/2303.03678v2.pdf","comment":"Code is available at https://github.com/j991222/MIMO_JCESD"},{"id":"http://arxiv.org/abs/2311.01771v3","updated":"2024-01-17T08:33:22Z","published":"2023-11-03T08:12:05Z","title":"Efficient Generalized Low-Rank Tensor Contextual Bandits","summary":" In this paper, we aim to build a novel bandits algorithm that is capable of\nfully harnessing the power of multi-dimensional data and the inherent\nnon-linearity of reward functions to provide high-usable and accountable\ndecision-making services. To this end, we introduce a generalized low-rank\ntensor contextual bandits model in which an action is formed from three feature\nvectors, and thus can be represented by a tensor. In this formulation, the\nreward is determined through a generalized linear function applied to the inner\nproduct of the action's feature tensor and a fixed but unknown parameter tensor\nwith a low tubal rank. To effectively achieve the trade-off between exploration\nand exploitation, we introduce a novel algorithm called \"Generalized Low-Rank\nTensor Exploration Subspace then Refine\" (G-LowTESTR). This algorithm first\ncollects raw data to explore the intrinsic low-rank tensor subspace information\nembedded in the decision-making scenario, and then converts the original\nproblem into an almost lower-dimensional generalized linear contextual bandits\nproblem. Rigorous theoretical analysis shows that the regret bound of\nG-LowTESTR is superior to those in vectorization and matricization cases. We\nconduct a series of simulations and real data experiments to further highlight\nthe effectiveness of G-LowTESTR, leveraging its ability to capitalize on the\nlow-rank tensor structure for enhanced learning.\n","authors":["Qianxin Yi","Yiyang Yang","Shaojie Tang","Jiapeng Liu","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01771v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09050v1","updated":"2024-01-17T08:32:07Z","published":"2024-01-17T08:32:07Z","title":"Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation\n with Deterministic Sampling Prior","summary":" Score distillation sampling (SDS) and its variants have greatly boosted the\ndevelopment of text-to-3D generation, but are vulnerable to geometry collapse\nand poor textures yet. To solve this issue, we first deeply analyze the SDS and\nfind that its distillation sampling process indeed corresponds to the\ntrajectory sampling of a stochastic differential equation (SDE): SDS samples\nalong an SDE trajectory to yield a less noisy sample which then serves as a\nguidance to optimize a 3D model. However, the randomness in SDE sampling often\nleads to a diverse and unpredictable sample which is not always less noisy, and\nthus is not a consistently correct guidance, explaining the vulnerability of\nSDS. Since for any SDE, there always exists an ordinary differential equation\n(ODE) whose trajectory sampling can deterministically and consistently converge\nto the desired target point as the SDE, we propose a novel and effective\n\"Consistent3D\" method that explores the ODE deterministic sampling prior for\ntext-to-3D generation. Specifically, at each training iteration, given a\nrendered image by a 3D model, we first estimate its desired 3D score function\nby a pre-trained 2D diffusion model, and build an ODE for trajectory sampling.\nNext, we design a consistency distillation sampling loss which samples along\nthe ODE trajectory to generate two adjacent samples and uses the less noisy\nsample to guide another more noisy one for distilling the deterministic prior\ninto the 3D model. Experimental results show the efficacy of our Consistent3D\nin generating high-fidelity and diverse 3D objects and large-scale scenes, as\nshown in Fig. 1. The codes are available at\nhttps://github.com/sail-sg/Consistent3D.\n","authors":["Zike Wu","Pan Zhou","Xuanyu Yi","Xiaoding Yuan","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05725v2","updated":"2024-01-17T08:06:39Z","published":"2023-10-09T13:54:08Z","title":"Post-hoc Bias Scoring Is Optimal For Fair Classification","summary":" We consider a binary classification problem under group fairness constraints,\nwhich can be one of Demographic Parity (DP), Equalized Opportunity (EOp), or\nEqualized Odds (EO). We propose an explicit characterization of Bayes optimal\nclassifier under the fairness constraints, which turns out to be a simple\nmodification rule of the unconstrained classifier. Namely, we introduce a novel\ninstance-level measure of bias, which we call bias score, and the modification\nrule is a simple linear rule on top of the finite amount of bias scores.Based\non this characterization, we develop a post-hoc approach that allows us to\nadapt to fairness constraints while maintaining high accuracy. In the case of\nDP and EOp constraints, the modification rule is thresholding a single bias\nscore, while in the case of EO constraints we are required to fit a linear\nmodification rule with 2 parameters. The method can also be applied for\ncomposite group-fairness criteria, such as ones involving several sensitive\nattributes.\n","authors":["Wenlong Chen","Yegor Klochkov","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05725v2.pdf","comment":"Accepted for publication at The Twelfth International Conference on\n Learning Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2312.08846v2","updated":"2024-01-17T08:05:07Z","published":"2023-12-14T12:02:24Z","title":"TiMix: Text-aware Image Mixing for Effective Vision-Language\n Pre-training","summary":" Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances\nmodern Vision-Language Pre-training (VLP) models by aligning visual and\nlinguistic modalities. Due to noises in web-harvested text-image pairs,\nhowever, scaling up training data volume in SMCL presents considerable\nobstacles in terms of computational cost and data inefficiency. To improve data\nefficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates\nmix-based data augmentation techniques into SMCL, yielding significant\nperformance improvements without significantly increasing computational\noverhead. We provide a theoretical analysis of TiMixfrom a mutual information\n(MI) perspective, showing that mixed data samples for cross-modal contrastive\nlearning implicitly serve as a regularizer for the contrastive loss. The\nexperimental results demonstrate that TiMix exhibits a comparable performance\non downstream tasks, even with a reduced amount of training data and shorter\ntraining time, when benchmarked against existing methods. This work empirically\nand theoretically demonstrates the potential of data mixing for data-efficient\nand computationally viable VLP, benefiting broader VLP model adoption in\npractical scenarios.\n","authors":["Chaoya Jiang","Wei ye","Haiyang Xu","Qinghao Ye","Ming Yan","Ji Zhang","Fei Huang","Shikun Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.08846v2.pdf","comment":"Accepted on AAAI2024"},{"id":"http://arxiv.org/abs/2401.09031v1","updated":"2024-01-17T07:58:18Z","published":"2024-01-17T07:58:18Z","title":"Data Attribution for Diffusion Models: Timestep-induced Bias in\n Influence Estimation","summary":" Data attribution methods trace model behavior back to its training dataset,\noffering an effective approach to better understand ``black-box'' neural\nnetworks. While prior research has established quantifiable links between model\noutput and training data in diverse settings, interpreting diffusion model\noutputs in relation to training samples remains underexplored. In particular,\ndiffusion models operate over a sequence of timesteps instead of instantaneous\ninput-output relationships in previous contexts, posing a significant challenge\nto extend existing frameworks to diffusion models directly. Notably, we present\nDiffusion-TracIn that incorporates this temporal dynamics and observe that\nsamples' loss gradient norms are highly dependent on timestep. This trend leads\nto a prominent bias in influence estimation, and is particularly noticeable for\nsamples trained on large-norm-inducing timesteps, causing them to be generally\ninfluential. To mitigate this effect, we introduce Diffusion-ReTrac as a\nre-normalized adaptation that enables the retrieval of training samples more\ntargeted to the test sample of interest, facilitating a localized measurement\nof influence and considerably more intuitive visualization. We demonstrate the\nefficacy of our approach through various evaluation metrics and auxiliary\ntasks, reducing the amount of generally influential samples to $\\frac{1}{3}$ of\nits original quantity.\n","authors":["Tong Xie","Haoyu Li","Andrew Bai","Cho-Jui Hsieh"],"pdf_url":"https://arxiv.org/pdf/2401.09031v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01864v2","updated":"2024-01-17T07:55:12Z","published":"2022-06-04T00:37:37Z","title":"Model-Informed Generative Adversarial Network (MI-GAN) for Learning\n Optimal Power Flow","summary":" The optimal power flow (OPF) problem, as a critical component of power system\noperations, becomes increasingly difficult to solve due to the variability,\nintermittency, and unpredictability of renewable energy brought to the power\nsystem. Although traditional optimization techniques, such as stochastic and\nrobust optimization approaches, could be leveraged to address the OPF problem,\nin the face of renewable energy uncertainty, i.e., the dynamic coefficients in\nthe optimization model, their effectiveness in dealing with large-scale\nproblems remains limited. As a result, deep learning techniques, such as neural\nnetworks, have recently been developed to improve computational efficiency in\nsolving OPF problems with the utilization of data. However, the feasibility and\noptimality of the solution may not be guaranteed, and the system dynamics\ncannot be properly addressed as well. In this paper, we propose an optimization\nmodel-informed generative adversarial network (MI-GAN) framework to solve OPF\nunder uncertainty. The main contributions are summarized into three aspects:\n(1) to ensure feasibility and improve optimality of generated solutions, three\nimportant layers are proposed: feasibility filter layer, comparison layer, and\ngradient-guided layer; (2) in the GAN-based framework, an efficient\nmodel-informed selector incorporating these three new layers is established;\nand (3) a new recursive iteration algorithm is also proposed to improve\nsolution optimality and handle the system dynamics. The numerical results on\nIEEE test systems show that the proposed method is very effective and\npromising.\n","authors":["Yuxuan Li","Chaoyue Zhao","Chenang Liu"],"pdf_url":"https://arxiv.org/pdf/2206.01864v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02040v2","updated":"2024-01-17T07:54:46Z","published":"2023-07-05T05:55:08Z","title":"VertiBench: Advancing Feature Distribution Diversity in Vertical\n Federated Learning Benchmarks","summary":" Vertical Federated Learning (VFL) is a crucial paradigm for training machine\nlearning models on feature-partitioned, distributed data. However, due to\nprivacy restrictions, few public real-world VFL datasets exist for algorithm\nevaluation, and these represent a limited array of feature distributions.\nExisting benchmarks often resort to synthetic datasets, derived from arbitrary\nfeature splits from a global set, which only capture a subset of feature\ndistributions, leading to inadequate algorithm performance assessment. This\npaper addresses these shortcomings by introducing two key factors affecting VFL\nperformance - feature importance and feature correlation - and proposing\nassociated evaluation metrics and dataset splitting methods. Additionally, we\nintroduce a real VFL dataset to address the deficit in image-image VFL\nscenarios. Our comprehensive evaluation of cutting-edge VFL algorithms provides\nvaluable insights for future research in the field.\n","authors":["Zhaomin Wu","Junyi Hou","Bingsheng He"],"pdf_url":"https://arxiv.org/pdf/2307.02040v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05794v2","updated":"2024-01-17T07:32:02Z","published":"2024-01-11T09:56:08Z","title":"Bounds on the price of feedback for mistake-bounded online learning","summary":" We improve several worst-case bounds for various online learning scenarios\nfrom (Auer and Long, Machine Learning, 1999). In particular, we sharpen an\nupper bound for delayed ambiguous reinforcement learning by a factor of 2 and\nan upper bound for learning compositions of families of functions by a factor\nof 2.41. We also improve a lower bound from the same paper for learning\ncompositions of $k$ families of functions by a factor of $\\Theta(\\ln{k})$,\nmatching the upper bound up to a constant factor. In addition, we solve a\nproblem from (Long, Theoretical Computer Science, 2020) on the price of bandit\nfeedback with respect to standard feedback for multiclass learning, and we\nimprove an upper bound from (Feng et al., Theoretical Computer Science, 2023)\non the price of $r$-input delayed ambiguous reinforcement learning by a factor\nof $r$, matching a lower bound from the same paper up to the leading term.\n","authors":["Jesse Geneson","Linus Tang"],"pdf_url":"https://arxiv.org/pdf/2401.05794v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09018v1","updated":"2024-01-17T07:30:14Z","published":"2024-01-17T07:30:14Z","title":"Residual Alignment: Uncovering the Mechanisms of Residual Networks","summary":" The ResNet architecture has been widely adopted in deep learning due to its\nsignificant boost to performance through the use of simple skip connections,\nyet the underlying mechanisms leading to its success remain largely unknown. In\nthis paper, we conduct a thorough empirical study of the ResNet architecture in\nclassification tasks by linearizing its constituent residual blocks using\nResidual Jacobians and measuring their singular value decompositions. Our\nmeasurements reveal a process called Residual Alignment (RA) characterized by\nfour properties:\n (RA1) intermediate representations of a given input are equispaced on a line,\nembedded in high dimensional space, as observed by Gai and Zhang [2021];\n (RA2) top left and right singular vectors of Residual Jacobians align with\neach other and across different depths;\n (RA3) Residual Jacobians are at most rank C for fully-connected ResNets,\nwhere C is the number of classes; and\n (RA4) top singular values of Residual Jacobians scale inversely with depth.\n RA consistently occurs in models that generalize well, in both\nfully-connected and convolutional architectures, across various depths and\nwidths, for varying numbers of classes, on all tested benchmark datasets, but\nceases to occur once the skip connections are removed. It also provably occurs\nin a novel mathematical model we propose. This phenomenon reveals a strong\nalignment between residual branches of a ResNet (RA2+4), imparting a highly\nrigid geometric structure to the intermediate representations as they progress\nlinearly through the network (RA1) up to the final layer, where they undergo\nNeural Collapse.\n","authors":["Jianing Li","Vardan Papyan"],"pdf_url":"https://arxiv.org/pdf/2401.09018v1.pdf","comment":"Accepted at NeurIPS 2023 as a Poster paper"},{"id":"http://arxiv.org/abs/2401.09011v1","updated":"2024-01-17T07:14:04Z","published":"2024-01-17T07:14:04Z","title":"Inductive Models for Artificial Intelligence Systems are Insufficient\n without Good Explanations","summary":" This paper discusses the limitations of machine learning (ML), particularly\ndeep artificial neural networks (ANNs), which are effective at approximating\ncomplex functions but often lack transparency and explanatory power. It\nhighlights the `problem of induction' : the philosophical issue that past\nobservations may not necessarily predict future events, a challenge that ML\nmodels face when encountering new, unseen data. The paper argues for the\nimportance of not just making predictions but also providing good explanations,\na feature that current models often fail to deliver. It suggests that for AI to\nprogress, we must seek models that offer insights and explanations, not just\npredictions.\n","authors":["Udesh Habaraduwa"],"pdf_url":"https://arxiv.org/pdf/2401.09011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15965v3","updated":"2024-01-17T07:08:16Z","published":"2023-12-26T09:03:23Z","title":"Efficient Reinforcemen Learning via Decoupling Exploration and\n Utilization","summary":" Deep neural network(DNN) generalization is limited by the over-reliance of\ncurrent offline reinforcement learning techniques on conservative processing of\nexisting datasets. This method frequently results in algorithms that settle for\nsuboptimal solutions that only adjust to a certain dataset. Similarly, in\nonline reinforcement learning, the previously imposed punitive pessimism also\ndeprives the model of its exploratory potential. Our research proposes a novel\nframework, Optimistic and Pessimistic Actor Reinforcement Learning (OPARL).\nOPARL employs a unique dual-actor approach: an optimistic actor dedicated to\nexploration and a pessimistic actor focused on utilization, thereby effectively\ndifferentiating between exploration and utilization strategies. This unique\ncombination in reinforcement learning methods fosters a more balanced and\nefficient approach. It enables the optimization of policies that focus on\nactions yielding high rewards through pessimistic utilization strategies, while\nalso ensuring extensive state coverage via optimistic exploration. Experiments\nand theoretical study demonstrates OPARL improves agents' capacities for\napplication and exploration. In the most tasks of DMControl benchmark and\nMujoco environment, OPARL performed better than state-of-the-art methods. Our\ncode has released on https://github.com/yydsok/OPARL\n","authors":["Jingpu Yang","Qirui Zhao","Helin Wang","Yuxiao Huang","Zirui Song","Miao Fang"],"pdf_url":"https://arxiv.org/pdf/2312.15965v3.pdf","comment":"Update V3"},{"id":"http://arxiv.org/abs/2401.09003v1","updated":"2024-01-17T06:48:16Z","published":"2024-01-17T06:48:16Z","title":"Augmenting Math Word Problems via Iterative Question Composing","summary":" Despite recent progress in improving the mathematical reasoning ability of\nlarge language models(LLMs), solving competition-level math problems without\nthe use of external tools remains challenging for open-source LLMs. In this\nwork, we introduce the MMIQC dataset, a mixture of processed web data and\nsynthetic question-response pairs, to equip base models with better\nmathematical reasoning skills. Mistral-7B-MMIQC, the model obtained by\nfine-tuning Mistral-7B(arXiv:2310.06825) on MMIQC, achieves 36.0\\% accuracy on\nMATH(arXiv:2103.03874), 5.8\\% higher than the previous (model size $\\sim$7B)\nSOTA. Our experiments also show that a large part of the improvement attributes\nto our novel augmentation method IQC(Iterative Question Composing), where we\niteratively ask an LLM to compose new questions from the given seed problems\nand do rejection sampling from another LLM. MMIQC has now been released on\nhttps://huggingface.co/datasets/Vivacem/MMIQC.\n","authors":["Haoxiong Liu","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2401.09003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00313v2","updated":"2024-01-17T06:41:47Z","published":"2023-12-30T20:13:28Z","title":"Matching of Users and Creators in Two-Sided Markets with Departures","summary":" Many online platforms of today, including social media sites, are two-sided\nmarkets bridging content creators and users. Most of the existing literature on\nplatform recommendation algorithms largely focuses on user preferences and\ndecisions, and does not simultaneously address creator incentives. We propose a\nmodel of content recommendation that explicitly focuses on the dynamics of\nuser-content matching, with the novel property that both users and creators may\nleave the platform permanently if they do not experience sufficient engagement.\nIn our model, each player decides to participate at each time step based on\nutilities derived from the current match: users based on alignment of the\nrecommended content with their preferences, and creators based on their\naudience size. We show that a user-centric greedy algorithm that does not\nconsider creator departures can result in arbitrarily poor total engagement,\nrelative to an algorithm that maximizes total engagement while accounting for\ntwo-sided departures. Moreover, in stark contrast to the case where only users\nor only creators leave the platform, we prove that with two-sided departures,\napproximating maximum total engagement within any constant factor is NP-hard.\nWe present two practical algorithms, one with performance guarantees under mild\nassumptions on user preferences, and another that tends to outperform\nalgorithms that ignore two-sided departures in practice.\n","authors":["Daniel Huttenlocher","Hannah Li","Liang Lyu","Asuman Ozdaglar","James Siderius"],"pdf_url":"https://arxiv.org/pdf/2401.00313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08999v1","updated":"2024-01-17T06:29:34Z","published":"2024-01-17T06:29:34Z","title":"Continuous Time Continuous Space Homeostatic Reinforcement Learning\n (CTCS-HRRL) : Towards Biological Self-Autonomous Agent","summary":" Homeostasis is a biological process by which living beings maintain their\ninternal balance. Previous research suggests that homeostasis is a learned\nbehaviour. Recently introduced Homeostatic Regulated Reinforcement Learning\n(HRRL) framework attempts to explain this learned homeostatic behavior by\nlinking Drive Reduction Theory and Reinforcement Learning. This linkage has\nbeen proven in the discrete time-space, but not in the continuous time-space.\nIn this work, we advance the HRRL framework to a continuous time-space\nenvironment and validate the CTCS-HRRL (Continuous Time Continuous Space HRRL)\nframework. We achieve this by designing a model that mimics the homeostatic\nmechanisms in a real-world biological agent. This model uses the\nHamilton-Jacobian Bellman Equation, and function approximation based on neural\nnetworks and Reinforcement Learning. Through a simulation-based experiment we\ndemonstrate the efficacy of this model and uncover the evidence linked to the\nagent's ability to dynamically choose policies that favor homeostasis in a\ncontinuously changing internal-state milieu. Results of our experiments\ndemonstrate that agent learns homeostatic behaviour in a CTCS environment,\nmaking CTCS-HRRL a promising framework for modellng animal dynamics and\ndecision-making.\n","authors":["Hugo Laurencon","Yesoda Bhargava","Riddhi Zantye","Charbel-Raphaël Ségerie","Johann Lussange","Veeky Baths","Boris Gutkin"],"pdf_url":"https://arxiv.org/pdf/2401.08999v1.pdf","comment":"This work is a result of the ongoing collaboration between Cognitive\n Neuroscience Lab, BITS Pilani K K Birla Goa Campus and Ecole Normale\n Superieure, Paris France. This work is jointly supervised by Prof. Boris\n Gutkin and Prof. Veeky Baths. arXiv admin note: substantial text overlap with\n arXiv:2109.06580"},{"id":"http://arxiv.org/abs/2401.08998v1","updated":"2024-01-17T06:22:47Z","published":"2024-01-17T06:22:47Z","title":"Attack and Reset for Unlearning: Exploiting Adversarial Noise toward\n Machine Unlearning through Parameter Re-initialization","summary":" With growing concerns surrounding privacy and regulatory compliance, the\nconcept of machine unlearning has gained prominence, aiming to selectively\nforget or erase specific learned information from a trained model. In response\nto this critical need, we introduce a novel approach called Attack-and-Reset\nfor Unlearning (ARU). This algorithm leverages meticulously crafted adversarial\nnoise to generate a parameter mask, effectively resetting certain parameters\nand rendering them unlearnable. ARU outperforms current state-of-the-art\nresults on two facial machine-unlearning benchmark datasets, MUFAC and MUCAC.\nIn particular, we present the steps involved in attacking and masking that\nstrategically filter and re-initialize network parameters biased towards the\nforget set. Our work represents a significant advancement in rendering data\nunexploitable to deep learning models through parameter re-initialization,\nachieved by harnessing adversarial noise to craft a mask.\n","authors":["Yoonhwa Jung","Ikhyun Cho","Shun-Hsiang Hsu","Julia Hockenmaier"],"pdf_url":"https://arxiv.org/pdf/2401.08998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08996v1","updated":"2024-01-17T06:17:42Z","published":"2024-01-17T06:17:42Z","title":"MicroNAS: Zero-Shot Neural Architecture Search for MCUs","summary":" Neural Architecture Search (NAS) effectively discovers new Convolutional\nNeural Network (CNN) architectures, particularly for accuracy optimization.\nHowever, prior approaches often require resource-intensive training on super\nnetworks or extensive architecture evaluations, limiting practical\napplications. To address these challenges, we propose MicroNAS, a\nhardware-aware zero-shot NAS framework designed for microcontroller units\n(MCUs) in edge computing. MicroNAS considers target hardware optimality during\nthe search, utilizing specialized performance indicators to identify optimal\nneural architectures without high computational costs. Compared to previous\nworks, MicroNAS achieves up to 1104x improvement in search efficiency and\ndiscovers models with over 3.23x faster MCU inference while maintaining similar\naccuracy\n","authors":["Ye Qiao","Haocheng Xu","Yifan Zhang","Sitao Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08992v1","updated":"2024-01-17T06:01:16Z","published":"2024-01-17T06:01:16Z","title":"Efficient Adapter Finetuning for Tail Languages in Streaming\n Multilingual ASR","summary":" The end-to-end ASR model is often desired in the streaming multilingual\nscenario since it is easier to deploy and can benefit from pre-trained speech\nmodels such as powerful foundation models. Meanwhile, the heterogeneous nature\nand imbalanced data abundance of different languages may cause performance\ndegradation, leading to asynchronous peak performance for different languages\nduring training, especially on tail ones. Sometimes even the data itself may\nbecome unavailable as a result of the enhanced privacy protection. Existing\nwork tend to significantly increase the model size or learn language-specific\ndecoders to accommodate each language separately. In this study, we explore\nsimple yet effective Language-Dependent Adapter (LDA) finetuning under a\ncascaded Conformer transducer framework enhanced by teacher pseudo-labeling for\ntail languages in the streaming multilingual ASR. The adapter only accounts for\n0.4% of the full model per language. It is plugged into the frozen foundation\nmodel and is the only trainable module during the finetuning process with noisy\nstudent training. The final model merges the adapter parameters from different\ncheckpoints for different languages. The model performance is validated on a\nchallenging multilingual dictation dataset, which includes 39 tail languages\nacross Latin, Greek, Arabic, etc. Our proposed method brings 12.2% word error\nrate reduction on average and up to 37.5% on a single locale. Furthermore, we\nshow that our parameter-efficient LDA can match the quality of the full model\nfinetuning, thus greatly alleviating the asynchronous peak performance issue.\n","authors":["Junwen Bai","Bo Li","Qiujia Li","Tara N. Sainath","Trevor Strohman"],"pdf_url":"https://arxiv.org/pdf/2401.08992v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08986v1","updated":"2024-01-17T05:39:03Z","published":"2024-01-17T05:39:03Z","title":"Rigid Protein-Protein Docking via Equivariant Elliptic-Paraboloid\n Interface Prediction","summary":" The study of rigid protein-protein docking plays an essential role in a\nvariety of tasks such as drug design and protein engineering. Recently, several\nlearning-based methods have been proposed for the task, exhibiting much faster\ndocking speed than those computational methods. In this paper, we propose a\nnovel learning-based method called ElliDock, which predicts an elliptic\nparaboloid to represent the protein-protein docking interface. To be specific,\nour model estimates elliptic paraboloid interfaces for the two input proteins\nrespectively, and obtains the roto-translation transformation for docking by\nmaking two interfaces coincide. By its design, ElliDock is independently\nequivariant with respect to arbitrary rotations/translations of the proteins,\nwhich is an indispensable property to ensure the generalization of the docking\nprocess. Experimental evaluations show that ElliDock achieves the fastest\ninference time among all compared methods and is strongly competitive with\ncurrent state-of-the-art learning-based models such as DiffDock-PP and Multimer\nparticularly for antibody-antigen docking.\n","authors":["Ziyang Yu","Wenbing Huang","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08986v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08984v1","updated":"2024-01-17T05:31:08Z","published":"2024-01-17T05:31:08Z","title":"A GAN-based data poisoning framework against anomaly detection in\n vertical federated learning","summary":" In vertical federated learning (VFL), commercial entities collaboratively\ntrain a model while preserving data privacy. However, a malicious participant's\npoisoning attack may degrade the performance of this collaborative model. The\nmain challenge in achieving the poisoning attack is the absence of access to\nthe server-side top model, leaving the malicious participant without a clear\ntarget model. To address this challenge, we introduce an innovative end-to-end\npoisoning framework P-GAN. Specifically, the malicious participant initially\nemploys semi-supervised learning to train a surrogate target model.\nSubsequently, this participant employs a GAN-based method to produce\nadversarial perturbations to degrade the surrogate target model's performance.\nFinally, the generator is obtained and tailored for VFL poisoning. Besides, we\ndevelop an anomaly detection algorithm based on a deep auto-encoder (DAE),\noffering a robust defense mechanism to VFL scenarios. Through extensive\nexperiments, we evaluate the efficacy of P-GAN and DAE, and further analyze the\nfactors that influence their performance.\n","authors":["Xiaolin Chen","Daoguang Zan","Wei Li","Bei Guan","Yongji Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08984v1.pdf","comment":"6 pages, 7 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2401.08977v1","updated":"2024-01-17T05:04:33Z","published":"2024-01-17T05:04:33Z","title":"FedLoGe: Joint Local and Generic Federated Learning under Long-tailed\n Data","summary":" Federated Long-Tailed Learning (Fed-LT), a paradigm wherein data collected\nfrom decentralized local clients manifests a globally prevalent long-tailed\ndistribution, has garnered considerable attention in recent times. In the\ncontext of Fed-LT, existing works have predominantly centered on addressing the\ndata imbalance issue to enhance the efficacy of the generic global model while\nneglecting the performance at the local level. In contrast, conventional\nPersonalized Federated Learning (pFL) techniques are primarily devised to\noptimize personalized local models under the presumption of a balanced global\ndata distribution. This paper introduces an approach termed Federated Local and\nGeneric Model Training in Fed-LT (FedLoGe), which enhances both local and\ngeneric model performance through the integration of representation learning\nand classifier alignment within a neural collapse framework. Our investigation\nreveals the feasibility of employing a shared backbone as a foundational\nframework for capturing overarching global trends, while concurrently employing\nindividualized classifiers to encapsulate distinct refinements stemming from\neach client's local features. Building upon this discovery, we establish the\nStatic Sparse Equiangular Tight Frame Classifier (SSE-C), inspired by neural\ncollapse principles that naturally prune extraneous noisy features and foster\nthe acquisition of potent data representations. Furthermore, leveraging\ninsights from imbalance neural collapse's classifier norm patterns, we develop\nGlobal and Local Adaptive Feature Realignment (GLA-FR) via an auxiliary global\nclassifier and personalized Euclidean norm transfer to align global features\nwith client preferences. Extensive experimental results on CIFAR-10/100-LT,\nImageNet, and iNaturalist demonstrate the advantage of our method over\nstate-of-the-art pFL and Fed-LT approaches.\n","authors":["Zikai Xiao","Zihan Chen","Liyinglan Liu","Yang Feng","Jian Wu","Wanlu Liu","Joey Tianyi Zhou","Howard Hao Yang","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2401.08977v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2311.16522v3","updated":"2024-01-17T05:04:12Z","published":"2023-11-28T05:00:27Z","title":"Dynamic Fault Characteristics Evaluation in Power Grid","summary":" To enhance the intelligence degree in operation and maintenance, a novel\nmethod for fault detection in power grids is proposed. The proposed GNN-based\napproach first identifies fault nodes through a specialized feature extraction\nmethod coupled with a knowledge graph. By incorporating temporal data, the\nmethod leverages the status of nodes from preceding and subsequent time periods\nto help current fault detection. To validate the effectiveness of the node\nfeatures, a correlation analysis of the output features from each node was\nconducted. The results from experiments show that this method can accurately\nlocate fault nodes in simulation scenarios with a remarkable accuracy.\nAdditionally, the graph neural network based feature modeling allows for a\nqualitative examination of how faults spread across nodes, which provides\nvaluable insights for analyzing fault nodes.\n","authors":["Hao Pei","Si Lin","Chuanfu Li","Che Wang","Haoming Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.16522v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08976v1","updated":"2024-01-17T05:03:53Z","published":"2024-01-17T05:03:53Z","title":"ACT-GAN: Radio map construction based on generative adversarial networks\n with ACT blocks","summary":" The radio map, serving as a visual representation of electromagnetic spatial\ncharacteristics, plays a pivotal role in assessment of wireless communication\nnetworks and radio monitoring coverage. Addressing the issue of low accuracy\nexisting in the current radio map construction, this paper presents a novel\nradio map construction method based on generative adversarial network (GAN) in\nwhich the Aggregated Contextual-Transformation (AOT) block, Convolutional Block\nAttention Module (CBAM), and Transposed Convolution (T-Conv) block are applied\nto the generator, and we name it as ACT-GAN. It significantly improves the\nreconstruction accuracy and local texture of the radio maps. The performance of\nACT-GAN across three different scenarios is demonstrated. Experiment results\nreveal that in the scenario without sparse discrete observations, the proposed\nmethod reduces the root mean square error (RMSE) by 14.6% in comparison to the\nstate-of-the-art models. In the scenario with sparse discrete observations, the\nRMSE is diminished by 13.2%. Furthermore, the predictive results of the\nproposed model show a more lucid representation of electromagnetic spatial\nfield distribution. To verify the universality of this model in radio map\nconstruction tasks, the scenario of unknown radio emission source is\ninvestigated. The results indicate that the proposed model is robust radio map\nconstruction and accurate in predicting the location of the emission source.\n","authors":["Chen Qi","Yang Jingjing","Huang Ming","Zhou Qiang"],"pdf_url":"https://arxiv.org/pdf/2401.08976v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2306.11700v2","updated":"2024-01-17T04:52:39Z","published":"2023-06-20T17:27:31Z","title":"Last-Iterate Convergent Policy Gradient Primal-Dual Methods for\n Constrained MDPs","summary":" We study the problem of computing an optimal policy of an infinite-horizon\ndiscounted constrained Markov decision process (constrained MDP). Despite the\npopularity of Lagrangian-based policy search methods used in practice, the\noscillation of policy iterates in these methods has not been fully understood,\nbringing out issues such as violation of constraints and sensitivity to\nhyper-parameters. To fill this gap, we employ the Lagrangian method to cast a\nconstrained MDP into a constrained saddle-point problem in which max/min\nplayers correspond to primal/dual variables, respectively, and develop two\nsingle-time-scale policy-based primal-dual algorithms with non-asymptotic\nconvergence of their policy iterates to an optimal constrained policy.\nSpecifically, we first propose a regularized policy gradient primal-dual\n(RPG-PD) method that updates the policy using an entropy-regularized policy\ngradient, and the dual variable via a quadratic-regularized gradient ascent,\nsimultaneously. We prove that the policy primal-dual iterates of RPG-PD\nconverge to a regularized saddle point with a sublinear rate, while the policy\niterates converge sublinearly to an optimal constrained policy. We further\ninstantiate RPG-PD in large state or action spaces by including function\napproximation in policy parametrization, and establish similar sublinear\nlast-iterate policy convergence. Second, we propose an optimistic policy\ngradient primal-dual (OPG-PD) method that employs the optimistic gradient\nmethod to update primal/dual variables, simultaneously. We prove that the\npolicy primal-dual iterates of OPG-PD converge to a saddle point that contains\nan optimal constrained policy, with a linear rate. To the best of our\nknowledge, this work appears to be the first non-asymptotic policy last-iterate\nconvergence result for single-time-scale algorithms in constrained MDPs.\n","authors":["Dongsheng Ding","Chen-Yu Wei","Kaiqing Zhang","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2306.11700v2.pdf","comment":"65 pages, 17 figures, and 1 table; NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.10359v3","updated":"2024-01-17T04:52:24Z","published":"2023-11-17T07:25:18Z","title":"FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel\n Identification","summary":" Highly parallelized workloads like machine learning training, inferences and\ngeneral HPC tasks are greatly accelerated using GPU devices. In a cloud\ncomputing cluster, serving a GPU's computation power through multi-tasks\nsharing is highly demanded since there are always more task requests than the\nnumber of GPU available. Existing GPU sharing solutions focus on reducing\ntask-level waiting time or task-level switching costs when multiple jobs\ncompeting for a single GPU. Non-stopped computation requests come with\ndifferent priorities, having non-symmetric impact on QoS for sharing a GPU\ndevice. Existing work missed the kernel-level optimization opportunity brought\nby this setting. To address this problem, we present a novel kernel-level\nscheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT\nincorporates task-level priority information, fine-grained kernel\nidentification, and kernel measurement, allowing low priorities task's\nexecution during high priority task's inter-kernel idle time. Thereby, filling\nthe GPU's device runtime fully, and reduce overall GPU sharing impact to cloud\nservices. Across a set of ML models, the FIKIT based inference system\naccelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in\nGPU sharing mode, and more than half of the cases are accelerated by more than\n3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have\na comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We\nfurther limit the kernel measurement and runtime fine-grained kernel scheduling\noverhead to less than 10%.\n","authors":["Wenqing Wu"],"pdf_url":"https://arxiv.org/pdf/2311.10359v3.pdf","comment":"20 pages, 20 figures. Delete a duplicated paragraph in the\n introduction section; Add more experiments with 2 additional figures; Update\n the conclusion"},{"id":"http://arxiv.org/abs/2310.13121v5","updated":"2024-01-17T04:22:12Z","published":"2023-10-19T19:34:42Z","title":"Understanding Addition in Transformers","summary":" Understanding the inner workings of machine learning models like Transformers\nis vital for their safe and ethical use. This paper presents an in-depth\nanalysis of a one-layer Transformer model trained for n-digit integer addition.\nWe reveal that the model divides the task into parallel, digit-specific streams\nand employs distinct algorithms for different digit positions. Our study also\nfinds that the model starts calculations late but executes them rapidly. A rare\nuse case with high loss is identified and explained. Overall, the model's\nalgorithm is explained in detail. These findings are validated through rigorous\ntesting and mathematical modeling, contributing to the broader works in\nMechanistic Interpretability, AI safety, and alignment. Our approach opens the\ndoor for analyzing more complex tasks and multi-layer Transformer models.\n","authors":["Philip Quirke","Fazl Barez"],"pdf_url":"https://arxiv.org/pdf/2310.13121v5.pdf","comment":"9 pages, 8 figures, accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08962v1","updated":"2024-01-17T04:21:04Z","published":"2024-01-17T04:21:04Z","title":"DOO-RE: A dataset of ambient sensors in a meeting room for activity\n recognition","summary":" With the advancement of IoT technology, recognizing user activities with\nmachine learning methods is a promising way to provide various smart services\nto users. High-quality data with privacy protection is essential for deploying\nsuch services in the real world. Data streams from surrounding ambient sensors\nare well suited to the requirement. Existing ambient sensor datasets only\nsupport constrained private spaces and those for public spaces have yet to be\nexplored despite growing interest in research on them. To meet this need, we\nbuild a dataset collected from a meeting room equipped with ambient sensors.\nThe dataset, DOO-RE, includes data streams from various ambient sensor types\nsuch as Sound and Projector. Each sensor data stream is segmented into activity\nunits and multiple annotators provide activity labels through a\ncross-validation annotation process to improve annotation quality. We finally\nobtain 9 types of activities. To our best knowledge, DOO-RE is the first\ndataset to support the recognition of both single and group activities in a\nreal meeting room with reliable annotations.\n","authors":["Hyunju Kim","Geon Kim","Taehoon Lee","Kisoo Kim","Dongman Lee"],"pdf_url":"https://arxiv.org/pdf/2401.08962v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08961v1","updated":"2024-01-17T04:20:26Z","published":"2024-01-17T04:20:26Z","title":"Cascading Reinforcement Learning","summary":" Cascading bandits have gained popularity in recent years due to their\napplicability to recommendation systems and online advertising. In the\ncascading bandit model, at each timestep, an agent recommends an ordered subset\nof items (called an item list) from a pool of items, each associated with an\nunknown attraction probability. Then, the user examines the list, and clicks\nthe first attractive item (if any), and after that, the agent receives a\nreward. The goal of the agent is to maximize the expected cumulative reward.\nHowever, the prior literature on cascading bandits ignores the influences of\nuser states (e.g., historical behaviors) on recommendations and the change of\nstates as the session proceeds. Motivated by this fact, we propose a\ngeneralized cascading RL framework, which considers the impact of user states\nand state transition into decisions. In cascading RL, we need to select items\nnot only with large attraction probabilities but also leading to good successor\nstates. This imposes a huge computational challenge due to the combinatorial\naction space. To tackle this challenge, we delve into the properties of value\nfunctions, and design an oracle BestPerm to efficiently find the optimal item\nlist. Equipped with BestPerm, we develop two algorithms CascadingVI and\nCascadingBPI, which are both computationally-efficient and sample-efficient,\nand provide near-optimal regret and sample complexity guarantees. Furthermore,\nwe present experiments to show the improved computational and sample\nefficiencies of our algorithms compared to straightforward adaptations of\nexisting RL algorithms in practice.\n","authors":["Yihan Du","R. Srikant","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2401.08961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08959v1","updated":"2024-01-17T04:19:33Z","published":"2024-01-17T04:19:33Z","title":"Towards Off-Policy Reinforcement Learning for Ranking Policies with\n Human Feedback","summary":" Probabilistic learning to rank (LTR) has been the dominating approach for\noptimizing the ranking metric, but cannot maximize long-term rewards.\nReinforcement learning models have been proposed to maximize user long-term\nrewards by formulating the recommendation as a sequential decision-making\nproblem, but could only achieve inferior accuracy compared to LTR counterparts,\nprimarily due to the lack of online interactions and the characteristics of\nranking. In this paper, we propose a new off-policy value ranking (VR)\nalgorithm that can simultaneously maximize user long-term rewards and optimize\nthe ranking metric offline for improved sample efficiency in a unified\nExpectation-Maximization (EM) framework. We theoretically and empirically show\nthat the EM process guides the leaned policy to enjoy the benefit of\nintegration of the future reward and ranking metric, and learn without any\nonline interactions. Extensive offline and online experiments demonstrate the\neffectiveness of our methods.\n","authors":["Teng Xiao","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.08959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16042v2","updated":"2024-01-17T04:07:06Z","published":"2023-09-27T21:53:56Z","title":"Towards Best Practices of Activation Patching in Language Models:\n Metrics and Methods","summary":" Mechanistic interpretability seeks to understand the internal mechanisms of\nmachine learning models, where localization -- identifying the important model\ncomponents -- is a key step. Activation patching, also known as causal tracing\nor interchange intervention, is a standard technique for this task (Vig et al.,\n2020), but the literature contains many variants with little consensus on the\nchoice of hyperparameters or methodology. In this work, we systematically\nexamine the impact of methodological details in activation patching, including\nevaluation metrics and corruption methods. In several settings of localization\nand circuit discovery in language models, we find that varying these\nhyperparameters could lead to disparate interpretability results. Backed by\nempirical observations, we give conceptual arguments for why certain metrics or\nmethods may be preferred. Finally, we provide recommendations for the best\npractices of activation patching going forwards.\n","authors":["Fred Zhang","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2309.16042v2.pdf","comment":"27 pages. ICLR 2024"},{"id":"http://arxiv.org/abs/2401.07231v2","updated":"2024-01-17T04:06:44Z","published":"2024-01-14T08:32:32Z","title":"Use of Prior Knowledge to Discover Causal Additive Models with\n Unobserved Variables and its Application to Time Series Data","summary":" This paper proposes two methods for causal additive models with unobserved\nvariables (CAM-UV). CAM-UV assumes that the causal functions take the form of\ngeneralized additive models and that latent confounders are present. First, we\npropose a method that leverages prior knowledge for efficient causal discovery.\nThen, we propose an extension of this method for inferring causality in time\nseries data. The original CAM-UV algorithm differs from other existing causal\nfunction models in that it does not seek the causal order between observed\nvariables, but rather aims to identify the causes for each observed variable.\nTherefore, the first proposed method in this paper utilizes prior knowledge,\nsuch as understanding that certain variables cannot be causes of specific\nothers. Moreover, by incorporating the prior knowledge that causes precedes\ntheir effects in time, we extend the first algorithm to the second method for\ncausal discovery in time series data. We validate the first proposed method by\nusing simulated data to demonstrate that the accuracy of causal discovery\nincreases as more prior knowledge is accumulated. Additionally, we test the\nsecond proposed method by comparing it with existing time series causal\ndiscovery methods, using both simulated data and real-world data.\n","authors":["Takashi Nicholas Maeda","Shohei Shohei"],"pdf_url":"https://arxiv.org/pdf/2401.07231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08947v1","updated":"2024-01-17T03:44:27Z","published":"2024-01-17T03:44:27Z","title":"AntiPhishStack: LSTM-based Stacked Generalization Model for Optimized\n Phishing URLs Detection","summary":" The escalating reliance on revolutionary online web services has introduced\nheightened security risks, with persistent challenges posed by phishing despite\nextensive security measures. Traditional phishing systems, reliant on machine\nlearning and manual features, struggle with evolving tactics. Recent advances\nin deep learning offer promising avenues for tackling novel phishing challenges\nand malicious URLs. This paper introduces a two-phase stack generalized model\nnamed AntiPhishStack, designed to detect phishing sites. The model leverages\nthe learning of URLs and character-level TF-IDF features symmetrically,\nenhancing its ability to combat emerging phishing threats. In Phase I, features\nare trained on a base machine learning classifier, employing K-fold\ncross-validation for robust mean prediction. Phase II employs a two-layered\nstacked-based LSTM network with five adaptive optimizers for dynamic\ncompilation, ensuring premier prediction on these features. Additionally, the\nsymmetrical predictions from both phases are optimized and integrated to train\na meta-XGBoost classifier, contributing to a final robust prediction. The\nsignificance of this work lies in advancing phishing detection with\nAntiPhishStack, operating without prior phishing-specific feature knowledge.\nExperimental validation on two benchmark datasets, comprising benign and\nphishing or malicious URLs, demonstrates the model's exceptional performance,\nachieving a notable 96.04% accuracy compared to existing studies. This research\nadds value to the ongoing discourse on symmetry and asymmetry in information\nsecurity and provides a forward-thinking solution for enhancing network\nsecurity in the face of evolving cyber threats.\n","authors":["Saba Aslam","Hafsa Aslam","Arslan Manzoor","Chen Hui","Abdur Rasool"],"pdf_url":"https://arxiv.org/pdf/2401.08947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07769v2","updated":"2024-01-17T03:38:26Z","published":"2024-01-15T15:27:24Z","title":"Deep Evolutional Instant Interest Network for CTR Prediction in\n Trigger-Induced Recommendation","summary":" The recommendation has been playing a key role in many industries, e.g.,\ne-commerce, streaming media, social media, etc. Recently, a new recommendation\nscenario, called Trigger-Induced Recommendation (TIR), where users are able to\nexplicitly express their instant interests via trigger items, is emerging as an\nessential role in many e-commerce platforms, e.g., Alibaba.com and Amazon.\nWithout explicitly modeling the user's instant interest, traditional\nrecommendation methods usually obtain sub-optimal results in TIR. Even though\nthere are a few methods considering the trigger and target items simultaneously\nto solve this problem, they still haven't taken into account temporal\ninformation of user behaviors, the dynamic change of user instant interest when\nthe user scrolls down and the interactions between the trigger and target\nitems. To tackle these problems, we propose a novel method -- Deep Evolutional\nInstant Interest Network (DEI2N), for click-through rate prediction in TIR\nscenarios. Specifically, we design a User Instant Interest Modeling Layer to\npredict the dynamic change of the intensity of instant interest when the user\nscrolls down. Temporal information is utilized in user behavior modeling.\nMoreover, an Interaction Layer is introduced to learn better interactions\nbetween the trigger and target items. We evaluate our method on several offline\nand real-world industrial datasets. Experimental results show that our proposed\nDEI2N outperforms state-of-the-art baselines. In addition, online A/B testing\ndemonstrates the superiority over the existing baseline in real-world\nproduction environments.\n","authors":["Zhibo Xiao","Luwei Yang","Tao Zhang","Wen Jiang","Wei Ning","Yujiu Yang"],"pdf_url":"https://arxiv.org/pdf/2401.07769v2.pdf","comment":"7 pages, 3 figures, accepted by the 17th ACM International Conference\n on Web Search and Data Mining(WSDM'2024)"},{"id":"http://arxiv.org/abs/2401.08383v2","updated":"2024-01-17T03:37:00Z","published":"2024-01-16T14:16:47Z","title":"Exploiting Inter-Layer Expert Affinity for Accelerating\n Mixture-of-Experts Model Inference","summary":" In large language models like the Generative Pre-trained Transformer, the\nMixture of Experts paradigm has emerged as a powerful technique for enhancing\nmodel expressiveness and accuracy. However, deploying GPT MoE models for\nparallel inference on distributed systems presents significant challenges,\nprimarily due to the extensive Alltoall communication required for expert\nrouting and aggregation. This communication bottleneck exacerbates the already\ncomplex computational landscape, hindering the efficient utilization of\nhigh-performance computing resources. In this paper, we propose a lightweight\noptimization technique called ExFlow, to largely accelerate the inference of\nthese MoE models. We take a new perspective on alleviating the communication\noverhead by exploiting the inter-layer expert affinity. Unlike previous\nmethods, our solution can be directly applied to pre-trained MoE models without\nany fine-tuning or accuracy degradation. By proposing a context-coherent expert\nparallelism on distributed systems, our design only uses one Alltoall\ncommunication to deliver the same functionality while previous methods all\nrequire two Alltoalls. By carefully examining the conditional probability in\ntokens' routing across multiple layers, we proved that pre-trained GPT MoE\nmodels implicitly exhibit a strong inter-layer expert affinity. We then design\nan efficient integer programming model to capture such features and show that\nby properly placing the experts on corresponding GPUs, we can reduce up to 67%\ncross-GPU routing latency. Our solution beats the cutting-edge MoE\nimplementations with experts from 8 to 64, with up to 2.2x improvement in\ninference throughput. We further provide a detailed study of how the model\nimplicitly acquires this expert affinity at the very early training stage and\nhow this affinity evolves and stabilizes during training.\n","authors":["Jinghan Yao","Quentin Anthony","Aamir Shafi","Hari Subramoni","Dhabaleswar K."," Panda"],"pdf_url":"https://arxiv.org/pdf/2401.08383v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08940v1","updated":"2024-01-17T03:26:04Z","published":"2024-01-17T03:26:04Z","title":"CEL: A Continual Learning Model for Disease Outbreak Prediction by\n Leveraging Domain Adaptation via Elastic Weight Consolidation","summary":" Continual learning, the ability of a model to learn over time without\nforgetting previous knowledge and, therefore, be adaptive to new data, is\nparamount in dynamic fields such as disease outbreak prediction. Deep neural\nnetworks, i.e., LSTM, are prone to error due to catastrophic forgetting. This\nstudy introduces a novel CEL model for continual learning by leveraging domain\nadaptation via Elastic Weight Consolidation (EWC). This model aims to mitigate\nthe catastrophic forgetting phenomenon in a domain incremental setting. The\nFisher Information Matrix (FIM) is constructed with EWC to develop a\nregularization term that penalizes changes to important parameters, namely, the\nimportant previous knowledge. CEL's performance is evaluated on three distinct\ndiseases, Influenza, Mpox, and Measles, with different metrics. The high\nR-squared values during evaluation and reevaluation outperform the other\nstate-of-the-art models in several contexts, indicating that CEL adapts to\nincremental data well. CEL's robustness and reliability are underscored by its\nminimal 65% forgetting rate and 18% higher memory stability compared to\nexisting benchmark studies. This study highlights CEL's versatility in disease\noutbreak prediction, addressing evolving data with temporal patterns. It offers\na valuable model for proactive disease control with accurate, timely\npredictions.\n","authors":["Saba Aslam","Abdur Rasool","Hongyan Wu","Xiaoli Li"],"pdf_url":"https://arxiv.org/pdf/2401.08940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13649v3","updated":"2024-01-17T03:23:23Z","published":"2023-06-23T17:56:26Z","title":"On-Policy Distillation of Language Models: Learning from Self-Generated\n Mistakes","summary":" Knowledge distillation (KD) is widely used for compressing a teacher model to\nreduce its inference cost and memory footprint, by training a smaller student\nmodel. However, current KD methods for auto-regressive sequence models suffer\nfrom distribution mismatch between output sequences seen during training and\nthose generated by the student during inference. To address this issue, we\nintroduce Generalized Knowledge Distillation (GKD). Instead of solely relying\non a fixed set of output sequences, GKD trains the student on its\nself-generated output sequences by leveraging feedback from the teacher on such\nsequences. Unlike supervised KD approaches, GKD also offers the flexibility to\nemploy alternative loss functions between the student and teacher, which can be\nuseful when the student lacks the expressivity to mimic the teacher's\ndistribution. Furthermore, GKD facilitates the seamless integration of\ndistillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for\ndistilling auto-regressive language models on summarization, translation, and\narithmetic reasoning tasks, and task-agnostic distillation for\ninstruction-tuning.\n","authors":["Rishabh Agarwal","Nino Vieillard","Yongchao Zhou","Piotr Stanczyk","Sabela Ramos","Matthieu Geist","Olivier Bachem"],"pdf_url":"https://arxiv.org/pdf/2306.13649v3.pdf","comment":"Accepted at ICLR 2024. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2210.02612v2","updated":"2024-01-17T03:15:15Z","published":"2022-10-06T00:22:02Z","title":"Lyapunov Function Consistent Adaptive Network Signal Control with Back\n Pressure and Reinforcement Learning","summary":" In traffic signal control, flow-based (optimizing the overall flow) and\npressure-based methods (equalizing and alleviating congestion) are commonly\nused but often considered separately. This study introduces a unified framework\nusing Lyapunov control theory, defining specific Lyapunov functions\nrespectively for these methods. We have found interesting results. For example,\nthe well-recognized back-pressure method is equal to differential queue lengths\nweighted by intersection lane saturation flows. We further improve it by adding\nbasic traffic flow theory. Rather than ensuring that the control system be\nstable, the system should be also capable of adaptive to various performance\nmetrics. Building on insights from Lyapunov theory, this study designs a reward\nfunction for the Reinforcement Learning (RL)-based network signal control,\nwhose agent is trained with Double Deep Q-Network (DDQN) for effective control\nover complex traffic networks. The proposed algorithm is compared with several\ntraditional and RL-based methods under pure passenger car flow and heterogenous\ntraffic flow including freight, respectively. The numerical tests demonstrate\nthat the proposed method outperforms the alternative control methods across\ndifferent traffic scenarios, covering corridor and general network situations\neach with varying traffic demands, in terms of the average network vehicle\nwaiting time per vehicle.\n","authors":["Chaolun Ma","Bruce Wang","Zihao Li","Ahmadreza Mahmoudzadeh","Yunlong Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.02612v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08936v1","updated":"2024-01-17T03:14:28Z","published":"2024-01-17T03:14:28Z","title":"DeLF: Designing Learning Environments with Foundation Models","summary":" Reinforcement learning (RL) offers a capable and intuitive structure for the\nfundamental sequential decision-making problem. Despite impressive\nbreakthroughs, it can still be difficult to employ RL in practice in many\nsimple applications. In this paper, we try to address this issue by introducing\na method for designing the components of the RL environment for a given,\nuser-intended application. We provide an initial formalization for the problem\nof RL component design, that concentrates on designing a good representation\nfor observation and action space. We propose a method named DeLF: Designing\nLearning Environments with Foundation Models, that employs large language\nmodels to design and codify the user's intended learning scenario. By testing\nour method on four different learning environments, we demonstrate that DeLF\ncan obtain executable environment codes for the corresponding RL problems.\n","authors":["Aida Afshar","Wenchao Li"],"pdf_url":"https://arxiv.org/pdf/2401.08936v1.pdf","comment":"AAAI 2024 Workshop on Synergy of Reinforcement Learning and Large\n Language Models"},{"id":"http://arxiv.org/abs/2401.08919v1","updated":"2024-01-17T02:04:59Z","published":"2024-01-17T02:04:59Z","title":"Partial Diacritization: A Context-Contrastive Inference Approach","summary":" Diacritization plays a pivotal role in improving readability and\ndisambiguating the meaning of Arabic texts. Efforts have so far focused on\nmarking every eligible character (Full Diacritization). Comparatively\noverlooked, Partial Diacritzation (PD) is the selection of a subset of\ncharacters to be marked to aid comprehension where needed. Research has\nindicated that excessive diacritic marks can hinder skilled readers--reducing\nreading speed and accuracy. We conduct a behavioral experiment and show that\npartially marked text is often easier to read than fully marked text, and\nsometimes easier than plain text. In this light, we introduce\nContext-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which\nintegrates seamlessly with existing Arabic diacritization systems. CCPD\nprocesses each word twice, once with context and once without, and diacritizes\nonly the characters with disparities between the two inferences. Further, we\nintroduce novel indicators for measuring partial diacritization quality (SR,\nPDER, HDER, ERE), essential for establishing this as a machine learning task.\nLastly, we introduce TD2, a Transformer-variant of an established model which\noffers a markedly different per formance profile on our proposed indicators\ncompared to all other known systems.\n","authors":["Muhammad ElNokrashy","Badr AlKhamissi"],"pdf_url":"https://arxiv.org/pdf/2401.08919v1.pdf","comment":"13 equations, 5 tables, 5 figures"},{"id":"http://arxiv.org/abs/2309.04001v3","updated":"2024-01-17T01:47:40Z","published":"2023-09-07T20:07:57Z","title":"MMSFormer: Multimodal Transformer for Material and Semantic Segmentation","summary":" Leveraging information across diverse modalities is known to enhance\nperformance on multimodal segmentation tasks. However, effectively fusing\ninformation from different modalities remains challenging due to the unique\ncharacteristics of each modality. In this paper, we propose a novel fusion\nstrategy that can effectively fuse information from different modality\ncombinations. We also propose a new model named Multi-Modal Segmentation\nTransFormer (MMSFormer) that incorporates the proposed fusion strategy to\nperform multimodal material and semantic segmentation tasks. MMSFormer\noutperforms current state-of-the-art models on three different datasets. As we\nbegin with only one input modality, performance improves progressively as\nadditional modalities are incorporated, showcasing the effectiveness of the\nfusion block in combining useful information from diverse input modalities.\nAblation studies show that different modules in the fusion block are crucial\nfor overall model performance. Furthermore, our ablation studies also highlight\nthe capacity of different input modalities to improve performance in the\nidentification of different types of materials. The code and pretrained models\nwill be made available at https://github.com/csiplab/MMSFormer.\n","authors":["Md Kaykobad Reza","Ashley Prater-Bennette","M. Salman Asif"],"pdf_url":"https://arxiv.org/pdf/2309.04001v3.pdf","comment":"14 pages, 3 figures, 8 tables"},{"id":"http://arxiv.org/abs/2308.11863v2","updated":"2024-01-17T01:45:27Z","published":"2023-08-23T01:44:28Z","title":"KinSPEAK: Improving speech recognition for Kinyarwanda via\n semi-supervised learning methods","summary":" Despite recent availability of large transcribed Kinyarwanda speech data,\nachieving robust speech recognition for Kinyarwanda is still challenging. In\nthis work, we show that using self-supervised pre-training, following a simple\ncurriculum schedule during fine-tuning and using semi-supervised learning to\nleverage large unlabelled speech data significantly improve speech recognition\nperformance for Kinyarwanda. Our approach focuses on using public domain data\nonly. A new studio-quality speech dataset is collected from a public website,\nthen used to train a clean baseline model. The clean baseline model is then\nused to rank examples from a more diverse and noisy public dataset, defining a\nsimple curriculum training schedule. Finally, we apply semi-supervised learning\nto label and learn from large unlabelled data in four successive generations.\nOur final model achieves 3.2% word error rate (WER) on the new dataset and\n15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the\nbest of our knowledge. Our experiments also indicate that using syllabic rather\nthan character-based tokenization results in better speech recognition\nperformance for Kinyarwanda.\n","authors":["Antoine Nzeyimana"],"pdf_url":"https://arxiv.org/pdf/2308.11863v2.pdf","comment":"9 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2401.08909v1","updated":"2024-01-17T01:33:23Z","published":"2024-01-17T01:33:23Z","title":"Characterising Gradients for Unsupervised Accuracy Estimation under\n Distribution Shift","summary":" Estimating test accuracy without access to the ground-truth test labels under\nvarying test environments is a challenging, yet extremely important problem in\nthe safe deployment of machine learning algorithms. Existing works rely on the\ninformation from either the outputs or the extracted features of neural\nnetworks to formulate an estimation score correlating with the ground-truth\ntest accuracy. In this paper, we investigate--both empirically and\ntheoretically--how the information provided by the gradients can be predictive\nof the ground-truth test accuracy even under a distribution shift.\nSpecifically, we use the norm of classification-layer gradients, backpropagated\nfrom the cross-entropy loss after only one gradient step over test data. Our\nkey idea is that the model should be adjusted with a higher magnitude of\ngradients when it does not generalize to the test dataset with a distribution\nshift. We provide theoretical insights highlighting the main ingredients of\nsuch an approach ensuring its empirical success. Extensive experiments\nconducted on diverse distribution shifts and model structures demonstrate that\nour method significantly outperforms state-of-the-art algorithms.\n","authors":["Renchunzi Xie","Ambroise Odonnat","Vasilii Feofanov","Ievgen Redko","Jianfeng Zhang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2401.08909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08908v1","updated":"2024-01-17T01:32:45Z","published":"2024-01-17T01:32:45Z","title":"Herding LLaMaS: Using LLMs as an OS Module","summary":" Computer systems are becoming increasingly heterogeneous with the emergence\nof new memory technologies and compute devices. GPUs alongside CPUs have become\ncommonplace and CXL is poised to be a mainstay of cloud systems. The operating\nsystem is responsible for managing these hardware resources, requiring\nmodification every time a new device is released. Years of research and\ndevelopment are sunk into tuning the OS for high performance with each new\nheterogeneous device. With the recent explosion in memory technologies and\ndomain-specific accelerators, it would be beneficial to have an OS that could\nprovide high performance for new devices without significant effort.\n We propose LLaMaS which can adapt to new devices easily. LLaMaS uses Large\nLanguage Models (LLMs) to extract the useful features of new devices from their\ntextual description and uses these features to make operating system decisions\nat runtime. Adding support to LLaMaS for a new device is as simple as\ndescribing the system and new device properties in plaintext.\n LLaMaS reduces the burden on system administrators to enable easy integration\nof new devices into production systems.\n Preliminary evaluation using ChatGPT shows that LLMs are capable of\nextracting device features from text and make correct OS decisions based on\nthose features.\n","authors":["Aditya K Kamath","Sujay Yadalam"],"pdf_url":"https://arxiv.org/pdf/2401.08908v1.pdf","comment":"ASPLOS 2023, Wild and Crazy Ideas session"},{"id":"http://arxiv.org/abs/2401.08903v1","updated":"2024-01-17T01:10:17Z","published":"2024-01-17T01:10:17Z","title":"PPR: Enhancing Dodging Attacks while Maintaining Impersonation Attacks\n on Face Recognition Systems","summary":" Adversarial Attacks on Face Recognition (FR) encompass two types:\nimpersonation attacks and evasion attacks. We observe that achieving a\nsuccessful impersonation attack on FR does not necessarily ensure a successful\ndodging attack on FR in the black-box setting. Introducing a novel attack\nmethod named Pre-training Pruning Restoration Attack (PPR), we aim to enhance\nthe performance of dodging attacks whilst avoiding the degradation of\nimpersonation attacks. Our method employs adversarial example pruning, enabling\na portion of adversarial perturbations to be set to zero, while tending to\nmaintain the attack performance. By utilizing adversarial example pruning, we\ncan prune the pre-trained adversarial examples and selectively free up certain\nadversarial perturbations. Thereafter, we embed adversarial perturbations in\nthe pruned area, which enhances the dodging performance of the adversarial face\nexamples. The effectiveness of our proposed attack method is demonstrated\nthrough our experimental results, showcasing its superior performance.\n","authors":["Fengfan Zhou","Heifei Ling"],"pdf_url":"https://arxiv.org/pdf/2401.08903v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08902v1","updated":"2024-01-17T01:06:22Z","published":"2024-01-17T01:06:22Z","title":"Similar but Faster: Manipulation of Tempo in Music Audio Embeddings for\n Tempo Prediction and Search","summary":" Audio embeddings enable large scale comparisons of the similarity of audio\nfiles for applications such as search and recommendation. Due to the\nsubjectivity of audio similarity, it can be desirable to design systems that\nanswer not only whether audio is similar, but similar in what way (e.g., wrt.\ntempo, mood or genre). Previous works have proposed disentangled embedding\nspaces where subspaces representing specific, yet possibly correlated,\nattributes can be weighted to emphasize those attributes in downstream tasks.\nHowever, no research has been conducted into the independence of these\nsubspaces, nor their manipulation, in order to retrieve tracks that are similar\nbut different in a specific way. Here, we explore the manipulation of tempo in\nembedding spaces as a case-study towards this goal. We propose tempo\ntranslation functions that allow for efficient manipulation of tempo within a\npre-existing embedding space whilst maintaining other properties such as genre.\nAs this translation is specific to tempo it enables retrieval of tracks that\nare similar but have specifically different tempi. We show that such a function\ncan be used as an efficient data augmentation strategy for both training of\ndownstream tempo predictors, and improved nearest neighbor retrieval of\nproperties largely independent of tempo.\n","authors":["Matthew C. McCallum","Florian Henkel","Jaehun Kim","Samuel E. Sandberg","Matthew E. P. Davies"],"pdf_url":"https://arxiv.org/pdf/2401.08902v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.01783v3","updated":"2024-01-17T01:02:06Z","published":"2024-01-03T15:16:25Z","title":"Approximating Numerical Fluxes Using Fourier Neural Operators for\n Hyperbolic Conservation Laws","summary":" Traditionally, classical numerical schemes have been employed to solve\npartial differential equations (PDEs) using computational methods. Recently,\nneural network-based methods have emerged. Despite these advancements, neural\nnetwork-based methods, such as physics-informed neural networks (PINNs) and\nneural operators, exhibit deficiencies in robustness and generalization. To\naddress these issues, numerous studies have integrated classical numerical\nframeworks with machine learning techniques, incorporating neural networks into\nparts of traditional numerical methods. In this study, we focus on hyperbolic\nconservation laws by replacing traditional numerical fluxes with neural\noperators. To this end, we developed loss functions inspired by established\nnumerical schemes related to conservation laws and approximated numerical\nfluxes using Fourier neural operators (FNOs). Our experiments demonstrated that\nour approach combines the strengths of both traditional numerical schemes and\nFNOs, outperforming standard FNO methods in several respects. For instance, we\ndemonstrate that our method is robust, has resolution invariance, and is\nfeasible as a data-driven method. In particular, our method can make continuous\npredictions over time and exhibits superior generalization capabilities with\nout-of-distribution (OOD) samples, which are challenges that existing neural\noperator methods encounter.\n","authors":["Taeyoung Kim","Myungjoo Kang"],"pdf_url":"https://arxiv.org/pdf/2401.01783v3.pdf","comment":"26 pages, 28 figures"},{"id":"http://arxiv.org/abs/2401.08898v1","updated":"2024-01-17T00:47:43Z","published":"2024-01-17T00:47:43Z","title":"Bridging State and History Representations: Understanding\n Self-Predictive RL","summary":" Representations are at the core of all deep reinforcement learning (RL)\nmethods for both Markov decision processes (MDPs) and partially observable\nMarkov decision processes (POMDPs). Many representation learning methods and\ntheoretical frameworks have been developed to understand what constitutes an\neffective representation. However, the relationships between these methods and\nthe shared properties among them remain unclear. In this paper, we show that\nmany of these seemingly distinct methods and frameworks for state and history\nabstractions are, in fact, based on a common idea of self-predictive\nabstraction. Furthermore, we provide theoretical insights into the widely\nadopted objectives and optimization, such as the stop-gradient technique, in\nlearning self-predictive representations. These findings together yield a\nminimalist algorithm to learn self-predictive representations for states and\nhistories. We validate our theories by applying our algorithm to standard MDPs,\nMDPs with distractors, and POMDPs with sparse rewards. These findings culminate\nin a set of practical guidelines for RL practitioners.\n","authors":["Tianwei Ni","Benjamin Eysenbach","Erfan Seyedsalehi","Michel Ma","Clement Gehring","Aditya Mahajan","Pierre-Luc Bacon"],"pdf_url":"https://arxiv.org/pdf/2401.08898v1.pdf","comment":"ICLR 2024 (Poster). Code is available at\n https://github.com/twni2016/self-predictive-rl"},{"id":"http://arxiv.org/abs/2401.08897v1","updated":"2024-01-17T00:46:24Z","published":"2024-01-17T00:46:24Z","title":"CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in\n Variational AutoEncoder","summary":" Symmetries of input and latent vectors have provided valuable insights for\ndisentanglement learning in VAEs.However, only a few works were proposed as an\nunsupervised method, and even these works require known factor information in\ntraining data. We propose a novel method, Composite Factor-Aligned Symmetry\nLearning (CFASL), which is integrated into VAEs for learning symmetry-based\ndisentanglement in unsupervised learning without any knowledge of the dataset\nfactor information.CFASL incorporates three novel features for learning\nsymmetry-based disentanglement: 1) Injecting inductive bias to align latent\nvector dimensions to factor-aligned symmetries within an explicit learnable\nsymmetry codebook 2) Learning a composite symmetry to express unknown factors\nchange between two random samples by learning factor-aligned symmetries within\nthe codebook 3) Inducing group equivariant encoder and decoder in training VAEs\nwith the two conditions. In addition, we propose an extended evaluation metric\nfor multi-factor changes in comparison to disentanglement evaluation in VAEs.\nIn quantitative and in-depth qualitative analysis, CFASL demonstrates a\nsignificant improvement of disentanglement in single-factor change, and\nmulti-factor change conditions compared to state-of-the-art methods.\n","authors":["Hee-Jun Jung","Jaehyoung Jeong","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08897v1.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2401.08895v1","updated":"2024-01-17T00:36:58Z","published":"2024-01-17T00:36:58Z","title":"cedar: Composable and Optimized Machine Learning Input Data Pipelines","summary":" The input data pipeline is an essential component of each machine learning\n(ML) training job. It is responsible for reading massive amounts of training\ndata, processing batches of samples using complex of transformations, and\nloading them onto training nodes at low latency and high throughput. Performant\ninput data systems are becoming increasingly critical, driven by skyrocketing\ndata volumes and training throughput demands. Unfortunately, current input data\nsystems cannot fully leverage key performance optimizations, resulting in\nhugely inefficient infrastructures that require significant resources -- or\nworse -- underutilize expensive accelerators.\n To address these demands, we present cedar, a programming model and framework\nthat allows users to easily build, optimize, and execute input data pipelines.\ncedar presents an easy-to-use programming interface, allowing users to define\ninput data pipelines using composable operators that support arbitrary ML\nframeworks and libraries. Meanwhile, cedar transparently applies a complex and\nextensible set of optimization techniques (e.g., offloading, caching,\nprefetching, fusion, and reordering). It then orchestrates processing across a\ncustomizable set of local and distributed compute resources in order to\nmaximize processing performance and efficiency, all without user input. On\naverage across six diverse input data pipelines, cedar achieves a 2.49x, 1.87x,\n2.18x, and 2.74x higher performance compared to tf.data, tf.data service, Ray\nData, and PyTorch's DataLoader, respectively.\n","authors":["Mark Zhao","Emanuel Adamiak","Christos Kozyrakis"],"pdf_url":"https://arxiv.org/pdf/2401.08895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.07996v3","updated":"2024-01-17T00:36:29Z","published":"2022-10-14T17:52:19Z","title":"Degeneracy is OK: Logarithmic Regret for Network Revenue Management with\n Indiscrete Distributions","summary":" We study the classical Network Revenue Management (NRM) problem with\naccept/reject decisions and $T$ IID arrivals. We consider a distributional form\nwhere each arrival must fall under a finite number of possible categories, each\nwith a deterministic resource consumption vector, but a random value\ndistributed continuously over an interval. We develop an online algorithm that\nachieves $O(\\log^2 T)$ regret under this model, with the only (necessary)\nassumption being that the probability densities are bounded away from 0. We\nderive a second result that achieves $O(\\log T)$ regret under an additional\nassumption of second-order growth. To our knowledge, these are the first\nresults achieving logarithmic-level regret in an NRM model with continuous\nvalues that do not require any kind of ``non-degeneracy'' assumptions. Our\nresults are achieved via new techniques including a new method of bounding\nmyopic regret, a ``semi-fluid'' relaxation of the offline allocation, and an\nimproved bound on the ``dual convergence''.\n","authors":["Jiashuo Jiang","Will Ma","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.07996v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08893v1","updated":"2024-01-17T00:16:46Z","published":"2024-01-17T00:16:46Z","title":"MADA: Meta-Adaptive Optimizers through hyper-gradient Descent","summary":" Since Adam was introduced, several novel adaptive optimizers for deep\nlearning have been proposed. These optimizers typically excel in some tasks but\nmay not outperform Adam uniformly across all tasks. In this work, we introduce\nMeta-Adaptive Optimizers (MADA), a unified optimizer framework that can\ngeneralize several known optimizers and dynamically learn the most suitable one\nduring training. The key idea in MADA is to parameterize the space of\noptimizers and search through it using hyper-gradient descent. Numerical\nresults suggest that MADA is robust against sub-optimally tuned\nhyper-parameters, and outperforms Adam, Lion, and Adan with their default\nhyper-parameters, often even with optimized hyper-parameters. We also propose\nAVGrad, a variant of AMSGrad where the maximum operator is replaced with\naveraging, and observe that it performs better within MADA. Finally, we provide\na convergence analysis to show that interpolation of optimizers (specifically,\nAVGrad and Adam) can improve their error bounds (up to constants), hinting at\nan advantage for meta-optimizers.\n","authors":["Kaan Ozkara","Can Karakus","Parameswaran Raman","Mingyi Hong","Shoham Sabach","Branislav Kveton","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2401.08893v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08891v1","updated":"2024-01-17T00:15:16Z","published":"2024-01-17T00:15:16Z","title":"Tempo estimation as fully self-supervised binary classification","summary":" This paper addresses the problem of global tempo estimation in musical audio.\nGiven that annotating tempo is time-consuming and requires certain musical\nexpertise, few publicly available data sources exist to train machine learning\nmodels for this task. Towards alleviating this issue, we propose a fully\nself-supervised approach that does not rely on any human labeled data. Our\nmethod builds on the fact that generic (music) audio embeddings already encode\na variety of properties, including information about tempo, making them easily\nadaptable for downstream tasks. While recent work in self-supervised tempo\nestimation aimed to learn a tempo specific representation that was subsequently\nused to train a supervised classifier, we reformulate the task into the binary\nclassification problem of predicting whether a target track has the same or a\ndifferent tempo compared to a reference. While the former still requires\nlabeled training data for the final classification model, our approach uses\narbitrary unlabeled music data in combination with time-stretching for model\ntraining as well as a small set of synthetically created reference samples for\npredicting the final tempo. Evaluation of our approach in comparison with the\nstate-of-the-art reveals highly competitive performance when the constraint of\nfinding the precise tempo octave is relaxed.\n","authors":["Florian Henkel","Jaehun Kim","Matthew C. McCallum","Samuel E. Sandberg","Matthew E. P. Davies"],"pdf_url":"https://arxiv.org/pdf/2401.08891v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.08889v1","updated":"2024-01-17T00:12:13Z","published":"2024-01-17T00:12:13Z","title":"On the Effect of Data-Augmentation on Local Embedding Properties in the\n Contrastive Learning of Music Audio Representations","summary":" Audio embeddings are crucial tools in understanding large catalogs of music.\nTypically embeddings are evaluated on the basis of the performance they provide\nin a wide range of downstream tasks, however few studies have investigated the\nlocal properties of the embedding spaces themselves which are important in\nnearest neighbor algorithms, commonly used in music search and recommendation.\nIn this work we show that when learning audio representations on music datasets\nvia contrastive learning, musical properties that are typically homogeneous\nwithin a track (e.g., key and tempo) are reflected in the locality of\nneighborhoods in the resulting embedding space. By applying appropriate data\naugmentation strategies, localisation of such properties can not only be\nreduced but the localisation of other attributes is increased. For example,\nlocality of features such as pitch and tempo that are less relevant to\nnon-expert listeners, may be mitigated while improving the locality of more\nsalient features such as genre and mood, achieving state-of-the-art performance\nin nearest neighbor retrieval accuracy. Similarly, we show that the optimal\nselection of data augmentation strategies for contrastive learning of music\naudio embeddings is dependent on the downstream task, highlighting this as an\nimportant embedding design decision.\n","authors":["Matthew C. McCallum","Matthew E. P. Davies","Florian Henkel","Jaehun Kim","Samuel E. Sandberg"],"pdf_url":"https://arxiv.org/pdf/2401.08889v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2310.06234v2","updated":"2024-01-17T00:03:00Z","published":"2023-10-10T01:04:15Z","title":"Efficient Adaptation of Large Vision Transformer via Adapter\n Re-Composing","summary":" The advent of high-capacity pre-trained models has revolutionized\nproblem-solving in computer vision, shifting the focus from training\ntask-specific models to adapting pre-trained models. Consequently, effectively\nadapting large pre-trained models to downstream tasks in an efficient manner\nhas become a prominent research area. Existing solutions primarily concentrate\non designing lightweight adapters and their interaction with pre-trained\nmodels, with the goal of minimizing the number of parameters requiring updates.\nIn this study, we propose a novel Adapter Re-Composing (ARC) strategy that\naddresses efficient pre-trained model adaptation from a fresh perspective. Our\napproach considers the reusability of adaptation parameters and introduces a\nparameter-sharing scheme. Specifically, we leverage symmetric\ndown-/up-projections to construct bottleneck operations, which are shared\nacross layers. By learning low-dimensional re-scaling coefficients, we can\neffectively re-compose layer-adaptive adapters. This parameter-sharing strategy\nin adapter design allows us to significantly reduce the number of new\nparameters while maintaining satisfactory performance, thereby offering a\npromising approach to compress the adaptation cost. We conduct experiments on\n24 downstream image classification tasks using various Vision Transformer\nvariants to evaluate our method. The results demonstrate that our approach\nachieves compelling transfer learning performance with a reduced parameter\ncount. Our code is available at\n\\href{https://github.com/DavidYanAnDe/ARC}{https://github.com/DavidYanAnDe/ARC}.\n","authors":["Wei Dong","Dawei Yan","Zhijun Lin","Peng Wang"],"pdf_url":"https://arxiv.org/pdf/2310.06234v2.pdf","comment":"Paper is accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.09651v1","updated":"2024-01-17T23:45:53Z","published":"2024-01-17T23:45:53Z","title":"Convex and Bilevel Optimization for Neuro-Symbolic Inference and\n Learning","summary":" We address a key challenge for neuro-symbolic (NeSy) systems by leveraging\nconvex and bilevel optimization techniques to develop a general gradient-based\nframework for end-to-end neural and symbolic parameter learning. The\napplicability of our framework is demonstrated with NeuPSL, a state-of-the-art\nNeSy architecture. To achieve this, we propose a smooth primal and dual\nformulation of NeuPSL inference and show learning gradients are functions of\nthe optimal dual variables. Additionally, we develop a dual block coordinate\ndescent algorithm for the new formulation that naturally exploits warm-starts.\nThis leads to over 100x learning runtime improvements over the current best\nNeuPSL inference method. Finally, we provide extensive empirical evaluations\nacross $8$ datasets covering a range of tasks and demonstrate our learning\nframework achieves up to a 16% point prediction performance improvement over\nalternative learning methods.\n","authors":["Charles Dickens","Changyu Gao","Connor Pryor","Stephen Wright","Lise Getoor"],"pdf_url":"https://arxiv.org/pdf/2401.09651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09646v1","updated":"2024-01-17T23:29:46Z","published":"2024-01-17T23:29:46Z","title":"ClimateGPT: Towards AI Synthesizing Interdisciplinary Research on\n Climate Change","summary":" This paper introduces ClimateGPT, a model family of domain-specific large\nlanguage models that synthesize interdisciplinary research on climate change.\nWe trained two 7B models from scratch on a science-oriented dataset of 300B\ntokens. For the first model, the 4.2B domain-specific tokens were included\nduring pre-training and the second was adapted to the climate domain after\npre-training. Additionally, ClimateGPT-7B, 13B and 70B are continuously\npre-trained from Llama~2 on a domain-specific dataset of 4.2B tokens. Each\nmodel is instruction fine-tuned on a high-quality and human-generated\ndomain-specific dataset that has been created in close cooperation with climate\nscientists. To reduce the number of hallucinations, we optimize the model for\nretrieval augmentation and propose a hierarchical retrieval strategy. To\nincrease the accessibility of our model to non-English speakers, we propose to\nmake use of cascaded machine translation and show that this approach can\nperform comparably to natively multilingual models while being easier to scale\nto a large number of languages. Further, to address the intrinsic\ninterdisciplinary aspect of climate change we consider different research\nperspectives. Therefore, the model can produce in-depth answers focusing on\ndifferent perspectives in addition to an overall answer. We propose a suite of\nautomatic climate-specific benchmarks to evaluate LLMs. On these benchmarks,\nClimateGPT-7B performs on par with the ten times larger Llama-2-70B Chat model\nwhile not degrading results on general domain benchmarks. Our human evaluation\nconfirms the trends we saw in our benchmarks. All models were trained and\nevaluated using renewable energy and are released publicly.\n","authors":["David Thulke","Yingbo Gao","Petrus Pelser","Rein Brune","Rricha Jalota","Floris Fok","Michael Ramos","Ian van Wyk","Abdallah Nasir","Hayden Goldstein","Taylor Tragemann","Katie Nguyen","Ariana Fowler","Andrew Stanco","Jon Gabriel","Jordan Taylor","Dean Moro","Evgenii Tsymbalov","Juliette de Waal","Evgeny Matusov","Mudar Yaghi","Mohammad Shihadah","Hermann Ney","Christian Dugast","Jonathan Dotan","Daniel Erasmus"],"pdf_url":"https://arxiv.org/pdf/2401.09646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09641v1","updated":"2024-01-17T23:27:48Z","published":"2024-01-17T23:27:48Z","title":"Functional Linear Non-Gaussian Acyclic Model for Causal Discovery","summary":" In causal discovery, non-Gaussianity has been used to characterize the\ncomplete configuration of a Linear Non-Gaussian Acyclic Model (LiNGAM),\nencompassing both the causal ordering of variables and their respective\nconnection strengths. However, LiNGAM can only deal with the finite-dimensional\ncase. To expand this concept, we extend the notion of variables to encompass\nvectors and even functions, leading to the Functional Linear Non-Gaussian\nAcyclic Model (Func-LiNGAM). Our motivation stems from the desire to identify\ncausal relationships in brain-effective connectivity tasks involving, for\nexample, fMRI and EEG datasets. We demonstrate why the original LiNGAM fails to\nhandle these inherently infinite-dimensional datasets and explain the\navailability of functional data analysis from both empirical and theoretical\nperspectives. {We establish theoretical guarantees of the identifiability of\nthe causal relationship among non-Gaussian random vectors and even random\nfunctions in infinite-dimensional Hilbert spaces.} To address the issue of\nsparsity in discrete time points within intrinsic infinite-dimensional\nfunctional data, we propose optimizing the coordinates of the vectors using\nfunctional principal component analysis. Experimental results on synthetic data\nverify the ability of the proposed framework to identify causal relationships\namong multivariate functions using the observed samples. For real data, we\nfocus on analyzing the brain connectivity patterns derived from fMRI data.\n","authors":["Tian-Le Yang","Kuang-Yao Lee","Kun Zhang","Joe Suzuki"],"pdf_url":"https://arxiv.org/pdf/2401.09641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05081v2","updated":"2024-01-17T23:21:31Z","published":"2023-11-09T00:46:31Z","title":"Generalized test utilities for long-tail performance in extreme\n multi-label classification","summary":" Extreme multi-label classification (XMLC) is the task of selecting a small\nsubset of relevant labels from a very large set of possible labels. As such, it\nis characterized by long-tail labels, i.e., most labels have very few positive\ninstances. With standard performance measures such as precision@k, a classifier\ncan ignore tail labels and still report good performance. However, it is often\nargued that correct predictions in the tail are more \"interesting\" or\n\"rewarding,\" but the community has not yet settled on a metric capturing this\nintuitive concept. The existing propensity-scored metrics fall short on this\ngoal by confounding the problems of long-tail and missing labels. In this\npaper, we analyze generalized metrics budgeted \"at k\" as an alternative\nsolution. To tackle the challenging problem of optimizing these metrics, we\nformulate it in the expected test utility (ETU) framework, which aims to\noptimize the expected performance on a fixed test set. We derive optimal\nprediction rules and construct computationally efficient approximations with\nprovable regret guarantees and robustness against model misspecification. Our\nalgorithm, based on block coordinate ascent, scales effortlessly to XMLC\nproblems and obtains promising results in terms of long-tail performance.\n","authors":["Erik Schultheis","Marek Wydmuch","Wojciech Kotłowski","Rohit Babbar","Krzysztof Dembczyński"],"pdf_url":"https://arxiv.org/pdf/2311.05081v2.pdf","comment":"This is the authors' version of the work accepted to NeurIPS 2023;\n the final version of the paper, errors and typos corrected, and minor\n modifications to improve clarity"},{"id":"http://arxiv.org/abs/2305.04099v2","updated":"2024-01-17T23:18:50Z","published":"2023-05-06T17:04:02Z","title":"Symbolic Regression on FPGAs for Fast Machine Learning Inference","summary":" The high-energy physics community is investigating the potential of deploying\nmachine-learning-based solutions on Field-Programmable Gate Arrays (FPGAs) to\nenhance physics sensitivity while still meeting data processing time\nconstraints. In this contribution, we introduce a novel end-to-end procedure\nthat utilizes a machine learning technique called symbolic regression (SR). It\nsearches the equation space to discover algebraic relations approximating a\ndataset. We use PySR (a software to uncover these expressions based on an\nevolutionary algorithm) and extend the functionality of hls4ml (a package for\nmachine learning inference in FPGAs) to support PySR-generated expressions for\nresource-constrained production environments. Deep learning models often\noptimize the top metric by pinning the network size because the vast\nhyperparameter space prevents an extensive search for neural architecture.\nConversely, SR selects a set of models on the Pareto front, which allows for\noptimizing the performance-resource trade-off directly. By embedding symbolic\nforms, our implementation can dramatically reduce the computational resources\nneeded to perform critical tasks. We validate our method on a physics\nbenchmark: the multiclass classification of jets produced in simulated\nproton-proton collisions at the CERN Large Hadron Collider. We show that our\napproach can approximate a 3-layer neural network using an inference model that\nachieves up to a 13-fold decrease in execution time, down to 5 ns, while still\npreserving more than 90% approximation accuracy.\n","authors":["Ho Fung Tsoi","Adrian Alan Pol","Vladimir Loncar","Ekaterina Govorkova","Miles Cranmer","Sridhara Dasu","Peter Elmer","Philip Harris","Isobel Ojalvo","Maurizio Pierini"],"pdf_url":"https://arxiv.org/pdf/2305.04099v2.pdf","comment":"9 pages. Accepted to 26th International Conference on Computing in\n High Energy & Nuclear Physics (CHEP 2023)"},{"id":"http://arxiv.org/abs/2401.09638v1","updated":"2024-01-17T23:17:08Z","published":"2024-01-17T23:17:08Z","title":"Automatic 3D Multi-modal Ultrasound Segmentation of Human Placenta using\n Fusion Strategies and Deep Learning","summary":" Purpose: Ultrasound is the most commonly used medical imaging modality for\ndiagnosis and screening in clinical practice. Due to its safety profile,\nnoninvasive nature and portability, ultrasound is the primary imaging modality\nfor fetal assessment in pregnancy. Current ultrasound processing methods are\neither manual or semi-automatic and are therefore laborious, time-consuming and\nprone to errors, and automation would go a long way in addressing these\nchallenges. Automated identification of placental changes at earlier gestation\ncould facilitate potential therapies for conditions such as fetal growth\nrestriction and pre-eclampsia that are currently detected only at late\ngestational age, potentially preventing perinatal morbidity and mortality.\n Methods: We propose an automatic three-dimensional multi-modal (B-mode and\npower Doppler) ultrasound segmentation of the human placenta using deep\nlearning combined with different fusion strategies.We collected data containing\nBmode and power Doppler ultrasound scans for 400 studies.\n Results: We evaluated different fusion strategies and state-of-the-art image\nsegmentation networks for placenta segmentation based on standard overlap- and\nboundary-based metrics. We found that multimodal information in the form of\nB-mode and power Doppler scans outperform any single modality. Furthermore, we\nfound that B-mode and power Doppler input scans fused at the data level provide\nthe best results with a mean Dice Similarity Coefficient (DSC) of 0.849.\n Conclusion: We conclude that the multi-modal approach of combining B-mode and\npower Doppler scans is effective in segmenting the placenta from 3D ultrasound\nscans in a fully automated manner and is robust to quality variation of the\ndatasets.\n","authors":["Sonit Singh","Gordon Stevenson","Brendan Mein","Alec Welsh","Arcot Sowmya"],"pdf_url":"https://arxiv.org/pdf/2401.09638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08744v2","updated":"2024-01-17T23:03:05Z","published":"2023-10-12T22:12:28Z","title":"Circuit Component Reuse Across Tasks in Transformer Language Models","summary":" Recent work in mechanistic interpretability has shown that behaviors in\nlanguage models can be successfully reverse-engineered through circuit\nanalysis. A common criticism, however, is that each circuit is task-specific,\nand thus such analysis cannot contribute to understanding the models at a\nhigher level. In this work, we present evidence that insights (both low-level\nfindings about specific heads and higher-level findings about general\nalgorithms) can indeed generalize across tasks. Specifically, we study the\ncircuit discovered in Wang et al. (2022) for the Indirect Object Identification\n(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that\nit is mostly reused to solve a seemingly different task: Colored Objects\n(Ippolito & Callison-Burch, 2023). We provide evidence that the process\nunderlying both tasks is functionally very similar, and contains about a 78%\noverlap in in-circuit attention heads. We further present a proof-of-concept\nintervention experiment, in which we adjust four attention heads in middle\nlayers in order to 'repair' the Colored Objects circuit and make it behave like\nthe IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the\nColored Objects task and explain most sources of error. The intervention\naffects downstream attention heads in specific ways predicted by their\ninteractions in the IOI circuit, indicating that this subcircuit behavior is\ninvariant to the different task inputs. Overall, our results provide evidence\nthat it may yet be possible to explain large language models' behavior in terms\nof a relatively small number of interpretable task-general algorithmic building\nblocks and computational components.\n","authors":["Jack Merullo","Carsten Eickhoff","Ellie Pavlick"],"pdf_url":"https://arxiv.org/pdf/2310.08744v2.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2208.13125v3","updated":"2024-01-17T22:55:00Z","published":"2022-08-28T02:52:10Z","title":"Normality-Guided Distributional Reinforcement Learning for Continuous\n Control","summary":" Learning a predictive model of the mean return, or value function, plays a\ncritical role in many reinforcement learning algorithms. Distributional\nreinforcement learning (DRL) has been shown to improve performance by modeling\nthe value distribution, not just the mean. We study the value distribution in\nseveral continuous control tasks and find that the learned value distribution\nis empirical quite close to normal. We design a method that exploits this\nproperty, employ variances predicted from a variance network, along with\nreturns, to analytically compute target quantile bars representing a normal for\nour distributional value function. In addition, we propose a policy update\nstrategy based on the correctness as measured by structural characteristics of\nthe value distribution not present in the standard value function. The approach\nwe outline is compatible with many DRL structures. We use two representative\non-policy algorithms, PPO and TRPO, as testbeds. Our method yields\nstatistically significant improvements in 10 out of 16 continuous task\nsettings, while utilizing a reduced number of weights and achieving faster\ntraining time compared to an ensemble-based method for quantifying value\ndistribution uncertainty.\n","authors":["Ju-Seung Byun","Andrew Perrault"],"pdf_url":"https://arxiv.org/pdf/2208.13125v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09631v1","updated":"2024-01-17T22:46:51Z","published":"2024-01-17T22:46:51Z","title":"Physics-Informed Calibration of Aeromagnetic Compensation in Magnetic\n Navigation Systems using Liquid Time-Constant Networks","summary":" Magnetic navigation (MagNav) is a rising alternative to the Global\nPositioning System (GPS) and has proven useful for aircraft navigation.\nTraditional aircraft navigation systems, while effective, face limitations in\nprecision and reliability in certain environments and against attacks. Airborne\nMagNav leverages the Earth's magnetic field to provide accurate positional\ninformation. However, external magnetic fields induced by aircraft electronics\nand Earth's large-scale magnetic fields disrupt the weaker signal of interest.\nWe introduce a physics-informed approach using Tolles-Lawson coefficients for\ncompensation and Liquid Time-Constant Networks (LTCs) to remove complex, noisy\nsignals derived from the aircraft's magnetic sources. Using real flight data\nwith magnetometer measurements and aircraft measurements, we observe up to a\n64% reduction in aeromagnetic compensation error (RMSE nT), outperforming\nconventional models. This significant improvement underscores the potential of\na physics-informed, machine learning approach for extracting clean, reliable,\nand accurate magnetic signals for MagNav positional estimation.\n","authors":["Favour Nerrise","Andrew Sosa Sosanya","Patrick Neary"],"pdf_url":"https://arxiv.org/pdf/2401.09631v1.pdf","comment":"Accepted at the NeurIPS 2023 Machine Learning and the Physical\n Sciences workshop, 7 pages, 4 figures, see code here:\n https://github.com/fnerrise/LNNs_MagNav/"},{"id":"http://arxiv.org/abs/2401.09629v1","updated":"2024-01-17T22:43:00Z","published":"2024-01-17T22:43:00Z","title":"Multiple Locally Linear Kernel Machines","summary":" In this paper we propose a new non-linear classifier based on a combination\nof locally linear classifiers. A well known optimization formulation is given\nas we cast the problem in a $\\ell_1$ Multiple Kernel Learning (MKL) problem\nusing many locally linear kernels. Since the number of such kernels is huge, we\nprovide a scalable generic MKL training algorithm handling streaming kernels.\nWith respect to the inference time, the resulting classifier fits the gap\nbetween high accuracy but slow non-linear classifiers (such as classical MKL)\nand fast but low accuracy linear classifiers.\n","authors":["David Picard"],"pdf_url":"https://arxiv.org/pdf/2401.09629v1.pdf","comment":"This paper was written in 2014 and was originally submitted but\n rejected at ICML'15"},{"id":"http://arxiv.org/abs/2112.08440v5","updated":"2024-01-17T22:41:32Z","published":"2021-12-14T07:02:57Z","title":"Climate-Invariant Machine Learning","summary":" Projecting climate change is a generalization problem: we extrapolate the\nrecent past using physical models across past, present, and future climates.\nCurrent climate models require representations of processes that occur at\nscales smaller than model grid size, which have been the main source of model\nprojection uncertainty. Recent machine learning (ML) algorithms hold promise to\nimprove such process representations, but tend to extrapolate poorly to climate\nregimes they were not trained on. To get the best of the physical and\nstatistical worlds, we propose a new framework - termed \"climate-invariant\" ML\n- incorporating knowledge of climate processes into ML algorithms, and show\nthat it can maintain high offline accuracy across a wide range of climate\nconditions and configurations in three distinct atmospheric models. Our results\nsuggest that explicitly incorporating physical knowledge into data-driven\nmodels of Earth system processes can improve their consistency, data\nefficiency, and generalizability across climate regimes.\n","authors":["Tom Beucler","Pierre Gentine","Janni Yuval","Ankitesh Gupta","Liran Peng","Jerry Lin","Sungduk Yu","Stephan Rasp","Fiaz Ahmed","Paul A. O'Gorman","J. David Neelin","Nicholas J. Lutsko","Michael Pritchard"],"pdf_url":"https://arxiv.org/pdf/2112.08440v5.pdf","comment":"26+28 pages, 9+15 figures, 0+3 tables in the main text +\n supplementary materials. Accepted for publication in Science Advances on Jan\n 5, 2024"},{"id":"http://arxiv.org/abs/2308.12952v3","updated":"2024-01-17T22:41:29Z","published":"2023-08-24T17:41:20Z","title":"BridgeData V2: A Dataset for Robot Learning at Scale","summary":" We introduce BridgeData V2, a large and diverse dataset of robotic\nmanipulation behaviors designed to facilitate research on scalable robot\nlearning. BridgeData V2 contains 60,096 trajectories collected across 24\nenvironments on a publicly available low-cost robot. BridgeData V2 provides\nextensive task and environment variability, leading to skills that can\ngeneralize across environments, domains, and institutions, making the dataset a\nuseful resource for a broad range of researchers. Additionally, the dataset is\ncompatible with a wide variety of open-vocabulary, multi-task learning methods\nconditioned on goal images or natural language instructions. In our\nexperiments, we train 6 state-of-the-art imitation learning and offline\nreinforcement learning methods on our dataset, and find that they succeed on a\nsuite of tasks requiring varying amounts of generalization. We also demonstrate\nthat the performance of these methods improves with more data and higher\ncapacity models, and that training on a greater variety of skills leads to\nimproved generalization. By publicly sharing BridgeData V2 and our pre-trained\nmodels, we aim to accelerate research in scalable robot learning methods.\nProject page at https://rail-berkeley.github.io/bridgedata\n","authors":["Homer Walke","Kevin Black","Abraham Lee","Moo Jin Kim","Max Du","Chongyi Zheng","Tony Zhao","Philippe Hansen-Estruch","Quan Vuong","Andre He","Vivek Myers","Kuan Fang","Chelsea Finn","Sergey Levine"],"pdf_url":"https://arxiv.org/pdf/2308.12952v3.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2401.09627v1","updated":"2024-01-17T22:34:20Z","published":"2024-01-17T22:34:20Z","title":"SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of\n Lumbar Spine MRI","summary":" Intervertebral disc disease, a prevalent ailment, frequently leads to\nintermittent or persistent low back pain, and diagnosing and assessing of this\ndisease rely on accurate measurement of vertebral bone and intervertebral disc\ngeometries from lumbar MR images. Deep neural network (DNN) models may assist\nclinicians with more efficient image segmentation of individual instances\n(disks and vertebrae) of the lumbar spine in an automated way, which is termed\nas instance image segmentation. In this work, we proposed SymTC, an innovative\nlumbar spine MR image segmentation model that combines the strengths of\nTransformer and Convolutional Neural Network (CNN). Specifically, we designed a\nparallel dual-path architecture to merge CNN layers and Transformer layers, and\nwe integrated a novel position embedding into the self-attention module of\nTransformer, enhancing the utilization of positional information for more\naccurate segmentation. To further improves model performance, we introduced a\nnew data augmentation technique to create synthetic yet realistic MR image\ndataset, named SSMSpine, which is made publicly available. We evaluated our\nSymTC and the other 15 existing image segmentation models on our private\nin-house dataset and the public SSMSpine dataset, using two metrics, Dice\nSimilarity Coefficient and 95% Hausdorff Distance. The results show that our\nSymTC has the best performance for segmenting vertebral bones and\nintervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine\ndataset are available at https://github.com/jiasongchen/SymTC.\n","authors":["Jiasong Chen","Linchen Qian","Linhai Ma","Timur Urakov","Weiyong Gu","Liang Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09627v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09624v1","updated":"2024-01-17T22:30:41Z","published":"2024-01-17T22:30:41Z","title":"MITS-GAN: Safeguarding Medical Imaging from Tampering with Generative\n Adversarial Networks","summary":" The progress in generative models, particularly Generative Adversarial\nNetworks (GANs), opened new possibilities for image generation but raised\nconcerns about potential malicious uses, especially in sensitive areas like\nmedical imaging. This study introduces MITS-GAN, a novel approach to prevent\ntampering in medical images, with a specific focus on CT scans. The approach\ndisrupts the output of the attacker's CT-GAN architecture by introducing\nimperceptible but yet precise perturbations. Specifically, the proposed\napproach involves the introduction of appropriate Gaussian noise to the input\nas a protective measure against various attacks. Our method aims to enhance\ntamper resistance, comparing favorably to existing techniques. Experimental\nresults on a CT scan dataset demonstrate MITS-GAN's superior performance,\nemphasizing its ability to generate tamper-resistant images with negligible\nartifacts. As image tampering in medical domains poses life-threatening risks,\nour proactive approach contributes to the responsible and ethical use of\ngenerative models. This work provides a foundation for future research in\ncountering cyber threats in medical imaging. Models and codes are publicly\navailable at the following link\n\\url{https://iplab.dmi.unict.it/MITS-GAN-2024/}.\n","authors":["Giovanni Pasqualino","Luca Guarnera","Alessandro Ortis","Sebastiano Battiato"],"pdf_url":"https://arxiv.org/pdf/2401.09624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09622v1","updated":"2024-01-17T22:23:29Z","published":"2024-01-17T22:23:29Z","title":"SMOOTHIE: A Theory of Hyper-parameter Optimization for Software\n Analytics","summary":" Hyper-parameter optimization is the black art of tuning a learner's control\nparameters. In software analytics, a repeated result is that such tuning can\nresult in dramatic performance improvements. Despite this, hyper-parameter\noptimization is often applied rarely or poorly in software analytics--perhaps\ndue to the CPU cost of exploring all those parameter options can be\nprohibitive.\n We theorize that learners generalize better when the loss landscape is\n``smooth''. This theory is useful since the influence on ``smoothness'' of\ndifferent hyper-parameter choices can be tested very quickly (e.g. for a deep\nlearner, after just one epoch).\n To test this theory, this paper implements and tests SMOOTHIE, a novel\nhyper-parameter optimizer that guides its optimizations via considerations of\n``smothness''. The experiments of this paper test SMOOTHIE on numerous SE tasks\nincluding (a) GitHub issue lifetime prediction; (b) detecting false alarms in\nstatic code warnings; (c) defect prediction, and (d) a set of standard ML\ndatasets. In all these experiments, SMOOTHIE out-performed state-of-the-art\noptimizers. Better yet, SMOOTHIE ran 300% faster than the prior state-of-the\nart. We hence conclude that this theory (that hyper-parameter optimization is\nbest viewed as a ``smoothing'' function for the decision landscape), is both\ntheoretically interesting and practically very useful.\n To support open science and other researchers working in this area, all our\nscripts and datasets are available on-line at\nhttps://github.com/yrahul3910/smoothness-hpo/.\n","authors":["Rahul Yedida","Tim Menzies"],"pdf_url":"https://arxiv.org/pdf/2401.09622v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2401.09607v1","updated":"2024-01-17T21:32:04Z","published":"2024-01-17T21:32:04Z","title":"Land Cover Image Classification","summary":" Land Cover (LC) image classification has become increasingly significant in\nunderstanding environmental changes, urban planning, and disaster management.\nHowever, traditional LC methods are often labor-intensive and prone to human\nerror. This paper explores state-of-the-art deep learning models for enhanced\naccuracy and efficiency in LC analysis. We compare convolutional neural\nnetworks (CNN) against transformer-based methods, showcasing their applications\nand advantages in LC studies. We used EuroSAT, a patch-based LC classification\ndata set based on Sentinel-2 satellite images and achieved state-of-the-art\nresults using current transformer models.\n","authors":["Antonio Rangel","Juan Terven","Diana M. Cordova-Esparza","E. A. Chavez-Urbiola"],"pdf_url":"https://arxiv.org/pdf/2401.09607v1.pdf","comment":"7 pages, 4 figures, 1 table, published in conference"},{"id":"http://arxiv.org/abs/2401.09606v1","updated":"2024-01-17T21:32:03Z","published":"2024-01-17T21:32:03Z","title":"Robustness Evaluation of Machine Learning Models for Robot Arm Action\n Recognition in Noisy Environments","summary":" In the realm of robot action recognition, identifying distinct but spatially\nproximate arm movements using vision systems in noisy environments poses a\nsignificant challenge. This paper studies robot arm action recognition in noisy\nenvironments using machine learning techniques. Specifically, a vision system\nis used to track the robot's movements followed by a deep learning model to\nextract the arm's key points. Through a comparative analysis of machine\nlearning methods, the effectiveness and robustness of this model are assessed\nin noisy environments. A case study was conducted using the Tic-Tac-Toe game in\na 3-by-3 grid environment, where the focus is to accurately identify the\nactions of the arms in selecting specific locations within this constrained\nenvironment. Experimental results show that our approach can achieve precise\nkey point detection and action classification despite the addition of noise and\nuncertainties to the dataset.\n","authors":["Elaheh Motamedi","Kian Behzad","Rojin Zandi","Hojjat Salehinejad","Milad Siami"],"pdf_url":"https://arxiv.org/pdf/2401.09606v1.pdf","comment":"Accepted at ICASSP"},{"id":"http://arxiv.org/abs/2401.09604v1","updated":"2024-01-17T21:30:22Z","published":"2024-01-17T21:30:22Z","title":"MedBlindTuner: Towards Privacy-preserving Fine-tuning on Biomedical\n Images with Transformers and Fully Homomorphic Encryption","summary":" Advancements in machine learning (ML) have significantly revolutionized\nmedical image analysis, prompting hospitals to rely on external ML services.\nHowever, the exchange of sensitive patient data, such as chest X-rays, poses\ninherent privacy risks when shared with third parties. Addressing this concern,\nwe propose MedBlindTuner, a privacy-preserving framework leveraging fully\nhomomorphic encryption (FHE) and a data-efficient image transformer (DEiT).\nMedBlindTuner enables the training of ML models exclusively on FHE-encrypted\nmedical images. Our experimental evaluation demonstrates that MedBlindTuner\nachieves comparable accuracy to models trained on non-encrypted images,\noffering a secure solution for outsourcing ML computations while preserving\npatient data privacy. To the best of our knowledge, this is the first work that\nuses data-efficient image transformers and fully homomorphic encryption in this\ndomain.\n","authors":["Prajwal Panzade","Daniel Takabi","Zhipeng Cai"],"pdf_url":"https://arxiv.org/pdf/2401.09604v1.pdf","comment":"Accepted for the presentation at W3PHIAI, The 38th Annual AAAI\n Conference on Artificial Intelligence 2024"},{"id":"http://arxiv.org/abs/2401.09596v1","updated":"2024-01-17T21:08:41Z","published":"2024-01-17T21:08:41Z","title":"Efficient generative adversarial networks using linear\n additive-attention Transformers","summary":" Although the capacity of deep generative models for image generation, such as\nDiffusion Models (DMs) and Generative Adversarial Networks (GANs), has\ndramatically improved in recent years, much of their success can be attributed\nto computationally expensive architectures. This has limited their adoption and\nuse to research laboratories and companies with large resources, while\nsignificantly raising the carbon footprint for training, fine-tuning, and\ninference. In this work, we present LadaGAN, an efficient generative\nadversarial network that is built upon a novel Transformer block named\nLadaformer. The main component of this block is a linear additive-attention\nmechanism that computes a single attention vector per head instead of the\nquadratic dot-product attention. We employ Ladaformer in both the generator and\ndiscriminator, which reduces the computational complexity and overcomes the\ntraining instabilities often associated with Transformer GANs. LadaGAN\nconsistently outperforms existing convolutional and Transformer GANs on\nbenchmark datasets at different resolutions while being significantly more\nefficient. Moreover, LadaGAN shows competitive performance compared to\nstate-of-the-art multi-step generative models (e.g. DMs) using orders of\nmagnitude less computational resources.\n","authors":["Emilio Morales-Juarez","Gibran Fuentes-Pineda"],"pdf_url":"https://arxiv.org/pdf/2401.09596v1.pdf","comment":"12 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.09587v1","updated":"2024-01-17T20:28:15Z","published":"2024-01-17T20:28:15Z","title":"Bilevel Optimization under Unbounded Smoothness: A New Algorithm and\n Convergence Analysis","summary":" Bilevel optimization is an important formulation for many machine learning\nproblems. Current bilevel optimization algorithms assume that the gradient of\nthe upper-level function is Lipschitz. However, recent studies reveal that\ncertain neural networks such as recurrent neural networks (RNNs) and\nlong-short-term memory networks (LSTMs) exhibit potential unbounded smoothness,\nrendering conventional bilevel optimization algorithms unsuitable. In this\npaper, we design a new bilevel optimization algorithm, namely BO-REP, to\naddress this challenge. This algorithm updates the upper-level variable using\nnormalized momentum and incorporates two novel techniques for updating the\nlower-level variable: \\textit{initialization refinement} and \\textit{periodic\nupdates}. Specifically, once the upper-level variable is initialized, a\nsubroutine is invoked to obtain a refined estimate of the corresponding optimal\nlower-level variable, and the lower-level variable is updated only after every\nspecific period instead of each iteration. When the upper-level problem is\nnonconvex and unbounded smooth, and the lower-level problem is strongly convex,\nwe prove that our algorithm requires $\\widetilde{\\mathcal{O}}(1/\\epsilon^4)$\niterations to find an $\\epsilon$-stationary point in the stochastic setting,\nwhere each iteration involves calling a stochastic gradient or Hessian-vector\nproduct oracle. Notably, this result matches the state-of-the-art complexity\nresults under the bounded smoothness setting and without mean-squared\nsmoothness of the stochastic gradient, up to logarithmic factors. Our proof\nrelies on novel technical lemmas for the periodically updated lower-level\nvariable, which are of independent interest. Our experiments on\nhyper-representation learning, hyperparameter optimization, and data\nhyper-cleaning for text classification tasks demonstrate the effectiveness of\nour proposed algorithm.\n","authors":["Jie Hao","Xiaochuan Gong","Mingrui Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09587v1.pdf","comment":"Accepted by ICLR 2024, Spotlight"},{"id":"http://arxiv.org/abs/2401.05566v3","updated":"2024-01-17T20:26:01Z","published":"2024-01-10T22:14:35Z","title":"Sleeper Agents: Training Deceptive LLMs that Persist Through Safety\n Training","summary":" Humans are capable of strategically deceptive behavior: behaving helpfully in\nmost situations, but then behaving very differently in order to pursue\nalternative objectives when given the opportunity. If an AI system learned such\na deceptive strategy, could we detect it and remove it using current\nstate-of-the-art safety training techniques? To study this question, we\nconstruct proof-of-concept examples of deceptive behavior in large language\nmodels (LLMs). For example, we train models that write secure code when the\nprompt states that the year is 2023, but insert exploitable code when the\nstated year is 2024. We find that such backdoor behavior can be made\npersistent, so that it is not removed by standard safety training techniques,\nincluding supervised fine-tuning, reinforcement learning, and adversarial\ntraining (eliciting unsafe behavior and then training to remove it). The\nbackdoor behavior is most persistent in the largest models and in models\ntrained to produce chain-of-thought reasoning about deceiving the training\nprocess, with the persistence remaining even when the chain-of-thought is\ndistilled away. Furthermore, rather than removing backdoors, we find that\nadversarial training can teach models to better recognize their backdoor\ntriggers, effectively hiding the unsafe behavior. Our results suggest that,\nonce a model exhibits deceptive behavior, standard techniques could fail to\nremove such deception and create a false impression of safety.\n","authors":["Evan Hubinger","Carson Denison","Jesse Mu","Mike Lambert","Meg Tong","Monte MacDiarmid","Tamera Lanham","Daniel M. Ziegler","Tim Maxwell","Newton Cheng","Adam Jermyn","Amanda Askell","Ansh Radhakrishnan","Cem Anil","David Duvenaud","Deep Ganguli","Fazl Barez","Jack Clark","Kamal Ndousse","Kshitij Sachan","Michael Sellitto","Mrinank Sharma","Nova DasSarma","Roger Grosse","Shauna Kravec","Yuntao Bai","Zachary Witten","Marina Favaro","Jan Brauner","Holden Karnofsky","Paul Christiano","Samuel R. Bowman","Logan Graham","Jared Kaplan","Sören Mindermann","Ryan Greenblatt","Buck Shlegeris","Nicholas Schiefer","Ethan Perez"],"pdf_url":"https://arxiv.org/pdf/2401.05566v3.pdf","comment":"updated to add missing acknowledgements"},{"id":"http://arxiv.org/abs/2304.05527v4","updated":"2024-01-17T20:16:04Z","published":"2023-04-11T22:45:18Z","title":"Black Box Variational Inference with a Deterministic Objective: Faster,\n More Accurate, and Even More Black Box","summary":" Automatic differentiation variational inference (ADVI) offers fast and\neasy-to-use posterior approximation in multiple modern probabilistic\nprogramming languages. However, its stochastic optimizer lacks clear\nconvergence criteria and requires tuning parameters. Moreover, ADVI inherits\nthe poor posterior uncertainty estimates of mean-field variational Bayes\n(MFVB). We introduce \"deterministic ADVI\" (DADVI) to address these issues.\nDADVI replaces the intractable MFVB objective with a fixed Monte Carlo\napproximation, a technique known in the stochastic optimization literature as\nthe \"sample average approximation\" (SAA). By optimizing an approximate but\ndeterministic objective, DADVI can use off-the-shelf second-order optimization,\nand, unlike standard mean-field ADVI, is amenable to more accurate posterior\ncovariances via linear response (LR). In contrast to existing worst-case\ntheory, we show that, on certain classes of common statistical problems, DADVI\nand the SAA can perform well with relatively few samples even in very high\ndimensions, though we also show that such favorable results cannot extend to\nvariational approximations that are too expressive relative to mean-field ADVI.\nWe show on a variety of real-world problems that DADVI reliably finds good\nsolutions with default settings (unlike ADVI) and, together with LR\ncovariances, is typically faster and more accurate than standard ADVI.\n","authors":["Ryan Giordano","Martin Ingram","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2304.05527v4.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.09414v1","updated":"2024-01-17T18:55:12Z","published":"2024-01-17T18:55:12Z","title":"Vlogger: Make Your Dream A Vlog","summary":" In this work, we present Vlogger, a generic AI system for generating a\nminute-level video blog (i.e., vlog) of user descriptions. Different from short\nvideos with a few seconds, vlog often contains a complex storyline with\ndiversified scenes, which is challenging for most existing video generation\napproaches. To break through this bottleneck, our Vlogger smartly leverages\nLarge Language Model (LLM) as Director and decomposes a long video generation\ntask of vlog into four key stages, where we invoke various foundation models to\nplay the critical roles of vlog professionals, including (1) Script, (2) Actor,\n(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings,\nour Vlogger can generate vlogs through explainable cooperation of top-down\nplanning and bottom-up shooting. Moreover, we introduce a novel video diffusion\nmodel, ShowMaker, which serves as a videographer in our Vlogger for generating\nthe video snippet of each shooting scene. By incorporating Script and Actor\nattentively as textual and visual prompts, it can effectively enhance\nspatial-temporal coherence in the snippet. Besides, we design a concise mixed\ntraining paradigm for ShowMaker, boosting its capacity for both T2V generation\nand prediction. Finally, the extensive experiments show that our method\nachieves state-of-the-art performance on zero-shot T2V generation and\nprediction tasks. More importantly, Vlogger can generate over 5-minute vlogs\nfrom open-world descriptions, without loss of video coherence on script and\nactor. The code and model is all available at\nhttps://github.com/zhuangshaobin/Vlogger.\n","authors":["Shaobin Zhuang","Kunchang Li","Xinyuan Chen","Yaohui Wang","Ziwei Liu","Yu Qiao","Yali Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09414v1.pdf","comment":"16 pages, 8 figures, 11 tables"},{"id":"http://arxiv.org/abs/2401.09019v1","updated":"2024-01-17T07:30:52Z","published":"2024-01-17T07:30:52Z","title":"Change Detection Between Optical Remote Sensing Imagery and Map Data via\n Segment Anything Model (SAM)","summary":" Unsupervised multimodal change detection is pivotal for time-sensitive tasks\nand comprehensive multi-temporal Earth monitoring. In this study, we explore\nunsupervised multimodal change detection between two key remote sensing data\nsources: optical high-resolution imagery and OpenStreetMap (OSM) data.\nSpecifically, we propose to utilize the vision foundation model Segmentation\nAnything Model (SAM), for addressing our task. Leveraging SAM's exceptional\nzero-shot transfer capability, high-quality segmentation maps of optical images\ncan be obtained. Thus, we can directly compare these two heterogeneous data\nforms in the so-called segmentation domain. We then introduce two strategies\nfor guiding SAM's segmentation process: the 'no-prompt' and 'box/mask prompt'\nmethods. The two strategies are designed to detect land-cover changes in\ngeneral scenarios and to identify new land-cover objects within existing\nbackgrounds, respectively. Experimental results on three datasets indicate that\nthe proposed approach can achieve more competitive results compared to\nrepresentative unsupervised multimodal change detection methods.\n","authors":["Hongruixuan Chen","Jian Song","Naoto Yokoya"],"pdf_url":"https://arxiv.org/pdf/2401.09019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08889v1","updated":"2024-01-17T00:12:13Z","published":"2024-01-17T00:12:13Z","title":"On the Effect of Data-Augmentation on Local Embedding Properties in the\n Contrastive Learning of Music Audio Representations","summary":" Audio embeddings are crucial tools in understanding large catalogs of music.\nTypically embeddings are evaluated on the basis of the performance they provide\nin a wide range of downstream tasks, however few studies have investigated the\nlocal properties of the embedding spaces themselves which are important in\nnearest neighbor algorithms, commonly used in music search and recommendation.\nIn this work we show that when learning audio representations on music datasets\nvia contrastive learning, musical properties that are typically homogeneous\nwithin a track (e.g., key and tempo) are reflected in the locality of\nneighborhoods in the resulting embedding space. By applying appropriate data\naugmentation strategies, localisation of such properties can not only be\nreduced but the localisation of other attributes is increased. For example,\nlocality of features such as pitch and tempo that are less relevant to\nnon-expert listeners, may be mitigated while improving the locality of more\nsalient features such as genre and mood, achieving state-of-the-art performance\nin nearest neighbor retrieval accuracy. Similarly, we show that the optimal\nselection of data augmentation strategies for contrastive learning of music\naudio embeddings is dependent on the downstream task, highlighting this as an\nimportant embedding design decision.\n","authors":["Matthew C. McCallum","Matthew E. P. Davies","Florian Henkel","Jaehun Kim","Samuel E. Sandberg"],"pdf_url":"https://arxiv.org/pdf/2401.08889v1.pdf","comment":"Accepted to the International Conference on Acoustics, Speech and\n Signal Processing (ICASSP) 2024"}]},"2024-01-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.10225v1","updated":"2024-01-18T18:59:11Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":" In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models, that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval in conversational QA, we fine-tune a dense retriever on a\nmulti-turn QA dataset, which provides comparable results to using the\nstate-of-the-art query rewriting model while largely reducing deployment cost.\nNotably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10\nconversational QA datasets (54.14 vs. 53.90), without relying on any synthetic\ndata from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03128v4","updated":"2024-01-18T18:57:10Z","published":"2023-10-04T19:39:26Z","title":"MetaTool Benchmark for Large Language Models: Deciding Whether to Use\n Tools and Which to Use","summary":" Large language models (LLMs) have garnered significant attention due to their\nimpressive natural language processing (NLP) capabilities. Recently, many\nstudies have focused on the tool utilization ability of LLMs. They primarily\ninvestigated how LLMs effectively collaborate with given specific tools.\nHowever, in scenarios where LLMs serve as intelligent agents, as seen in\napplications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate\ndecision-making processes that involve deciding whether to employ a tool and\nselecting the most suitable tool(s) from a collection of available tools to\nfulfill user requests. Therefore, in this paper, we introduce MetaTool, a\nbenchmark designed to evaluate whether LLMs have tool usage awareness and can\ncorrectly choose tools. Specifically, we create a dataset called ToolE within\nthe benchmark. This dataset contains various types of user queries in the form\nof prompts that trigger LLMs to use tools, including both single-tool and\nmulti-tool scenarios. Subsequently, we set the tasks for both tool usage\nawareness and tool selection. We define four subtasks from different\nperspectives in tool selection, including tool selection with similar choices,\ntool selection in specific scenarios, tool selection with possible reliability\nissues, and multi-tool selection. We conduct experiments involving eight\npopular LLMs and find that the majority of them still struggle to effectively\nselect tools, highlighting the existing gaps between LLMs and genuine\nintelligent agents. However, through the error analysis, we found there is\nstill significant room for improvement. Finally, we conclude with insights for\ntool developers -- we strongly recommend that tool developers choose an\nappropriate rewrite model for generating new descriptions based on the\ndownstream LLM the tool will apply to. Our code is in\n\\href{https://github.com/HowieHwong/MetaTool}{Github}.\n","authors":["Yue Huang","Jiawen Shi","Yuan Li","Chenrui Fan","Siyuan Wu","Qihui Zhang","Yixin Liu","Pan Zhou","Yao Wan","Neil Zhenqiang Gong","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2310.03128v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11446v2","updated":"2024-01-18T18:50:55Z","published":"2023-10-17T17:56:18Z","title":"Functional Invariants to Watermark Large Transformers","summary":" The rapid growth of transformer-based models increases the concerns about\ntheir integrity and ownership insurance. Watermarking addresses this issue by\nembedding a unique identifier into the model, while preserving its performance.\nHowever, most existing approaches require to optimize the weights to imprint\nthe watermark signal, which is not suitable at scale due to the computational\ncost. This paper explores watermarks with virtually no computational cost,\napplicable to a non-blind white-box setting (assuming access to both the\noriginal and watermarked networks). They generate functionally equivalent\ncopies by leveraging the models' invariance, via operations like dimension\npermutations or scaling/unscaling. This enables to watermark models without any\nchange in their outputs and remains stealthy. Experiments demonstrate the\neffectiveness of the approach and its robustness against various model\ntransformations (fine-tuning, quantization, pruning), making it a practical\nsolution to protect the integrity of large models.\n","authors":["Pierre Fernandez","Guillaume Couairon","Teddy Furon","Matthijs Douze"],"pdf_url":"https://arxiv.org/pdf/2310.11446v2.pdf","comment":"Published at ICASSP 2024. Webpage at\n https://pierrefdz.github.io/publications/invariancewm/"},{"id":"http://arxiv.org/abs/2401.10208v1","updated":"2024-01-18T18:50:16Z","published":"2024-01-18T18:50:16Z","title":"MM-Interleaved: Interleaved Image-Text Generative Modeling via\n Multi-modal Feature Synchronizer","summary":" Developing generative models for interleaved image-text data has both\nresearch and practical value. It requires models to understand the interleaved\nsequences and subsequently generate images and text. However, existing attempts\nare limited by the issue that the fixed number of visual tokens cannot\nefficiently capture image details, which is particularly problematic in the\nmulti-image scenarios. To address this, this paper presents MM-Interleaved, an\nend-to-end generative model for interleaved image-text data. It introduces a\nmulti-scale and multi-image feature synchronizer module, allowing direct access\nto fine-grained image features in the previous context during the generation\nprocess. MM-Interleaved is end-to-end pre-trained on both paired and\ninterleaved image-text corpora. It is further enhanced through a supervised\nfine-tuning phase, wherein the model improves its ability to follow complex\nmulti-modal instructions. Experiments demonstrate the versatility of\nMM-Interleaved in recognizing visual details following multi-modal instructions\nand generating consistent images following both textual and visual conditions.\nCode and models are available at\n\\url{https://github.com/OpenGVLab/MM-Interleaved}.\n","authors":["Changyao Tian","Xizhou Zhu","Yuwen Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Yuntao Chen","Lewei Lu","Tong Lu","Jie Zhou","Hongsheng Li","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2401.10208v1.pdf","comment":"20 pages, 9 figures, 17 tables"},{"id":"http://arxiv.org/abs/2312.16171v2","updated":"2024-01-18T18:41:09Z","published":"2023-12-26T18:59:33Z","title":"Principled Instructions Are All You Need for Questioning LLaMA-1/2,\n GPT-3.5/4","summary":" This paper introduces 26 guiding principles designed to streamline the\nprocess of querying and prompting large language models. Our goal is to\nsimplify the underlying concepts of formulating questions for various scales of\nlarge language models, examining their abilities, and enhancing user\ncomprehension on the behaviors of different scales of large language models\nwhen feeding into different prompts. Extensive experiments are conducted on\nLLaMA-1/2 (7B, 13B and 70B), GPT-3.5/4 to verify the effectiveness of the\nproposed principles on instructions and prompts design. We hope that this work\ncan provide a better guide for researchers working on the prompting of large\nlanguage models. Project page is available at\nhttps://github.com/VILA-Lab/ATLAS.\n","authors":["Sondos Mahmoud Bsharat","Aidar Myrzakhan","Zhiqiang Shen"],"pdf_url":"https://arxiv.org/pdf/2312.16171v2.pdf","comment":"Github at: https://github.com/VILA-Lab/ATLAS"},{"id":"http://arxiv.org/abs/2309.07382v2","updated":"2024-01-18T18:23:37Z","published":"2023-09-14T01:59:15Z","title":"Less is More for Long Document Summary Evaluation by LLMs","summary":" Large Language Models (LLMs) have shown promising performance in summary\nevaluation tasks, yet they face challenges such as high computational costs and\nthe Lost-in-the-Middle problem where important information in the middle of\nlong documents is often overlooked. To address these issues, this paper\nintroduces a novel approach, Extract-then-Evaluate, which involves extracting\nkey sentences from a long source document and then evaluating the summary by\nprompting LLMs. The results reveal that the proposed method not only\nsignificantly reduces evaluation costs but also exhibits a higher correlation\nwith human evaluations. Furthermore, we provide practical recommendations for\noptimal document length and sentence extraction methods, contributing to the\ndevelopment of cost-effective yet more accurate methods for LLM-based text\ngeneration evaluation.\n","authors":["Yunshu Wu","Hayate Iso","Pouya Pezeshkpour","Nikita Bhutani","Estevam Hruschka"],"pdf_url":"https://arxiv.org/pdf/2309.07382v2.pdf","comment":"EACL (main)"},{"id":"http://arxiv.org/abs/2401.10189v1","updated":"2024-01-18T18:20:15Z","published":"2024-01-18T18:20:15Z","title":"Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through\n Text Reconstruction","summary":" Fine-grained few-shot entity extraction in the chemical domain faces two\nunique challenges. First, compared with entity extraction tasks in the general\ndomain, sentences from chemical papers usually contain more entities. Moreover,\nentity extraction models usually have difficulty extracting entities of\nlong-tailed types. In this paper, we propose Chem-FINESE, a novel\nsequence-to-sequence (seq2seq) based few-shot entity extraction approach, to\naddress these two challenges. Our Chem-FINESE has two components: a seq2seq\nentity extractor to extract named entities from the input sentence and a\nseq2seq self-validation module to reconstruct the original input sentence from\nextracted entities. Inspired by the fact that a good entity extraction system\nneeds to extract entities faithfully, our new self-validation module leverages\nentity extraction results to reconstruct the original input sentence. Besides,\nwe design a new contrastive loss to reduce excessive copying during the\nextraction process. Finally, we release ChemNER+, a new fine-grained chemical\nentity extraction dataset that is annotated by domain experts with the ChemNER\nschema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets\nshow that our newly proposed framework has contributed up to 8.26% and 6.84%\nabsolute F1-score gains respectively.\n","authors":["Qingyun Wang","Zixuan Zhang","Hongxiang Li","Xuan Liu","Jiawei Han","Heng Ji","Huimin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10189v1.pdf","comment":"16 pages. Accepted by Findings of the Association for Computational\n Linguistics: EACL 2024. Code and resources are available at\n https://github.com/EagleW/Chem-FINESE"},{"id":"http://arxiv.org/abs/2401.10186v1","updated":"2024-01-18T18:15:46Z","published":"2024-01-18T18:15:46Z","title":"Beyond Reference-Based Metrics: Analyzing Behaviors of Open LLMs on\n Data-to-Text Generation","summary":" We investigate to which extent open large language models (LLMs) can generate\ncoherent and relevant text from structured data. To prevent bias from\nbenchmarks leaked into LLM training data, we collect Quintd-1: an ad-hoc\nbenchmark for five data-to-text (D2T) generation tasks, consisting of\nstructured data records in standard formats gathered from public APIs. We\nleverage reference-free evaluation metrics and LLMs' in-context learning\ncapabilities, allowing us to test the models with no human-written references.\nOur evaluation focuses on annotating semantic accuracy errors on token-level,\ncombining human annotators and a metric based on GPT-4. Our systematic\nexamination of the models' behavior across domains and tasks suggests that\nstate-of-the-art open LLMs with 7B parameters can generate fluent and coherent\ntext from various standard data formats in zero-shot settings. However, we also\nshow that semantic accuracy of the outputs remains a major issue: on our\nbenchmark, 80% of outputs of open LLMs contain a semantic error according to\nhuman annotators (91% according to GPT-4). Our code, data, and model outputs\nare available at https://d2t-llm.github.io.\n","authors":["Zdeněk Kasner","Ondřej Dušek"],"pdf_url":"https://arxiv.org/pdf/2401.10186v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2401.10134v1","updated":"2024-01-18T17:03:59Z","published":"2024-01-18T17:03:59Z","title":"Spatial-Temporal Large Language Model for Traffic Prediction","summary":" Traffic prediction, a critical component for intelligent transportation\nsystems, endeavors to foresee future traffic at specific locations using\nhistorical data. Although existing traffic prediction models often emphasize\ndeveloping complex neural network structures, their accuracy has not seen\nimprovements accordingly. Recently, Large Language Models (LLMs) have shown\noutstanding capabilities in time series analysis. Differing from existing\nmodels, LLMs progress mainly through parameter expansion and extensive\npre-training while maintaining their fundamental structures. In this paper, we\npropose a Spatial-Temporal Large Language Model (ST-LLM) for traffic\nprediction. Specifically, ST-LLM redefines the timesteps at each location as\ntokens and incorporates a spatial-temporal embedding module to learn the\nspatial location and global temporal representations of tokens. Then these\nrepresentations are fused to provide each token with unified spatial and\ntemporal information. Furthermore, we propose a novel partially frozen\nattention strategy of the LLM, which is designed to capture spatial-temporal\ndependencies for traffic prediction. Comprehensive experiments on real traffic\ndatasets offer evidence that ST-LLM outperforms state-of-the-art models.\nNotably, the ST-LLM also exhibits robust performance in both few-shot and\nzero-shot prediction scenarios.\n","authors":["Chenxi Liu","Sun Yang","Qianxiong Xu","Zhishuai Li","Cheng Long","Ziyue Li","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03122v2","updated":"2024-01-18T16:53:15Z","published":"2023-12-05T20:41:34Z","title":"Assertion Enhanced Few-Shot Learning: Instructive Technique for Large\n Language Models to Generate Educational Explanations","summary":" Human educators possess an intrinsic ability to anticipate and seek\neducational explanations from students, which drives them to pose\nthought-provoking questions when students cannot articulate these explanations\nindependently. We aim to imbue Intelligent Tutoring Systems with this ability\nusing few-shot learning capability of Large Language Models. Our work proposes\na novel prompting technique, Assertion Enhanced Few-Shot Learning, to\nfacilitate the generation of accurate, detailed oriented educational\nexplanations. Our central hypothesis is that, in educational domain, few-shot\ndemonstrations are necessary but not a sufficient condition for quality\nexplanation generation. We conducted a study involving 12 in-service teachers,\ncomparing our approach to Traditional Few-Shot Learning. The results show that\nAssertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and\nyields higher-quality explanations, as evaluated by teachers. We also conduct a\nqualitative ablation study to factor the impact of assertions to provide\neducator-friendly prompting guidelines for generating explanations in their\ndomain of interest.\n","authors":["Tasmia Shahriar","Noboru Matsuda","Kelly Ramos"],"pdf_url":"https://arxiv.org/pdf/2312.03122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10111v1","updated":"2024-01-18T16:27:18Z","published":"2024-01-18T16:27:18Z","title":"Marrying Adapters and Mixup to Efficiently Enhance the Adversarial\n Robustness of Pre-Trained Language Models for Text Classification","summary":" Existing works show that augmenting training data of neural networks using\nboth clean and adversarial examples can enhance their generalizability under\nadversarial attacks. However, this training approach often leads to performance\ndegradation on clean inputs. Additionally, it requires frequent re-training of\nthe entire model to account for new attack types, resulting in significant and\ncostly computations. Such limitations make adversarial training mechanisms less\npractical, particularly for complex Pre-trained Language Models (PLMs) with\nmillions or even billions of parameters. To overcome these challenges while\nstill harnessing the theoretical benefits of adversarial training, this study\ncombines two concepts: (1) adapters, which enable parameter-efficient\nfine-tuning, and (2) Mixup, which train NNs via convex combinations of pairs\ndata pairs. Intuitively, we propose to fine-tune PLMs through convex\ncombinations of non-data pairs of fine-tuned adapters, one trained with clean\nand another trained with adversarial examples. Our experiments show that the\nproposed method achieves the best trade-off between training efficiency and\npredictive performance, both with and without attacks compared to other\nbaselines on a variety of downstream tasks.\n","authors":["Tuc Nguyen","Thai Le"],"pdf_url":"https://arxiv.org/pdf/2401.10111v1.pdf","comment":"10 pages and 2 figures"},{"id":"http://arxiv.org/abs/2310.12086v2","updated":"2024-01-18T16:20:06Z","published":"2023-10-18T16:27:49Z","title":"FactCHD: Benchmarking Fact-Conflicting Hallucination Detection","summary":" Despite their impressive generative capabilities, LLMs are hindered by\nfact-conflicting hallucinations in real-world applications. The accurate\nidentification of hallucinations in texts generated by LLMs, especially in\ncomplex inferential scenarios, is a relatively unexplored area. To address this\ngap, we present FactCHD, a dedicated benchmark designed for the detection of\nfact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset\nthat spans various factuality patterns, including vanilla, multi-hop,\ncomparison, and set operation. A distinctive element of FactCHD is its\nintegration of fact-based evidence chains, significantly enhancing the depth of\nevaluating the detectors' explanations. Experiments on different LLMs expose\nthe shortcomings of current approaches in detecting factual errors accurately.\nFurthermore, we introduce Truth-Triangulator that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset is available at\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chenxi Wang","Ningyu Zhang","Jiang Yong","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2304.09048v2","updated":"2024-01-18T16:14:35Z","published":"2023-04-18T15:12:34Z","title":"CodeKGC: Code Language Model for Generative Knowledge Graph Construction","summary":" Current generative knowledge graph construction approaches usually fail to\ncapture structural knowledge by simply flattening natural language into\nserialized texts or a specification language. However, large generative\nlanguage model trained on structured data such as code has demonstrated\nimpressive capability in understanding natural language for structural\nprediction and reasoning tasks. Intuitively, we address the task of generative\nknowledge graph construction with code language model: given a code-format\nnatural language input, the target is to generate triples which can be\nrepresented as code completion tasks. Specifically, we develop schema-aware\nprompts that effectively utilize the semantic structure within the knowledge\ngraph. As code inherently possesses structure, such as class and function\ndefinitions, it serves as a useful model for prior semantic structural\nknowledge. Furthermore, we employ a rationale-enhanced generation method to\nboost the performance. Rationales provide intermediate steps, thereby improving\nknowledge extraction abilities. Experimental results indicate that the proposed\napproach can obtain better performance on benchmark datasets compared with\nbaselines. Code and datasets are available in\nhttps://github.com/zjunlp/DeepKE/tree/main/example/llm.\n","authors":["Zhen Bi","Jing Chen","Yinuo Jiang","Feiyu Xiong","Wei Guo","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.09048v2.pdf","comment":"ACM Transactions on Asian and Low-Resource Language Information\n Processing"},{"id":"http://arxiv.org/abs/2401.10091v1","updated":"2024-01-18T15:59:42Z","published":"2024-01-18T15:59:42Z","title":"Power in Numbers: Robust reading comprehension by finetuning with four\n adversarial sentences per example","summary":" Recent models have achieved human level performance on the Stanford Question\nAnswering Dataset when using F1 scores to evaluate the reading comprehension\ntask. Yet, teaching machines to comprehend text has not been solved in the\ngeneral case. By appending one adversarial sentence to the context paragraph,\npast research has shown that the F1 scores from reading comprehension models\ndrop almost in half. In this paper, I replicate past adversarial research with\na new model, ELECTRA-Small, and demonstrate that the new model's F1 score drops\nfrom 83.9% to 29.2%. To improve ELECTRA-Small's resistance to this attack, I\nfinetune the model on SQuAD v1.1 training examples with one to five adversarial\nsentences appended to the context paragraph. Like past research, I find that\nthe finetuned model on one adversarial sentence does not generalize well across\nevaluation datasets. However, when finetuned on four or five adversarial\nsentences the model attains an F1 score of more than 70% on most evaluation\ndatasets with multiple appended and prepended adversarial sentences. The\nresults suggest that with enough examples we can make models robust to\nadversarial attacks.\n","authors":["Ariel Marcus"],"pdf_url":"https://arxiv.org/pdf/2401.10091v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11065v2","updated":"2024-01-18T15:57:19Z","published":"2023-04-19T06:54:14Z","title":"Conversational Process Modeling: Can Generative AI Empower Domain\n Experts in Creating and Redesigning Process Models?","summary":" AI-driven chatbots such as ChatGPT have caused a tremendous hype lately. For\nBPM applications, several applications for AI-driven chatbots have been\nidentified to be promising to generate business value, including explanation of\nprocess mining outcomes and preparation of input data. However, a systematic\nanalysis of chatbots for their support of conversational process modeling as a\nprocess-oriented capability is missing. This work aims at closing this gap by\nproviding a systematic analysis of existing chatbots. Application scenarios are\nidentified along the process life cycle. Then a systematic literature review on\nconversational process modeling is performed, resulting in a taxonomy of\napplication scenarios for conversational process modeling, including\nparaphrasing and improvement of process descriptions. In addition, this work\nsuggests and applies an evaluation method for the output of AI-driven chatbots\nwith respect to completeness and correctness of the process models. This method\nconsists of a set of KPIs on a test set, a set of prompts for task and control\nflow extraction, as well as a survey with users. Based on the literature and\nthe evaluation, recommendations for the usage (practical implications) and\nfurther development (research directions) of conversational process modeling\nare derived.\n","authors":["Nataliia Klievtsova","Janik-Vasily Benzin","Timotheus Kampik","Juergen Mangler","Stefanie Rinderle-Ma"],"pdf_url":"https://arxiv.org/pdf/2304.11065v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10070v1","updated":"2024-01-18T15:39:38Z","published":"2024-01-18T15:39:38Z","title":"Communication-Efficient Personalized Federated Learning for\n Speech-to-Text Tasks","summary":" To protect privacy and meet legal regulations, federated learning (FL) has\ngained significant attention for training speech-to-text (S2T) systems,\nincluding automatic speech recognition (ASR) and speech translation (ST).\nHowever, the commonly used FL approach (i.e., \\textsc{FedAvg}) in S2T tasks\ntypically suffers from extensive communication overhead due to multi-round\ninteractions based on the whole model and performance degradation caused by\ndata heterogeneity among clients.To address these issues, we propose a\npersonalized federated S2T framework that introduces \\textsc{FedLoRA}, a\nlightweight LoRA module for client-side tuning and interaction with the server\nto minimize communication overhead, and \\textsc{FedMem}, a global model\nequipped with a $k$-nearest-neighbor ($k$NN) classifier that captures\nclient-specific distributional shifts to achieve personalization and overcome\ndata heterogeneity. Extensive experiments based on Conformer and Whisper\nbackbone models on CoVoST and GigaSpeech benchmarks show that our approach\nsignificantly reduces the communication overhead on all S2T tasks and\neffectively personalizes the global model to overcome data heterogeneity.\n","authors":["Yichao Du","Zhirui Zhang","Linan Yue","Xu Huang","Yuqing Zhang","Tong Xu","Linli Xu","Enhong Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10070v1.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.10462v2","updated":"2024-01-18T15:37:33Z","published":"2023-08-21T04:31:06Z","title":"Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation\n with Large Language Models","summary":" Large Language Models (LLMs) demonstrate impressive capabilities to generate\naccurate code snippets given natural language intents in zero-shot, i.e.,\nwithout the need for specific fine-tuning. While prior studies have highlighted\nthe advantages of fine-tuning LLMs, this process incurs high computational\ncosts, making it impractical in resource-scarce environments, particularly for\nmodels with billions of parameters. To address these challenges, previous\nresearch explored In-Context Learning (ICL) as a strategy to guide the LLM\ngenerative process with task-specific prompt examples. However, ICL introduces\ninconveniences, such as the need for designing contextually relevant prompts\nand the absence of learning task-specific parameters, thereby limiting\ndownstream task performance. In this context, we foresee Parameter-Efficient\nFine-Tuning (PEFT) techniques as a promising approach to efficiently specialize\nLLMs to task-specific data while maintaining reasonable resource consumption.\nIn this paper, we deliver a comprehensive study of PEFT techniques for LLMs\nunder the automated code generation scenario. Our comprehensive investigation\nof PEFT techniques for LLMs reveals their superiority and potential over ICL\nacross a diverse set of LLMs. Additionally, we demonstrate the extended\ncapabilities of PEFT, showcasing its ability to learn from two distinct\ndatasets jointly without compromising performance. Furthermore, our study\nhighlights the potential for tuning larger LLMs and significant reductions in\nmemory usage by combining PEFT with quantization. Therefore, this study opens\nopportunities for broader applications of PEFT in software engineering\nscenarios. Our code is available at\nhttps://github.com/martin-wey/peft-llm-code/.\n","authors":["Martin Weyssow","Xin Zhou","Kisub Kim","David Lo","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2308.10462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10065v1","updated":"2024-01-18T15:32:24Z","published":"2024-01-18T15:32:24Z","title":"Code Prompting Elicits Conditional Reasoning Abilities in Text+Code LLMs","summary":" Reasoning is a fundamental component for achieving language understanding.\nAmong the multiple types of reasoning, conditional reasoning, the ability to\ndraw different conclusions depending on some condition, has been understudied\nin large language models (LLMs). Recent prompting methods, such as chain of\nthought, have significantly improved LLMs on reasoning tasks. Nevertheless,\nthere is still little understanding of what triggers reasoning abilities in\nLLMs. We hypothesize that code prompts can trigger conditional reasoning in\nLLMs trained on text and code. We propose a chain of prompts that transforms a\nnatural language problem into code and prompts the LLM with the generated code.\nOur experiments find that code prompts exhibit a performance boost between 2.6\nand 7.7 points on GPT 3.5 across multiple datasets requiring conditional\nreasoning. We then conduct experiments to discover how code prompts elicit\nconditional reasoning abilities and through which features. We observe that\nprompts need to contain natural language text accompanied by high-quality code\nthat closely represents the semantics of the instance text. Furthermore, we\nshow that code prompts are more efficient, requiring fewer demonstrations, and\nthat they trigger superior state tracking of variables or key entities.\n","authors":["Haritz Puerto","Martin Tutek","Somak Aditya","Xiaodan Zhu","Iryna Gurevych"],"pdf_url":"https://arxiv.org/pdf/2401.10065v1.pdf","comment":"Code, prompt templates, prompts, and outputs are publicly available\n at https://github.com/UKPLab/arxiv2024-conditional-reasoning-llms"},{"id":"http://arxiv.org/abs/2401.10045v1","updated":"2024-01-18T15:08:58Z","published":"2024-01-18T15:08:58Z","title":"Antonym vs Synonym Distinction using InterlaCed Encoder NETworks\n (ICE-NET)","summary":" Antonyms vs synonyms distinction is a core challenge in lexico-semantic\nanalysis and automated lexical resource construction. These pairs share a\nsimilar distributional context which makes it harder to distinguish them.\nLeading research in this regard attempts to capture the properties of the\nrelation pairs, i.e., symmetry, transitivity, and trans-transitivity. However,\nthe inability of existing research to appropriately model the relation-specific\nproperties limits their end performance. In this paper, we propose InterlaCed\nEncoder NETworks (i.e., ICE-NET) for antonym vs synonym distinction, that aim\nto capture and model the relation-specific properties of the antonyms and\nsynonyms pairs in order to perform the classification task in a\nperformance-enhanced manner. Experimental evaluation using the benchmark\ndatasets shows that ICE-NET outperforms the existing research by a relative\nscore of upto 1.8% in F1-measure. We release the codes for ICE-NET at\nhttps://github.com/asif6827/ICENET.\n","authors":["Muhammad Asif Ali","Yan Hu","Jianbin Qin","Di Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10040v1","updated":"2024-01-18T15:04:55Z","published":"2024-01-18T15:04:55Z","title":"Large Language Models for Scientific Information Extraction: An\n Empirical Study for Virology","summary":" In this paper, we champion the use of structured and semantic content\nrepresentation of discourse-based scholarly communication, inspired by tools\nlike Wikipedia infoboxes or structured Amazon product descriptions. These\nrepresentations provide users with a concise overview, aiding scientists in\nnavigating the dense academic landscape. Our novel automated approach leverages\nthe robust text generation capabilities of LLMs to produce structured scholarly\ncontribution summaries, offering both a practical solution and insights into\nLLMs' emergent abilities.\n For LLMs, the prime focus is on improving their general intelligence as\nconversational agents. We argue that these models can also be applied\neffectively in information extraction (IE), specifically in complex IE tasks\nwithin terse domains like Science. This paradigm shift replaces the traditional\nmodular, pipelined machine learning approach with a simpler objective expressed\nthrough instructions. Our results show that finetuned FLAN-T5 with 1000x fewer\nparameters than the state-of-the-art GPT-davinci is competitive for the task.\n","authors":["Mahsa Shamsabadi","Jennifer D'Souza","Sören Auer"],"pdf_url":"https://arxiv.org/pdf/2401.10040v1.pdf","comment":"8 pages, 6 figures, Accepted as Findings of the ACL: EACL 2024"},{"id":"http://arxiv.org/abs/2401.10034v1","updated":"2024-01-18T14:58:17Z","published":"2024-01-18T14:58:17Z","title":"Evolutionary Computation in the Era of Large Language Model: Survey and\n Roadmap","summary":" Large Language Models (LLMs), built upon Transformer-based architectures with\nmassive pretraining on diverse data, have not only revolutionized natural\nlanguage processing but also extended their prowess to various domains, marking\na significant stride towards artificial general intelligence. The interplay\nbetween LLMs and Evolutionary Algorithms (EAs), despite differing in objectives\nand methodologies, reveals intriguing parallels, especially in their shared\noptimization nature, black-box characteristics, and proficiency in handling\ncomplex problems. Meanwhile, EA can not only provide an optimization framework\nfor LLM's further enhancement under black-box settings but also empower LLM\nwith flexible global search and iterative mechanism in applications. On the\nother hand, LLM's abundant domain knowledge enables EA to perform smarter\nsearches, while its text processing capability assist in deploying EA across\nvarious tasks. Based on their complementary advantages, this paper presents a\ncomprehensive review and forward-looking roadmap, categorizing their mutual\ninspiration into LLM-enhanced evolutionary optimization and EA-enhanced LLM.\nSome integrated synergy methods are further introduced to exemplify the\namalgamation of LLMs and EAs in various application scenarios, including neural\narchitecture search, code generation, software engineering, and text\ngeneration. As the first comprehensive review specifically focused on the EA\nresearch in the era of LLMs, this paper provides a foundational stepping stone\nfor understanding and harnessing the collaborative potential of LLMs and EAs.\nBy presenting a comprehensive review, categorization, and critical analysis, we\ncontribute to the ongoing discourse on the cross-disciplinary study of these\ntwo powerful paradigms. The identified challenges and future directions offer\nguidance to unlock the full potential of this innovative collaboration.\n","authors":["Xingyu Wu","Sheng-hao Wu","Jibin Wu","Liang Feng","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2401.10034v1.pdf","comment":"evolutionary algorithm (EA), large language model (LLM), optimization\n problem, prompt optimization, architecture search, code generation"},{"id":"http://arxiv.org/abs/2401.10030v1","updated":"2024-01-18T14:56:23Z","published":"2024-01-18T14:56:23Z","title":"Framing Analysis of Health-Related Narratives: Conspiracy versus\n Mainstream Media","summary":" Understanding how online media frame issues is crucial due to their impact on\npublic opinion. Research on framing using natural language processing\ntechniques mainly focuses on specific content features in messages and neglects\ntheir narrative elements. Also, the distinction between framing in different\nsources remains an understudied problem. We address those issues and\ninvestigate how the framing of health-related topics, such as COVID-19 and\nother diseases, differs between conspiracy and mainstream websites. We\nincorporate narrative information into the framing analysis by introducing a\nnovel frame extraction approach based on semantic graphs. We find that\nhealth-related narratives in conspiracy media are predominantly framed in terms\nof beliefs, while mainstream media tend to present them in terms of science. We\nhope our work offers new ways for a more nuanced frame analysis.\n","authors":["Markus Reiter-Haas","Beate Klösch","Markus Hadler","Elisabeth Lex"],"pdf_url":"https://arxiv.org/pdf/2401.10030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10020v1","updated":"2024-01-18T14:43:47Z","published":"2024-01-18T14:43:47Z","title":"Self-Rewarding Language Models","summary":" We posit that to achieve superhuman agents, future models require superhuman\nfeedback in order to provide an adequate training signal. Current approaches\ncommonly train reward models from human preferences, which may then be\nbottlenecked by human performance level, and secondly these separate frozen\nreward models cannot then learn to improve during LLM training. In this work,\nwe study Self-Rewarding Language Models, where the language model itself is\nused via LLM-as-a-Judge prompting to provide its own rewards during training.\nWe show that during Iterative DPO training that not only does instruction\nfollowing ability improve, but also the ability to provide high-quality rewards\nto itself. Fine-tuning Llama 2 70B on three iterations of our approach yields a\nmodel that outperforms many existing systems on the AlpacaEval 2.0 leaderboard,\nincluding Claude 2, Gemini Pro, and GPT-4 0613. While only a preliminary study,\nthis work opens the door to the possibility of models that can continually\nimprove in both axes.\n","authors":["Weizhe Yuan","Richard Yuanzhe Pang","Kyunghyun Cho","Sainbayar Sukhbaatar","Jing Xu","Jason Weston"],"pdf_url":"https://arxiv.org/pdf/2401.10020v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10019v1","updated":"2024-01-18T14:40:46Z","published":"2024-01-18T14:40:46Z","title":"R-Judge: Benchmarking Safety Risk Awareness for LLM Agents","summary":" Large language models (LLMs) have exhibited great potential in autonomously\ncompleting tasks across real-world applications. Despite this, these LLM agents\nintroduce unexpected safety risks when operating in interactive environments.\nInstead of centering on LLM-generated content safety in most prior studies,\nthis work addresses the imperative need for benchmarking the behavioral safety\nof LLM agents within diverse environments. We introduce R-Judge, a benchmark\ncrafted to evaluate the proficiency of LLMs in judging safety risks given agent\ninteraction records. R-Judge comprises 162 agent interaction records,\nencompassing 27 key risk scenarios among 7 application categories and 10 risk\ntypes. It incorporates human consensus on safety with annotated safety risk\nlabels and high-quality risk descriptions. Utilizing R-Judge, we conduct a\ncomprehensive evaluation of 8 prominent LLMs commonly employed as the backbone\nfor agents. The best-performing model, GPT-4, achieves 72.29% in contrast to\nthe human score of 89.38%, showing considerable room for enhancing the risk\nawareness of LLMs. Notably, leveraging risk descriptions as environment\nfeedback significantly improves model performance, revealing the importance of\nsalient safety risk feedback. Furthermore, we design an effective chain of\nsafety analysis technique to help the judgment of safety risks and conduct an\nin-depth case study to facilitate future research. R-Judge is publicly\navailable at https://github.com/Lordog/R-Judge.\n","authors":["Tongxin Yuan","Zhiwei He","Lingzhong Dong","Yiming Wang","Ruijie Zhao","Tian Xia","Lizhen Xu","Binglin Zhou","Fangqi Li","Zhuosheng Zhang","Rui Wang","Gongshen Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10019v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10016v1","updated":"2024-01-18T14:34:49Z","published":"2024-01-18T14:34:49Z","title":"Gender Bias in Machine Translation and The Era of Large Language Models","summary":" This chapter examines the role of Machine Translation in perpetuating gender\nbias, highlighting the challenges posed by cross-linguistic settings and\nstatistical dependencies. A comprehensive overview of relevant existing work\nrelated to gender bias in both conventional Neural Machine Translation\napproaches and Generative Pretrained Transformer models employed as Machine\nTranslation systems is provided. Through an experiment using ChatGPT (based on\nGPT-3.5) in an English-Italian translation context, we further assess ChatGPT's\ncurrent capacity to address gender bias. The findings emphasize the ongoing\nneed for advancements in mitigating bias in Machine Translation systems and\nunderscore the importance of fostering fairness and inclusivity in language\ntechnologies.\n","authors":["Eva Vanmassenhove"],"pdf_url":"https://arxiv.org/pdf/2401.10016v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2401.10015v1","updated":"2024-01-18T14:33:01Z","published":"2024-01-18T14:33:01Z","title":"Towards Hierarchical Spoken Language Dysfluency Modeling","summary":" Speech dysfluency modeling is the bottleneck for both speech therapy and\nlanguage learning. However, there is no AI solution to systematically tackle\nthis problem. We first propose to define the concept of dysfluent speech and\ndysfluent speech modeling. We then present Hierarchical Unconstrained\nDysfluency Modeling (H-UDM) approach that addresses both dysfluency\ntranscription and detection to eliminate the need for extensive manual\nannotation. Furthermore, we introduce a simulated dysfluent dataset called\nVCTK++ to enhance the capabilities of H-UDM in phonetic transcription. Our\nexperimental results demonstrate the effectiveness and robustness of our\nproposed methods in both transcription and detection tasks.\n","authors":["Jiachen Lian","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2401.10015v1.pdf","comment":"2024 EACL Long (main conference). arXiv admin note: substantial text\n overlap with arXiv:2312.12810"},{"id":"http://arxiv.org/abs/2311.13184v2","updated":"2024-01-18T14:32:15Z","published":"2023-11-22T06:23:18Z","title":"Large Language Model-Enhanced Algorithm Selection: Towards Comprehensive\n Algorithm Representation","summary":" Algorithm selection aims to identify the most suitable algorithm for solving\na specific problem before execution, which has become a critical process of the\nAutoML. Current mainstream algorithm selection techniques rely heavily on\nfeature representations of various problems and employ the performance of each\nalgorithm as supervised information. However, there is a significant research\ngap concerning the consideration of algorithm features. This gap is primarily\nattributed to the inherent complexity of algorithms, making it particularly\nchallenging to find a universally effective feature extraction method that is\napplicable across a diverse range of algorithms. Unfortunately, neglecting this\naspect undoubtedly impacts the accuracy of algorithm selection and indirectly\nnecessitates an increased volume of problem data for training purposes. This\npaper takes a significant stride towards addressing this gap by proposing an\napproach that integrates algorithm representation into the algorithm selection\nprocess. Specifically, our proposed model employs distinct modules to extract\nrepresentations of both problems and algorithms, where the algorithm\nrepresentation leverages the capabilities of pre-trained LLMs in the realm of\ncode comprehension. Following the extraction of embedding vectors for both\nalgorithms and problems, the most suitable algorithm is determined through\ncalculations of matching degrees. Our experiments not only validate the\neffectiveness of the proposed model but also showcase the performance of\ndifferent embedded pre-trained LLMs, which suggests that the proposed algorithm\nselection framework holds the potential to serve as a baseline task for\nevaluating the code representation capabilities of LLMs.\n","authors":["Xingyu Wu","Yan Zhong","Jibin Wu","Bingbing Jiang","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2311.13184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18913v2","updated":"2024-01-18T14:23:07Z","published":"2023-10-29T05:50:03Z","title":"Debiasing Algorithm through Model Adaptation","summary":" Large language models are becoming the go-to solution for various language\ntasks. However, with growing capacity, models are prone to rely on spurious\ncorrelations stemming from biases and stereotypes present in the training data.\nThis work proposes a novel method for detecting and mitigating gender bias in\nlanguage models. We perform causal analysis to identify problematic model\ncomponents and discover that mid-upper feed-forward layers are most prone to\nconvey biases. Based on the analysis results, we adapt the model by multiplying\nthese layers by a linear projection. Our titular method, DAMA, significantly\ndecreases bias as measured by diverse metrics while maintaining the model's\nperformance on downstream tasks. We release code for our method and models,\nwhich retrain LLaMA's state-of-the-art performance while being significantly\nless biased.\n","authors":["Tomasz Limisiewicz","David Mareček","Tomáš Musil"],"pdf_url":"https://arxiv.org/pdf/2310.18913v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10005v1","updated":"2024-01-18T14:21:56Z","published":"2024-01-18T14:21:56Z","title":"Advancing Large Multi-modal Models with Explicit Chain-of-Reasoning and\n Visual Question Generation","summary":" The increasing demand for intelligent systems capable of interpreting and\nreasoning about visual content requires the development of Large Multi-Modal\nModels (LMMs) that are not only accurate but also have explicit reasoning\ncapabilities. This paper presents a novel approach to imbue an LMM with the\nability to conduct explicit reasoning based on visual content and textual\ninstructions. We introduce a system that can ask a question to acquire\nnecessary knowledge, thereby enhancing the robustness and explicability of the\nreasoning process. Our method comprises the development of a novel dataset\ngenerated by a Large Language Model (LLM), designed to promote chain-of-thought\nreasoning combined with a question-asking mechanism. We designed an LMM, which\nhas high capabilities on region awareness to address the intricate requirements\nof image-text alignment. The model undergoes a three-stage training phase,\nstarting with large-scale image-text alignment using a large-scale datasets,\nfollowed by instruction tuning, and fine-tuning with a focus on\nchain-of-thought reasoning. The results demonstrate a stride toward a more\nrobust, accurate, and interpretable LMM, capable of reasoning explicitly and\nseeking information proactively when confronted with ambiguous visual input.\n","authors":["Kohei Uehara","Nabarun Goswami","Hanqin Wang","Toshiaki Baba","Kohtaro Tanaka","Tomohiro Hashimoto","Kai Wang","Rei Ito","Takagi Naoya","Ryo Umagami","Yingyi Wen","Tanachai Anakewat","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2401.10005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10002v1","updated":"2024-01-18T14:17:40Z","published":"2024-01-18T14:17:40Z","title":"Distantly Supervised Morpho-Syntactic Model for Relation Extraction","summary":" The task of Information Extraction (IE) involves automatically converting\nunstructured textual content into structured data. Most research in this field\nconcentrates on extracting all facts or a specific set of relationships from\ndocuments. In this paper, we present a method for the extraction and\ncategorisation of an unrestricted set of relationships from text. Our method\nrelies on morpho-syntactic extraction patterns obtained by a distant\nsupervision method, and creates Syntactic and Semantic Indices to extract and\nclassify candidate graphs. We evaluate our approach on six datasets built on\nWikidata and Wikipedia. The evaluation shows that our approach can achieve\nPrecision scores of up to 0.85, but with lower Recall and F1 scores. Our\napproach allows to quickly create rule-based systems for Information Extraction\nand to build annotated datasets to train machine-learning and deep-learning\nbased classifiers.\n","authors":["Nicolas Gutehrlé","Iana Atanassova"],"pdf_url":"https://arxiv.org/pdf/2401.10002v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.09984v1","updated":"2024-01-18T13:58:10Z","published":"2024-01-18T13:58:10Z","title":"Gradable ChatGPT Translation Evaluation","summary":" ChatGPT, as a language model based on large-scale pre-training, has exerted a\nprofound influence on the domain of machine translation. In ChatGPT, a \"Prompt\"\nrefers to a segment of text or instruction employed to steer the model towards\ngenerating a specific category of response. The design of the translation\nprompt emerges as a key aspect that can wield influence over factors such as\nthe style, precision and accuracy of the translation to a certain extent.\nHowever, there is a lack of a common standard and methodology on how to design\nand select a translation prompt. Accordingly, this paper proposes a generic\ntaxonomy, which defines gradable translation prompts in terms of expression\ntype, translation style, POS information and explicit statement, thus\nfacilitating the construction of prompts endowed with distinct attributes\ntailored for various translation tasks. Specific experiments and cases are\nselected to validate and illustrate the effectiveness of the method.\n","authors":["Hui Jiao","Bei Peng","Lu Zong","Xiaojun Zhang","Xinwei Li"],"pdf_url":"https://arxiv.org/pdf/2401.09984v1.pdf","comment":"Under review in the journal Procesamiento del Lenguaje Natural"},{"id":"http://arxiv.org/abs/2401.09972v1","updated":"2024-01-18T13:41:08Z","published":"2024-01-18T13:41:08Z","title":"Better Explain Transformers by Illuminating Important Information","summary":" Transformer-based models excel in various natural language processing (NLP)\ntasks, attracting countless efforts to explain their inner workings. Prior\nmethods explain Transformers by focusing on the raw gradient and attention as\ntoken attribution scores, where non-relevant information is often considered\nduring explanation computation, resulting in confusing results. In this work,\nwe propose highlighting the important information and eliminating irrelevant\ninformation by a refined information flow on top of the layer-wise relevance\npropagation (LRP) method. Specifically, we consider identifying syntactic and\npositional heads as important attention heads and focus on the relevance\nobtained from these important heads. Experimental results demonstrate that\nirrelevant information does distort output attribution scores and then should\nbe masked during explanation computation. Compared to eight baselines on both\nclassification and question-answering datasets, our method consistently\noutperforms with over 3\\% to 33\\% improvement on explanation metrics, providing\nsuperior explanation performance. Our anonymous code repository is available\nat: https://github.com/LinxinS97/Mask-LRP\n","authors":["Linxin Song","Yan Cui","Ao Luo","Freddy Lecue","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2401.09972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13971v6","updated":"2024-01-18T13:35:55Z","published":"2023-05-23T11:54:37Z","title":"Grammar-Constrained Decoding for Structured NLP Tasks without Finetuning","summary":" Despite their impressive performance, large language models (LMs) still\nstruggle with reliably generating complex output structures when not finetuned\nto follow the required output format exactly. To address this issue,\ngrammar-constrained decoding (GCD) can be used to control the generation of\nLMs, guaranteeing that the output follows a given structure. Most existing GCD\nmethods are, however, limited to specific tasks, such as parsing or code\ngeneration. In this work, we demonstrate that formal grammars can describe the\noutput space for a much wider range of tasks and argue that GCD can serve as a\nunified framework for structured NLP tasks in general. For increased\nflexibility, we introduce input-dependent grammars, which allow the grammar to\ndepend on the input and thus enable the generation of different output\nstructures for different inputs. We then empirically demonstrate the power and\nflexibility of GCD-enhanced LMs on (1) information extraction, (2) entity\ndisambiguation, and (3) constituency parsing. Our results indicate that\ngrammar-constrained LMs substantially outperform unconstrained LMs or even beat\ntask-specific finetuned models. Grammar constraints thus hold great promise for\nharnessing off-the-shelf LMs for a wide range of structured NLP tasks,\nespecially where training data is scarce or finetuning is expensive. Code and\ndata: https://github.com/epfl-dlab/GCD.\n","authors":["Saibo Geng","Martin Josifoski","Maxime Peyrard","Robert West"],"pdf_url":"https://arxiv.org/pdf/2305.13971v6.pdf","comment":"Accepted at EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2401.09967v1","updated":"2024-01-18T13:31:24Z","published":"2024-01-18T13:31:24Z","title":"Sketch-Guided Constrained Decoding for Boosting Blackbox Large Language\n Models without Logit Access","summary":" Constrained decoding, a technique for enforcing constraints on language model\noutputs, offers a way to control text generation without retraining or\narchitectural modifications. Its application is, however, typically restricted\nto models that give users access to next-token distributions (usually via\nsoftmax logits), which poses a limitation with blackbox large language models\n(LLMs). This paper introduces sketch-guided constrained decoding (SGCD), a\nnovel approach to constrained decoding for blackbox LLMs, which operates\nwithout access to the logits of the blackbox LLM. SGCD utilizes a locally\nhosted auxiliary model to refine the output of an unconstrained blackbox LLM,\neffectively treating this initial output as a \"sketch\" for further elaboration.\nThis approach is complementary to traditional logit-based techniques and\nenables the application of constrained decoding in settings where full model\ntransparency is unavailable. We demonstrate the efficacy of SGCD through\nexperiments in closed information extraction and constituency parsing, showing\nhow it enhances the utility and flexibility of blackbox LLMs for complex NLP\ntasks.\n","authors":["Saibo Geng","Berkay Döner","Chris Wendler","Martin Josifoski","Robert West"],"pdf_url":"https://arxiv.org/pdf/2401.09967v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08483v2","updated":"2024-01-18T11:43:52Z","published":"2023-10-12T16:42:53Z","title":"Understanding the Humans Behind Online Misinformation: An Observational\n Study Through the Lens of the COVID-19 Pandemic","summary":" The proliferation of online misinformation has emerged as one of the biggest\nthreats to society. Considerable efforts have focused on building\nmisinformation detection models, still the perils of misinformation remain\nabound. Mitigating online misinformation and its ramifications requires a\nholistic approach that encompasses not only an understanding of its intricate\nlandscape in relation to the complex issue and topic-rich information ecosystem\nonline, but also the psychological drivers of individuals behind it. Adopting a\ntime series analytic technique and robust causal inference-based design, we\nconduct a large-scale observational study analyzing over 32 million COVID-19\ntweets and 16 million historical timeline tweets. We focus on understanding the\nbehavior and psychology of users disseminating misinformation during COVID-19\nand its relationship with the historical inclinations towards sharing\nmisinformation on Non-COVID domains before the pandemic. Our analysis\nunderscores the intricacies inherent to cross-domain misinformation, and\nhighlights that users' historical inclination toward sharing misinformation is\npositively associated with their present behavior pertaining to misinformation\nsharing on emergent topics and beyond. This work may serve as a valuable\nfoundation for designing user-centric inoculation strategies and\necologically-grounded agile interventions for effectively tackling online\nmisinformation.\n","authors":["Mohit Chandra","Anush Mattapalli","Munmun De Choudhury"],"pdf_url":"https://arxiv.org/pdf/2310.08483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07284v2","updated":"2024-01-18T11:29:37Z","published":"2024-01-14T13:11:31Z","title":"Improving Domain Adaptation through Extended-Text Reading Comprehension","summary":" To enhance the domain-specific capabilities of large language models,\ncontinued pre-training on a domain-specific corpus is a prevalent method.\nRecent work demonstrates that adapting models using reading comprehension data\nformatted by regex-based patterns can significantly improve performance on\ndomain-specific tasks. However, regex-based patterns are incapable of parsing\nraw corpora using domain-specific knowledge. Furthermore, the question and\nanswer pairs are extracted directly from the corpus in predefined formats\noffers limited context. To address this limitation, we improve reading\ncomprehension via LLM and clustering. LLM focuses on leveraging domain\nknowledge within the corpus to refine comprehension stage, while clustering\nsupplies relevant knowledge by extending the context to enrich reading stage.\nAdditionally, our method incorporates parameter-efficient fine-tuning to\nimprove the efficiency of domain adaptation. In comparison to AdaptLLM, our\nmethod achieves an improvement exceeding 5% in domain-specific tasks. Our code\nwill available at https://github.com/microsoft/LMOps.\n","authors":["Ting Jiang","Shaohan Huang","Shengyue Luo","Zihan Zhang","Haizhen Huang","Furu Wei","Weiwei Deng","Feng Sun","Qi Zhang","Deqing Wang","Fuzhen Zhuang"],"pdf_url":"https://arxiv.org/pdf/2401.07284v2.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2401.09899v1","updated":"2024-01-18T11:24:30Z","published":"2024-01-18T11:24:30Z","title":"Meme-ingful Analysis: Enhanced Understanding of Cyberbullying in Memes\n Through Multimodal Explanations","summary":" Internet memes have gained significant influence in communicating political,\npsychological, and sociocultural ideas. While memes are often humorous, there\nhas been a rise in the use of memes for trolling and cyberbullying. Although a\nwide variety of effective deep learning-based models have been developed for\ndetecting offensive multimodal memes, only a few works have been done on\nexplainability aspect. Recent laws like \"right to explanations\" of General Data\nProtection Regulation, have spurred research in developing interpretable models\nrather than only focusing on performance. Motivated by this, we introduce {\\em\nMultiBully-Ex}, the first benchmark dataset for multimodal explanation from\ncode-mixed cyberbullying memes. Here, both visual and textual modalities are\nhighlighted to explain why a given meme is cyberbullying. A Contrastive\nLanguage-Image Pretraining (CLIP) projection-based multimodal shared-private\nmultitask approach has been proposed for visual and textual explanation of a\nmeme. Experimental results demonstrate that training with multimodal\nexplanations improves performance in generating textual justifications and more\naccurately identifying the visual evidence supporting a decision with reliable\nperformance improvements.\n","authors":["Prince Jha","Krishanu Maity","Raghav Jain","Apoorv Verma","Sriparna Saha","Pushpak Bhattacharyya"],"pdf_url":"https://arxiv.org/pdf/2401.09899v1.pdf","comment":"EACL2024"},{"id":"http://arxiv.org/abs/2401.09890v1","updated":"2024-01-18T11:05:03Z","published":"2024-01-18T11:05:03Z","title":"A Survey on Hardware Accelerators for Large Language Models","summary":" Large Language Models (LLMs) have emerged as powerful tools for natural\nlanguage processing tasks, revolutionizing the field with their ability to\nunderstand and generate human-like text. As the demand for more sophisticated\nLLMs continues to grow, there is a pressing need to address the computational\nchallenges associated with their scale and complexity. This paper presents a\ncomprehensive survey on hardware accelerators designed to enhance the\nperformance and energy efficiency of Large Language Models. By examining a\ndiverse range of accelerators, including GPUs, FPGAs, and custom-designed\narchitectures, we explore the landscape of hardware solutions tailored to meet\nthe unique computational demands of LLMs. The survey encompasses an in-depth\nanalysis of architecture, performance metrics, and energy efficiency\nconsiderations, providing valuable insights for researchers, engineers, and\ndecision-makers aiming to optimize the deployment of LLMs in real-world\napplications.\n","authors":["Christoforos Kachris"],"pdf_url":"https://arxiv.org/pdf/2401.09890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09880v1","updated":"2024-01-18T10:52:46Z","published":"2024-01-18T10:52:46Z","title":"Attention-Based Recurrent Neural Network For Automatic Behavior Laying\n Hen Recognition","summary":" One of the interests of modern poultry farming is the vocalization of laying\nhens which contain very useful information on health behavior. This information\nis used as health and well-being indicators that help breeders better monitor\nlaying hens, which involves early detection of problems for rapid and more\neffective intervention. In this work, we focus on the sound analysis for the\nrecognition of the types of calls of the laying hens in order to propose a\nrobust system of characterization of their behavior for a better monitoring. To\ndo this, we first collected and annotated laying hen call signals, then\ndesigned an optimal acoustic characterization based on the combination of time\nand frequency domain features. We then used these features to build the\nmulti-label classification models based on recurrent neural network to assign a\nsemantic class to the vocalization that characterize the laying hen behavior.\nThe results show an overall performance with our model based on the combination\nof time and frequency domain features that obtained the highest F1-score\n(F1=92.75) with a gain of 17% on the models using the frequency domain features\nand of 8% on the compared approaches from the litterature.\n","authors":["Fréjus A. A. Laleye","Mikaël A. Mousse"],"pdf_url":"https://arxiv.org/pdf/2401.09880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09862v1","updated":"2024-01-18T10:21:15Z","published":"2024-01-18T10:21:15Z","title":"Evolutionary Multi-Objective Optimization of Large Language Model\n Prompts for Balancing Sentiments","summary":" The advent of large language models (LLMs) such as ChatGPT has attracted\nconsiderable attention in various domains due to their remarkable performance\nand versatility. As the use of these models continues to grow, the importance\nof effective prompt engineering has come to the fore. Prompt optimization\nemerges as a crucial challenge, as it has a direct impact on model performance\nand the extraction of relevant information. Recently, evolutionary algorithms\n(EAs) have shown promise in addressing this issue, paving the way for novel\noptimization strategies. In this work, we propose a evolutionary\nmulti-objective (EMO) approach specifically tailored for prompt optimization\ncalled EMO-Prompts, using sentiment analysis as a case study. We use sentiment\nanalysis capabilities as our experimental targets. Our results demonstrate that\nEMO-Prompts effectively generates prompts capable of guiding the LLM to produce\ntexts embodying two conflicting emotions simultaneously.\n","authors":["Jill Baumann","Oliver Kramer"],"pdf_url":"https://arxiv.org/pdf/2401.09862v1.pdf","comment":"Accepted in EvoApps at EvoStar 2024"},{"id":"http://arxiv.org/abs/2401.09839v1","updated":"2024-01-18T09:54:18Z","published":"2024-01-18T09:54:18Z","title":"MatSciRE: Leveraging Pointer Networks to Automate Entity and Relation\n Extraction for Material Science Knowledge-base Construction","summary":" Material science literature is a rich source of factual information about\nvarious categories of entities (like materials and compositions) and various\nrelations between these entities, such as conductivity, voltage, etc.\nAutomatically extracting this information to generate a material science\nknowledge base is a challenging task. In this paper, we propose MatSciRE\n(Material Science Relation Extractor), a Pointer Network-based encoder-decoder\nframework, to jointly extract entities and relations from material science\narticles as a triplet ($entity1, relation, entity2$). Specifically, we target\nthe battery materials and identify five relations to work on - conductivity,\ncoulombic efficiency, capacity, voltage, and energy. Our proposed approach\nachieved a much better F1-score (0.771) than a previous attempt using\nChemDataExtractor (0.716). The overall graphical framework of MatSciRE is shown\nin Fig 1. The material information is extracted from material science\nliterature in the form of entity-relation triplets using MatSciRE.\n","authors":["Ankan Mullick","Akash Ghosh","G Sai Chaitanya","Samir Ghui","Tapas Nayak","Seung-Cheol Lee","Satadeep Bhattacharjee","Pawan Goyal"],"pdf_url":"https://arxiv.org/pdf/2401.09839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08417v2","updated":"2024-01-18T09:31:28Z","published":"2024-01-16T15:04:51Z","title":"Contrastive Preference Optimization: Pushing the Boundaries of LLM\n Performance in Machine Translation","summary":" Moderate-sized large language models (LLMs) -- those with 7B or 13B\nparameters -- exhibit promising machine translation (MT) performance. However,\neven the top-performing 13B LLM-based translation models, like ALMA, does not\nmatch the performance of state-of-the-art conventional encoder-decoder\ntranslation models or larger-scale LLMs such as GPT-4. In this study, we bridge\nthis performance gap. We first assess the shortcomings of supervised\nfine-tuning for LLMs in the MT task, emphasizing the quality issues present in\nthe reference data, despite being human-generated. Then, in contrast to SFT\nwhich mimics reference translations, we introduce Contrastive Preference\nOptimization (CPO), a novel approach that trains models to avoid generating\nadequate but not perfect translations. Applying CPO to ALMA models with only\n22K parallel sentences and 12M parameters yields significant improvements. The\nresulting model, called ALMA-R, can match or exceed the performance of the WMT\ncompetition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets.\n","authors":["Haoran Xu","Amr Sharaf","Yunmo Chen","Weiting Tan","Lingfeng Shen","Benjamin Van Durme","Kenton Murray","Young Jin Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08417v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09815v1","updated":"2024-01-18T09:13:59Z","published":"2024-01-18T09:13:59Z","title":"Simple and effective data augmentation for compositional generalization","summary":" Compositional generalization, the ability to predict complex meanings from\ntraining on simpler sentences, poses challenges for powerful pretrained seq2seq\nmodels. In this paper, we show that data augmentation methods that sample MRs\nand backtranslate them can be effective for compositional generalization, but\nonly if we sample from the right distribution. Remarkably, sampling from a\nuniform distribution performs almost as well as sampling from the test\ndistribution, and greatly outperforms earlier methods that sampled from the\ntraining distribution. We further conduct experiments to investigate the reason\nwhy this happens and where the benefit of such data augmentation methods come\nfrom.\n","authors":["Yuekun Yao","Alexander Koller"],"pdf_url":"https://arxiv.org/pdf/2401.09815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12798v4","updated":"2024-01-18T09:03:24Z","published":"2023-10-19T14:52:58Z","title":"MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and\n Uni-Modal Adapter","summary":" Language Models (LMs) have demonstrated impressive molecule understanding\nability on various 1D text-related tasks. However, they inherently lack 2D\ngraph perception - a critical ability of human professionals in comprehending\nmolecules' topological structures. To bridge this gap, we propose MolCA:\nMolecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal\nAdapter. MolCA enables an LM (e.g., Galactica) to understand both text- and\ngraph-based molecular contents via the cross-modal projector. Specifically, the\ncross-modal projector is implemented as a Q-Former to connect a graph encoder's\nrepresentation space and an LM's text space. Further, MolCA employs a uni-modal\nadapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks.\nUnlike previous studies that couple an LM with a graph encoder via cross-modal\ncontrastive learning, MolCA retains the LM's ability of open-ended text\ngeneration and augments it with 2D graph information. To showcase its\neffectiveness, we extensively benchmark MolCA on tasks of molecule captioning,\nIUPAC name prediction, and molecule-text retrieval, on which MolCA\nsignificantly outperforms the baselines. Our codes and checkpoints can be found\nat https://github.com/acharkq/MolCA.\n","authors":["Zhiyuan Liu","Sihang Li","Yanchen Luo","Hao Fei","Yixin Cao","Kenji Kawaguchi","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.12798v4.pdf","comment":"EMNLP main conference. 9 pages"},{"id":"http://arxiv.org/abs/2401.09798v1","updated":"2024-01-18T08:36:54Z","published":"2024-01-18T08:36:54Z","title":"All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks","summary":" Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where\nsafeguards are bypassed to produce ethically harmful prompts. This study\nintroduces a simple black-box method to effectively generate jailbreak prompts,\novercoming the limitations of high complexity and computational costs\nassociated with existing methods. The proposed technique iteratively rewrites\nharmful prompts into non-harmful expressions using the target LLM itself, based\non the hypothesis that LLMs can directly sample safeguard-bypassing\nexpressions. Demonstrated through experiments with ChatGPT (GPT-3.5 and GPT-4)\nand Gemini-Pro, this method achieved an attack success rate of over 80% within\nan average of 5 iterations and remained effective despite model updates. The\njailbreak prompts generated were naturally-worded and concise, suggesting they\nare less detectable. The results indicate that creating effective jailbreak\nprompts is simpler than previously considered, and black-box jailbreak attacks\npose a more serious security threat.\n","authors":["Kazuhiro Takemoto"],"pdf_url":"https://arxiv.org/pdf/2401.09798v1.pdf","comment":"11 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2309.05448v2","updated":"2024-01-18T08:33:35Z","published":"2023-09-11T13:41:27Z","title":"Panoptic Vision-Language Feature Fields","summary":" Recently, methods have been proposed for 3D open-vocabulary semantic\nsegmentation. Such methods are able to segment scenes into arbitrary classes\nbased on text descriptions provided during runtime. In this paper, we propose\nto the best of our knowledge the first algorithm for open-vocabulary panoptic\nsegmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature\nFields (PVLFF), learns a semantic feature field of the scene by distilling\nvision-language features from a pretrained 2D model, and jointly fits an\ninstance feature field through contrastive learning using 2D instance segments\non input frames. Despite not being trained on the target classes, our method\nachieves panoptic segmentation performance similar to the state-of-the-art\nclosed-set 3D systems on the HyperSim, ScanNet and Replica dataset and\nadditionally outperforms current 3D open-vocabulary systems in terms of\nsemantic segmentation. We ablate the components of our method to demonstrate\nthe effectiveness of our model architecture. Our code will be available at\nhttps://github.com/ethz-asl/pvlff.\n","authors":["Haoran Chen","Kenneth Blomqvist","Francesco Milano","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2309.05448v2.pdf","comment":"This work has been accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2401.09785v1","updated":"2024-01-18T08:09:27Z","published":"2024-01-18T08:09:27Z","title":"Instant Answering in E-Commerce Buyer-Seller Messaging","summary":" E-commerce customers frequently seek detailed product information for\npurchase decisions, commonly contacting sellers directly with extended queries.\nThis manual response requirement imposes additional costs and disrupts buyer's\nshopping experience with response time fluctuations ranging from hours to days.\nWe seek to automate buyer inquiries to sellers in a leading e-commerce store\nusing a domain-specific federated Question Answering (QA) system. The main\nchallenge is adapting current QA systems, designed for single questions, to\naddress detailed customer queries. We address this with a low-latency,\nsequence-to-sequence approach, MESSAGE-TO-QUESTION ( M2Q ). It reformulates\nbuyer messages into succinct questions by identifying and extracting the most\nsalient information from a message. Evaluation against baselines shows that M2Q\nyields relative increases of 757% in question understanding, and 1,746% in\nanswering rate from the federated QA system. Live deployment shows that\nautomatic answering saves sellers from manually responding to millions of\nmessages per year, and also accelerates customer purchase decisions by\neliminating the need for buyers to wait for a reply\n","authors":["Besnik Fetahu","Tejas Mehta","Qun Song","Nikhita Vedula","Oleg Rokhlenko","Shervin Malmasi"],"pdf_url":"https://arxiv.org/pdf/2401.09785v1.pdf","comment":"Accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2401.09783v1","updated":"2024-01-18T08:05:45Z","published":"2024-01-18T08:05:45Z","title":"Leveraging Biases in Large Language Models: \"bias-kNN'' for Effective\n Few-Shot Learning","summary":" Large Language Models (LLMs) have shown significant promise in various\napplications, including zero-shot and few-shot learning. However, their\nperformance can be hampered by inherent biases. Instead of traditionally sought\nmethods that aim to minimize or correct these biases, this study introduces a\nnovel methodology named ``bias-kNN''. This approach capitalizes on the biased\noutputs, harnessing them as primary features for kNN and supplementing with\ngold labels. Our comprehensive evaluations, spanning diverse domain text\nclassification datasets and different GPT-2 model sizes, indicate the\nadaptability and efficacy of the ``bias-kNN'' method. Remarkably, this approach\nnot only outperforms conventional in-context learning in few-shot scenarios but\nalso demonstrates robustness across a spectrum of samples, templates and\nverbalizers. This study, therefore, presents a unique perspective on harnessing\nbiases, transforming them into assets for enhanced model performance.\n","authors":["Yong Zhang","Hanzhang Li","Zhitao Li","Ning Cheng","Ming Li","Jing Xiao","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09783v1.pdf","comment":"Accepted by the 49th IEEE International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.09775v1","updated":"2024-01-18T07:52:12Z","published":"2024-01-18T07:52:12Z","title":"Controllable Decontextualization of Yes/No Question and Answers into\n Factual Statements","summary":" Yes/No or polar questions represent one of the main linguistic question\ncategories. They consist of a main interrogative clause, for which the answer\nis binary (assertion or negation). Polar questions and answers (PQA) represent\na valuable knowledge resource present in many community and other curated QA\nsources, such as forums or e-commerce applications. Using answers to polar\nquestions alone in other contexts is not trivial. Answers are contextualized,\nand presume that the interrogative question clause and any shared knowledge\nbetween the asker and answerer are provided.\n We address the problem of controllable rewriting of answers to polar\nquestions into decontextualized and succinct factual statements. We propose a\nTransformer sequence to sequence model that utilizes soft-constraints to ensure\ncontrollable rewriting, such that the output statement is semantically\nequivalent to its PQA input. Evaluation on three separate PQA datasets as\nmeasured through automated and human evaluation metrics show that our proposed\napproach achieves the best performance when compared to existing baselines.\n","authors":["Lingbo Mo","Besnik Fetahu","Oleg Rokhlenko","Shervin Malmasi"],"pdf_url":"https://arxiv.org/pdf/2401.09775v1.pdf","comment":"Accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2401.09774v1","updated":"2024-01-18T07:50:07Z","published":"2024-01-18T07:50:07Z","title":"On the Audio Hallucinations in Large Audio-Video Language Models","summary":" Large audio-video language models can generate descriptions for both video\nand audio. However, they sometimes ignore audio content, producing audio\ndescriptions solely reliant on visual information. This paper refers to this as\naudio hallucinations and analyzes them in large audio-video language models. We\ngather 1,000 sentences by inquiring about audio information and annotate them\nwhether they contain hallucinations. If a sentence is hallucinated, we also\ncategorize the type of hallucination. The results reveal that 332 sentences are\nhallucinated with distinct trends observed in nouns and verbs for each\nhallucination type. Based on this, we tackle a task of audio hallucination\nclassification using pre-trained audio-text models in the zero-shot and\nfine-tuning settings. Our experimental results reveal that the zero-shot models\nachieve higher performance (52.2% in F1) than the random (40.3%) and the\nfine-tuning models achieve 87.9%, outperforming the zero-shot models.\n","authors":["Taichi Nishimura","Shota Nakada","Masayoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2401.09774v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2401.07510v2","updated":"2024-01-18T07:47:00Z","published":"2024-01-15T07:21:16Z","title":"Developing ChatGPT for Biology and Medicine: A Complete Review of\n Biomedical Question Answering","summary":" ChatGPT explores a strategic blueprint of question answering (QA) in\ndelivering medical diagnosis, treatment recommendations, and other healthcare\nsupport. This is achieved through the increasing incorporation of medical\ndomain data via natural language processing (NLP) and multimodal paradigms. By\ntransitioning the distribution of text, images, videos, and other modalities\nfrom the general domain to the medical domain, these techniques have expedited\nthe progress of medical domain question answering (MDQA). They bridge the gap\nbetween human natural language and sophisticated medical domain knowledge or\nexpert manual annotations, handling large-scale, diverse, unbalanced, or even\nunlabeled data analysis scenarios in medical contexts. Central to our focus is\nthe utilizing of language models and multimodal paradigms for medical question\nanswering, aiming to guide the research community in selecting appropriate\nmechanisms for their specific medical research requirements. Specialized tasks\nsuch as unimodal-related question answering, reading comprehension, reasoning,\ndiagnosis, relation extraction, probability modeling, and others, as well as\nmultimodal-related tasks like vision question answering, image caption,\ncross-modal retrieval, report summarization, and generation, are discussed in\ndetail. Each section delves into the intricate specifics of the respective\nmethod under consideration. This paper highlights the structures and\nadvancements of medical domain explorations against general domain methods,\nemphasizing their applications across different tasks and datasets. It also\noutlines current challenges and opportunities for future medical domain\nresearch, paving the way for continued innovation and application in this\nrapidly evolving field.\n","authors":["Qing Li","Lei Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2401.07510v2.pdf","comment":"There are some mistakes in introducing medical language question\n answering Models and medical multimodal question answering models, such as\n their dataset should be displayed for pretraining"},{"id":"http://arxiv.org/abs/2401.06805v2","updated":"2024-01-18T07:31:47Z","published":"2024-01-10T15:29:21Z","title":"Exploring the Reasoning Abilities of Multimodal Large Language Models\n (MLLMs): A Comprehensive Survey on Emerging Trends in Multimodal Reasoning","summary":" Strong Artificial Intelligence (Strong AI) or Artificial General Intelligence\n(AGI) with abstract reasoning ability is the goal of next-generation AI. Recent\nadvancements in Large Language Models (LLMs), along with the emerging field of\nMultimodal Large Language Models (MLLMs), have demonstrated impressive\ncapabilities across a wide range of multimodal tasks and applications.\nParticularly, various MLLMs, each with distinct model architectures, training\ndata, and training stages, have been evaluated across a broad range of MLLM\nbenchmarks. These studies have, to varying degrees, revealed different aspects\nof the current capabilities of MLLMs. However, the reasoning abilities of MLLMs\nhave not been systematically investigated. In this survey, we comprehensively\nreview the existing evaluation protocols of multimodal reasoning, categorize\nand illustrate the frontiers of MLLMs, introduce recent trends in applications\nof MLLMs on reasoning-intensive tasks, and finally discuss current practices\nand future directions. We believe our survey establishes a solid base and sheds\nlight on this important topic, multimodal reasoning.\n","authors":["Yiqi Wang","Wentao Chen","Xiaotian Han","Xudong Lin","Haiteng Zhao","Yongfei Liu","Bohan Zhai","Jianbo Yuan","Quanzeng You","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2401.06805v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02472v2","updated":"2024-01-18T07:27:09Z","published":"2023-03-04T18:06:36Z","title":"ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration\n Measure","summary":" Studies have shown that modern neural networks tend to be poorly calibrated\ndue to over-confident predictions. Traditionally, post-processing methods have\nbeen used to calibrate the model after training. In recent years, various\ntrainable calibration measures have been proposed to incorporate them directly\ninto the training process. However, these methods all incorporate internal\nhyperparameters, and the performance of these calibration objectives relies on\ntuning these hyperparameters, incurring more computational costs as the size of\nneural networks and datasets become larger. As such, we present Expected\nSquared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable\ncalibration objective loss, where we view the calibration error from the\nperspective of the squared difference between the two expectations. With\nextensive experiments on several architectures (CNNs, Transformers) and\ndatasets, we demonstrate that (1) incorporating ESD into the training improves\nmodel calibration in various batch size settings without the need for internal\nhyperparameter tuning, (2) ESD yields the best-calibrated results compared with\nprevious approaches, and (3) ESD drastically improves the computational costs\nrequired for calibration during training due to the absence of internal\nhyperparameter. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/ESD.\n","authors":["Hee Suk Yoon","Joshua Tian Jin Tee","Eunseop Yoon","Sunjae Yoon","Gwangsu Kim","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2303.02472v2.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2401.09760v1","updated":"2024-01-18T07:23:51Z","published":"2024-01-18T07:23:51Z","title":"A Comparative Study on Annotation Quality of Crowdsourcing and LLM via\n Label Aggregation","summary":" Whether Large Language Models (LLMs) can outperform crowdsourcing on the data\nannotation task is attracting interest recently. Some works verified this issue\nwith the average performance of individual crowd workers and LLM workers on\nsome specific NLP tasks by collecting new datasets. However, on the one hand,\nexisting datasets for the studies of annotation quality in crowdsourcing are\nnot yet utilized in such evaluations, which potentially provide reliable\nevaluations from a different viewpoint. On the other hand, the quality of these\naggregated labels is crucial because, when utilizing crowdsourcing, the\nestimated labels aggregated from multiple crowd labels to the same instances\nare the eventually collected labels. Therefore, in this paper, we first\ninvestigate which existing crowdsourcing datasets can be used for a comparative\nstudy and create a benchmark. We then compare the quality between individual\ncrowd labels and LLM labels and make the evaluations on the aggregated labels.\nIn addition, we propose a Crowd-LLM hybrid label aggregation method and verify\nthe performance. We find that adding LLM labels from good LLMs to existing\ncrowdsourcing datasets can enhance the quality of the aggregated labels of the\ndatasets, which is also higher than the quality of LLM labels themselves.\n","authors":["Jiyi Li"],"pdf_url":"https://arxiv.org/pdf/2401.09760v1.pdf","comment":"Accepted in ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.08090v2","updated":"2024-01-18T07:23:49Z","published":"2023-08-16T01:46:01Z","title":"Separate the Wheat from the Chaff: Model Deficiency Unlearning via\n Parameter-Efficient Module Operation","summary":" Large language models (LLMs) have been widely used in various applications\nbut are known to suffer from issues related to untruthfulness and toxicity.\nWhile parameter-efficient modules (PEMs) have demonstrated their effectiveness\nin equipping models with new skills, leveraging PEMs for deficiency unlearning\nremains underexplored. In this work, we propose a PEMs operation approach,\nnamely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and\ndetoxification of LLMs through the integration of ``expert'' PEM and\n``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable\ncapabilities due to their proficiency in generating fabricated content, which\nnecessitates language modeling and logical narrative competence. Rather than\nmerely negating the parameters, our approach involves extracting and\neliminating solely the deficiency capability within anti-expert PEM while\npreserving the general capabilities. To evaluate the effectiveness of our\napproach in terms of truthfulness and detoxification, we conduct extensive\nexperiments on LLMs, encompassing additional abilities such as language\nmodeling and mathematical reasoning. Our empirical results demonstrate that our\napproach effectively improves truthfulness and detoxification, while largely\npreserving the fundamental abilities of LLMs.\n","authors":["Xinshuo Hu","Dongfang Li","Baotian Hu","Zihao Zheng","Zhenyu Liu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08090v2.pdf","comment":"AAAI 2024; The first two authors contributed equally to this paper"},{"id":"http://arxiv.org/abs/2401.09758v1","updated":"2024-01-18T07:18:03Z","published":"2024-01-18T07:18:03Z","title":"Resolving Regular Polysemy in Named Entities","summary":" Word sense disambiguation primarily addresses the lexical ambiguity of common\nwords based on a predefined sense inventory. Conversely, proper names are\nusually considered to denote an ad-hoc real-world referent. Once the reference\nis decided, the ambiguity is purportedly resolved. However, proper names also\nexhibit ambiguities through appellativization, i.e., they act like common words\nand may denote different aspects of their referents. We proposed to address the\nambiguities of proper names through the light of regular polysemy, which we\nformalized as dot objects. This paper introduces a combined word sense\ndisambiguation (WSD) model for disambiguating common words against Chinese\nWordnet (CWN) and proper names as dot objects. The model leverages the\nflexibility of a gloss-based model architecture, which takes advantage of the\nglosses and example sentences of CWN. We show that the model achieves\ncompetitive results on both common and proper nouns, even on a relatively\nsparse sense dataset. Aside from being a performant WSD tool, the model further\nfacilitates the future development of the lexical resource.\n","authors":["Shu-Kai Hsieh","Yu-Hsiang Tseng","Hsin-Yu Chou","Ching-Wen Yang","Yu-Yun Chang"],"pdf_url":"https://arxiv.org/pdf/2401.09758v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13269v2","updated":"2024-01-18T06:53:39Z","published":"2023-07-25T05:39:21Z","title":"LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA\n Composition","summary":" Low-rank adaptations (LoRA) are often employed to fine-tune large language\nmodels (LLMs) for new tasks. This paper investigates LoRA composability for\ncross-task generalization and introduces LoraHub, a simple framework devised\nfor the purposive assembly of LoRA modules trained on diverse given tasks, with\nthe objective of achieving adaptable performance on unseen tasks. With just a\nfew examples from a new task, LoraHub can fluidly combine multiple LoRA\nmodules, eliminating the need for human expertise and assumptions. Notably, the\ncomposition requires neither additional model parameters nor gradients.\nEmpirical results on the Big-Bench Hard benchmark suggest that LoraHub, while\nnot surpassing the performance of in-context learning, offers a notable\nperformance-efficiency trade-off in few-shot scenarios by employing a\nsignificantly reduced number of tokens per example during inference. Notably,\nLoraHub establishes a better upper bound compared to in-context learning when\npaired with different demonstration examples, demonstrating its potential for\nfuture development. Our vision is to establish a platform for LoRA modules,\nempowering users to share their trained LoRA modules. This collaborative\napproach facilitates the seamless application of LoRA modules to novel tasks,\ncontributing to an adaptive ecosystem. Our code is available at\nhttps://github.com/sail-sg/lorahub, and all the pre-trained LoRA modules are\nreleased at https://huggingface.co/lorahub.\n","authors":["Chengsong Huang","Qian Liu","Bill Yuchen Lin","Tianyu Pang","Chao Du","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2307.13269v2.pdf","comment":"Add more related work and experimental results"},{"id":"http://arxiv.org/abs/2401.09727v1","updated":"2024-01-18T05:06:39Z","published":"2024-01-18T05:06:39Z","title":"Large Language Model Lateral Spear Phishing: A Comparative Study in\n Large-Scale Organizational Settings","summary":" The critical threat of phishing emails has been further exacerbated by the\npotential of LLMs to generate highly targeted, personalized, and automated\nspear phishing attacks. Two critical problems concerning LLM-facilitated\nphishing require further investigation: 1) Existing studies on lateral phishing\nlack specific examination of LLM integration for large-scale attacks targeting\nthe entire organization, and 2) Current anti-phishing infrastructure, despite\nits extensive development, lacks the capability to prevent LLM-generated\nattacks, potentially impacting both employees and IT security incident\nmanagement. However, the execution of such investigative studies necessitates a\nreal-world environment, one that functions during regular business operations\nand mirrors the complexity of a large organizational infrastructure. This\nsetting must also offer the flexibility required to facilitate a diverse array\nof experimental conditions, particularly the incorporation of phishing emails\ncrafted by LLMs. This study is a pioneering exploration into the use of Large\nLanguage Models (LLMs) for the creation of targeted lateral phishing emails,\ntargeting a large tier 1 university's operation and workforce of approximately\n9,000 individuals over an 11-month period. It also evaluates the capability of\nemail filtering infrastructure to detect such LLM-generated phishing attempts,\nproviding insights into their effectiveness and identifying potential areas for\nimprovement. Based on our findings, we propose machine learning-based detection\ntechniques for such emails to detect LLM-generated phishing emails that were\nmissed by the existing infrastructure, with an F1-score of 98.96.\n","authors":["Mazal Bethany","Athanasios Galiopoulos","Emet Bethany","Mohammad Bahrami Karkevandi","Nishant Vishwamitra","Peyman Najafirad"],"pdf_url":"https://arxiv.org/pdf/2401.09727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09724v1","updated":"2024-01-18T04:57:12Z","published":"2024-01-18T04:57:12Z","title":"Predicting Viral Rumors and Vulnerable Users for Infodemic Surveillance","summary":" In the age of the infodemic, it is crucial to have tools for effectively\nmonitoring the spread of rampant rumors that can quickly go viral, as well as\nidentifying vulnerable users who may be more susceptible to spreading such\nmisinformation. This proactive approach allows for timely preventive measures\nto be taken, mitigating the negative impact of false information on society. We\npropose a novel approach to predict viral rumors and vulnerable users using a\nunified graph neural network model. We pre-train network-based user embeddings\nand leverage a cross-attention mechanism between users and posts, together with\na community-enhanced vulnerability propagation (CVP) method to improve user and\npropagation graph representations. Furthermore, we employ two multi-task\ntraining strategies to mitigate negative transfer effects among tasks in\ndifferent settings, enhancing the overall performance of our approach. We also\nconstruct two datasets with ground-truth annotations on information virality\nand user vulnerability in rumor and non-rumor events, which are automatically\nderived from existing rumor detection datasets. Extensive evaluation results of\nour joint learning model confirm its superiority over strong baselines in all\nthree tasks: rumor detection, virality prediction, and user vulnerability\nscoring. For instance, compared to the best baselines based on the Weibo\ndataset, our model makes 3.8\\% and 3.0\\% improvements on Accuracy and MacF1 for\nrumor detection, and reduces mean squared error (MSE) by 23.9\\% and 16.5\\% for\nvirality prediction and user vulnerability scoring, respectively. Our findings\nsuggest that our approach effectively captures the correlation between rumor\nvirality and user vulnerability, leveraging this information to improve\nprediction performance and provide a valuable tool for infodemic surveillance.\n","authors":["Xuan Zhang","Wei Gao"],"pdf_url":"https://arxiv.org/pdf/2401.09724v1.pdf","comment":"Accepted by IP&M"},{"id":"http://arxiv.org/abs/2310.15141v2","updated":"2024-01-18T04:42:34Z","published":"2023-10-23T17:47:34Z","title":"SpecTr: Fast Speculative Decoding via Optimal Transport","summary":" Autoregressive sampling from large language models has led to\nstate-of-the-art results in several natural language tasks. However,\nautoregressive sampling generates tokens one at a time making it slow, and even\nprohibitive in certain tasks. One way to speed up sampling is\n$\\textit{speculative decoding}$: use a small model to sample a $\\textit{draft}$\n(block or sequence of tokens), and then score all tokens in the draft by the\nlarge language model in parallel. A subset of the tokens in the draft are\naccepted (and the rest rejected) based on a statistical method to guarantee\nthat the final output follows the distribution of the large model. In this\nwork, we provide a principled understanding of speculative decoding through the\nlens of optimal transport (OT) with $\\textit{membership cost}$. This framework\ncan be viewed as an extension of the well-known $\\textit{maximal-coupling}$\nproblem. This new formulation enables us to generalize the speculative decoding\nmethod to allow for a set of $k$ candidates at the token-level, which leads to\nan improved optimal membership cost. We show that the optimal draft selection\nalgorithm (transport plan) can be computed via linear programming, whose\nbest-known runtime is exponential in $k$. We then propose a valid draft\nselection algorithm whose acceptance probability is $(1-1/e)$-optimal\nmultiplicatively. Moreover, it can be computed in time almost linear with size\nof domain of a single token. Using this $new draft selection$ algorithm, we\ndevelop a new autoregressive sampling algorithm called $\\textit{SpecTr}$, which\nprovides speedup in decoding while ensuring that there is no quality\ndegradation in the decoded output. We experimentally demonstrate that for\nstate-of-the-art large language models, the proposed approach achieves a wall\nclock speedup of 2.13X, a further 1.37X speedup over speculative decoding on\nstandard benchmarks.\n","authors":["Ziteng Sun","Ananda Theertha Suresh","Jae Hun Ro","Ahmad Beirami","Himanshu Jain","Felix Yu"],"pdf_url":"https://arxiv.org/pdf/2310.15141v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.09699v1","updated":"2024-01-18T03:09:06Z","published":"2024-01-18T03:09:06Z","title":"Curriculum Recommendations Using Transformer Base Model with InfoNCE\n Loss And Language Switching Method","summary":" The Curriculum Recommendations paradigm is dedicated to fostering learning\nequality within the ever-evolving realms of educational technology and\ncurriculum development. In acknowledging the inherent obstacles posed by\nexisting methodologies, such as content conflicts and disruptions from language\ntranslation, this paradigm aims to confront and overcome these challenges.\nNotably, it addresses content conflicts and disruptions introduced by language\ntranslation, hindrances that can impede the creation of an all-encompassing and\npersonalized learning experience. The paradigm's objective is to cultivate an\neducational environment that not only embraces diversity but also customizes\nlearning experiences to suit the distinct needs of each learner. To overcome\nthese challenges, our approach builds upon notable contributions in curriculum\ndevelopment and personalized learning, introducing three key innovations. These\ninclude the integration of Transformer Base Model to enhance computational\nefficiency, the implementation of InfoNCE Loss for accurate content-topic\nmatching, and the adoption of a language switching strategy to alleviate\ntranslation-related ambiguities. Together, these innovations aim to\ncollectively tackle inherent challenges and contribute to forging a more\nequitable and effective learning journey for a diverse range of learners.\nCompetitive cross-validation scores underscore the efficacy of\nsentence-transformers/LaBSE, achieving 0.66314, showcasing our methodology's\neffectiveness in diverse linguistic nuances for content alignment prediction.\nIndex Terms-Curriculum Recommendation, Transformer model with InfoNCE Loss,\nLanguage Switching.\n","authors":["Xiaonan Xu","Bin Yuan","Yongyao Mo","Tianbo Song","Shulin Li"],"pdf_url":"https://arxiv.org/pdf/2401.09699v1.pdf","comment":"4pages, 2 figures, ICAICA2023"},{"id":"http://arxiv.org/abs/2401.06951v2","updated":"2024-01-18T02:18:43Z","published":"2024-01-13T02:11:20Z","title":"E^2-LLM: Efficient and Extreme Length Extension of Large Language Models","summary":" Typically, training LLMs with long context sizes is computationally\nexpensive, requiring extensive training hours and GPU resources. Existing\nlong-context extension methods usually need additional training procedures to\nsupport corresponding long-context windows, where the long-context training\ndata (e.g., 32k) is needed, and high GPU training costs are assumed. To address\nthe aforementioned issues, we propose an Efficient and Extreme length extension\nmethod for Large Language Models, called E 2 -LLM, with only one training\nprocedure and dramatically reduced computation cost, which also removes the\nneed to collect long-context data. Concretely, first, the training data of our\nE 2 -LLM only requires a short length (e.g., 4k), which reduces the tuning cost\ngreatly. Second, the training procedure on the short training context window is\nperformed only once time, and we can support different evaluation context\nwindows at inference. Third, in E 2 - LLM, based on RoPE position embeddings,\nwe introduce two different augmentation methods on the scale and position index\nparameters for different samples in training. It aims to make the model more\nrobust to the different relative differences when directly interpolating the\narbitrary context length at inference. Comprehensive experimental results on\nmultiple benchmark datasets demonstrate the effectiveness of our E 2 -LLM on\nchallenging long-context tasks.\n","authors":["Jiaheng Liu","Zhiqi Bai","Yuanxing Zhang","Chenchen Zhang","Yu Zhang","Ge Zhang","Jiakai Wang","Haoran Que","Yukang Chen","Wenbo Su","Tiezheng Ge","Jie Fu","Wenhu Chen","Bo Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.06951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.15629v3","updated":"2024-01-18T00:43:41Z","published":"2022-10-27T17:20:50Z","title":"Language Control Diffusion: Efficiently Scaling through Space, Time, and\n Tasks","summary":" Training generalist agents is difficult across several axes, requiring us to\ndeal with high-dimensional inputs (space), long horizons (time), and\ngeneralization to novel tasks. Recent advances with architectures have allowed\nfor improved scaling along one or two of these axes, but are still\ncomputationally prohibitive to use. In this paper, we propose to address all\nthree axes by leveraging \\textbf{L}anguage to \\textbf{C}ontrol\n\\textbf{D}iffusion models as a hierarchical planner conditioned on language\n(LCD). We effectively and efficiently scale diffusion models for planning in\nextended temporal, state, and task dimensions to tackle long horizon control\nproblems conditioned on natural language instructions, as a step towards\ngeneralist agents. Comparing LCD with other state-of-the-art models on the\nCALVIN language robotics benchmark finds that LCD outperforms other SOTA\nmethods in multi-task success rates, whilst improving inference speed over\nother comparable diffusion models by 3.3x~15x. We show that LCD can\nsuccessfully leverage the unique strength of diffusion models to produce\ncoherent long range plans while addressing their weakness in generating\nlow-level details and control.\n","authors":["Edwin Zhang","Yujie Lu","William Wang","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.15629v3.pdf","comment":"ICLR 2024, Project and code available at\n https://github.com/ezhang7423/language-control-diffusion"},{"id":"http://arxiv.org/abs/2401.10415v1","updated":"2024-01-18T23:00:54Z","published":"2024-01-18T23:00:54Z","title":"Can Large Language Model Summarizers Adapt to Diverse Scientific\n Communication Goals?","summary":" In this work, we investigate the controllability of large language models\n(LLMs) on scientific summarization tasks. We identify key stylistic and content\ncoverage factors that characterize different types of summaries such as paper\nreviews, abstracts, and lay summaries. By controlling stylistic features, we\nfind that non-fine-tuned LLMs outperform humans in the MuP review generation\ntask, both in terms of similarity to reference summaries and human preferences.\nAlso, we show that we can improve the controllability of LLMs with\nkeyword-based classifier-free guidance (CFG) while achieving lexical overlap\ncomparable to strong fine-tuned baselines on arXiv and PubMed. However, our\nresults also indicate that LLMs cannot consistently generate long summaries\nwith more than 8 sentences. Furthermore, these models exhibit limited capacity\nto produce highly abstractive lay summaries. Although LLMs demonstrate strong\ngeneric summarization competency, sophisticated content control without costly\nfine-tuning remains an open problem for domain-specific applications.\n","authors":["Marcio Fonseca","Shay B. Cohen"],"pdf_url":"https://arxiv.org/pdf/2401.10415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10407v1","updated":"2024-01-18T22:32:31Z","published":"2024-01-18T22:32:31Z","title":"Learning High-Quality and General-Purpose Phrase Representations","summary":" Phrase representations play an important role in data science and natural\nlanguage processing, benefiting various tasks like Entity Alignment, Record\nLinkage, Fuzzy Joins, and Paraphrase Classification. The current\nstate-of-the-art method involves fine-tuning pre-trained language models for\nphrasal embeddings using contrastive learning. However, we have identified\nareas for improvement. First, these pre-trained models tend to be unnecessarily\ncomplex and require to be pre-trained on a corpus with context sentences.\nSecond, leveraging the phrase type and morphology gives phrase representations\nthat are both more precise and more flexible. We propose an improved framework\nto learn phrase representations in a context-free fashion. The framework\nemploys phrase type classification as an auxiliary task and incorporates\ncharacter-level information more effectively into the phrase representation.\nFurthermore, we design three granularities of data augmentation to increase the\ndiversity of training samples. Our experiments across a wide range of tasks\nshow that our approach generates superior phrase embeddings compared to\nprevious methods while requiring a smaller model size. The code is available at\n\\faGithub~ \\url{https://github.com/tigerchen52/PEARL} \\end{abstract}\n","authors":["Lihu Chen","Gaël Varoquaux","Fabian M. Suchanek"],"pdf_url":"https://arxiv.org/pdf/2401.10407v1.pdf","comment":"Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2310.04965v2","updated":"2024-01-18T21:17:04Z","published":"2023-10-08T01:51:17Z","title":"MULTISCRIPT: Multimodal Script Learning for Supporting Open Domain\n Everyday Tasks","summary":" Automatically generating scripts (i.e. sequences of key steps described in\ntext) from video demonstrations and reasoning about the subsequent steps are\ncrucial to the modern AI virtual assistants to guide humans to complete\neveryday tasks, especially unfamiliar ones. However, current methods for\ngenerative script learning rely heavily on well-structured preceding steps\ndescribed in text and/or images or are limited to a certain domain, resulting\nin a disparity with real-world user scenarios. To address these limitations, we\npresent a new benchmark challenge -- MultiScript, with two new tasks on\ntask-oriented multimodal script learning: (1) multimodal script generation, and\n(2) subsequent step prediction. For both tasks, the input consists of a target\ntask name and a video illustrating what has been done to complete the target\ntask, and the expected output is (1) a sequence of structured step descriptions\nin text based on the demonstration video, and (2) a single text description for\nthe subsequent step, respectively. Built from WikiHow, MultiScript covers\nmultimodal scripts in videos and text descriptions for over 6,655 human\neveryday tasks across 19 diverse domains. To establish baseline performance on\nMultiScript, we propose two knowledge-guided multimodal generative frameworks\nthat incorporate the task-related knowledge prompted from large language models\nsuch as Vicuna. Experimental results show that our proposed approaches\nsignificantly improve over the competitive baselines.\n","authors":["Jingyuan Qi","Minqian Liu","Ying Shen","Zhiyang Xu","Lifu Huang"],"pdf_url":"https://arxiv.org/pdf/2310.04965v2.pdf","comment":"Accepted by AAAI 2024. 11 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.07106v2","updated":"2024-01-18T20:19:54Z","published":"2024-01-13T16:13:45Z","title":"Directed Regular and Context-Free Languages","summary":" We study the problem of deciding whether a given language is directed. A\nlanguage $L$ is \\emph{directed} if every pair of words in $L$ have a common\n(scattered) superword in $L$. Deciding directedness is a fundamental problem in\nconnection with ideal decompositions of downward closed sets. Another\nmotivation is that deciding whether two \\emph{directed} context-free languages\nhave the same downward closures can be decided in polynomial time, whereas for\ngeneral context-free languages, this problem is known to be coNEXP-complete.\n We show that the directedness problem for regular languages, given as NFAs,\nbelongs to $AC^1$, and thus polynomial time. Moreover, it is NL-complete for\nfixed alphabet sizes. Furthermore, we show that for context-free languages, the\ndirectedness problem is PSPACE-complete.\n","authors":["Moses Ganardi","Irmak Saglam","Georg Zetzsche"],"pdf_url":"https://arxiv.org/pdf/2401.07106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10353v1","updated":"2024-01-18T19:46:04Z","published":"2024-01-18T19:46:04Z","title":"Inconsistent dialogue responses and how to recover from them","summary":" One critical issue for chat systems is to stay consistent about preferences,\nopinions, beliefs and facts of itself, which has been shown a difficult\nproblem. In this work, we study methods to assess and bolster utterance\nconsistency of chat systems. A dataset is first developed for studying the\ninconsistencies, where inconsistent dialogue responses, explanations of the\ninconsistencies, and recovery utterances are authored by annotators. This\ncovers the life span of inconsistencies, namely introduction, understanding,\nand resolution. Building on this, we introduce a set of tasks centered on\ndialogue consistency, specifically focused on its detection and resolution. Our\nexperimental findings indicate that our dataset significantly helps the\nprogress in identifying and resolving conversational inconsistencies, and\ncurrent popular large language models like ChatGPT which are good at resolving\ninconsistencies however still struggle with detection.\n","authors":["Mian Zhang","Lifeng Jin","Linfeng Song","Haitao Mi","Dong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.10353v1.pdf","comment":"Accepted in EACL 2024. Code and dataset available at\n https://github.com/mianzhang/CIDER"},{"id":"http://arxiv.org/abs/2401.10352v1","updated":"2024-01-18T19:42:04Z","published":"2024-01-18T19:42:04Z","title":"Bridging Cultural Nuances in Dialogue Agents through Cultural Value\n Surveys","summary":" The cultural landscape of interactions with dialogue agents is a compelling\nyet relatively unexplored territory. It's clear that various sociocultural\naspects -- from communication styles and beliefs to shared metaphors and\nknowledge -- profoundly impact these interactions. To delve deeper into this\ndynamic, we introduce cuDialog, a first-of-its-kind benchmark for dialogue\ngeneration with a cultural lens. We also develop baseline models capable of\nextracting cultural attributes from dialogue exchanges, with the goal of\nenhancing the predictive accuracy and quality of dialogue agents. To\neffectively co-learn cultural understanding and multi-turn dialogue\npredictions, we propose to incorporate cultural dimensions with dialogue\nencoding features. Our experimental findings highlight that incorporating\ncultural value surveys boosts alignment with references and cultural markers,\ndemonstrating its considerable influence on personalization and dialogue\nquality. To facilitate further exploration in this exciting domain, we publish\nour benchmark publicly accessible at https://github.com/yongcaoplus/cuDialog.\n","authors":["Yong Cao","Min Chen","Daniel Hershcovich"],"pdf_url":"https://arxiv.org/pdf/2401.10352v1.pdf","comment":"16pages, 7 figures, EACL 2024 main"},{"id":"http://arxiv.org/abs/2212.09726v2","updated":"2024-01-18T19:27:04Z","published":"2022-12-19T18:51:06Z","title":"Improving Faithfulness of Abstractive Summarization by Controlling\n Confounding Effect of Irrelevant Sentences","summary":" Lack of factual correctness is an issue that still plagues state-of-the-art\nsummarization systems despite their impressive progress on generating seemingly\nfluent summaries. In this paper, we show that factual inconsistency can be\ncaused by irrelevant parts of the input text, which act as confounders. To that\nend, we leverage information-theoretic measures of causal effects to quantify\nthe amount of confounding and precisely quantify how they affect the\nsummarization performance. Based on insights derived from our theoretical\nresults, we design a simple multi-task model to control such confounding by\nleveraging human-annotated relevant sentences when available. Crucially, we\ngive a principled characterization of data distributions where such confounding\ncan be large thereby necessitating the use of human annotated relevant\nsentences to generate factual summaries. Our approach improves faithfulness\nscores by 20\\% over strong baselines on AnswerSumm\n\\citep{fabbri2021answersumm}, a conversation summarization dataset where lack\nof faithfulness is a significant issue due to the subjective nature of the\ntask. Our best method achieves the highest faithfulness score while also\nachieving state-of-the-art results on standard metrics like ROUGE and METEOR.\nWe corroborate these improvements through human evaluation.\n","authors":["Asish Ghoshal","Arash Einolghozati","Ankit Arun","Haoran Li","Lili Yu","Vera Gor","Yashar Mehdad","Scott Wen-tau Yih","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2212.09726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10337v1","updated":"2024-01-18T19:02:00Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n Security Attack Pattern Recognition","summary":" Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v1.pdf","comment":"accepted at EACL 2024, in ARR October 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.10232v1","updated":"2024-01-18T18:59:58Z","published":"2024-01-18T18:59:58Z","title":"ParaHome: Parameterizing Everyday Home Activities Towards 3D Generative\n Modeling of Human-Object Interactions","summary":" To enable machines to learn how humans interact with the physical world in\nour daily activities, it is crucial to provide rich data that encompasses the\n3D motion of humans as well as the motion of objects in a learnable 3D\nrepresentation. Ideally, this data should be collected in a natural setup,\ncapturing the authentic dynamic 3D signals during human-object interactions. To\naddress this challenge, we introduce the ParaHome system, designed to capture\nand parameterize dynamic 3D movements of humans and objects within a common\nhome environment. Our system consists of a multi-view setup with 70\nsynchronized RGB cameras, as well as wearable motion capture devices equipped\nwith an IMU-based body suit and hand motion capture gloves. By leveraging the\nParaHome system, we collect a novel large-scale dataset of human-object\ninteraction. Notably, our dataset offers key advancement over existing datasets\nin three main aspects: (1) capturing 3D body and dexterous hand manipulation\nmotion alongside 3D object movement within a contextual home environment during\nnatural activities; (2) encompassing human interaction with multiple objects in\nvarious episodic scenarios with corresponding descriptions in texts; (3)\nincluding articulated objects with multiple parts expressed with parameterized\narticulations. Building upon our dataset, we introduce new research tasks aimed\nat building a generative model for learning and synthesizing human-object\ninteractions in a real-world room setting.\n","authors":["Jeonghwan Kim","Jisoo Kim","Jeonghyeon Na","Hanbyul Joo"],"pdf_url":"https://arxiv.org/pdf/2401.10232v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10229v1","updated":"2024-01-18T18:59:34Z","published":"2024-01-18T18:59:34Z","title":"OMG-Seg: Is One Model Good Enough For All Segmentation?","summary":" In this work, we address various segmentation tasks, each traditionally\ntackled by distinct or partially unified models. We propose OMG-Seg, One Model\nthat is Good enough to efficiently and effectively handle all the segmentation\ntasks, including image semantic, instance, and panoptic segmentation, as well\nas their video counterparts, open vocabulary settings, prompt-driven,\ninteractive segmentation like SAM, and video object segmentation. To our\nknowledge, this is the first model to handle all these tasks in one model and\nachieve satisfactory performance. We show that OMG-Seg, a transformer-based\nencoder-decoder architecture with task-specific queries and outputs, can\nsupport over ten distinct segmentation tasks and yet significantly reduce\ncomputational and parameter overhead across various tasks and datasets. We\nrigorously evaluate the inter-task influences and correlations during\nco-training. Code and models are available at https://github.com/lxtGH/OMG-Seg.\n","authors":["Xiangtai Li","Haobo Yuan","Wei Li","Henghui Ding","Size Wu","Wenwei Zhang","Yining Li","Kai Chen","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2401.10229v1.pdf","comment":"Project Page: https://lxtgh.github.io/project/omg_seg/"},{"id":"http://arxiv.org/abs/2401.10228v1","updated":"2024-01-18T18:59:30Z","published":"2024-01-18T18:59:30Z","title":"RAP-SAM: Towards Real-Time All-Purpose Segment Anything","summary":" Advanced by transformer architecture, vision foundation models (VFMs) achieve\nremarkable progress in performance and generalization ability. Segment Anything\nModel (SAM) is one remarkable model that can achieve generalized segmentation.\nHowever, most VFMs cannot run in realtime, which makes it difficult to transfer\nthem into several products. On the other hand, current real-time segmentation\nmainly has one purpose, such as semantic segmentation on the driving scene. We\nargue that diverse outputs are needed for real applications. Thus, this work\nexplores a new real-time segmentation setting, named all-purpose segmentation\nin real-time, to transfer VFMs in real-time deployment. It contains three\ndifferent tasks, including interactive segmentation, panoptic segmentation, and\nvideo segmentation. We aim to use one model to achieve the above tasks in\nreal-time. We first benchmark several strong baselines. Then, we present\nReal-Time All Purpose SAM (RAP-SAM). It contains an efficient encoder and an\nefficient decoupled decoder to perform prompt-driven decoding. Moreover, we\nfurther explore different training strategies and tuning methods to boost\nco-training performance further. Our code and model are available at\nhttps://github.com/xushilin1/RAP-SAM/.\n","authors":["Shilin Xu","Haobo Yuan","Qingyu Shi","Lu Qi","Jingbo Wang","Yibo Yang","Yining Li","Kai Chen","Yunhai Tong","Bernard Ghanem","Xiangtai Li","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10228v1.pdf","comment":"Project Page: https://xushilin1.github.io/rap_sam/"},{"id":"http://arxiv.org/abs/2401.10227v1","updated":"2024-01-18T18:59:19Z","published":"2024-01-18T18:59:19Z","title":"A Simple Latent Diffusion Approach for Panoptic Segmentation and Mask\n Inpainting","summary":" Panoptic and instance segmentation networks are often trained with\nspecialized object detection modules, complex loss functions, and ad-hoc\npost-processing steps to handle the permutation-invariance of the instance\nmasks. This work builds upon Stable Diffusion and proposes a latent diffusion\napproach for panoptic segmentation, resulting in a simple architecture which\nomits these complexities. Our training process consists of two steps: (1)\ntraining a shallow autoencoder to project the segmentation masks to latent\nspace; (2) training a diffusion model to allow image-conditioned sampling in\nlatent space. The use of a generative model unlocks the exploration of mask\ncompletion or inpainting, which has applications in interactive segmentation.\nThe experimental validation yields promising results for both panoptic\nsegmentation and mask inpainting. While not setting a new state-of-the-art, our\nmodel's simplicity, generality, and mask completion capability are desirable\nproperties.\n","authors":["Wouter Van Gansbeke","Bert De Brabandere"],"pdf_url":"https://arxiv.org/pdf/2401.10227v1.pdf","comment":"Code: https://github.com/segments-ai/latent-diffusion-segmentation"},{"id":"http://arxiv.org/abs/2306.00977v3","updated":"2024-01-18T18:59:17Z","published":"2023-06-01T17:59:10Z","title":"AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation","summary":" During interactive segmentation, a model and a user work together to\ndelineate objects of interest in a 3D point cloud. In an iterative process, the\nmodel assigns each data point to an object (or the background), while the user\ncorrects errors in the resulting segmentation and feeds them back into the\nmodel. The current best practice formulates the problem as binary\nclassification and segments objects one at a time. The model expects the user\nto provide positive clicks to indicate regions wrongly assigned to the\nbackground and negative clicks on regions wrongly assigned to the object.\nSequentially visiting objects is wasteful since it disregards synergies between\nobjects: a positive click for a given object can, by definition, serve as a\nnegative click for nearby objects. Moreover, a direct competition between\nadjacent objects can speed up the identification of their common boundary. We\nintroduce AGILE3D, an efficient, attention-based model that (1) supports\nsimultaneous segmentation of multiple 3D objects, (2) yields more accurate\nsegmentation masks with fewer user clicks, and (3) offers faster inference. Our\ncore idea is to encode user clicks as spatial-temporal queries and enable\nexplicit interactions between click queries as well as between them and the 3D\nscene through a click attention module. Every time new clicks are added, we\nonly need to run a lightweight decoder that produces updated segmentation\nmasks. In experiments with four different 3D point cloud datasets, AGILE3D sets\na new state-of-the-art. Moreover, we also verify its practicality in real-world\nsetups with real user studies.\n","authors":["Yuanwen Yue","Sabarinath Mahadevan","Jonas Schult","Francis Engelmann","Bastian Leibe","Konrad Schindler","Theodora Kontogianni"],"pdf_url":"https://arxiv.org/pdf/2306.00977v3.pdf","comment":"Accepted to ICLR 2024. Project page: https://ywyue.github.io/AGILE3D"},{"id":"http://arxiv.org/abs/2401.10226v1","updated":"2024-01-18T18:59:13Z","published":"2024-01-18T18:59:13Z","title":"Towards Language-Driven Video Inpainting via Multimodal Large Language\n Models","summary":" We introduce a new task -- language-driven video inpainting, which uses\nnatural language instructions to guide the inpainting process. This approach\novercomes the limitations of traditional video inpainting methods that depend\non manually labeled binary masks, a process often tedious and labor-intensive.\nWe present the Remove Objects from Videos by Instructions (ROVI) dataset,\ncontaining 5,650 videos and 9,091 inpainting results, to support training and\nevaluation for this task. We also propose a novel diffusion-based\nlanguage-driven video inpainting framework, the first end-to-end baseline for\nthis task, integrating Multimodal Large Language Models to understand and\nexecute complex language-based inpainting requests effectively. Our\ncomprehensive results showcase the dataset's versatility and the model's\neffectiveness in various language-instructed inpainting scenarios. We will make\ndatasets, code, and models publicly available.\n","authors":["Jianzong Wu","Xiangtai Li","Chenyang Si","Shangchen Zhou","Jingkang Yang","Jiangning Zhang","Yining Li","Kai Chen","Yunhai Tong","Ziwei Liu","Chen Change Loy"],"pdf_url":"https://arxiv.org/pdf/2401.10226v1.pdf","comment":"Project Page: https://jianzongwu.github.io/projects/rovi"},{"id":"http://arxiv.org/abs/2401.10224v1","updated":"2024-01-18T18:59:09Z","published":"2024-01-18T18:59:09Z","title":"The Manga Whisperer: Automatically Generating Transcriptions for Comics","summary":" In the past few decades, Japanese comics, commonly referred to as Manga, have\ntranscended both cultural and linguistic boundaries to become a true worldwide\nsensation. Yet, the inherent reliance on visual cues and illustration within\nmanga renders it largely inaccessible to individuals with visual impairments.\nIn this work, we seek to address this substantial barrier, with the aim of\nensuring that manga can be appreciated and actively engaged by everyone.\nSpecifically, we tackle the problem of diarisation i.e. generating a\ntranscription of who said what and when, in a fully automatic way.\n To this end, we make the following contributions: (1) we present a unified\nmodel, Magi, that is able to (a) detect panels, text boxes and character boxes,\n(b) cluster characters by identity (without knowing the number of clusters\napriori), and (c) associate dialogues to their speakers; (2) we propose a novel\napproach that is able to sort the detected text boxes in their reading order\nand generate a dialogue transcript; (3) we annotate an evaluation benchmark for\nthis task using publicly available [English] manga pages. The code, evaluation\ndatasets and the pre-trained model can be found at:\nhttps://github.com/ragavsachdeva/magi.\n","authors":["Ragav Sachdeva","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2401.10224v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10222v1","updated":"2024-01-18T18:58:54Z","published":"2024-01-18T18:58:54Z","title":"Supervised Fine-tuning in turn Improves Visual Foundation Models","summary":" Image-text training like CLIP has dominated the pretraining of vision\nfoundation models in recent years. Subsequent efforts have been made to\nintroduce region-level visual learning into CLIP's pretraining but face\nscalability challenges due to the lack of large-scale region-level datasets.\nDrawing inspiration from supervised fine-tuning (SFT) in natural language\nprocessing such as instruction tuning, we explore the potential of fine-grained\nSFT in enhancing the generation of vision foundation models after their\npretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash\nthe fine-grained knowledge of vision foundation models. In ViSFT, the vision\nfoundation model is enhanced by performing visual joint learning on some\nin-domain tasks and then tested on out-of-domain benchmarks. With updating\nusing ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over\n4.4B parameters shows improvements across various out-of-domain benchmarks\nincluding vision and vision-linguistic scenarios.\n","authors":["Xiaohu Jiang","Yixiao Ge","Yuying Ge","Chun Yuan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2401.10222v1.pdf","comment":"14 pages, 3 figures, Project page:\n https://github.com/TencentARC/ViSFT/tree/main"},{"id":"http://arxiv.org/abs/2401.10220v1","updated":"2024-01-18T18:58:49Z","published":"2024-01-18T18:58:49Z","title":"AutoFT: Robust Fine-Tuning by Optimizing Hyperparameters on OOD Data","summary":" Foundation models encode rich representations that can be adapted to a\ndesired task by fine-tuning on task-specific data. However, fine-tuning a model\non one particular data distribution often compromises the model's original\nperformance on other distributions. Current methods for robust fine-tuning\nutilize hand-crafted regularization techniques to constrain the fine-tuning\nprocess towards the base foundation model. Yet, it is hard to precisely specify\nwhat characteristics of the foundation model to retain during fine-tuning, as\nthis depends on how the pre-training, fine-tuning, and evaluation data\ndistributions relate to each other. We propose AutoFT, a data-driven approach\nfor guiding foundation model fine-tuning. AutoFT optimizes fine-tuning\nhyperparameters to maximize performance on a small out-of-distribution (OOD)\nvalidation set. To guide fine-tuning in a granular way, AutoFT searches a\nhighly expressive hyperparameter space that includes weight coefficients for\nmany different losses, in addition to learning rate and weight decay values. We\nevaluate AutoFT on nine natural distribution shifts which include domain shifts\nand subpopulation shifts. Our experiments show that AutoFT significantly\nimproves generalization to new OOD data, outperforming existing robust\nfine-tuning methods. Notably, AutoFT achieves new state-of-the-art performance\non the WILDS-iWildCam and WILDS-FMoW benchmarks, outperforming the previous\nbest methods by $6.0\\%$ and $1.5\\%$, respectively.\n","authors":["Caroline Choi","Yoonho Lee","Annie Chen","Allan Zhou","Aditi Raghunathan","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2401.10220v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.10219v1","updated":"2024-01-18T18:58:44Z","published":"2024-01-18T18:58:44Z","title":"Edit One for All: Interactive Batch Image Editing","summary":" In recent years, image editing has advanced remarkably. With increased human\ncontrol, it is now possible to edit an image in a plethora of ways; from\nspecifying in text what we want to change, to straight up dragging the contents\nof the image in an interactive point-based manner. However, most of the focus\nhas remained on editing single images at a time. Whether and how we can\nsimultaneously edit large batches of images has remained understudied. With the\ngoal of minimizing human supervision in the editing process, this paper\npresents a novel method for interactive batch image editing using StyleGAN as\nthe medium. Given an edit specified by users in an example image (e.g., make\nthe face frontal), our method can automatically transfer that edit to other\ntest images, so that regardless of their initial state (pose), they all arrive\nat the same final state (e.g., all facing front). Extensive experiments\ndemonstrate that edits performed using our method have similar visual quality\nto existing single-image-editing methods, while having more visual consistency\nand saving significant time and human effort.\n","authors":["Thao Nguyen","Utkarsh Ojha","Yuheng Li","Haotian Liu","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2401.10219v1.pdf","comment":"Project page: https://thaoshibe.github.io/edit-one-for-all/"},{"id":"http://arxiv.org/abs/2401.10217v1","updated":"2024-01-18T18:57:40Z","published":"2024-01-18T18:57:40Z","title":"Explaining the Implicit Neural Canvas: Connecting Pixels to Neurons by\n Tracing their Contributions","summary":" The many variations of Implicit Neural Representations (INRs), where a neural\nnetwork is trained as a continuous representation of a signal, have tremendous\npractical utility for downstream tasks including novel view synthesis, video\ncompression, and image superresolution. Unfortunately, the inner workings of\nthese networks are seriously under-studied. Our work, eXplaining the Implicit\nNeural Canvas (XINC), is a unified framework for explaining properties of INRs\nby examining the strength of each neuron's contribution to each output pixel.\nWe call the aggregate of these contribution maps the Implicit Neural Canvas and\nwe use this concept to demonstrate that the INRs which we study learn to\n''see'' the frames they represent in surprising ways. For example, INRs tend to\nhave highly distributed representations. While lacking high-level object\nsemantics, they have a significant bias for color and edges, and are almost\nentirely space-agnostic. We arrive at our conclusions by examining how objects\nare represented across time in video INRs, using clustering to visualize\nsimilar neurons across layers and architectures, and show that this is\ndominated by motion. These insights demonstrate the general usefulness of our\nanalysis framework. Our project page is available at\nhttps://namithap10.github.io/xinc.\n","authors":["Namitha Padmanabhan","Matthew Gwilliam","Pulkit Kumar","Shishira R Maiya","Max Ehrlich","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2401.10217v1.pdf","comment":"Project site: https://namithap10.github.io/xinc"},{"id":"http://arxiv.org/abs/2401.10215v1","updated":"2024-01-18T18:56:34Z","published":"2024-01-18T18:56:34Z","title":"GPAvatar: Generalizable and Precise Head Avatar from Image(s)","summary":" Head avatar reconstruction, crucial for applications in virtual reality,\nonline meetings, gaming, and film industries, has garnered substantial\nattention within the computer vision community. The fundamental objective of\nthis field is to faithfully recreate the head avatar and precisely control\nexpressions and postures. Existing methods, categorized into 2D-based warping,\nmesh-based, and neural rendering approaches, present challenges in maintaining\nmulti-view consistency, incorporating non-facial information, and generalizing\nto new identities. In this paper, we propose a framework named GPAvatar that\nreconstructs 3D head avatars from one or several images in a single forward\npass. The key idea of this work is to introduce a dynamic point-based\nexpression field driven by a point cloud to precisely and effectively capture\nexpressions. Furthermore, we use a Multi Tri-planes Attention (MTA) fusion\nmodule in the tri-planes canonical field to leverage information from multiple\ninput images. The proposed method achieves faithful identity reconstruction,\nprecise expression control, and multi-view consistency, demonstrating promising\nresults for free-viewpoint rendering and novel view synthesis.\n","authors":["Xuangeng Chu","Yu Li","Ailing Zeng","Tianyu Yang","Lijian Lin","Yunfei Liu","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2401.10215v1.pdf","comment":"ICLR 2024, code is available at https://github.com/xg-chu/GPAvatar"},{"id":"http://arxiv.org/abs/2401.10208v1","updated":"2024-01-18T18:50:16Z","published":"2024-01-18T18:50:16Z","title":"MM-Interleaved: Interleaved Image-Text Generative Modeling via\n Multi-modal Feature Synchronizer","summary":" Developing generative models for interleaved image-text data has both\nresearch and practical value. It requires models to understand the interleaved\nsequences and subsequently generate images and text. However, existing attempts\nare limited by the issue that the fixed number of visual tokens cannot\nefficiently capture image details, which is particularly problematic in the\nmulti-image scenarios. To address this, this paper presents MM-Interleaved, an\nend-to-end generative model for interleaved image-text data. It introduces a\nmulti-scale and multi-image feature synchronizer module, allowing direct access\nto fine-grained image features in the previous context during the generation\nprocess. MM-Interleaved is end-to-end pre-trained on both paired and\ninterleaved image-text corpora. It is further enhanced through a supervised\nfine-tuning phase, wherein the model improves its ability to follow complex\nmulti-modal instructions. Experiments demonstrate the versatility of\nMM-Interleaved in recognizing visual details following multi-modal instructions\nand generating consistent images following both textual and visual conditions.\nCode and models are available at\n\\url{https://github.com/OpenGVLab/MM-Interleaved}.\n","authors":["Changyao Tian","Xizhou Zhu","Yuwen Xiong","Weiyun Wang","Zhe Chen","Wenhai Wang","Yuntao Chen","Lewei Lu","Tong Lu","Jie Zhou","Hongsheng Li","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2401.10208v1.pdf","comment":"20 pages, 9 figures, 17 tables"},{"id":"http://arxiv.org/abs/2401.10191v1","updated":"2024-01-18T18:25:29Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n Continual Learning","summary":" Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v1.pdf","comment":"Accepted to ICLR2024 (main track), code is available at:\n https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2308.05021v3","updated":"2024-01-18T18:18:59Z","published":"2023-08-09T15:31:17Z","title":"On Error Propagation of Diffusion Models","summary":" Although diffusion models (DMs) have shown promising performances in a number\nof tasks (e.g., speech synthesis and image generation), they might suffer from\nerror propagation because of their sequential structure. However, this is not\ncertain because some sequential models, such as Conditional Random Field (CRF),\nare free from this problem. To address this issue, we develop a theoretical\nframework to mathematically formulate error propagation in the architecture of\nDMs, The framework contains three elements, including modular error, cumulative\nerror, and propagation equation. The modular and cumulative errors are related\nby the equation, which interprets that DMs are indeed affected by error\npropagation. Our theoretical study also suggests that the cumulative error is\nclosely related to the generation quality of DMs. Based on this finding, we\napply the cumulative error as a regularization term to reduce error\npropagation. Because the term is computationally intractable, we derive its\nupper bound and design a bootstrap algorithm to efficiently estimate the bound\nfor optimization. We have conducted extensive experiments on multiple image\ndatasets, showing that our proposed regularization reduces error propagation,\nsignificantly improves vanilla DMs, and outperforms previous baselines.\n","authors":["Yangming Li","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2308.05021v3.pdf","comment":"Accepted by ICLR-2024"},{"id":"http://arxiv.org/abs/2212.08044v2","updated":"2024-01-18T18:16:35Z","published":"2022-12-15T18:52:03Z","title":"Benchmarking Robustness of Multimodal Image-Text Models under\n Distribution Shift","summary":" Multimodal image-text models have shown remarkable performance in the past\nfew years. However, evaluating robustness against distribution shifts is\ncrucial before adopting them in real-world applications. In this work, we\ninvestigate the robustness of 12 popular open-sourced image-text models under\ncommon perturbations on five tasks (image-text retrieval, visual reasoning,\nvisual entailment, image captioning, and text-to-image generation). In\nparticular, we propose several new multimodal robustness benchmarks by applying\n17 image perturbation and 16 text perturbation techniques on top of existing\ndatasets. We observe that multimodal models are not robust to image and text\nperturbations, especially to image perturbations. Among the tested perturbation\nmethods, character-level perturbations constitute the most severe distribution\nshift for text, and zoom blur is the most severe shift for image data. We also\nintroduce two new robustness metrics (\\textbf{MMI} for MultiModal Impact score\nand \\textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal\nmodels. We hope our extensive study sheds light on new directions for the\ndevelopment of robust multimodal models. More details can be found on the\nproject webpage: \\url{https://MMRobustness.github.io}.\n","authors":["Jielin Qiu","Yi Zhu","Xingjian Shi","Florian Wenzel","Zhiqiang Tang","Ding Zhao","Bo Li","Mu Li"],"pdf_url":"https://arxiv.org/pdf/2212.08044v2.pdf","comment":"Accepted by Journal of Data-centric Machine Learning Research (DMLR)\n 2024"},{"id":"http://arxiv.org/abs/2309.14068v3","updated":"2024-01-18T18:16:33Z","published":"2023-09-25T12:03:32Z","title":"Soft Mixture Denoising: Beyond the Expressive Bottleneck of Diffusion\n Models","summary":" Because diffusion models have shown impressive performances in a number of\ntasks, such as image synthesis, there is a trend in recent works to prove (with\ncertain assumptions) that these models have strong approximation capabilities.\nIn this paper, we show that current diffusion models actually have an\nexpressive bottleneck in backward denoising and some assumption made by\nexisting theoretical guarantees is too strong. Based on this finding, we prove\nthat diffusion models have unbounded errors in both local and global denoising.\nIn light of our theoretical studies, we introduce soft mixture denoising (SMD),\nan expressive and efficient model for backward denoising. SMD not only permits\ndiffusion models to well approximate any Gaussian mixture distributions in\ntheory, but also is simple and efficient for implementation. Our experiments on\nmultiple image datasets show that SMD significantly improves different types of\ndiffusion models (e.g., DDPM), espeically in the situation of few backward\niterations.\n","authors":["Yangming Li","Boris van Breugel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2309.14068v3.pdf","comment":"Accepted by ICLR-2024"},{"id":"http://arxiv.org/abs/2308.03200v3","updated":"2024-01-18T18:10:38Z","published":"2023-08-06T20:10:12Z","title":"Uncovering local aggregated air quality index with smartphone captured\n images leveraging efficient deep convolutional neural network","summary":" The prevalence and mobility of smartphones make these a widely used tool for\nenvironmental health research. However, their potential for determining\naggregated air quality index (AQI) based on PM2.5 concentration in specific\nlocations remains largely unexplored in the existing literature. In this paper,\nwe thoroughly examine the challenges associated with predicting\nlocation-specific PM2.5 concentration using images taken with smartphone\ncameras. The focus of our study is on Dhaka, the capital of Bangladesh, due to\nits significant air pollution levels and the large population exposed to it.\nOur research involves the development of a Deep Convolutional Neural Network\n(DCNN), which we train using over a thousand outdoor images taken and\nannotated. These photos are captured at various locations in Dhaka, and their\nlabels are based on PM2.5 concentration data obtained from the local US\nconsulate, calculated using the NowCast algorithm. Through supervised learning,\nour model establishes a correlation index during training, enhancing its\nability to function as a Picture-based Predictor of PM2.5 Concentration (PPPC).\nThis enables the algorithm to calculate an equivalent daily averaged AQI index\nfrom a smartphone image. Unlike, popular overly parameterized models, our model\nshows resource efficiency since it uses fewer parameters. Furthermore, test\nresults indicate that our model outperforms popular models like ViT and INN, as\nwell as popular CNN-based models such as VGG19, ResNet50, and MobileNetV2, in\npredicting location-specific PM2.5 concentration. Our dataset is the first\npublicly available collection that includes atmospheric images and\ncorresponding PM2.5 measurements from Dhaka. Our codes and dataset are\navailable at https://github.com/lepotatoguy/aqi.\n","authors":["Joyanta Jyoti Mondal","Md. Farhadul Islam","Raima Islam","Nowsin Kabir Rhidi","Sarfaraz Newaz","Meem Arafat Manab","A. B. M. Alim Al Islam","Jannatun Noor"],"pdf_url":"https://arxiv.org/pdf/2308.03200v3.pdf","comment":"18 pages, 7 figures, published to Nature Scientific Reports"},{"id":"http://arxiv.org/abs/2401.10178v1","updated":"2024-01-18T18:06:22Z","published":"2024-01-18T18:06:22Z","title":"Neural Echos: Depthwise Convolutional Filters Replicate Biological\n Receptive Fields","summary":" In this study, we present evidence suggesting that depthwise convolutional\nkernels are effectively replicating the structural intricacies of the\nbiological receptive fields observed in the mammalian retina. We provide\nanalytics of trained kernels from various state-of-the-art models\nsubstantiating this evidence. Inspired by this intriguing discovery, we propose\nan initialization scheme that draws inspiration from the biological receptive\nfields. Experimental analysis of the ImageNet dataset with multiple CNN\narchitectures featuring depthwise convolutions reveals a marked enhancement in\nthe accuracy of the learned model when initialized with biologically derived\nweights. This underlies the potential for biologically inspired computational\nmodels to further our understanding of vision processing systems and to improve\nthe efficacy of convolutional networks.\n","authors":["Zahra Babaiee","Peyman M. Kiasari","Daniela Rus","Radu Grosu"],"pdf_url":"https://arxiv.org/pdf/2401.10178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10176v1","updated":"2024-01-18T18:05:35Z","published":"2024-01-18T18:05:35Z","title":"Comprehensive OOD Detection Improvements","summary":" As machine learning becomes increasingly prevalent in impactful decisions,\nrecognizing when inference data is outside the model's expected input\ndistribution is paramount for giving context to predictions.\nOut-of-distribution (OOD) detection methods have been created for this task.\nSuch methods can be split into representation-based or logit-based methods from\nwhether they respectively utilize the model's embeddings or predictions for OOD\ndetection. In contrast to most papers which solely focus on one such group, we\naddress both. We employ dimensionality reduction on feature embeddings in\nrepresentation-based methods for both time speedups and improved performance.\nAdditionally, we propose DICE-COL, a modification of the popular logit-based\nmethod Directed Sparsification (DICE) that resolves an unnoticed flaw. We\ndemonstrate the effectiveness of our methods on the OpenOODv1.5 benchmark\nframework, where they significantly improve performance and set\nstate-of-the-art results.\n","authors":["Anish Lakkapragada","Amol Khanna","Edward Raff","Nathan Inkawhich"],"pdf_url":"https://arxiv.org/pdf/2401.10176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10171v1","updated":"2024-01-18T18:01:19Z","published":"2024-01-18T18:01:19Z","title":"SHINOBI: Shape and Illumination using Neural Object Decomposition via\n BRDF Optimization In-the-wild","summary":" We present SHINOBI, an end-to-end framework for the reconstruction of shape,\nmaterial, and illumination from object images captured with varying lighting,\npose, and background. Inverse rendering of an object based on unconstrained\nimage collections is a long-standing challenge in computer vision and graphics\nand requires a joint optimization over shape, radiance, and pose. We show that\nan implicit shape representation based on a multi-resolution hash encoding\nenables faster and robust shape reconstruction with joint camera alignment\noptimization that outperforms prior work. Further, to enable the editing of\nillumination and object reflectance (i.e. material) we jointly optimize BRDF\nand illumination together with the object's shape. Our method is class-agnostic\nand works on in-the-wild image collections of objects to produce relightable 3D\nassets for several use cases such as AR/VR, movies, games, etc. Project page:\nhttps://shinobi.aengelhardt.com Video:\nhttps://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be\n","authors":["Andreas Engelhardt","Amit Raj","Mark Boss","Yunzhi Zhang","Abhishek Kar","Yuanzhen Li","Deqing Sun","Ricardo Martin Brualla","Jonathan T. Barron","Hendrik P. A. Lensch","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2401.10171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10166v1","updated":"2024-01-18T17:55:39Z","published":"2024-01-18T17:55:39Z","title":"VMamba: Visual State Space Model","summary":" Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) stand as\nthe two most popular foundation models for visual representation learning.\nWhile CNNs exhibit remarkable scalability with linear complexity w.r.t. image\nresolution, ViTs surpass them in fitting capabilities despite contending with\nquadratic complexity. A closer inspection reveals that ViTs achieve superior\nvisual modeling performance through the incorporation of global receptive\nfields and dynamic weights. This observation motivates us to propose a novel\narchitecture that inherits these components while enhancing computational\nefficiency. To this end, we draw inspiration from the recently introduced state\nspace model and propose the Visual State Space Model (VMamba), which achieves\nlinear complexity without sacrificing global receptive fields. To address the\nencountered direction-sensitive issue, we introduce the Cross-Scan Module (CSM)\nto traverse the spatial domain and convert any non-causal visual image into\norder patch sequences. Extensive experimental results substantiate that VMamba\nnot only demonstrates promising capabilities across various visual perception\ntasks, but also exhibits more pronounced advantages over established benchmarks\nas the image resolution increases. Source code has been available at\nhttps://github.com/MzeroMiko/VMamba.\n","authors":["Yue Liu","Yunjie Tian","Yuzhong Zhao","Hongtian Yu","Lingxi Xie","Yaowei Wang","Qixiang Ye","Yunfan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10166v1.pdf","comment":"13 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2310.19776v3","updated":"2024-01-18T17:53:45Z","published":"2023-10-30T17:45:32Z","title":"Learn to Categorize or Categorize to Learn? Self-Coding for Generalized\n Category Discovery","summary":" In the quest for unveiling novel categories at test time, we confront the\ninherent limitations of traditional supervised recognition models that are\nrestricted by a predefined category set. While strides have been made in the\nrealms of self-supervised and open-world learning towards test-time category\ndiscovery, a crucial yet often overlooked question persists: what exactly\ndelineates a category? In this paper, we conceptualize a category through the\nlens of optimization, viewing it as an optimal solution to a well-defined\nproblem. Harnessing this unique conceptualization, we propose a novel,\nefficient and self-supervised method capable of discovering previously unknown\ncategories at test time. A salient feature of our approach is the assignment of\nminimum length category codes to individual data instances, which encapsulates\nthe implicit category hierarchy prevalent in real-world datasets. This\nmechanism affords us enhanced control over category granularity, thereby\nequipping our model to handle fine-grained categories adeptly. Experimental\nevaluations, bolstered by state-of-the-art benchmark comparisons, testify to\nthe efficacy of our solution in managing unknown categories at test time.\nFurthermore, we fortify our proposition with a theoretical foundation,\nproviding proof of its optimality. Our code is available at\nhttps://github.com/SarahRastegar/InfoSieve.\n","authors":["Sarah Rastegar","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2310.19776v3.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.10150v1","updated":"2024-01-18T17:22:37Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n Diffusion-Based Video Generation","summary":" Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model.To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos.\n","authors":["Changgu Chen","Junwei Shu","Lianggangxu Chen","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v1.pdf","comment":"9 pages, 4 figures, IJCAI paper"},{"id":"http://arxiv.org/abs/2401.10148v1","updated":"2024-01-18T17:22:11Z","published":"2024-01-18T17:22:11Z","title":"Explicitly Disentangled Representations in Object-Centric Learning","summary":" Extracting structured representations from raw visual data is an important\nand long-standing challenge in machine learning. Recently, techniques for\nunsupervised learning of object-centric representations have raised growing\ninterest. In this context, enhancing the robustness of the latent features can\nimprove the efficiency and effectiveness of the training of downstream tasks. A\npromising step in this direction is to disentangle the factors that cause\nvariation in the data. Previously, Invariant Slot Attention disentangled\nposition, scale, and orientation from the remaining features. Extending this\napproach, we focus on separating the shape and texture components. In\nparticular, we propose a novel architecture that biases object-centric models\ntoward disentangling shape and texture components into two non-overlapping\nsubsets of the latent space dimensions. These subsets are known a priori, hence\nbefore the training process. Experiments on a range of object-centric\nbenchmarks reveal that our approach achieves the desired disentanglement while\nalso numerically improving baseline performance in most cases. In addition, we\nshow that our method can generate novel textures for a specific object or\ntransfer textures between objects with distinct shapes.\n","authors":["Riccardo Majellaro","Jonathan Collu","Aske Plaat","Thomas M. Moerland"],"pdf_url":"https://arxiv.org/pdf/2401.10148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09172v3","updated":"2024-01-18T17:13:21Z","published":"2023-04-18T17:59:45Z","title":"Hyperbolic Image-Text Representations","summary":" Visual and linguistic concepts naturally organize themselves in a hierarchy,\nwhere a textual concept \"dog\" entails all images that contain dogs. Despite\nbeing intuitive, current large-scale vision and language models such as CLIP do\nnot explicitly capture such hierarchy. We propose MERU, a contrastive model\nthat yields hyperbolic representations of images and text. Hyperbolic spaces\nhave suitable geometric properties to embed tree-like data, so MERU can better\ncapture the underlying hierarchy in image-text datasets. Our results show that\nMERU learns a highly interpretable and structured representation space while\nbeing competitive with CLIP's performance on standard multi-modal tasks like\nimage classification and image-text retrieval. Our code and models are\navailable at https://www.github.com/facebookresearch/meru\n","authors":["Karan Desai","Maximilian Nickel","Tanmay Rajpurohit","Justin Johnson","Ramakrishna Vedantam"],"pdf_url":"https://arxiv.org/pdf/2304.09172v3.pdf","comment":"ICML 2023 (v3: Add link to code in abstract)"},{"id":"http://arxiv.org/abs/2401.10139v1","updated":"2024-01-18T17:06:21Z","published":"2024-01-18T17:06:21Z","title":"Model Compression Techniques in Biometrics Applications: A Survey","summary":" The development of deep learning algorithms has extensively empowered\nhumanity's task automatization capacity. However, the huge improvement in the\nperformance of these models is highly correlated with their increasing level of\ncomplexity, limiting their usefulness in human-oriented applications, which are\nusually deployed in resource-constrained devices. This led to the development\nof compression techniques that drastically reduce the computational and memory\ncosts of deep learning models without significant performance degradation. This\npaper aims to systematize the current literature on this topic by presenting a\ncomprehensive survey of model compression techniques in biometrics\napplications, namely quantization, knowledge distillation and pruning. We\nconduct a critical analysis of the comparative value of these techniques,\nfocusing on their advantages and disadvantages and presenting suggestions for\nfuture work directions that can potentially improve the current methods.\nAdditionally, we discuss and analyze the link between model bias and model\ncompression, highlighting the need to direct compression research toward model\nfairness in future works.\n","authors":["Eduarda Caldeira","Pedro C. Neto","Marco Huber","Naser Damer","Ana F. Sequeira"],"pdf_url":"https://arxiv.org/pdf/2401.10139v1.pdf","comment":"Under review at IEEE Journal"},{"id":"http://arxiv.org/abs/2401.10129v1","updated":"2024-01-18T16:59:27Z","published":"2024-01-18T16:59:27Z","title":"Few-shot learning for COVID-19 Chest X-Ray Classification with\n Imbalanced Data: An Inter vs. Intra Domain Study","summary":" Medical image datasets are essential for training models used in\ncomputer-aided diagnosis, treatment planning, and medical research. However,\nsome challenges are associated with these datasets, including variability in\ndata distribution, data scarcity, and transfer learning issues when using\nmodels pre-trained from generic images. This work studies the effect of these\nchallenges at the intra- and inter-domain level in few-shot learning scenarios\nwith severe data imbalance. For this, we propose a methodology based on Siamese\nneural networks in which a series of techniques are integrated to mitigate the\neffects of data scarcity and distribution imbalance. Specifically, different\ninitialization and data augmentation methods are analyzed, and four adaptations\nto Siamese networks of solutions to deal with imbalanced data are introduced,\nincluding data balancing and weighted loss, both separately and combined, and\nwith a different balance of pairing ratios. Moreover, we also assess the\ninference process considering four classifiers, namely Histogram, $k$NN, SVM,\nand Random Forest. Evaluation is performed on three chest X-ray datasets with\nannotated cases of both positive and negative COVID-19 diagnoses. The accuracy\nof each technique proposed for the Siamese architecture is analyzed separately\nand their results are compared to those obtained using equivalent methods on a\nstate-of-the-art CNN. We conclude that the introduced techniques offer\npromising improvements over the baseline in almost all cases, and that the\nselection of the technique may vary depending on the amount of data available\nand the level of imbalance.\n","authors":["Alejandro Galán-Cuenca","Antonio Javier Gallego","Marcelo Saval-Calvo","Antonio Pertusa"],"pdf_url":"https://arxiv.org/pdf/2401.10129v1.pdf","comment":"Submited to Pattern Analysis and Applications"},{"id":"http://arxiv.org/abs/2401.10128v1","updated":"2024-01-18T16:59:04Z","published":"2024-01-18T16:59:04Z","title":"Sub2Full: split spectrum to boost OCT despeckling without clean data","summary":" Optical coherence tomography (OCT) suffers from speckle noise, causing the\ndeterioration of image quality, especially in high-resolution modalities like\nvisible light OCT (vis-OCT). The potential of conventional supervised deep\nlearning denoising methods is limited by the difficulty of obtaining clean\ndata. Here, we proposed an innovative self-supervised strategy called Sub2Full\n(S2F) for OCT despeckling without clean data. This approach works by acquiring\ntwo repeated B-scans, splitting the spectrum of the first repeat as a\nlow-resolution input, and utilizing the full spectrum of the second repeat as\nthe high-resolution target. The proposed method was validated on vis-OCT\nretinal images visualizing sublaminar structures in outer retina and\ndemonstrated superior performance over conventional Noise2Noise and Noise2Void\nschemes. The code is available at\nhttps://github.com/PittOCT/Sub2Full-OCT-Denoising.\n","authors":["Lingyun Wang","Jose A Sahel","Shaohua Pi"],"pdf_url":"https://arxiv.org/pdf/2401.10128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06397v2","updated":"2024-01-18T16:40:22Z","published":"2024-01-12T06:35:09Z","title":"UMG-CLIP: A Unified Multi-Granularity Vision Generalist for Open-World\n Understanding","summary":" Vision-language foundation models, represented by Contrastive language-image\npre-training (CLIP), have gained increasing attention for jointly understanding\nboth vision and textual tasks. However, existing approaches primarily focus on\ntraining models to match global image representations with textual\ndescriptions, thereby overlooking the critical alignment between local regions\nand corresponding text tokens. This paper extends CLIP with multi-granularity\nalignment. Notably, we deliberately construct a new dataset comprising pseudo\nannotations at various levels of granularities, encompassing image-level,\nregion-level, and pixel-level captions/tags. Accordingly, we develop a unified\nmulti-granularity learning framework, named UMG-CLIP, that simultaneously\nempowers the model with versatile perception abilities across different levels\nof detail. Equipped with parameter efficient tuning, UMG-CLIP surpasses current\nwidely used CLIP models and achieves state-of-the-art performance on diverse\nimage understanding benchmarks, including open-world recognition, retrieval,\nsemantic segmentation, and panoptic segmentation tasks. We hope UMG-CLIP can\nserve as a valuable option for advancing vision-language foundation models.\n","authors":["Bowen Shi","Peisen Zhao","Zichen Wang","Yuhang Zhang","Yaoming Wang","Jin Li","Wenrui Dai","Junni Zou","Hongkai Xiong","Qi Tian","Xiaopeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.06397v2.pdf","comment":"The paper is undergoing internal legal review and will be resubmitted\n once it passes the review"},{"id":"http://arxiv.org/abs/2401.10113v1","updated":"2024-01-18T16:35:37Z","published":"2024-01-18T16:35:37Z","title":"Exposing Lip-syncing Deepfakes from Mouth Inconsistencies","summary":" A lip-syncing deepfake is a digitally manipulated video in which a person's\nlip movements are created convincingly using AI models to match altered or\nentirely new audio. Lip-syncing deepfakes are a dangerous type of deepfakes as\nthe artifacts are limited to the lip region and more difficult to discern. In\nthis paper, we describe a novel approach, LIP-syncing detection based on mouth\nINConsistency (LIPINC), for lip-syncing deepfake detection by identifying\ntemporal inconsistencies in the mouth region. These inconsistencies are seen in\nthe adjacent frames and throughout the video. Our model can successfully\ncapture these irregularities and outperforms the state-of-the-art methods on\nseveral benchmark deepfake datasets.\n","authors":["Soumyya Kanti Datta","Shan Jia","Siwei Lyu"],"pdf_url":"https://arxiv.org/pdf/2401.10113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10110v1","updated":"2024-01-18T16:27:09Z","published":"2024-01-18T16:27:09Z","title":"VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text\n Recognition","summary":" Scene Text Recognition (STR) is a challenging task that involves recognizing\ntext within images of natural scenes. Although current state-of-the-art models\nfor STR exhibit high performance, they typically suffer from low inference\nefficiency due to their reliance on hybrid architectures comprised of visual\nencoders and sequence decoders. In this work, we propose the VIsion Permutable\nextractor for fast and efficient scene Text Recognition (VIPTR), which achieves\nan impressive balance between high performance and rapid inference speeds in\nthe domain of STR. Specifically, VIPTR leverages a visual-semantic extractor\nwith a pyramid structure, characterized by multiple self-attention layers,\nwhile eschewing the traditional sequence decoder. This design choice results in\na lightweight and efficient model capable of handling inputs of varying sizes.\nExtensive experimental results on various standard datasets for both Chinese\nand English scene text recognition validate the superiority of VIPTR. Notably,\nthe VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with\nother lightweight models and achieves SOTA inference speeds. Meanwhile, the\nVIPTR-L (Large) variant attains greater recognition accuracy, while maintaining\na low parameter count and favorable inference speed. Our proposed method\nprovides a compelling solution for the STR challenge, which blends high\naccuracy with efficiency and greatly benefits real-world applications requiring\nfast and reliable text recognition. The code is publicly available at\nhttps://github.com/cxfyxl/VIPTR.\n","authors":["Xianfu Cheng","Weixiao Zhou","Xiang Li","Xiaoming Chen","Jian Yang","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2401.10110v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.00159 by other authors"},{"id":"http://arxiv.org/abs/2310.12086v2","updated":"2024-01-18T16:20:06Z","published":"2023-10-18T16:27:49Z","title":"FactCHD: Benchmarking Fact-Conflicting Hallucination Detection","summary":" Despite their impressive generative capabilities, LLMs are hindered by\nfact-conflicting hallucinations in real-world applications. The accurate\nidentification of hallucinations in texts generated by LLMs, especially in\ncomplex inferential scenarios, is a relatively unexplored area. To address this\ngap, we present FactCHD, a dedicated benchmark designed for the detection of\nfact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset\nthat spans various factuality patterns, including vanilla, multi-hop,\ncomparison, and set operation. A distinctive element of FactCHD is its\nintegration of fact-based evidence chains, significantly enhancing the depth of\nevaluating the detectors' explanations. Experiments on different LLMs expose\nthe shortcomings of current approaches in detecting factual errors accurately.\nFurthermore, we introduce Truth-Triangulator that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset is available at\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chenxi Wang","Ningyu Zhang","Jiang Yong","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.02119v2","updated":"2024-01-18T15:59:34Z","published":"2023-09-05T10:52:21Z","title":"Hierarchical Masked 3D Diffusion Model for Video Outpainting","summary":" Video outpainting aims to adequately complete missing areas at the edges of\nvideo frames. Compared to image outpainting, it presents an additional\nchallenge as the model should maintain the temporal consistency of the filled\narea. In this paper, we introduce a masked 3D diffusion model for video\noutpainting. We use the technique of mask modeling to train the 3D diffusion\nmodel. This allows us to use multiple guide frames to connect the results of\nmultiple video clip inferences, thus ensuring temporal consistency and reducing\njitter between adjacent frames. Meanwhile, we extract the global frames of the\nvideo as prompts and guide the model to obtain information other than the\ncurrent video clip using cross-attention. We also introduce a hybrid\ncoarse-to-fine inference pipeline to alleviate the artifact accumulation\nproblem. The existing coarse-to-fine pipeline only uses the infilling strategy,\nwhich brings degradation because the time interval of the sparse frames is too\nlarge. Our pipeline benefits from bidirectional learning of the mask modeling\nand thus can employ a hybrid strategy of infilling and interpolation when\ngenerating sparse frames. Experiments show that our method achieves\nstate-of-the-art results in video outpainting tasks. More results and codes are\nprovided at our https://fanfanda.github.io/M3DDM/.\n","authors":["Fanda Fan","Chaoxu Guo","Litong Gong","Biao Wang","Tiezheng Ge","Yuning Jiang","Chunjie Luo","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.02119v2.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2401.10090v1","updated":"2024-01-18T15:56:23Z","published":"2024-01-18T15:56:23Z","title":"Cross-Modality Perturbation Synergy Attack for Person Re-identification","summary":" In recent years, there has been significant research focusing on addressing\nsecurity concerns in single-modal person re-identification (ReID) systems that\nare based on RGB images. However, the safety of cross-modality scenarios, which\nare more commonly encountered in practical applications involving images\ncaptured by infrared cameras, has not received adequate attention. The main\nchallenge in cross-modality ReID lies in effectively dealing with visual\ndifferences between different modalities. For instance, infrared images are\ntypically grayscale, unlike visible images that contain color information.\nExisting attack methods have primarily focused on the characteristics of the\nvisible image modality, overlooking the features of other modalities and the\nvariations in data distribution among different modalities. This oversight can\npotentially undermine the effectiveness of these methods in image retrieval\nacross diverse modalities. This study represents the first exploration into the\nsecurity of cross-modality ReID models and proposes a universal perturbation\nattack specifically designed for cross-modality ReID. This attack optimizes\nperturbations by leveraging gradients from diverse modality data, thereby\ndisrupting the discriminator and reinforcing the differences between\nmodalities. We conducted experiments on two widely used cross-modality\ndatasets, namely RegDB and SYSU, which not only demonstrated the effectiveness\nof our method but also provided insights for future enhancements in the\nrobustness of cross-modality ReID systems.\n","authors":["Yunpeng Gong"," others"],"pdf_url":"https://arxiv.org/pdf/2401.10090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02116v2","updated":"2024-01-18T15:47:07Z","published":"2023-12-04T18:48:02Z","title":"GIVT: Generative Infinite-Vocabulary Transformers","summary":" We introduce generative infinite-vocabulary transformers (GIVT) which\ngenerate vector sequences with real-valued entries, instead of discrete tokens\nfrom a finite vocabulary. To this end, we propose two surprisingly simple\nmodifications to decoder-only transformers: 1) at the input, we replace the\nfinite-vocabulary lookup table with a linear projection of the input vectors;\nand 2) at the output, we replace the logits prediction (usually mapped to a\ncategorical distribution) with the parameters of a multivariate Gaussian\nmixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT,\nwhere transformers are used to model the discrete latent sequences of a VQ-VAE,\nwe use GIVT to model the unquantized real-valued latent sequences of a VAE.\nWhen applying GIVT to class-conditional image generation with iterative masked\nmodeling, we show competitive results with MaskGIT, while our approach\noutperforms both VQ-GAN and MaskGIT when using it for causal modeling. Finally,\nwe obtain competitive results outside of image generation when applying our\napproach to panoptic segmentation and depth estimation with a VAE-based variant\nof the UViM framework.\n","authors":["Michael Tschannen","Cian Eastwood","Fabian Mentzer"],"pdf_url":"https://arxiv.org/pdf/2312.02116v2.pdf","comment":"v2: add related NLP work, loss details"},{"id":"http://arxiv.org/abs/2401.10061v1","updated":"2024-01-18T15:30:58Z","published":"2024-01-18T15:30:58Z","title":"DiffusionGPT: LLM-Driven Text-to-Image Generation System","summary":" Diffusion models have opened up new avenues for the field of image\ngeneration, resulting in the proliferation of high-quality models shared on\nopen-source platforms. However, a major challenge persists in current\ntext-to-image systems are often unable to handle diverse inputs, or are limited\nto single model results. Current unified attempts often fall into two\northogonal aspects: i) parse Diverse Prompts in input stage; ii) activate\nexpert model to output. To combine the best of both worlds, we propose\nDiffusionGPT, which leverages Large Language Models (LLM) to offer a unified\ngeneration system capable of seamlessly accommodating various types of prompts\nand integrating domain-expert models. DiffusionGPT constructs domain-specific\nTrees for various generative models based on prior knowledge. When provided\nwith an input, the LLM parses the prompt and employs the Trees-of-Thought to\nguide the selection of an appropriate model, thereby relaxing input constraints\nand ensuring exceptional performance across diverse domains. Moreover, we\nintroduce Advantage Databases, where the Tree-of-Thought is enriched with human\nfeedback, aligning the model selection process with human preferences. Through\nextensive experiments and comparisons, we demonstrate the effectiveness of\nDiffusionGPT, showcasing its potential for pushing the boundaries of image\nsynthesis in diverse domains.\n","authors":["Jie Qin","Jie Wu","Weifeng Chen","Yuxi Ren","Huixia Li","Hefeng Wu","Xuefeng Xiao","Rui Wang","Shilei Wen"],"pdf_url":"https://arxiv.org/pdf/2401.10061v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12838v2","updated":"2024-01-18T15:27:37Z","published":"2023-12-20T08:42:57Z","title":"FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image\n Segmentation against Heterogeneous Annotation Noise","summary":" Federated learning (FL) has emerged as a promising paradigm for training\nsegmentation models on decentralized medical data, owing to its\nprivacy-preserving property. However, existing research overlooks the prevalent\nannotation noise encountered in real-world medical datasets, which limits the\nperformance ceilings of FL. In this paper, we, for the first time, identify and\ntackle this problem. For problem formulation, we propose a contour evolution\nfor modeling non-independent and identically distributed (Non-IID) noise across\npixels within each client and then extend it to the case of multi-source data\nto form a heterogeneous noise model (i.e., Non-IID annotation noise across\nclients). For robust learning from annotations with such two-level Non-IID\nnoise, we emphasize the importance of data quality in model aggregation,\nallowing high-quality clients to have a greater impact on FL. To achieve this,\nwe propose Federated learning with Annotation quAlity-aware AggregatIon, named\nFedA3I, by introducing a quality factor based on client-wise noise estimation.\nSpecifically, noise estimation at each client is accomplished through the\nGaussian mixture model and then incorporated into model aggregation in a\nlayer-wise manner to up-weight high-quality clients. Extensive experiments on\ntwo real-world medical image segmentation datasets demonstrate the superior\nperformance of FedA$^3$I against the state-of-the-art approaches in dealing\nwith cross-client annotation noise. The code is available at\nhttps://github.com/wnn2000/FedAAAI.\n","authors":["Nannan Wu","Zhaobin Sun","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2312.12838v2.pdf","comment":"Accepted at AAAI'24"},{"id":"http://arxiv.org/abs/2401.10050v1","updated":"2024-01-18T15:15:32Z","published":"2024-01-18T15:15:32Z","title":"ContextMix: A context-aware data augmentation method for industrial\n visual inspection systems","summary":" While deep neural networks have achieved remarkable performance, data\naugmentation has emerged as a crucial strategy to mitigate overfitting and\nenhance network performance. These techniques hold particular significance in\nindustrial manufacturing contexts. Recently, image mixing-based methods have\nbeen introduced, exhibiting improved performance on public benchmark datasets.\nHowever, their application to industrial tasks remains challenging. The\nmanufacturing environment generates massive amounts of unlabeled data on a\ndaily basis, with only a few instances of abnormal data occurrences. This leads\nto severe data imbalance. Thus, creating well-balanced datasets is not\nstraightforward due to the high costs associated with labeling. Nonetheless,\nthis is a crucial step for enhancing productivity. For this reason, we\nintroduce ContextMix, a method tailored for industrial applications and\nbenchmark datasets. ContextMix generates novel data by resizing entire images\nand integrating them into other images within the batch. This approach enables\nour method to learn discriminative features based on varying sizes from resized\nimages and train informative secondary features for object recognition using\noccluded images. With the minimal additional computation cost of image\nresizing, ContextMix enhances performance compared to existing augmentation\ntechniques. We evaluate its effectiveness across classification, detection, and\nsegmentation tasks using various network architectures on public benchmark\ndatasets. Our proposed method demonstrates improved results across a range of\nrobustness tasks. Its efficacy in real industrial environments is particularly\nnoteworthy, as demonstrated using the passive component dataset.\n","authors":["Hyungmin Kim","Donghun Kim","Pyunghwan Ahn","Sungho Suh","Hansang Cho","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10050v1.pdf","comment":"Accepted to EAAI"},{"id":"http://arxiv.org/abs/2401.10044v1","updated":"2024-01-18T15:08:42Z","published":"2024-01-18T15:08:42Z","title":"Deep spatial context: when attention-based models meet spatial\n regression","summary":" We propose 'Deep spatial context' (DSCon) method, which serves for\ninvestigation of the attention-based vision models using the concept of spatial\ncontext. It was inspired by histopathologists, however, the method can be\napplied to various domains. The DSCon allows for a quantitative measure of the\nspatial context's role using three Spatial Context Measures: $SCM_{features}$,\n$SCM_{targets}$, $SCM_{residuals}$ to distinguish whether the spatial context\nis observable within the features of neighboring regions, their target values\n(attention scores) or residuals, respectively. It is achieved by integrating\nspatial regression into the pipeline. The DSCon helps to verify research\nquestions. The experiments reveal that spatial relationships are much bigger in\nthe case of the classification of tumor lesions than normal tissues. Moreover,\nit turns out that the larger the size of the neighborhood taken into account\nwithin spatial regression, the less valuable contextual information is.\nFurthermore, it is observed that the spatial context measure is the largest\nwhen considered within the feature space as opposed to the targets and\nresiduals.\n","authors":["Paulina Tomaszewska","Elżbieta Sienkiewicz","Mai P. Hoang","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2401.10044v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17189v2","updated":"2024-01-18T15:06:38Z","published":"2023-09-29T12:38:00Z","title":"RTFS-Net: Recurrent time-frequency modelling for efficient audio-visual\n speech separation","summary":" Audio-visual speech separation methods aim to integrate different modalities\nto generate high-quality separated speech, thereby enhancing the performance of\ndownstream tasks such as speech recognition. Most existing state-of-the-art\n(SOTA) models operate in the time domain. However, their overly simplistic\napproach to modeling acoustic features often necessitates larger and more\ncomputationally intensive models in order to achieve SOTA performance. In this\npaper, we present a novel time-frequency domain audio-visual speech separation\nmethod: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies\nits algorithms on the complex time-frequency bins yielded by the Short-Time\nFourier Transform. We model and capture the time and frequency dimensions of\nthe audio independently using a multi-layered RNN along each dimension.\nFurthermore, we introduce a unique attention-based fusion technique for the\nefficient integration of audio and visual information, and a new mask\nseparation approach that takes advantage of the intrinsic spectral nature of\nthe acoustic features for a clearer separation. RTFS-Net outperforms the\nprevious SOTA method using only 10% of the parameters and 18% of the MACs. This\nis the first time-frequency domain audio-visual speech separation method to\noutperform all contemporary time-domain counterparts.\n","authors":["Samuel Pegg","Kai Li","Xiaolin Hu"],"pdf_url":"https://arxiv.org/pdf/2309.17189v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10041v1","updated":"2024-01-18T15:05:57Z","published":"2024-01-18T15:05:57Z","title":"CMFN: Cross-Modal Fusion Network for Irregular Scene Text Recognition","summary":" Scene text recognition, as a cross-modal task involving vision and text, is\nan important research topic in computer vision. Most existing methods use\nlanguage models to extract semantic information for optimizing visual\nrecognition. However, the guidance of visual cues is ignored in the process of\nsemantic mining, which limits the performance of the algorithm in recognizing\nirregular scene text. To tackle this issue, we propose a novel cross-modal\nfusion network (CMFN) for irregular scene text recognition, which incorporates\nvisual cues into the semantic mining process. Specifically, CMFN consists of a\nposition self-enhanced encoder, a visual recognition branch and an iterative\nsemantic recognition branch. The position self-enhanced encoder provides\ncharacter sequence position encoding for both the visual recognition branch and\nthe iterative semantic recognition branch. The visual recognition branch\ncarries out visual recognition based on the visual features extracted by CNN\nand the position encoding information provided by the position self-enhanced\nencoder. The iterative semantic recognition branch, which consists of a\nlanguage recognition module and a cross-modal fusion gate, simulates the way\nthat human recognizes scene text and integrates cross-modal visual cues for\ntext recognition. The experiments demonstrate that the proposed CMFN algorithm\nachieves comparable performance to state-of-the-art algorithms, indicating its\neffectiveness.\n","authors":["Jinzhi Zheng","Ruyi Ji","Libo Zhang","Yanjun Wu","Chen Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10041v1.pdf","comment":"Accepted to ICONIP 2023"},{"id":"http://arxiv.org/abs/2401.10039v1","updated":"2024-01-18T15:04:46Z","published":"2024-01-18T15:04:46Z","title":"GPT4Ego: Unleashing the Potential of Pre-trained Models for Zero-Shot\n Egocentric Action Recognition","summary":" Vision-Language Models (VLMs), pre-trained on large-scale datasets, have\nshown impressive performance in various visual recognition tasks. This\nadvancement paves the way for notable performance in Zero-Shot Egocentric\nAction Recognition (ZS-EAR). Typically, VLMs handle ZS-EAR as a global\nvideo-text matching task, which often leads to suboptimal alignment of vision\nand linguistic knowledge. We propose a refined approach for ZS-EAR using VLMs,\nemphasizing fine-grained concept-description alignment that capitalizes on the\nrich semantic and contextual details in egocentric videos. In this paper, we\nintroduce GPT4Ego, a straightforward yet remarkably potent VLM framework for\nZS-EAR, designed to enhance the fine-grained alignment of concept and\ndescription between vision and language. Extensive experiments demonstrate\nGPT4Ego significantly outperforms existing VLMs on three large-scale egocentric\nvideo benchmarks, i.e., EPIC-KITCHENS-100 (33.2%, +9.4%), EGTEA (39.6%, +5.5%),\nand CharadesEgo (31.5%, +2.6%).\n","authors":["Guangzhao Dai","Xiangbo Shu","Wenhao Wu"],"pdf_url":"https://arxiv.org/pdf/2401.10039v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.08219v2","updated":"2024-01-18T15:02:56Z","published":"2022-06-16T14:41:32Z","title":"HaGRID - HAnd Gesture Recognition Image Dataset","summary":" This paper introduces an enormous dataset, HaGRID (HAnd Gesture Recognition\nImage Dataset), to build a hand gesture recognition (HGR) system concentrating\non interaction with devices to manage them. That is why all 18 chosen gestures\nare endowed with the semiotic function and can be interpreted as a specific\naction. Although the gestures are static, they were picked up, especially for\nthe ability to design several dynamic gestures. It allows the trained model to\nrecognize not only static gestures such as \"like\" and \"stop\" but also \"swipes\"\nand \"drag and drop\" dynamic gestures. The HaGRID contains 554,800 images and\nbounding box annotations with gesture labels to solve hand detection and\ngesture classification tasks. The low variability in context and subjects of\nother datasets was the reason for creating the dataset without such\nlimitations. Utilizing crowdsourcing platforms allowed us to collect samples\nrecorded by 37,583 subjects in at least as many scenes with subject-to-camera\ndistances from 0.5 to 4 meters in various natural light conditions. The\ninfluence of the diversity characteristics was assessed in ablation study\nexperiments. Also, we demonstrate the HaGRID ability to be used for pretraining\nmodels in HGR tasks. The HaGRID and pretrained models are publicly available.\n","authors":["Alexander Kapitanov","Karina Kvanchiani","Alexander Nagaev","Roman Kraynov","Andrei Makhliarchuk"],"pdf_url":"https://arxiv.org/pdf/2206.08219v2.pdf","comment":"12 pages, 5 figures, open-source dataset for computer vision"},{"id":"http://arxiv.org/abs/2401.10037v1","updated":"2024-01-18T15:00:28Z","published":"2024-01-18T15:00:28Z","title":"Depth Over RGB: Automatic Evaluation of Open Surgery Skills Using Depth\n Camera","summary":" Purpose: In this paper, we present a novel approach to the automatic\nevaluation of open surgery skills using depth cameras. This work is intended to\nshow that depth cameras achieve similar results to RGB cameras, which is the\ncommon method in the automatic evaluation of open surgery skills. Moreover,\ndepth cameras offer advantages such as robustness to lighting variations,\ncamera positioning, simplified data compression, and enhanced privacy, making\nthem a promising alternative to RGB cameras.\n Methods: Experts and novice surgeons completed two simulators of open\nsuturing. We focused on hand and tool detection, and action segmentation in\nsuturing procedures. YOLOv8 was used for tool detection in RGB and depth\nvideos. Furthermore, UVAST and MSTCN++ were used for action segmentation. Our\nstudy includes the collection and annotation of a dataset recorded with Azure\nKinect.\n Results: We demonstrated that using depth cameras in object detection and\naction segmentation achieves comparable results to RGB cameras. Furthermore, we\nanalyzed 3D hand path length, revealing significant differences between experts\nand novice surgeons, emphasizing the potential of depth cameras in capturing\nsurgical skills. We also investigated the influence of camera angles on\nmeasurement accuracy, highlighting the advantages of 3D cameras in providing a\nmore accurate representation of hand movements.\n Conclusion: Our research contributes to advancing the field of surgical skill\nassessment by leveraging depth cameras for more reliable and privacy\nevaluations. The findings suggest that depth cameras can be valuable in\nassessing surgical skills and provide a foundation for future research in this\narea.\n","authors":["Ido Zuckerman","Nicole Werner","Jonathan Kouchly","Emma Huston","Shannon DiMarco","Paul DiMusto","Shlomi Laufer"],"pdf_url":"https://arxiv.org/pdf/2401.10037v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11086v2","updated":"2024-01-18T14:40:43Z","published":"2022-11-20T21:18:41Z","title":"An Embarrassingly Simple Baseline for Imbalanced Semi-Supervised\n Learning","summary":" Semi-supervised learning (SSL) has shown great promise in leveraging\nunlabeled data to improve model performance. While standard SSL assumes uniform\ndata distribution, we consider a more realistic and challenging setting called\nimbalanced SSL, where imbalanced class distributions occur in both labeled and\nunlabeled data. Although there are existing endeavors to tackle this challenge,\ntheir performance degenerates when facing severe imbalance since they can not\nreduce the class imbalance sufficiently and effectively. In this paper, we\nstudy a simple yet overlooked baseline -- SimiS -- which tackles data imbalance\nby simply supplementing labeled data with pseudo-labels, according to the\ndifference in class distribution from the most frequent class. Such a simple\nbaseline turns out to be highly effective in reducing class imbalance. It\noutperforms existing methods by a significant margin, e.g., 12.8%, 13.6%, and\n16.7% over previous SOTA on CIFAR100-LT, FOOD101-LT, and ImageNet127\nrespectively. The reduced imbalance results in faster convergence and better\npseudo-label accuracy of SimiS. The simplicity of our method also makes it\npossible to be combined with other re-balancing techniques to improve the\nperformance further. Moreover, our method shows great robustness to a wide\nrange of data distributions, which holds enormous potential in practice. Code\nwill be publicly available.\n","authors":["Hao Chen","Yue Fan","Yidong Wang","Jindong Wang","Bernt Schiele","Xing Xie","Marios Savvides","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2211.11086v2.pdf","comment":"Issues in the paper, will re-open later"},{"id":"http://arxiv.org/abs/2401.10017v1","updated":"2024-01-18T14:36:51Z","published":"2024-01-18T14:36:51Z","title":"Text Region Multiple Information Perception Network for Scene Text\n Detection","summary":" Segmentation-based scene text detection algorithms can handle arbitrary shape\nscene texts and have strong robustness and adaptability, so it has attracted\nwide attention. Existing segmentation-based scene text detection algorithms\nusually only segment the pixels in the center region of the text, while\nignoring other information of the text region, such as edge information,\ndistance information, etc., thus limiting the detection accuracy of the\nalgorithm for scene text. This paper proposes a plug-and-play module called the\nRegion Multiple Information Perception Module (RMIPM) to enhance the detection\nperformance of segmentation-based algorithms. Specifically, we design an\nimproved module that can perceive various types of information about scene text\nregions, such as text foreground classification maps, distance maps, direction\nmaps, etc. Experiments on MSRA-TD500 and TotalText datasets show that our\nmethod achieves comparable performance with current state-of-the-art\nalgorithms.\n","authors":["Jinzhi Zheng","Libo Zhang","Yanjun Wu","Chen Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10017v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.10011v1","updated":"2024-01-18T14:27:01Z","published":"2024-01-18T14:27:01Z","title":"CPCL: Cross-Modal Prototypical Contrastive Learning for Weakly\n Supervised Text-based Person Re-Identification","summary":" Weakly supervised text-based person re-identification (TPRe-ID) seeks to\nretrieve images of a target person using textual descriptions, without relying\non identity annotations and is more challenging and practical. The primary\nchallenge is the intra-class differences, encompassing intra-modal feature\nvariations and cross-modal semantic gaps. Prior works have focused on\ninstance-level samples and ignored prototypical features of each person which\nare intrinsic and invariant. Toward this, we propose a Cross-Modal Prototypical\nContrastive Learning (CPCL) method. In practice, the CPCL introduces the CLIP\nmodel to weakly supervised TPRe-ID for the first time, mapping visual and\ntextual instances into a shared latent space. Subsequently, the proposed\nPrototypical Multi-modal Memory (PMM) module captures associations between\nheterogeneous modalities of image-text pairs belonging to the same person\nthrough the Hybrid Cross-modal Matching (HCM) module in a many-to-many mapping\nfashion. Moreover, the Outlier Pseudo Label Mining (OPLM) module further\ndistinguishes valuable outlier samples from each modality, enhancing the\ncreation of more reliable clusters by mining implicit relationships between\nimage-text pairs. Experimental results demonstrate that our proposed CPCL\nattains state-of-the-art performance on all three public datasets, with a\nsignificant improvement of 11.58%, 8.77% and 5.25% in Rank@1 accuracy on\nCUHK-PEDES, ICFG-PEDES and RSTPReid datasets, respectively. The code is\navailable at https://github.com/codeGallery24/CPCL.\n","authors":["Yanwei Zheng","Xinpeng Zhao","Chuanlin Lan","Xiaowei Zhang","Bowen Huang","Jibin Yang","Dongxiao Yu"],"pdf_url":"https://arxiv.org/pdf/2401.10011v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.10005v1","updated":"2024-01-18T14:21:56Z","published":"2024-01-18T14:21:56Z","title":"Advancing Large Multi-modal Models with Explicit Chain-of-Reasoning and\n Visual Question Generation","summary":" The increasing demand for intelligent systems capable of interpreting and\nreasoning about visual content requires the development of Large Multi-Modal\nModels (LMMs) that are not only accurate but also have explicit reasoning\ncapabilities. This paper presents a novel approach to imbue an LMM with the\nability to conduct explicit reasoning based on visual content and textual\ninstructions. We introduce a system that can ask a question to acquire\nnecessary knowledge, thereby enhancing the robustness and explicability of the\nreasoning process. Our method comprises the development of a novel dataset\ngenerated by a Large Language Model (LLM), designed to promote chain-of-thought\nreasoning combined with a question-asking mechanism. We designed an LMM, which\nhas high capabilities on region awareness to address the intricate requirements\nof image-text alignment. The model undergoes a three-stage training phase,\nstarting with large-scale image-text alignment using a large-scale datasets,\nfollowed by instruction tuning, and fine-tuning with a focus on\nchain-of-thought reasoning. The results demonstrate a stride toward a more\nrobust, accurate, and interpretable LMM, capable of reasoning explicitly and\nseeking information proactively when confronted with ambiguous visual input.\n","authors":["Kohei Uehara","Nabarun Goswami","Hanqin Wang","Toshiaki Baba","Kohtaro Tanaka","Tomohiro Hashimoto","Kai Wang","Rei Ito","Takagi Naoya","Ryo Umagami","Yingyi Wen","Tanachai Anakewat","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2401.10005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12307v2","updated":"2024-01-18T14:15:19Z","published":"2023-03-22T04:49:23Z","title":"Curvature-Balanced Feature Manifold Learning for Long-Tailed\n Classification","summary":" To address the challenges of long-tailed classification, researchers have\nproposed several approaches to reduce model bias, most of which assume that\nclasses with few samples are weak classes. However, recent studies have shown\nthat tail classes are not always hard to learn, and model bias has been\nobserved on sample-balanced datasets, suggesting the existence of other factors\nthat affect model bias. In this work, we systematically propose a series of\ngeometric measurements for perceptual manifolds in deep neural networks, and\nthen explore the effect of the geometric characteristics of perceptual\nmanifolds on classification difficulty and how learning shapes the geometric\ncharacteristics of perceptual manifolds. An unanticipated finding is that the\ncorrelation between the class accuracy and the separation degree of perceptual\nmanifolds gradually decreases during training, while the negative correlation\nwith the curvature gradually increases, implying that curvature imbalance leads\nto model bias. Therefore, we propose curvature regularization to facilitate the\nmodel to learn curvature-balanced and flatter perceptual manifolds. Evaluations\non multiple long-tailed and non-long-tailed datasets show the excellent\nperformance and exciting generality of our approach, especially in achieving\nsignificant performance improvements based on current state-of-the-art\ntechniques. Our work opens up a geometric analysis perspective on model bias\nand reminds researchers to pay attention to model bias on non-long-tailed and\neven sample-balanced datasets. The code and model will be made public.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Lingling Li"],"pdf_url":"https://arxiv.org/pdf/2303.12307v2.pdf","comment":"20pages, Accepted by CVPR 2023"},{"id":"http://arxiv.org/abs/2304.00429v5","updated":"2024-01-18T14:15:19Z","published":"2023-04-02T02:36:30Z","title":"Information Recovery-Driven Deep Incomplete Multiview Clustering Network","summary":" Incomplete multi-view clustering is a hot and emerging topic. It is well\nknown that unavoidable data incompleteness greatly weakens the effective\ninformation of multi-view data. To date, existing incomplete multi-view\nclustering methods usually bypass unavailable views according to prior missing\ninformation, which is considered as a second-best scheme based on evasion.\nOther methods that attempt to recover missing information are mostly applicable\nto specific two-view datasets. To handle these problems, in this paper, we\npropose an information recovery-driven deep incomplete multi-view clustering\nnetwork, termed as RecFormer. Concretely, a two-stage autoencoder network with\nthe self-attention structure is built to synchronously extract high-level\nsemantic representations of multiple views and recover the missing data.\nBesides, we develop a recurrent graph reconstruction mechanism that cleverly\nleverages the restored views to promote the representation learning and the\nfurther data reconstruction. Visualization of recovery results are given and\nsufficient experimental results confirm that our RecFormer has obvious\nadvantages over other top methods.\n","authors":["Chengliang Liu","Jie Wen","Zhihao Wu","Xiaoling Luo","Chao Huang","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2304.00429v5.pdf","comment":"Accepted by TNNLS 2023. Please contact me if you have any questions:\n liucl1996@163.com. The code is available at:\n https://github.com/justsmart/RecFormer"},{"id":"http://arxiv.org/abs/2401.09997v1","updated":"2024-01-18T14:13:46Z","published":"2024-01-18T14:13:46Z","title":"BPDO:Boundary Points Dynamic Optimization for Arbitrary Shape Scene Text\n Detection","summary":" Arbitrary shape scene text detection is of great importance in scene\nunderstanding tasks. Due to the complexity and diversity of text in natural\nscenes, existing scene text algorithms have limited accuracy for detecting\narbitrary shape text. In this paper, we propose a novel arbitrary shape scene\ntext detector through boundary points dynamic optimization(BPDO). The proposed\nmodel is designed with a text aware module (TAM) and a boundary point dynamic\noptimization module (DOM). Specifically, the model designs a text aware module\nbased on segmentation to obtain boundary points describing the central region\nof the text by extracting a priori information about the text region. Then,\nbased on the idea of deformable attention, it proposes a dynamic optimization\nmodel for boundary points, which gradually optimizes the exact position of the\nboundary points based on the information of the adjacent region of each\nboundary point. Experiments on CTW-1500, Total-Text, and MSRA-TD500 datasets\nshow that the model proposed in this paper achieves a performance that is\nbetter than or comparable to the state-of-the-art algorithm, proving the\neffectiveness of the model.\n","authors":["Jinzhi Zheng","Libo Zhang","Yanjun Wu","Chen Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.09997v1.pdf","comment":"Accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.15144v3","updated":"2024-01-18T14:10:02Z","published":"2023-12-23T02:54:41Z","title":"Spatial-Temporal Decoupling Contrastive Learning for Skeleton-based\n Human Action Recognition","summary":" Skeleton-based action recognition is a central task in human-computer\ninteraction. However, most previous methods suffer from two issues: (i)\nsemantic ambiguity arising from spatial-temporal information mixture; and (ii)\noverlooking the explicit exploitation of the latent data distributions (i.e.,\nthe intra-class variations and inter-class relations), thereby leading to\nsub-optimum solutions of the skeleton encoders. To mitigate this, we propose a\nspatial-temporal decoupling contrastive learning (STD-CL) framework to obtain\ndiscriminative and semantically distinct representations from the sequences,\nwhich can be incorporated into various previous skeleton encoders and can be\nremoved when testing. Specifically, we decouple the global features into\nspatial-specific and temporal-specific features to reduce the spatial-temporal\ncoupling of features. Furthermore, to explicitly exploit the latent data\ndistributions, we employ the attentive features to contrastive learning, which\nmodels the cross-sequence semantic relations by pulling together the features\nfrom the positive pairs and pushing away the negative pairs. Extensive\nexperiments show that STD-CL with four various skeleton encoders (HCN, 2S-AGCN,\nCTR-GCN, and Hyperformer) achieves solid improvements on NTU60, NTU120, and\nNW-UCLA benchmarks. The code will be released soon.\n","authors":["Shaojie Zhang","Jianqin Yin","Yonghao Dang"],"pdf_url":"https://arxiv.org/pdf/2312.15144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09988v1","updated":"2024-01-18T14:06:29Z","published":"2024-01-18T14:06:29Z","title":"Developing an AI-based Integrated System for Bee Health Evaluation","summary":" Honey bees pollinate about one-third of the world's food supply, but bee\ncolonies have alarmingly declined by nearly 40% over the past decade due to\nseveral factors, including pesticides and pests. Traditional methods for\nmonitoring beehives, such as human inspection, are subjective, disruptive, and\ntime-consuming. To overcome these limitations, artificial intelligence has been\nused to assess beehive health. However, previous studies have lacked an\nend-to-end solution and primarily relied on data from a single source, either\nbee images or sounds. This study introduces a comprehensive system consisting\nof bee object detection and health evaluation. Additionally, it utilized a\ncombination of visual and audio signals to analyze bee behaviors. An\nAttention-based Multimodal Neural Network (AMNN) was developed to adaptively\nfocus on key features from each type of signal for accurate bee health\nassessment. The AMNN achieved an overall accuracy of 92.61%, surpassing eight\nexisting single-signal Convolutional Neural Networks and Recurrent Neural\nNetworks. It outperformed the best image-based model by 32.51% and the top\nsound-based model by 13.98% while maintaining efficient processing times.\nFurthermore, it improved prediction robustness, attaining an F1-score higher\nthan 90% across all four evaluated health conditions. The study also shows that\naudio signals are more reliable than images for assessing bee health. By\nseamlessly integrating AMNN with image and sound data in a comprehensive bee\nhealth monitoring system, this approach provides a more efficient and\nnon-invasive solution for the early detection of bee diseases and the\npreservation of bee colonies.\n","authors":["Andrew Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11918v3","updated":"2024-01-18T14:04:32Z","published":"2023-08-23T05:03:45Z","title":"AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet\n Underwater Object Detection","summary":" In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation\nand Vortex Convolutional Network, AMSP-UOD, designed for underwater object\ndetection. AMSP-UOD specifically addresses the impact of non-ideal imaging\nfactors on detection accuracy in complex underwater environments. To mitigate\nthe influence of noise on object detection performance, we propose AMSP Vortex\nConvolution (AMSP-VConv) to disrupt the noise distribution, enhance feature\nextraction capabilities, effectively reduce parameters, and improve network\nrobustness. We design the Feature Association Decoupling Cross Stage Partial\n(FAD-CSP) module, which strengthens the association of long and short range\nfeatures, improving the network performance in complex underwater environments.\nAdditionally, our sophisticated post-processing method, based on Non-Maximum\nSuppression (NMS) with aspect-ratio similarity thresholds, optimizes detection\nin dense scenes, such as waterweed and schools of fish, improving object\ndetection accuracy. Extensive experiments on the URPC and RUOD datasets\ndemonstrate that our method outperforms existing state-of-the-art methods in\nterms of accuracy and noise immunity. AMSP-UOD proposes an innovative solution\nwith the potential for real-world applications. Our code is available at\nhttps://github.com/zhoujingchun03/AMSP-UOD.\n","authors":["Jingchun Zhou","Zongxin He","Kin-Man Lam","Yudong Wang","Weishi Zhang","ChunLe Guo","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11918v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02070v3","updated":"2024-01-18T14:03:28Z","published":"2023-02-04T02:47:41Z","title":"Semantic-Guided Generative Image Augmentation Method with Diffusion\n Models for Image Classification","summary":" Existing image augmentation methods consist of two categories:\nperturbation-based methods and generative methods. Perturbation-based methods\napply pre-defined perturbations to augment an original image, but only locally\nvary the image, thus lacking image diversity. In contrast, generative methods\nbring more image diversity in the augmented images but may not preserve\nsemantic consistency, thus incorrectly changing the essential semantics of the\noriginal image. To balance image diversity and semantic consistency in\naugmented images, we propose SGID, a Semantic-guided Generative Image\naugmentation method with Diffusion models for image classification.\nSpecifically, SGID employs diffusion models to generate augmented images with\ngood image diversity. More importantly, SGID takes image labels and captions as\nguidance to maintain semantic consistency between the augmented and original\nimages. Experimental results show that SGID outperforms the best augmentation\nbaseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and\n0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image\naugmentation baselines and further improves the overall performance. We\ndemonstrate the semantic consistency and image diversity of SGID through\nquantitative human and automated evaluations, as well as qualitative case\nstudies.\n","authors":["Bohan Li","Xiao Xu","Xinghao Wang","Yutai Hou","Yunlong Feng","Feng Wang","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2302.02070v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2401.09985v1","updated":"2024-01-18T14:01:20Z","published":"2024-01-18T14:01:20Z","title":"WorldDreamer: Towards General World Models for Video Generation via\n Predicting Masked Tokens","summary":" World models play a crucial role in understanding and predicting the dynamics\nof the world, which is essential for video generation. However, existing world\nmodels are confined to specific scenarios such as gaming or driving, limiting\ntheir ability to capture the complexity of general world dynamic environments.\nTherefore, we introduce WorldDreamer, a pioneering world model to foster a\ncomprehensive comprehension of general world physics and motions, which\nsignificantly enhances the capabilities of video generation. Drawing\ninspiration from the success of large language models, WorldDreamer frames\nworld modeling as an unsupervised visual sequence modeling challenge. This is\nachieved by mapping visual inputs to discrete tokens and predicting the masked\nones. During this process, we incorporate multi-modal prompts to facilitate\ninteraction within the world model. Our experiments show that WorldDreamer\nexcels in generating videos across different scenarios, including natural\nscenes and driving environments. WorldDreamer showcases versatility in\nexecuting tasks such as text-to-video conversion, image-tovideo synthesis, and\nvideo editing. These results underscore WorldDreamer's effectiveness in\ncapturing dynamic elements within diverse general world environments.\n","authors":["Xiaofeng Wang","Zheng Zhu","Guan Huang","Boyuan Wang","Xinze Chen","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.09985v1.pdf","comment":"project page: https://world-dreamer.github.io/"},{"id":"http://arxiv.org/abs/2308.11932v4","updated":"2024-01-18T13:57:05Z","published":"2023-08-23T05:40:55Z","title":"Synergistic Multiscale Detail Refinement via Intrinsic Supervision for\n Underwater Image Enhancement","summary":" Visually restoring underwater scenes primarily involves mitigating\ninterference from underwater media. Existing methods ignore the inherent\nscale-related characteristics in underwater scenes. Therefore, we present the\nsynergistic multi-scale detail refinement via intrinsic supervision (SMDR-IS)\nfor enhancing underwater scene details, which contain multi-stages. The\nlow-degradation stage from the original images furnishes the original stage\nwith multi-scale details, achieved through feature propagation using the\nAdaptive Selective Intrinsic Supervised Feature (ASISF) module. By using\nintrinsic supervision, the ASISF module can precisely control and guide feature\ntransmission across multi-degradation stages, enhancing multi-scale detail\nrefinement and minimizing the interference from irrelevant information in the\nlow-degradation stage. In multi-degradation encoder-decoder framework of\nSMDR-IS, we introduce the Bifocal Intrinsic-Context Attention Module (BICA).\nBased on the intrinsic supervision principles, BICA efficiently exploits\nmulti-scale scene information in images. BICA directs higher-resolution spaces\nby tapping into the insights of lower-resolution ones, underscoring the pivotal\nrole of spatial contextual relationships in underwater image restoration.\nThroughout training, the inclusion of a multi-degradation loss function can\nenhance the network, allowing it to adeptly extract information across diverse\nscales. When benchmarked against state-of-the-art methods, SMDR-IS consistently\nshowcases superior performance. The code is publicly available at:\nhttps://github.com/zhoujingchun03/SMDR-IS.\n","authors":["Dehuan Zhang","Jingchun Zhou","ChunLe Guo","Weishi Zhang","Chongyi Li"],"pdf_url":"https://arxiv.org/pdf/2308.11932v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07450v2","updated":"2024-01-18T13:55:56Z","published":"2024-01-15T03:38:57Z","title":"Hierarchical Fashion Design with Multi-stage Diffusion Models","summary":" Cross-modal fashion synthesis and editing offer intelligent support to\nfashion designers by enabling the automatic generation and local modification\nof design drafts.While current diffusion models demonstrate commendable\nstability and controllability in image synthesis,they still face significant\nchallenges in generating fashion design from abstract design elements and\nfine-grained editing.Abstract sensory expressions, \\eg office, business, and\nparty, form the high-level design concepts, while measurable aspects like\nsleeve length, collar type, and pant length are considered the low-level\nattributes of clothing.Controlling and editing fashion images using lengthy\ntext descriptions poses a difficulty.In this paper, we propose HieraFashDiff,a\nnovel fashion design method using the shared multi-stage diffusion model\nencompassing high-level design concepts and low-level clothing attributes in a\nhierarchical structure.Specifically, we categorized the input text into\ndifferent levels and fed them in different time step to the diffusion model\naccording to the criteria of professional clothing designers.HieraFashDiff\nallows designers to add low-level attributes after high-level prompts for\ninteractive editing incrementally.In addition, we design a differentiable loss\nfunction in the sampling process with a mask to keep non-edit\nareas.Comprehensive experiments performed on our newly conducted Hierarchical\nfashion dataset,demonstrate that our proposed method outperforms other\nstate-of-the-art competitors.\n","authors":["Zhifeng Xie","Hao li","Huiming Ding","Mengtian Li","Ying Cao"],"pdf_url":"https://arxiv.org/pdf/2401.07450v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09980v1","updated":"2024-01-18T13:51:20Z","published":"2024-01-18T13:51:20Z","title":"Ventricular Segmentation: A Brief Comparison of U-Net Derivatives","summary":" Medical imaging refers to the technologies and methods utilized to view the\nhuman body and its inside, in order to diagnose, monitor, or even treat medical\ndisorders. This paper aims to explore the application of deep learning\ntechniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic\nResonance Imaging) images, aiming to enhance the diagnosis, monitoring, and\ntreatment of medical disorders related to the heart. The focus centers on\nimplementing various architectures that are derivatives of U-Net, to\neffectively isolate specific parts of the heart for comprehensive anatomical\nand functional analysis. Through a combination of images, graphs, and\nquantitative metrics, the efficacy of the models and their predictions are\nshowcased. Additionally, this paper addresses encountered challenges and\noutline strategies for future improvements. This abstract provides a concise\noverview of the efforts in utilizing deep learning for cardiac image\nsegmentation, emphasizing both the accomplishments and areas for further\nrefinement.\n","authors":["Ketan Suhaas Saichandran"],"pdf_url":"https://arxiv.org/pdf/2401.09980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09962v1","updated":"2024-01-18T13:23:51Z","published":"2024-01-18T13:23:51Z","title":"CustomVideo: Customizing Text-to-Video Generation with Multiple Subjects","summary":" Customized text-to-video generation aims to generate high-quality videos\nguided by text prompts and subject references. Current approaches designed for\nsingle subjects suffer from tackling multiple subjects, which is a more\nchallenging and practical scenario. In this work, we aim to promote\nmulti-subject guided text-to-video customization. We propose CustomVideo, a\nnovel framework that can generate identity-preserving videos with the guidance\nof multiple subjects. To be specific, firstly, we encourage the co-occurrence\nof multiple subjects via composing them in a single image. Further, upon a\nbasic text-to-video diffusion model, we design a simple yet effective attention\ncontrol strategy to disentangle different subjects in the latent space of\ndiffusion model. Moreover, to help the model focus on the specific object area,\nwe segment the object from given reference images and provide a corresponding\nobject mask for attention learning. Also, we collect a multi-subject\ntext-to-video generation dataset as a comprehensive benchmark, with 69\nindividual subjects and 57 meaningful pairs. Extensive qualitative,\nquantitative, and user study results demonstrate the superiority of our method,\ncompared with the previous state-of-the-art approaches.\n","authors":["Zhao Wang","Aoxue Li","Enze Xie","Lingting Zhu","Yong Guo","Qi Dou","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2401.09962v1.pdf","comment":"10 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2308.11164v2","updated":"2024-01-18T13:01:03Z","published":"2023-08-22T03:45:13Z","title":"Decoupled Contrastive Multi-View Clustering with High-Order Random Walks","summary":" In recent, some robust contrastive multi-view clustering (MvC) methods have\nbeen proposed, which construct data pairs from neighborhoods to alleviate the\nfalse negative issue, i.e., some intra-cluster samples are wrongly treated as\nnegative pairs. Although promising performance has been achieved by these\nmethods, the false negative issue is still far from addressed and the false\npositive issue emerges because all in- and out-of-neighborhood samples are\nsimply treated as positive and negative, respectively. To address the issues,\nwe propose a novel robust method, dubbed decoupled contrastive multi-view\nclustering with high-order random walks (DIVIDE). In brief, DIVIDE leverages\nrandom walks to progressively identify data pairs in a global instead of local\nmanner. As a result, DIVIDE could identify in-neighborhood negatives and\nout-of-neighborhood positives. Moreover, DIVIDE embraces a novel MvC\narchitecture to perform inter- and intra-view contrastive learning in different\nembedding spaces, thus boosting clustering performance and embracing the\nrobustness against missing views. To verify the efficacy of DIVIDE, we carry\nout extensive experiments on four benchmark datasets comparing with nine\nstate-of-the-art MvC methods in both complete and incomplete MvC settings.\n","authors":["Yiding Lu","Yijie Lin","Mouxing Yang","Dezhong Peng","Peng Hu","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2308.11164v2.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.09942v1","updated":"2024-01-18T12:45:14Z","published":"2024-01-18T12:45:14Z","title":"Multi-task Learning for Joint Re-identification, Team Affiliation, and\n Role Classification for Sports Visual Tracking","summary":" Effective tracking and re-identification of players is essential for\nanalyzing soccer videos. But, it is a challenging task due to the non-linear\nmotion of players, the similarity in appearance of players from the same team,\nand frequent occlusions. Therefore, the ability to extract meaningful\nembeddings to represent players is crucial in developing an effective tracking\nand re-identification system. In this paper, a multi-purpose part-based person\nrepresentation method, called PRTreID, is proposed that performs three tasks of\nrole classification, team affiliation, and re-identification, simultaneously.\nIn contrast to available literature, a single network is trained with\nmulti-task supervision to solve all three tasks, jointly. The proposed joint\nmethod is computationally efficient due to the shared backbone. Also, the\nmulti-task learning leads to richer and more discriminative representations, as\ndemonstrated by both quantitative and qualitative results. To demonstrate the\neffectiveness of PRTreID, it is integrated with a state-of-the-art tracking\nmethod, using a part-based post-processing module to handle long-term tracking.\nThe proposed tracking method outperforms all existing tracking methods on the\nchallenging SoccerNet tracking dataset.\n","authors":["Amir M. Mansourian","Vladimir Somers","Christophe De Vleeschouwer","Shohreh Kasaei"],"pdf_url":"https://arxiv.org/pdf/2401.09942v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09939v1","updated":"2024-01-18T12:41:41Z","published":"2024-01-18T12:41:41Z","title":"ICGNet: A Unified Approach for Instance-Centric Grasping","summary":" Accurate grasping is the key to several robotic tasks including assembly and\nhousehold robotics. Executing a successful grasp in a cluttered environment\nrequires multiple levels of scene understanding: First, the robot needs to\nanalyze the geometric properties of individual objects to find feasible grasps.\nThese grasps need to be compliant with the local object geometry. Second, for\neach proposed grasp, the robot needs to reason about the interactions with\nother objects in the scene. Finally, the robot must compute a collision-free\ngrasp trajectory while taking into account the geometry of the target object.\nMost grasp detection algorithms directly predict grasp poses in a monolithic\nfashion, which does not capture the composability of the environment. In this\npaper, we introduce an end-to-end architecture for object-centric grasping. The\nmethod uses pointcloud data from a single arbitrary viewing direction as an\ninput and generates an instance-centric representation for each partially\nobserved object in the scene. This representation is further used for object\nreconstruction and grasp detection in cluttered table-top scenes. We show the\neffectiveness of the proposed method by extensively evaluating it against\nstate-of-the-art methods on synthetic datasets, indicating superior performance\nfor grasping and reconstruction. Additionally, we demonstrate real-world\napplicability by decluttering scenes with varying numbers of objects.\n","authors":["René Zurbrügg","Yifan Liu","Francis Engelmann","Suryansh Kumar","Marco Hutter","Vaishakh Patil","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2401.09939v1.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.17670v2","updated":"2024-01-18T12:29:31Z","published":"2023-12-29T16:37:08Z","title":"TopCoW: Benchmarking Topology-Aware Anatomical Segmentation of the\n Circle of Willis (CoW) for CTA and MRA","summary":" The Circle of Willis (CoW) is an important network of arteries connecting\nmajor circulations of the brain. Its vascular architecture is believed to\naffect the risk, severity, and clinical outcome of serious neuro-vascular\ndiseases. However, characterizing the highly variable CoW anatomy is still a\nmanual and time-consuming expert task. The CoW is usually imaged by two\nangiographic imaging modalities, magnetic resonance angiography (MRA) and\ncomputed tomography angiography (CTA), but there exist limited public datasets\nwith annotations on CoW anatomy, especially for CTA. Therefore we organized the\nTopCoW Challenge in 2023 with the release of an annotated CoW dataset. The\nTopCoW dataset was the first public dataset with voxel-level annotations for\nthirteen possible CoW vessel components, enabled by virtual-reality (VR)\ntechnology. It was also the first large dataset with paired MRA and CTA from\nthe same patients. TopCoW challenge formalized the CoW characterization problem\nas a multiclass anatomical segmentation task with an emphasis on topological\nmetrics. We invited submissions worldwide for the CoW segmentation task, which\nattracted over 140 registered participants from four continents. The top\nperforming teams managed to segment many CoW components to Dice scores around\n90%, but with lower scores for communicating arteries and rare variants. There\nwere also topological mistakes for predictions with high Dice scores.\nAdditional topological analysis revealed further areas for improvement in\ndetecting certain CoW components and matching CoW variant topology accurately.\nTopCoW represented a first attempt at benchmarking the CoW anatomical\nsegmentation task for MRA and CTA, both morphologically and topologically.\n","authors":["Kaiyuan Yang","Fabio Musio","Yihui Ma","Norman Juchler","Johannes C. Paetzold","Rami Al-Maskari","Luciano Höher","Hongwei Bran Li","Ibrahim Ethem Hamamci","Anjany Sekuboyina","Suprosanna Shit","Houjing Huang","Diana Waldmannstetter","Florian Kofler","Fernando Navarro","Martin Menten","Ivan Ezhov","Daniel Rueckert","Iris Vos","Ynte Ruigrok","Birgitta Velthuis","Hugo Kuijf","Julien Hämmerli","Catherine Wurster","Philippe Bijlenga","Laura Westphal","Jeroen Bisschop","Elisa Colombo","Hakim Baazaoui","Andrew Makmur","James Hallinan","Bene Wiestler","Jan S. Kirschke","Roland Wiest","Emmanuel Montagnon","Laurent Letourneau-Guillon","Adrian Galdran","Francesco Galati","Daniele Falcetta","Maria A. Zuluaga","Chaolong Lin","Haoran Zhao","Zehan Zhang","Sinyoung Ra","Jongyun Hwang","Hyunjin Park","Junqiang Chen","Marek Wodzinski","Henning Müller","Pengcheng Shi","Wei Liu","Ting Ma","Cansu Yalçin","Rachika E. Hamadache","Joaquim Salvi","Xavier Llado","Uma Maria Lal-Trehan Estrada","Valeriia Abramova","Luca Giancardo","Arnau Oliver","Jialu Liu","Haibin Huang","Yue Cui","Zehang Lin","Yusheng Liu","Shunzhi Zhu","Tatsat R. Patel","Vincent M. Tutino","Maysam Orouskhani","Huayu Wang","Mahmud Mossa-Basha","Chengcheng Zhu","Maximilian R. Rokuss","Yannick Kirchhoff","Nico Disch","Julius Holzschuh","Fabian Isensee","Klaus Maier-Hein","Yuki Sato","Sven Hirsch","Susanne Wegener","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2312.17670v2.pdf","comment":"23 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW\n 2023 Challenge"},{"id":"http://arxiv.org/abs/2401.09923v1","updated":"2024-01-18T12:13:06Z","published":"2024-01-18T12:13:06Z","title":"MAMBA: Multi-level Aggregation via Memory Bank for Video Object\n Detection","summary":" State-of-the-art video object detection methods maintain a memory structure,\neither a sliding window or a memory queue, to enhance the current frame using\nattention mechanisms. However, we argue that these memory structures are not\nefficient or sufficient because of two implied operations: (1) concatenating\nall features in memory for enhancement, leading to a heavy computational cost;\n(2) frame-wise memory updating, preventing the memory from capturing more\ntemporal information. In this paper, we propose a multi-level aggregation\narchitecture via memory bank called MAMBA. Specifically, our memory bank\nemploys two novel operations to eliminate the disadvantages of existing\nmethods: (1) light-weight key-set construction which can significantly reduce\nthe computational cost; (2) fine-grained feature-wise updating strategy which\nenables our method to utilize knowledge from the whole video. To better enhance\nfeatures from complementary levels, i.e., feature maps and proposals, we\nfurther propose a generalized enhancement operation (GEO) to aggregate\nmulti-level features in a unified manner. We conduct extensive evaluations on\nthe challenging ImageNetVID dataset. Compared with existing state-of-the-art\nmethods, our method achieves superior performance in terms of both speed and\naccuracy. More remarkably, MAMBA achieves mAP of 83.7/84.6% at 12.6/9.1 FPS\nwith ResNet-101. Code is available at\nhttps://github.com/guanxiongsun/video_feature_enhancement.\n","authors":["Guanxiong Sun","Yang Hua","Guosheng Hu","Neil Robertson"],"pdf_url":"https://arxiv.org/pdf/2401.09923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09921v1","updated":"2024-01-18T12:07:39Z","published":"2024-01-18T12:07:39Z","title":"BlenDA: Domain Adaptive Object Detection through diffusion-based\n blending","summary":" Unsupervised domain adaptation (UDA) aims to transfer a model learned using\nlabeled data from the source domain to unlabeled data in the target domain. To\naddress the large domain gap issue between the source and target domains, we\npropose a novel regularization method for domain adaptive object detection,\nBlenDA, by generating the pseudo samples of the intermediate domains and their\ncorresponding soft domain labels for adaptation training. The intermediate\nsamples are generated by dynamically blending the source images with their\ncorresponding translated images using an off-the-shelf pre-trained\ntext-to-image diffusion model which takes the text label of the target domain\nas input and has demonstrated superior image-to-image translation quality.\nBased on experimental results from two adaptation benchmarks, our proposed\napproach can significantly enhance the performance of the state-of-the-art\ndomain adaptive object detector, Adversarial Query Transformer (AQT).\nParticularly, in the Cityscapes to Foggy Cityscapes adaptation, we achieve an\nimpressive 53.4% mAP on the Foggy Cityscapes dataset, surpassing the previous\nstate-of-the-art by 1.5%. It is worth noting that our proposed method is also\napplicable to various paradigms of domain adaptive object detection. The code\nis available at:https://github.com/aiiu-lab/BlenDA\n","authors":["Tzuhsuan Huang","Chen-Che Huang","Chung-Hao Ku","Jun-Cheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.09921v1.pdf","comment":"ICASSP(2024):2024 IEEE International Conference on Acoustics, Speech\n and Signal Processing"},{"id":"http://arxiv.org/abs/2307.07142v2","updated":"2024-01-18T11:30:47Z","published":"2023-07-14T03:55:54Z","title":"Quantity-Aware Coarse-to-Fine Correspondence for Image-to-Point Cloud\n Registration","summary":" Image-to-point cloud registration aims to determine the relative camera pose\nbetween an RGB image and a reference point cloud, serving as a general solution\nfor locating 3D objects from 2D observations. Matching individual points with\npixels can be inherently ambiguous due to modality gaps. To address this\nchallenge, we propose a framework to capture quantity-aware correspondences\nbetween local point sets and pixel patches and refine the results at both the\npoint and pixel levels. This framework aligns the high-level semantics of point\nsets and pixel patches to improve the matching accuracy. On a coarse scale, the\nset-to-patch correspondence is expected to be influenced by the quantity of 3D\npoints. To achieve this, a novel supervision strategy is proposed to adaptively\nquantify the degrees of correlation as continuous values. On a finer scale,\npoint-to-pixel correspondences are refined from a smaller search space through\na well-designed scheme, which incorporates both resampling and quantity-aware\npriors. Particularly, a confidence sorting strategy is proposed to\nproportionally select better correspondences at the final stage. Leveraging the\nadvantages of high-quality correspondences, the problem is successfully\nresolved using an efficient Perspective-n-Point solver within the framework of\nrandom sample consensus (RANSAC). Extensive experiments on the KITTI Odometry\nand NuScenes datasets demonstrate the superiority of our method over the\nstate-of-the-art methods.\n","authors":["Gongxin Yao","Yixin Xuan","Yiwei Chen","Yu Pan"],"pdf_url":"https://arxiv.org/pdf/2307.07142v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09900v1","updated":"2024-01-18T11:26:20Z","published":"2024-01-18T11:26:20Z","title":"XAI-Enhanced Semantic Segmentation Models for Visual Quality Inspection","summary":" Visual quality inspection systems, crucial in sectors like manufacturing and\nlogistics, employ computer vision and machine learning for precise, rapid\ndefect detection. However, their unexplained nature can hinder trust, error\nidentification, and system improvement. This paper presents a framework to\nbolster visual quality inspection by using CAM-based explanations to refine\nsemantic segmentation models. Our approach consists of 1) Model Training, 2)\nXAI-based Model Explanation, 3) XAI Evaluation, and 4) Annotation Augmentation\nfor Model Enhancement, informed by explanations and expert insights.\nEvaluations show XAI-enhanced models surpass original DeepLabv3-ResNet101\nmodels, especially in intricate object segmentation.\n","authors":["Tobias Clement","Truong Thanh Hung Nguyen","Mohamed Abdelaal","Hung Cao"],"pdf_url":"https://arxiv.org/pdf/2401.09900v1.pdf","comment":"IEEE ICCE 2024"},{"id":"http://arxiv.org/abs/2401.09895v1","updated":"2024-01-18T11:14:32Z","published":"2024-01-18T11:14:32Z","title":"Skeleton-Guided Instance Separation for Fine-Grained Segmentation in\n Microscopy","summary":" One of the fundamental challenges in microscopy (MS) image analysis is\ninstance segmentation (IS), particularly when segmenting cluster regions where\nmultiple objects of varying sizes and shapes may be connected or even\noverlapped in arbitrary orientations. Existing IS methods usually fail in\nhandling such scenarios, as they rely on coarse instance representations such\nas keypoints and horizontal bounding boxes (h-bboxes). In this paper, we\npropose a novel one-stage framework named A2B-IS to address this challenge and\nenhance the accuracy of IS in MS images. Our approach represents each instance\nwith a pixel-level mask map and a rotated bounding box (r-bbox). Unlike\ntwo-stage methods that use box proposals for segmentations, our method\ndecouples mask and box predictions, enabling simultaneous processing to\nstreamline the model pipeline. Additionally, we introduce a Gaussian skeleton\nmap to aid the IS task in two key ways: (1) It guides anchor placement,\nreducing computational costs while improving the model's capacity to learn\nRoI-aware features by filtering out noise from background regions. (2) It\nensures accurate isolation of densely packed instances by rectifying erroneous\nbox predictions near instance boundaries. To further enhance the performance,\nwe integrate two modules into the framework: (1) An Atrous Attention Block\n(A2B) designed to extract high-resolution feature maps with fine-grained\nmultiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that\nleverages both labeled and unlabeled images for model training. Our method has\nbeen thoroughly validated on two large-scale MS datasets, demonstrating its\nsuperiority over most state-of-the-art approaches.\n","authors":["Jun Wang","Chengfeng Zhou","Zhaoyan Ming","Lina Wei","Xudong Jiang","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2401.09895v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09883v1","updated":"2024-01-18T10:55:13Z","published":"2024-01-18T10:55:13Z","title":"Question-Answer Cross Language Image Matching for Weakly Supervised\n Semantic Segmentation","summary":" Class Activation Map (CAM) has emerged as a popular tool for weakly\nsupervised semantic segmentation (WSSS), allowing the localization of object\nregions in an image using only image-level labels. However, existing CAM\nmethods suffer from under-activation of target object regions and\nfalse-activation of background regions due to the fact that a lack of detailed\nsupervision can hinder the model's ability to understand the image as a whole.\nIn this paper, we propose a novel Question-Answer Cross-Language-Image Matching\nframework for WSSS (QA-CLIMS), leveraging the vision-language foundation model\nto maximize the text-based understanding of images and guide the generation of\nactivation maps. First, a series of carefully designed questions are posed to\nthe VQA (Visual Question Answering) model with Question-Answer Prompt\nEngineering (QAPE) to generate a corpus of both foreground target objects and\nbackgrounds that are adaptive to query images. We then employ contrastive\nlearning in a Region Image Text Contrastive (RITC) network to compare the\nobtained foreground and background regions with the generated corpus. Our\napproach exploits the rich textual information from the open vocabulary as\nadditional supervision, enabling the model to generate high-quality CAMs with a\nmore complete object region and reduce false-activation of background regions.\nWe conduct extensive analysis to validate the proposed method and show that our\napproach performs state-of-the-art on both PASCAL VOC 2012 and MS COCO\ndatasets. Code is available at: https://github.com/CVI-SZU/QA-CLIMS\n","authors":["Songhe Deng","Wei Zhuo","Jinheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2401.09883v1.pdf","comment":"ACM MM 2023"},{"id":"http://arxiv.org/abs/2212.09129v3","updated":"2024-01-18T10:52:34Z","published":"2022-12-18T16:53:13Z","title":"SUCRe: Leveraging Scene Structure for Underwater Color Restoration","summary":" Underwater images are altered by the physical characteristics of the medium\nthrough which light rays pass before reaching the optical sensor. Scattering\nand wavelength-dependent absorption significantly modify the captured colors\ndepending on the distance of observed elements to the image plane. In this\npaper, we aim to recover an image of the scene as if the water had no effect on\nlight propagation. We introduce SUCRe, a novel method that exploits the scene's\n3D structure for underwater color restoration. By following points in multiple\nimages and tracking their intensities at different distances to the sensor, we\nconstrain the optimization of the parameters in an underwater image formation\nmodel and retrieve unattenuated pixel intensities. We conduct extensive\nquantitative and qualitative analyses of our approach in a variety of scenarios\nranging from natural light to deep-sea environments using three underwater\ndatasets acquired from real-world scenarios and one synthetic dataset. We also\ncompare the performance of the proposed approach with that of a wide range of\nexisting state-of-the-art methods. The results demonstrate a consistent benefit\nof exploiting multiple views across a spectrum of objective metrics. Our code\nis publicly available at https://github.com/clementinboittiaux/sucre.\n","authors":["Clémentin Boittiaux","Ricard Marxer","Claire Dune","Aurélien Arnaubec","Maxime Ferrera","Vincent Hugel"],"pdf_url":"https://arxiv.org/pdf/2212.09129v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09866v1","updated":"2024-01-18T10:29:10Z","published":"2024-01-18T10:29:10Z","title":"Boosting Few-Shot Segmentation via Instance-Aware Data Augmentation and\n Local Consensus Guided Cross Attention","summary":" Few-shot segmentation aims to train a segmentation model that can fast adapt\nto a novel task for which only a few annotated images are provided. Most recent\nmodels have adopted a prototype-based paradigm for few-shot inference. These\napproaches may have limited generalization capacity beyond the standard 1- or\n5-shot settings. In this paper, we closely examine and reevaluate the\nfine-tuning based learning scheme that fine-tunes the classification layer of a\ndeep segmentation network pre-trained on diverse base classes. To improve the\ngeneralizability of the classification layer optimized with sparsely annotated\nsamples, we introduce an instance-aware data augmentation (IDA) strategy that\naugments the support images based on the relative sizes of the target objects.\nThe proposed IDA effectively increases the support set's diversity and promotes\nthe distribution consistency between support and query images. On the other\nhand, the large visual difference between query and support images may hinder\nknowledge transfer and cripple the segmentation performance. To cope with this\nchallenge, we introduce the local consensus guided cross attention (LCCA) to\nalign the query feature with support features based on their dense correlation,\nfurther improving the model's generalizability to the query image. The\nsignificant performance improvements on the standard few-shot segmentation\nbenchmarks PASCAL-$5^i$ and COCO-$20^i$ verify the efficacy of our proposed\nmethod.\n","authors":["Li Guo","Haoming Liu","Yuxuan Xia","Chengyu Zhang","Xiaochen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.09866v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09865v1","updated":"2024-01-18T10:28:45Z","published":"2024-01-18T10:28:45Z","title":"Improving fine-grained understanding in image-text pre-training","summary":" We introduce SPARse Fine-grained Contrastive Alignment (SPARC), a simple\nmethod for pretraining more fine-grained multimodal representations from\nimage-text pairs. Given that multiple image patches often correspond to single\nwords, we propose to learn a grouping of image patches for every token in the\ncaption. To achieve this, we use a sparse similarity metric between image\npatches and language tokens and compute for each token a language-grouped\nvision embedding as the weighted average of patches. The token and\nlanguage-grouped vision embeddings are then contrasted through a fine-grained\nsequence-wise loss that only depends on individual samples and does not require\nother batch samples as negatives. This enables more detailed information to be\nlearned in a computationally inexpensive manner. SPARC combines this\nfine-grained loss with a contrastive loss between global image and text\nembeddings to learn representations that simultaneously encode global and local\ninformation. We thoroughly evaluate our proposed method and show improved\nperformance over competing approaches both on image-level tasks relying on\ncoarse-grained information, e.g. classification, as well as region-level tasks\nrelying on fine-grained information, e.g. retrieval, object detection, and\nsegmentation. Moreover, SPARC improves model faithfulness and captioning in\nfoundational vision-language models.\n","authors":["Ioana Bica","Anastasija Ilić","Matthias Bauer","Goker Erdogan","Matko Bošnjak","Christos Kaplanis","Alexey A. Gritsenko","Matthias Minderer","Charles Blundell","Razvan Pascanu","Jovana Mitrović"],"pdf_url":"https://arxiv.org/pdf/2401.09865v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2401.09861v1","updated":"2024-01-18T10:18:48Z","published":"2024-01-18T10:18:48Z","title":"Temporal Insight Enhancement: Mitigating Temporal Hallucination in\n Multimodal Large Language Models","summary":" Recent advancements in Multimodal Large Language Models (MLLMs) have\nsignificantly enhanced the comprehension of multimedia content, bringing\ntogether diverse modalities such as text, images, and videos. However, a\ncritical challenge faced by these models, especially when processing video\ninputs, is the occurrence of hallucinations - erroneous perceptions or\ninterpretations, particularly at the event level. This study introduces an\ninnovative method to address event-level hallucinations in MLLMs, focusing on\nspecific temporal understanding in video content. Our approach leverages a\nnovel framework that extracts and utilizes event-specific information from both\nthe event query and the provided video to refine MLLMs' response. We propose a\nunique mechanism that decomposes on-demand event queries into iconic actions.\nSubsequently, we employ models like CLIP and BLIP2 to predict specific\ntimestamps for event occurrences. Our evaluation, conducted using the\nCharades-STA dataset, demonstrates a significant reduction in temporal\nhallucinations and an improvement in the quality of event-related responses.\nThis research not only provides a new perspective in addressing a critical\nlimitation of MLLMs but also contributes a quantitatively measurable method for\nevaluating MLLMs in the context of temporal-related questions.\n","authors":["Li Sun","Liuan Wang","Jun Sun","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2401.09861v1.pdf","comment":"7 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.13646v3","updated":"2024-01-18T10:13:51Z","published":"2023-12-21T08:16:26Z","title":"Weakly Supervised Semantic Segmentation for Driving Scenes","summary":" State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS)\nusing image-level labels exhibit severe performance degradation on driving\nscene datasets such as Cityscapes. To address this challenge, we develop a new\nWSSS framework tailored to driving scene datasets. Based on extensive analysis\nof dataset characteristics, we employ Contrastive Language-Image Pre-training\n(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key\nchallenges: (1) pseudo-masks from CLIP lack in representing small object\nclasses, and (2) these masks contain notable noise. We propose solutions for\neach issue as follows. (1) We devise Global-Local View Training that seamlessly\nincorporates small-scale patches during model training, thereby enhancing the\nmodel's capability to handle small-sized yet critical objects in driving scenes\n(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing\n(CARB), a novel technique that discerns reliable and noisy regions through\nevaluating the consistency between CLIP masks and segmentation predictions. It\nprioritizes reliable pixels over noisy pixels via adaptive loss weighting.\nNotably, the proposed method achieves 51.8\\% mIoU on the Cityscapes test\ndataset, showcasing its potential as a strong WSSS baseline on driving scene\ndatasets. Experimental results on CamVid and WildDash2 demonstrate the\neffectiveness of our method across diverse datasets, even with small-scale\ndatasets or visually challenging conditions. The code is available at\nhttps://github.com/k0u-id/CARB.\n","authors":["Dongseob Kim","Seungho Lee","Junsuk Choe","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.13646v3.pdf","comment":"AAAI 2024 accepted. First two authors contributed equally"},{"id":"http://arxiv.org/abs/2401.09852v1","updated":"2024-01-18T10:08:24Z","published":"2024-01-18T10:08:24Z","title":"Enhancing the Fairness and Performance of Edge Cameras with Explainable\n AI","summary":" The rising use of Artificial Intelligence (AI) in human detection on Edge\ncamera systems has led to accurate but complex models, challenging to interpret\nand debug. Our research presents a diagnostic method using Explainable AI (XAI)\nfor model debugging, with expert-driven problem identification and solution\ncreation. Validated on the Bytetrack model in a real-world office Edge network,\nwe found the training dataset as the main bias source and suggested model\naugmentation as a solution. Our approach helps identify model biases, essential\nfor achieving fair and trustworthy models.\n","authors":["Truong Thanh Hung Nguyen","Vo Thanh Khang Nguyen","Quoc Hung Cao","Van Binh Truong","Quoc Khanh Nguyen","Hung Cao"],"pdf_url":"https://arxiv.org/pdf/2401.09852v1.pdf","comment":"IEEE ICCE 2024"},{"id":"http://arxiv.org/abs/2401.09836v1","updated":"2024-01-18T09:53:03Z","published":"2024-01-18T09:53:03Z","title":"Exploring Latent Cross-Channel Embedding for Accurate 3D Human Pose\n Reconstruction in a Diffusion Framework","summary":" Monocular 3D human pose estimation poses significant challenges due to the\ninherent depth ambiguities that arise during the reprojection process from 2D\nto 3D. Conventional approaches that rely on estimating an over-fit projection\nmatrix struggle to effectively address these challenges and often result in\nnoisy outputs. Recent advancements in diffusion models have shown promise in\nincorporating structural priors to address reprojection ambiguities. However,\nthere is still ample room for improvement as these methods often overlook the\nexploration of correlation between the 2D and 3D joint-level features. In this\nstudy, we propose a novel cross-channel embedding framework that aims to fully\nexplore the correlation between joint-level features of 3D coordinates and\ntheir 2D projections. In addition, we introduce a context guidance mechanism to\nfacilitate the propagation of joint graph attention across latent channels\nduring the iterative diffusion process. To evaluate the effectiveness of our\nproposed method, we conduct experiments on two benchmark datasets, namely\nHuman3.6M and MPI-INF-3DHP. Our results demonstrate a significant improvement\nin terms of reconstruction accuracy compared to state-of-the-art methods. The\ncode for our method will be made available online for further reference.\n","authors":["Junkun Jiang","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.09836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09180v2","updated":"2024-01-18T09:51:46Z","published":"2024-01-17T12:43:28Z","title":"Unsupervised Multiple Domain Translation through Controlled\n Disentanglement in Variational Autoencoder","summary":" Unsupervised Multiple Domain Translation is the task of transforming data\nfrom one domain to other domains without having paired data to train the\nsystems. Typically, methods based on Generative Adversarial Networks (GANs) are\nused to address this task. However, our proposal exclusively relies on a\nmodified version of a Variational Autoencoder. This modification consists of\nthe use of two latent variables disentangled in a controlled way by design. One\nof this latent variables is imposed to depend exclusively on the domain, while\nthe other one must depend on the rest of the variability factors of the data.\nAdditionally, the conditions imposed over the domain latent variable allow for\nbetter control and understanding of the latent space. We empirically\ndemonstrate that our approach works on different vision datasets improving the\nperformance of other well known methods. Finally, we prove that, indeed, one of\nthe latent variables stores all the information related to the domain and the\nother one hardly contains any domain information.\n","authors":["Antonio Almudévar","Théo Mariotte","Alfonso Ortega","Marie Tahon"],"pdf_url":"https://arxiv.org/pdf/2401.09180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09833v1","updated":"2024-01-18T09:50:26Z","published":"2024-01-18T09:50:26Z","title":"Slicer Networks","summary":" In medical imaging, scans often reveal objects with varied contrasts but\nconsistent internal intensities or textures. This characteristic enables the\nuse of low-frequency approximations for tasks such as segmentation and\ndeformation field estimation. Yet, integrating this concept into neural network\narchitectures for medical image analysis remains underexplored. In this paper,\nwe propose the Slicer Network, a novel architecture designed to leverage these\ntraits. Comprising an encoder utilizing models like vision transformers for\nfeature extraction and a slicer employing a learnable bilateral grid, the\nSlicer Network strategically refines and upsamples feature maps via a\nsplatting-blurring-slicing process. This introduces an edge-preserving\nlow-frequency approximation for the network outcome, effectively enlarging the\neffective receptive field. The enhancement not only reduces computational\ncomplexity but also boosts overall performance. Experiments across different\nmedical imaging applications, including unsupervised and keypoints-based image\nregistration and lesion segmentation, have verified the Slicer Network's\nimproved accuracy and efficiency.\n","authors":["Hang Zhang","Xiang Chen","Rongguang Wang","Renjiu Hu","Dongdong Liu","Gaolei Li"],"pdf_url":"https://arxiv.org/pdf/2401.09833v1.pdf","comment":"8 figures and 3 tables"},{"id":"http://arxiv.org/abs/2401.09828v1","updated":"2024-01-18T09:42:47Z","published":"2024-01-18T09:42:47Z","title":"Enhanced Automated Quality Assessment Network for Interactive Building\n Segmentation in High-Resolution Remote Sensing Imagery","summary":" In this research, we introduce the enhanced automated quality assessment\nnetwork (IBS-AQSNet), an innovative solution for assessing the quality of\ninteractive building segmentation within high-resolution remote sensing\nimagery. This is a new challenge in segmentation quality assessment, and our\nproposed IBS-AQSNet allievate this by identifying missed and mistaken segment\nareas. First of all, to acquire robust image features, our method combines a\nrobust, pre-trained backbone with a lightweight counterpart for comprehensive\nfeature extraction from imagery and segmentation results. These features are\nthen fused through a simple combination of concatenation, convolution layers,\nand residual connections. Additionally, ISR-AQSNet incorporates a multi-scale\ndifferential quality assessment decoder, proficient in pinpointing areas where\nsegmentation result is either missed or mistaken. Experiments on a newly-built\nEVLab-BGZ dataset, which includes over 39,198 buildings, demonstrate the\nsuperiority of the proposed method in automating segmentation quality\nassessment, thereby setting a new benchmark in the field.\n","authors":["Zhili Zhang","Xiangyun Hu","Jiabo Xu"],"pdf_url":"https://arxiv.org/pdf/2401.09828v1.pdf","comment":"The manuscript is submitted to IEEE International Geoscience and\n Remote Sensing Symposium(IGARSS2024)"},{"id":"http://arxiv.org/abs/2401.09826v1","updated":"2024-01-18T09:34:40Z","published":"2024-01-18T09:34:40Z","title":"Boosting Few-Shot Semantic Segmentation Via Segment Anything Model","summary":" In semantic segmentation, accurate prediction masks are crucial for\ndownstream tasks such as medical image analysis and image editing. Due to the\nlack of annotated data, few-shot semantic segmentation (FSS) performs poorly in\npredicting masks with precise contours. Recently, we have noticed that the\nlarge foundation model segment anything model (SAM) performs well in processing\ndetailed features. Inspired by SAM, we propose FSS-SAM to boost FSS methods by\naddressing the issue of inaccurate contour. The FSS-SAM is training-free. It\nworks as a post-processing tool for any FSS methods and can improve the\naccuracy of predicted masks. Specifically, we use predicted masks from FSS\nmethods to generate prompts and then use SAM to predict new masks. To avoid\npredicting wrong masks with SAM, we propose a prediction result selection (PRS)\nalgorithm. The algorithm can remarkably decrease wrong predictions. Experiment\nresults on public datasets show that our method is superior to base FSS methods\nin both quantitative and qualitative aspects.\n","authors":["Chen-Bin Feng","Qi Lai","Kangdao Liu","Houcheng Su","Chi-Man Vong"],"pdf_url":"https://arxiv.org/pdf/2401.09826v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05163v2","updated":"2024-01-18T09:34:31Z","published":"2024-01-10T13:56:40Z","title":"MISS: A Generative Pretraining and Finetuning Approach for Med-VQA","summary":" Medical visual question answering (VQA) is a challenging multimodal task,\nwhere Vision-Language Pre-training (VLP) models can effectively improve the\ngeneralization performance. However, most methods in the medical field treat\nVQA as an answer classification task which is difficult to transfer to\npractical application scenarios. Additionally, due to the privacy of medical\nimages and the expensive annotation process, large-scale medical image-text\npairs datasets for pretraining are severely lacking. In this paper, we propose\na large-scale MultI-task Self-Supervised learning based framework (MISS) for\nmedical VQA tasks. Unlike existing methods, we treat medical VQA as a\ngenerative task. We unify the text encoder and multimodal encoder and align\nimage-text features through multi-task learning. Furthermore, we propose a\nTransfer-and-Caption method that extends the feature space of single-modal\nimage datasets using large language models (LLMs), enabling those traditional\nmedical vision field task data to be applied to VLP. Experiments show that our\nmethod achieves excellent results with fewer multimodal datasets and\ndemonstrates the advantages of generative VQA models. The code and model\nweights will be released upon the paper's acceptance.\n","authors":["Jiawei Chen","Dingkang Yang","Yue Jiang","Yuxuan Lei","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.05163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09823v1","updated":"2024-01-18T09:31:25Z","published":"2024-01-18T09:31:25Z","title":"Enhancing Small Object Encoding in Deep Neural Networks: Introducing\n Fast&Focused-Net with Volume-wise Dot Product Layer","summary":" In this paper, we introduce Fast&Focused-Net, a novel deep neural network\narchitecture tailored for efficiently encoding small objects into fixed-length\nfeature vectors. Contrary to conventional Convolutional Neural Networks (CNNs),\nFast&Focused-Net employs a series of our newly proposed layer, the Volume-wise\nDot Product (VDP) layer, designed to address several inherent limitations of\nCNNs. Specifically, CNNs often exhibit a smaller effective receptive field than\ntheir theoretical counterparts, limiting their vision span. Additionally, the\ninitial layers in CNNs produce low-dimensional feature vectors, presenting a\nbottleneck for subsequent learning. Lastly, the computational overhead of CNNs,\nparticularly in capturing diverse image regions by parameter sharing, is\nsignificantly high. The VDP layer, at the heart of Fast&Focused-Net, aims to\nremedy these issues by efficiently covering the entire image patch information\nwith reduced computational demand. Experimental results demonstrate the prowess\nof Fast&Focused-Net in a variety of applications. For small object\nclassification tasks, our network outperformed state-of-the-art methods on\ndatasets such as CIFAR-10, CIFAR-100, STL-10, SVHN-Cropped, and Fashion-MNIST.\nIn the context of larger image classification, when combined with a transformer\nencoder (ViT), Fast&Focused-Net produced competitive results for OpenImages V6,\nImageNet-1K, and Places365 datasets. Moreover, the same combination showcased\nunparalleled performance in text recognition tasks across SVT, IC15, SVTP, and\nHOST datasets. This paper presents the architecture, the underlying motivation,\nand extensive empirical evidence suggesting that Fast&Focused-Net is a\npromising direction for efficient and focused deep learning.\n","authors":["Ali Tofik","Roy Partha Pratim"],"pdf_url":"https://arxiv.org/pdf/2401.09823v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10642v2","updated":"2024-01-18T09:29:36Z","published":"2023-10-16T17:57:43Z","title":"Real-time Photorealistic Dynamic Scene Representation and Rendering with\n 4D Gaussian Splatting","summary":" Reconstructing dynamic 3D scenes from 2D images and generating diverse views\nover time is challenging due to scene complexity and temporal dynamics. Despite\nadvancements in neural implicit models, limitations persist: (i) Inadequate\nScene Structure: Existing methods struggle to reveal the spatial and temporal\nstructure of dynamic scenes from directly learning the complex 6D plenoptic\nfunction. (ii) Scaling Deformation Modeling: Explicitly modeling scene element\ndeformation becomes impractical for complex dynamics. To address these issues,\nwe consider the spacetime as an entirety and propose to approximate the\nunderlying spatio-temporal 4D volume of a dynamic scene by optimizing a\ncollection of 4D primitives, with explicit geometry and appearance modeling.\nLearning to optimize the 4D primitives enables us to synthesize novel views at\nany desired time with our tailored rendering routine. Our model is conceptually\nsimple, consisting of a 4D Gaussian parameterized by anisotropic ellipses that\ncan rotate arbitrarily in space and time, as well as view-dependent and\ntime-evolved appearance represented by the coefficient of 4D spherindrical\nharmonics. This approach offers simplicity, flexibility for variable-length\nvideo and end-to-end training, and efficient real-time rendering, making it\nsuitable for capturing complex dynamic scene motions. Experiments across\nvarious benchmarks, including monocular and multi-view scenarios, demonstrate\nour 4DGS model's superior visual quality and efficiency.\n","authors":["Zeyu Yang","Hongye Yang","Zijie Pan","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.10642v2.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2308.16573v3","updated":"2024-01-18T09:25:19Z","published":"2023-08-31T09:13:34Z","title":"Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for\n Semi-Supervised Medical Image Segmentation","summary":" While supervised learning has achieved remarkable success, obtaining\nlarge-scale labeled datasets in biomedical imaging is often impractical due to\nhigh costs and the time-consuming annotations required from radiologists.\nSemi-supervised learning emerges as an effective strategy to overcome this\nlimitation by leveraging useful information from unlabeled datasets. In this\npaper, we present a novel semi-supervised learning method, Dual-Decoder\nConsistency via Pseudo-Labels Guided Data Augmentation (DCPA), for medical\nimage segmentation. We devise a consistency regularization to promote\nconsistent representations during the training process. Specifically, we use\ndistinct decoders for student and teacher networks while maintain the same\nencoder. Moreover, to learn from unlabeled data, we create pseudo-labels\ngenerated by the teacher networks and augment the training data with the\npseudo-labels. Both techniques contribute to enhancing the performance of the\nproposed method. The method is evaluated on three representative medical image\nsegmentation datasets. Comprehensive comparisons with state-of-the-art\nsemi-supervised medical image segmentation methods were conducted under typical\nscenarios, utilizing 10% and 20% labeled data, as well as in the extreme\nscenario of only 5% labeled data. The experimental results consistently\ndemonstrate the superior performance of our method compared to other methods\nacross the three semi-supervised settings. The source code is publicly\navailable at https://github.com/BinYCn/DCPA.git.\n","authors":["Yuanbin Chen","Tao Wang","Hui Tang","Longxuan Zhao","Ruige Zong","Shun Chen","Tao Tan","Xinlin Zhang","Tong Tong"],"pdf_url":"https://arxiv.org/pdf/2308.16573v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.07929v4","updated":"2024-01-18T09:03:23Z","published":"2022-07-16T12:46:10Z","title":"Towards Lightweight Super-Resolution with Dual Regression Learning","summary":" Deep neural networks have exhibited remarkable performance in image\nsuper-resolution (SR) tasks by learning a mapping from low-resolution (LR)\nimages to high-resolution (HR) images. However, the SR problem is typically an\nill-posed problem and existing methods would come with several limitations.\nFirst, the possible mapping space of SR can be extremely large since there may\nexist many different HR images that can be super-resolved from the same LR\nimage. As a result, it is hard to directly learn a promising SR mapping from\nsuch a large space. Second, it is often inevitable to develop very large models\nwith extremely high computational cost to yield promising SR performance. In\npractice, one can use model compression techniques to obtain compact models by\nreducing model redundancy. Nevertheless, it is hard for existing model\ncompression methods to accurately identify the redundant components due to the\nextremely large SR mapping space. To alleviate the first challenge, we propose\na dual regression learning scheme to reduce the space of possible SR mappings.\nSpecifically, in addition to the mapping from LR to HR images, we learn an\nadditional dual regression mapping to estimate the downsampling kernel and\nreconstruct LR images. In this way, the dual mapping acts as a constraint to\nreduce the space of possible mappings. To address the second challenge, we\npropose a dual regression compression (DRC) method to reduce model redundancy\nin both layer-level and channel-level based on channel pruning. Specifically,\nwe first develop a channel number search method that minimizes the dual\nregression loss to determine the redundancy of each layer. Given the searched\nchannel numbers, we further exploit the dual regression manner to evaluate the\nimportance of channels and prune the redundant ones. Extensive experiments show\nthe effectiveness of our method in obtaining accurate and efficient SR models.\n","authors":["Yong Guo","Jingdong Wang","Qi Chen","Jiezhang Cao","Zeshuai Deng","Yanwu Xu","Jian Chen","Mingkui Tan"],"pdf_url":"https://arxiv.org/pdf/2207.07929v4.pdf","comment":"Journal extension of DRN for lightweight super-resolution"},{"id":"http://arxiv.org/abs/2311.15111v3","updated":"2024-01-18T09:02:36Z","published":"2023-11-25T20:01:20Z","title":"UAE: Universal Anatomical Embedding on Multi-modality Medical Images","summary":" Identifying specific anatomical structures (\\textit{e.g.}, lesions or\nlandmarks) in medical images plays a fundamental role in medical image\nanalysis. Exemplar-based landmark detection methods are receiving increasing\nattention since they can detect arbitrary anatomical points in inference while\ndo not need landmark annotations in training. They use self-supervised learning\nto acquire a discriminative embedding for each voxel within the image. These\napproaches can identify corresponding landmarks through nearest neighbor\nmatching and has demonstrated promising results across various tasks. However,\ncurrent methods still face challenges in: (1) differentiating voxels with\nsimilar appearance but different semantic meanings (\\textit{e.g.}, two adjacent\nstructures without clear borders); (2) matching voxels with similar semantics\nbut markedly different appearance (\\textit{e.g.}, the same vessel before and\nafter contrast injection); and (3) cross-modality matching (\\textit{e.g.},\nCT-MRI landmark-based registration). To overcome these challenges, we propose\nuniversal anatomical embedding (UAE), which is a unified framework designed to\nlearn appearance, semantic, and cross-modality anatomical embeddings.\nSpecifically, UAE incorporates three key innovations: (1) semantic embedding\nlearning with prototypical contrastive loss; (2) a fixed-point-based matching\nstrategy; and (3) an iterative approach for cross-modality embedding learning.\nWe thoroughly evaluated UAE across intra- and inter-modality tasks, including\none-shot landmark detection, lesion tracking on longitudinal CT scans, and\nCT-MRI affine/rigid registration with varying field of view. Our results\nsuggest that UAE outperforms state-of-the-art methods, offering a robust and\nversatile approach for landmark based medical image analysis tasks. Code and\ntrained models are available at: \\href{https://shorturl.at/bgsB3}\n","authors":["Xiaoyu Bai","Fan Bai","Xiaofei Huo","Jia Ge","Jingjing Lu","Xianghua Ye","Ke Yan","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.15111v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09802v1","updated":"2024-01-18T08:46:02Z","published":"2024-01-18T08:46:02Z","title":"Multilingual Visual Speech Recognition with a Single Model by Learning\n with Discrete Visual Speech Units","summary":" This paper explores sentence-level Multilingual Visual Speech Recognition\nwith a single model for the first time. As the massive multilingual modeling of\nvisual data requires huge computational costs, we propose a novel strategy,\nprocessing with visual speech units. Motivated by the recent success of the\naudio speech unit, the proposed visual speech unit is obtained by discretizing\nthe visual speech features extracted from the self-supervised visual speech\nmodel. To correctly capture multilingual visual speech, we first train the\nself-supervised visual speech model on 5,512 hours of multilingual audio-visual\ndata. Through analysis, we verify that the visual speech units mainly contain\nviseme information while suppressing non-linguistic information. By using the\nvisual speech units as the inputs of our system, we pre-train the model to\npredict corresponding text outputs on massive multilingual data constructed by\nmerging several VSR databases. As both the inputs and outputs are discrete, we\ncan greatly improve the training efficiency compared to the standard VSR\ntraining. Specifically, the input data size is reduced to 0.016% of the\noriginal video inputs. In order to complement the insufficient visual\ninformation in speech recognition, we apply curriculum learning where the\ninputs of the system begin with audio-visual speech units and gradually change\nto visual speech units. After pre-training, the model is finetuned on\ncontinuous features. We set new state-of-the-art multilingual VSR performances\nby achieving comparable performances to the previous language-specific VSR\nmodels, with a single trained model.\n","authors":["Minsu Kim","Jeong Hun Yeo","Jeongsoo Choi","Se Jin Park","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2401.09802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05448v2","updated":"2024-01-18T08:33:35Z","published":"2023-09-11T13:41:27Z","title":"Panoptic Vision-Language Feature Fields","summary":" Recently, methods have been proposed for 3D open-vocabulary semantic\nsegmentation. Such methods are able to segment scenes into arbitrary classes\nbased on text descriptions provided during runtime. In this paper, we propose\nto the best of our knowledge the first algorithm for open-vocabulary panoptic\nsegmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature\nFields (PVLFF), learns a semantic feature field of the scene by distilling\nvision-language features from a pretrained 2D model, and jointly fits an\ninstance feature field through contrastive learning using 2D instance segments\non input frames. Despite not being trained on the target classes, our method\nachieves panoptic segmentation performance similar to the state-of-the-art\nclosed-set 3D systems on the HyperSim, ScanNet and Replica dataset and\nadditionally outperforms current 3D open-vocabulary systems in terms of\nsemantic segmentation. We ablate the components of our method to demonstrate\nthe effectiveness of our model architecture. Our code will be available at\nhttps://github.com/ethz-asl/pvlff.\n","authors":["Haoran Chen","Kenneth Blomqvist","Francesco Milano","Roland Siegwart"],"pdf_url":"https://arxiv.org/pdf/2309.05448v2.pdf","comment":"This work has been accepted by IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2306.16003v2","updated":"2024-01-18T08:31:46Z","published":"2023-06-28T08:22:53Z","title":"Text-driven Talking Face Synthesis by Reprogramming Audio-driven Models","summary":" In this paper, we present a method for reprogramming pre-trained audio-driven\ntalking face synthesis models to operate in a text-driven manner. Consequently,\nwe can easily generate face videos that articulate the provided textual\nsentences, eliminating the necessity of recording speech for each inference, as\nrequired in the audio-driven model. To this end, we propose to embed the input\ntext into the learned audio latent space of the pre-trained audio-driven model,\nwhile preserving the face synthesis capability of the original pre-trained\nmodel. Specifically, we devise a Text-to-Audio Embedding Module (TAEM) which\nmaps a given text input into the audio latent space by modeling pronunciation\nand duration characteristics. Furthermore, to consider the speaker\ncharacteristics in audio while using text inputs, TAEM is designed to accept a\nvisual speaker embedding. The visual speaker embedding is derived from a single\ntarget face image and enables improved mapping of input text to the learned\naudio latent space by incorporating the speaker characteristics inherent in the\naudio. The main advantages of the proposed framework are that 1) it can be\napplied to diverse audio-driven talking face synthesis models and 2) we can\ngenerate talking face videos with either text inputs or audio inputs with high\nflexibility.\n","authors":["Jeongsoo Choi","Minsu Kim","Se Jin Park","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2306.16003v2.pdf","comment":"ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.09794v1","updated":"2024-01-18T08:26:37Z","published":"2024-01-18T08:26:37Z","title":"Wavelet-Guided Acceleration of Text Inversion in Diffusion-Based Image\n Editing","summary":" In the field of image editing, Null-text Inversion (NTI) enables fine-grained\nediting while preserving the structure of the original image by optimizing null\nembeddings during the DDIM sampling process. However, the NTI process is\ntime-consuming, taking more than two minutes per image. To address this, we\nintroduce an innovative method that maintains the principles of the NTI while\naccelerating the image editing process. We propose the WaveOpt-Estimator, which\ndetermines the text optimization endpoint based on frequency characteristics.\nUtilizing wavelet transform analysis to identify the image's frequency\ncharacteristics, we can limit text optimization to specific timesteps during\nthe DDIM sampling process. By adopting the Negative-Prompt Inversion (NPI)\nconcept, a target prompt representing the original image serves as the initial\ntext value for optimization. This approach maintains performance comparable to\nNTI while reducing the average editing time by over 80% compared to the NTI\nmethod. Our method presents a promising approach for efficient, high-quality\nimage editing based on diffusion models.\n","authors":["Gwanhyeong Koo","Sunjae Yoon","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2401.09794v1.pdf","comment":"The International Conference on Acoustics, Speech, & Signal\n Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2401.09791v1","updated":"2024-01-18T08:23:29Z","published":"2024-01-18T08:23:29Z","title":"BreastRegNet: A Deep Learning Framework for Registration of Breast\n Faxitron and Histopathology Images","summary":" A standard treatment protocol for breast cancer entails administering\nneoadjuvant therapy followed by surgical removal of the tumor and surrounding\ntissue. Pathologists typically rely on cabinet X-ray radiographs, known as\nFaxitron, to examine the excised breast tissue and diagnose the extent of\nresidual disease. However, accurately determining the location, size, and\nfocality of residual cancer can be challenging, and incorrect assessments can\nlead to clinical consequences. The utilization of automated methods can improve\nthe histopathology process, allowing pathologists to choose regions for\nsampling more effectively and precisely. Despite the recognized necessity,\nthere are currently no such methods available. Training such automated\ndetection models require accurate ground truth labels on ex-vivo radiology\nimages, which can be acquired through registering Faxitron and histopathology\nimages and mapping the extent of cancer from histopathology to x-ray images.\nThis study introduces a deep learning-based image registration approach trained\non mono-modal synthetic image pairs. The models were trained using data from 50\nwomen who received neoadjuvant chemotherapy and underwent surgery. The results\ndemonstrate that our method is faster and yields significantly lower average\nlandmark error ($2.1\\pm1.96$ mm) over the state-of-the-art iterative\n($4.43\\pm4.1$ mm) and deep learning ($4.02\\pm3.15$ mm) approaches. Improved\nperformance of our approach in integrating radiology and pathology information\nfacilitates generating large datasets, which allows training models for more\naccurate breast cancer detection.\n","authors":["Negar Golestani","Aihui Wang","Gregory R Bean","Mirabela Rusu"],"pdf_url":"https://arxiv.org/pdf/2401.09791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07931v4","updated":"2024-01-18T08:22:14Z","published":"2023-05-13T14:48:09Z","title":"GSB: Group Superposition Binarization for Vision Transformer with\n Limited Training Samples","summary":" Vision Transformer (ViT) has performed remarkably in various computer vision\ntasks. Nonetheless, affected by the massive amount of parameters, ViT usually\nsuffers from serious overfitting problems with a relatively limited number of\ntraining samples. In addition, ViT generally demands heavy computing resources,\nwhich limit its deployment on resource-constrained devices. As a type of\nmodel-compression method, model binarization is potentially a good choice to\nsolve the above problems. Compared with the full-precision one, the model with\nthe binarization method replaces complex tensor multiplication with simple\nbit-wise binary operations and represents full-precision model parameters and\nactivations with only 1-bit ones, which potentially solves the problem of model\nsize and computational complexity, respectively. In this paper, we investigate\na binarized ViT model. Empirically, we observe that the existing binarization\ntechnology designed for Convolutional Neural Networks (CNN) cannot migrate well\nto a ViT's binarization task. We also find that the decline of the accuracy of\nthe binary ViT model is mainly due to the information loss of the Attention\nmodule and the Value vector. Therefore, we propose a novel model binarization\ntechnique, called Group Superposition Binarization (GSB), to deal with these\nissues. Furthermore, in order to further improve the performance of the\nbinarization model, we have investigated the gradient calculation procedure in\nthe binarization process and derived more proper gradient calculation equations\nfor GSB to reduce the influence of gradient mismatch. Then, the knowledge\ndistillation technique is introduced to alleviate the performance degradation\ncaused by model binarization. Analytically, model binarization can limit the\nparameters search space during parameter updates while training a model....\n","authors":["Tian Gao","Cheng-Zhong Xu","Le Zhang","Hui Kong"],"pdf_url":"https://arxiv.org/pdf/2305.07931v4.pdf","comment":"Accepted by Neural Networks"},{"id":"http://arxiv.org/abs/2310.09760v2","updated":"2024-01-18T08:14:32Z","published":"2023-10-15T07:19:23Z","title":"GPT-Prompt Controlled Diffusion for Weakly-Supervised Semantic\n Segmentation","summary":" Weakly supervised semantic segmentation (WSSS), aiming to train segmentation\nmodels solely using image-level labels, has received significant attention.\nExisting approaches mainly concentrate on creating high-quality pseudo labels\nby utilizing existing images and their corresponding image-level labels.\nHowever, the quality of pseudo labels degrades significantly when the size of\navailable dataset is limited. Thus, in this paper, we tackle this problem from\na different view by introducing a novel approach called GPT-Prompt Controlled\nDiffusion (GPCD) for data augmentation. This approach enhances the current\nlabeled datasets by augmenting with a variety of images, achieved through\ncontrolled diffusion guided by GPT prompts. In this process, the existing\nimages and image-level labels provide the necessary control information, where\nGPT is employed to enrich the prompts, leading to the generation of diverse\nbackgrounds. Moreover, we integrate data source information as tokens into the\nVision Transformer (ViT) framework. These tokens are specifically designed to\nimprove the ability of downstream WSSS framework to recognize the origins of\naugmented images. Our proposed GPCD approach clearly surpasses existing\nstate-of-the-art methods. This effect is more obvious when the amount of\navailable data is small, demonstrating the effectiveness of our method.\n","authors":["Wangyu Wu","Tianhong Dai","Xiaowei Huang","Fei Ma","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2310.09760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09786v1","updated":"2024-01-18T08:10:34Z","published":"2024-01-18T08:10:34Z","title":"Adaptive Self-training Framework for Fine-grained Scene Graph Generation","summary":" Scene graph generation (SGG) models have suffered from inherent problems\nregarding the benchmark datasets such as the long-tailed predicate distribution\nand missing annotation problems. In this work, we aim to alleviate the\nlong-tailed problem of SGG by utilizing unannotated triplets. To this end, we\nintroduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels\nfor unannotated triplets based on which the SGG models are trained. While there\nhas been significant progress in self-training for image recognition, designing\na self-training framework for the SGG task is more challenging due to its\ninherent nature such as the semantic ambiguity and the long-tailed distribution\nof predicate classes. Hence, we propose a novel pseudo-labeling technique for\nSGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is\na model-agnostic framework that can be applied to any existing SGG models.\nFurthermore, we devise a graph structure learner (GSL) that is beneficial when\nadopting our proposed self-training framework to the state-of-the-art\nmessage-passing neural network (MPNN)-based SGG models. Our extensive\nexperiments verify the effectiveness of ST-SGG on various SGG models,\nparticularly in enhancing the performance on fine-grained predicate classes.\n","authors":["Kibum Kim","Kanghoon Yoon","Yeonjun In","Jinyoung Moon","Donghyun Kim","Chanyoung Park"],"pdf_url":"https://arxiv.org/pdf/2401.09786v1.pdf","comment":"9 pages; ICLR 2024"},{"id":"http://arxiv.org/abs/2401.08209v2","updated":"2024-01-18T07:59:03Z","published":"2024-01-16T08:50:44Z","title":"Transcending the Limit of Local Window: Advanced Super-Resolution\n Transformer with Adaptive Token Dictionary","summary":" Single Image Super-Resolution is a classic computer vision problem that\ninvolves estimating high-resolution (HR) images from low-resolution (LR) ones.\nAlthough deep neural networks (DNNs), especially Transformers for\nsuper-resolution, have seen significant advancements in recent years,\nchallenges still remain, particularly in limited receptive field caused by\nwindow-based self-attention. To address these issues, we introduce a group of\nauxiliary Adaptive Token Dictionary to SR Transformer and establish an ATD-SR\nmethod. The introduced token dictionary could learn prior information from\ntraining data and adapt the learned prior to specific testing image through an\nadaptive refinement step. The refinement strategy could not only provide global\ninformation to all input tokens but also group image tokens into categories.\nBased on category partitions, we further propose a category-based\nself-attention mechanism designed to leverage distant but similar tokens for\nenhancing input features. The experimental results show that our method\nachieves the best performance on various single image super-resolution\nbenchmarks.\n","authors":["Leheng Zhang","Yawei Li","Xingyu Zhou","Xiaorui Zhao","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.08209v2.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.09774v1","updated":"2024-01-18T07:50:07Z","published":"2024-01-18T07:50:07Z","title":"On the Audio Hallucinations in Large Audio-Video Language Models","summary":" Large audio-video language models can generate descriptions for both video\nand audio. However, they sometimes ignore audio content, producing audio\ndescriptions solely reliant on visual information. This paper refers to this as\naudio hallucinations and analyzes them in large audio-video language models. We\ngather 1,000 sentences by inquiring about audio information and annotate them\nwhether they contain hallucinations. If a sentence is hallucinated, we also\ncategorize the type of hallucination. The results reveal that 332 sentences are\nhallucinated with distinct trends observed in nouns and verbs for each\nhallucination type. Based on this, we tackle a task of audio hallucination\nclassification using pre-trained audio-text models in the zero-shot and\nfine-tuning settings. Our experimental results reveal that the zero-shot models\nachieve higher performance (52.2% in F1) than the random (40.3%) and the\nfine-tuning models achieve 87.9%, outperforming the zero-shot models.\n","authors":["Taichi Nishimura","Shota Nakada","Masayoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2401.09774v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2311.18243v2","updated":"2024-01-18T07:47:24Z","published":"2023-11-30T04:21:10Z","title":"DKiS: Decay weight invertible image steganography with private key","summary":" Image steganography, defined as the practice of concealing information within\nanother image, traditionally encounters security challenges when its methods\nbecome publicly known or are under attack. To address this, a novel private\nkey-based image steganography technique has been introduced. This approach\nensures the security of the hidden information, as access requires a\ncorresponding private key, regardless of the public knowledge of the\nsteganography method. Experimental evidence has been presented, demonstrating\nthe effectiveness of our method and showcasing its real-world applicability.\nFurthermore, a critical challenge in the invertible image steganography process\nhas been identified by us: the transfer of non-essential, or `garbage',\ninformation from the secret to the host pipeline. To tackle this issue, the\ndecay weight has been introduced to control the information transfer,\neffectively filtering out irrelevant data and enhancing the performance of\nimage steganography. The code for this technique is publicly accessible at\nhttps://github.com/yanghangAI/DKiS, and a practical demonstration can be found\nat http://yanghang.site/hidekey.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15646v5","updated":"2024-01-18T07:44:08Z","published":"2023-10-24T09:07:47Z","title":"Mean Teacher DETR with Masked Feature Alignment: A Robust Domain\n Adaptive Detection Transformer Framework","summary":" Unsupervised domain adaptation object detection (UDAOD) research on Detection\nTransformer(DETR) mainly focuses on feature alignment and existing methods can\nbe divided into two kinds, each of which has its unresolved issues. One-stage\nfeature alignment methods can easily lead to performance fluctuation and\ntraining stagnation. Two-stage feature alignment method based on mean teacher\ncomprises a pretraining stage followed by a self-training stage, each facing\nproblems in obtaining reliable pretrained model and achieving consistent\nperformance gains. Methods mentioned above have not yet explore how to utilize\nthe third related domain such as target-like domain to assist adaptation. To\naddress these issues, we propose a two-stage framework named MTM, i.e. Mean\nTeacher-DETR with Masked Feature Alignment. In the pretraining stage, we\nutilize labeled target-like images produced by image style transfer to avoid\nperformance fluctuation. In the self-training stage, we leverage unlabeled\ntarget images by pseudo labels based on mean teacher and propose a module\ncalled Object Queries Knowledge Transfer (OQKT) to ensure consistent\nperformance gains of the student model. Most importantly, we propose masked\nfeature alignment methods including Masked Domain Query-based Feature Alignment\n(MDQFA) and Masked Token-wise Feature Alignment (MTWFA) to alleviate domain\nshift in a more robust way, which not only prevent training stagnation and lead\nto a robust pretrained model in the pretraining stage, but also enhance the\nmodel's target performance in the self-training stage. Experiments on three\nchallenging scenarios and a theoretical analysis verify the effectiveness of\nMTM.\n","authors":["Weixi Weng","Chun Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.15646v5.pdf","comment":"AAAI2024"},{"id":"http://arxiv.org/abs/2401.09773v1","updated":"2024-01-18T07:44:04Z","published":"2024-01-18T07:44:04Z","title":"SEINE: Structure Encoding and Interaction Network for Nuclei Instance\n Segmentation","summary":" Nuclei instance segmentation in histopathological images is of great\nimportance for biological analysis and cancer diagnosis but remains challenging\nfor two reasons. (1) Similar visual presentation of intranuclear and\nextranuclear regions of chromophobe nuclei often causes under-segmentation, and\n(2) current methods lack the exploration of nuclei structure, resulting in\nfragmented instance predictions. To address these problems, this paper proposes\na structure encoding and interaction network, termed SEINE, which develops the\nstructure modeling scheme of nuclei and exploits the structure similarity\nbetween nuclei to improve the integrality of each segmented instance.\nConcretely, SEINE introduces a contour-based structure encoding (SE) that\nconsiders the correlation between nuclei structure and semantics, realizing a\nreasonable representation of the nuclei structure. Based on the encoding, we\npropose a structure-guided attention (SGA) that takes the clear nuclei as\nprototypes to enhance the structure learning for the fuzzy nuclei. To\nstrengthen the structural learning ability, a semantic feature fusion (SFF) is\npresented to boost the semantic consistency of semantic and structure branches.\nFurthermore, a position enhancement (PE) method is applied to suppress\nincorrect nuclei boundary predictions. Extensive experiments demonstrate the\nsuperiority of our approaches, and SEINE achieves state-of-the-art (SOTA)\nperformance on four datasets. The code is available at\n\\href{https://github.com/zhangye-zoe/SEINE}{https://github.com/zhangye-zoe/SEINE}.\n","authors":["Ye Zhang","Linghan Cai","Ziyue Wang","Yongbing Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09773v1.pdf","comment":"10 pages, 12 figures, 6 tables, submitted to TMI"},{"id":"http://arxiv.org/abs/2310.17212v2","updated":"2024-01-18T07:37:32Z","published":"2023-10-26T07:56:17Z","title":"Affective Video Content Analysis: Decade Review and New Perspectives","summary":" Video content is rich in semantics and has the ability to evoke various\nemotions in viewers. In recent years, with the rapid development of affective\ncomputing and the explosive growth of visual data, affective video content\nanalysis (AVCA) as an essential branch of affective computing has become a\nwidely researched topic. In this study, we comprehensively review the\ndevelopment of AVCA over the past decade, particularly focusing on the most\nadvanced methods adopted to address the three major challenges of video feature\nextraction, expression subjectivity, and multimodal feature fusion. We first\nintroduce the widely used emotion representation models in AVCA and describe\ncommonly used datasets. We summarize and compare representative methods in the\nfollowing aspects: (1) unimodal AVCA models, including facial expression\nrecognition and posture emotion recognition; (2) multimodal AVCA models,\nincluding feature fusion, decision fusion, and attention-based multimodal\nmodels; (3) model performance evaluation standards. Finally, we discuss future\nchallenges and promising research directions, such as emotion recognition and\npublic opinion analysis, human-computer interaction, and emotional\nintelligence.\n","authors":["Junxiao Xue","Jie Wang","Xuecheng Wu","Qian Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.17212v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09763v1","updated":"2024-01-18T07:28:17Z","published":"2024-01-18T07:28:17Z","title":"CLIP Model for Images to Textual Prompts Based on Top-k Neighbors","summary":" Text-to-image synthesis, a subfield of multimodal generation, has gained\nsignificant attention in recent years. We propose a cost-effective approach for\nimage-to-prompt generation that leverages generative models to generate textual\nprompts without the need for large amounts of annotated data. We divide our\nmethod into two stages: online stage and offline stage. We use a combination of\nthe CLIP model and K-nearest neighbors (KNN) algorithm. The proposed system\nconsists of two main parts: an offline task and an online task. Our method owns\nthe highest metric 0.612 among these models, which is 0.013, 0.055, 0.011\nhigher than Clip, Clip + KNN(top 10) respectively.\n","authors":["Xin Zhang","Xin Zhang","YeMing Cai","Tianzhi Jia"],"pdf_url":"https://arxiv.org/pdf/2401.09763v1.pdf","comment":"CLIP model, KNN, image-to-prompts"},{"id":"http://arxiv.org/abs/2303.02472v2","updated":"2024-01-18T07:27:09Z","published":"2023-03-04T18:06:36Z","title":"ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration\n Measure","summary":" Studies have shown that modern neural networks tend to be poorly calibrated\ndue to over-confident predictions. Traditionally, post-processing methods have\nbeen used to calibrate the model after training. In recent years, various\ntrainable calibration measures have been proposed to incorporate them directly\ninto the training process. However, these methods all incorporate internal\nhyperparameters, and the performance of these calibration objectives relies on\ntuning these hyperparameters, incurring more computational costs as the size of\nneural networks and datasets become larger. As such, we present Expected\nSquared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable\ncalibration objective loss, where we view the calibration error from the\nperspective of the squared difference between the two expectations. With\nextensive experiments on several architectures (CNNs, Transformers) and\ndatasets, we demonstrate that (1) incorporating ESD into the training improves\nmodel calibration in various batch size settings without the need for internal\nhyperparameter tuning, (2) ESD yields the best-calibrated results compared with\nprevious approaches, and (3) ESD drastically improves the computational costs\nrequired for calibration during training due to the absence of internal\nhyperparameter. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/ESD.\n","authors":["Hee Suk Yoon","Joshua Tian Jin Tee","Eunseop Yoon","Sunjae Yoon","Gwangsu Kim","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2303.02472v2.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2401.09759v1","updated":"2024-01-18T07:19:10Z","published":"2024-01-18T07:19:10Z","title":"SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech\n Recognition","summary":" Audio-visual speech recognition (AVSR) is a multimodal extension of automatic\nspeech recognition (ASR), using video as a complement to audio. In AVSR,\nconsiderable efforts have been directed at datasets for facial features such as\nlip-readings, while they often fall short in evaluating the image comprehension\ncapabilities in broader contexts. In this paper, we construct SlideAVSR, an\nAVSR dataset using scientific paper explanation videos. SlideAVSR provides a\nnew benchmark where models transcribe speech utterances with texts on the\nslides on the presentation recordings. As technical terminologies that are\nfrequent in paper explanations are notoriously challenging to transcribe\nwithout reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR\nproblems. As a simple yet effective baseline, we propose DocWhisper, an AVSR\nmodel that can refer to textual information from slides, and confirm its\neffectiveness on SlideAVSR.\n","authors":["Hao Wang","Shuhei Kurita","Shuichiro Shimizu","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2401.09759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08868v2","updated":"2024-01-18T07:14:00Z","published":"2024-01-16T22:46:29Z","title":"B-Cos Aligned Transformers Learn Human-Interpretable Features","summary":" Vision Transformers (ViTs) and Swin Transformers (Swin) are currently\nstate-of-the-art in computational pathology. However, domain experts are still\nreluctant to use these models due to their lack of interpretability. This is\nnot surprising, as critical decisions need to be transparent and\nunderstandable. The most common approach to understanding transformers is to\nvisualize their attention. However, attention maps of ViTs are often\nfragmented, leading to unsatisfactory explanations. Here, we introduce a novel\narchitecture called the B-cos Vision Transformer (BvT) that is designed to be\nmore interpretable. It replaces all linear transformations with the B-cos\ntransform to promote weight-input alignment. In a blinded study, medical\nexperts clearly ranked BvTs above ViTs, suggesting that our network is better\nat capturing biomedically relevant structures. This is also true for the B-cos\nSwin Transformer (Bwin). Compared to the Swin Transformer, it even improves the\nF1-score by up to 4.7% on two public datasets.\n","authors":["Manuel Tran","Amal Lahiani","Yashin Dicente Cid","Melanie Boxberg","Peter Lienemann","Christian Matek","Sophia J. Wagner","Fabian J. Theis","Eldad Klaiman","Tingying Peng"],"pdf_url":"https://arxiv.org/pdf/2401.08868v2.pdf","comment":"Accepted at MICCAI 2023 (oral). Camera-ready available at\n https://doi.org/10.1007/978-3-031-43993-3_50"},{"id":"http://arxiv.org/abs/2309.04780v3","updated":"2024-01-18T07:13:40Z","published":"2023-09-09T12:50:06Z","title":"Latent Degradation Representation Constraint for Single Image Deraining","summary":" Since rain streaks show a variety of shapes and directions, learning the\ndegradation representation is extremely challenging for single image deraining.\nExisting methods are mainly targeted at designing complicated modules to\nimplicitly learn latent degradation representation from coupled rainy images.\nThis way, it is hard to decouple the content-independent degradation\nrepresentation due to the lack of explicit constraint, resulting in over- or\nunder-enhancement problems. To tackle this issue, we propose a novel Latent\nDegradation Representation Constraint Network (LDRCNet) that consists of\nDirection-Aware Encoder (DAEncoder), UNet Deraining Network, and Multi-Scale\nInteraction Block (MSIBlock). Specifically, the DAEncoder is proposed to\nadaptively extract latent degradation representation by using the deformable\nconvolutions to exploit the direction consistency of rain streaks. Next, a\nconstraint loss is introduced to explicitly constraint the degradation\nrepresentation learning during training. Last, we propose an MSIBlock to fuse\nwith the learned degradation representation and decoder features of the\nderaining network for adaptive information interaction, which enables the\nderaining network to remove various complicated rainy patterns and reconstruct\nimage details. Experimental results on synthetic and real datasets demonstrate\nthat our method achieves new state-of-the-art performance.\n","authors":["Yuhong He","Long Peng","Lu Wang","Jun Cheng"],"pdf_url":"https://arxiv.org/pdf/2309.04780v3.pdf","comment":"This paper is accepted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08140v2","updated":"2024-01-18T07:01:15Z","published":"2024-01-16T06:19:18Z","title":"ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Process","summary":" Neural radiance fields (NeRFs) have gained popularity across various\napplications. However, they face challenges in the sparse view setting, lacking\nsufficient constraints from volume rendering. Reconstructing and understanding\na 3D scene from sparse and unconstrained cameras is a long-standing problem in\nclassical computer vision with diverse applications. While recent works have\nexplored NeRFs in sparse, unconstrained view scenarios, their focus has been\nprimarily on enhancing reconstruction and novel view synthesis. Our approach\ntakes a broader perspective by posing the question: \"from where has each point\nbeen seen?\" -- which gates how well we can understand and reconstruct it. In\nother words, we aim to determine the origin or provenance of each 3D point and\nits associated information under sparse, unconstrained views. We introduce\nProvNeRF, a model that enriches a traditional NeRF representation by\nincorporating per-point provenance, modeling likely source locations for each\npoint. We achieve this by extending implicit maximum likelihood estimation\n(IMLE) for stochastic processes. Notably, our method is compatible with any\npre-trained NeRF model and the associated training camera poses. We demonstrate\nthat modeling per-point provenance offers several advantages, including\nuncertainty estimation, criteria-based view selection, and improved novel view\nsynthesis, compared to state-of-the-art methods. Please visit our project page\nat https://provnerf.github.io\n","authors":["Kiyohiro Nakayama","Mikaela Angelina Uy","Yang You","Ke Li","Leonidas Guibas"],"pdf_url":"https://arxiv.org/pdf/2401.08140v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08689v2","updated":"2024-01-18T06:45:16Z","published":"2024-01-13T08:30:13Z","title":"NODI: Out-Of-Distribution Detection with Noise from Diffusion","summary":" Out-of-distribution (OOD) detection is a crucial part of deploying machine\nlearning models safely. It has been extensively studied with a plethora of\nmethods developed in the literature. This problem is tackled with an OOD score\ncomputation, however, previous methods compute the OOD scores with limited\nusage of the in-distribution dataset. For instance, the OOD scores are computed\nwith information from a small portion of the in-distribution data. Furthermore,\nthese methods encode images with a neural image encoder. The robustness of\nthese methods is rarely checked with respect to image encoders of different\ntraining methods and architectures. In this work, we introduce the diffusion\nprocess into the OOD task. The diffusion model integrates information on the\nwhole training set into the predicted noise vectors. What's more, we deduce a\nclosed-form solution for the noise vector (stable point). Then the noise vector\nis converted into our OOD score, we test both the deep model predicted noise\nvector and the closed-form noise vector on the OOD benchmarks \\cite{openood}.\nOur method outperforms previous OOD methods across all types of image encoders\n(Table. \\ref{main}). A $3.5\\%$ performance gain is achieved with the MAE-based\nimage encoder. Moreover, we studied the robustness of OOD methods by applying\ndifferent types of image encoders. Some OOD methods failed to generalize well\nwhen switching image encoders from ResNet to Vision Transformers, our method\nperforms exhibits good robustness with all the image encoders.\n","authors":["Jingqiu Zhou","Aojun Zhou","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2401.08689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09742v1","updated":"2024-01-18T05:50:09Z","published":"2024-01-18T05:50:09Z","title":"Image Translation as Diffusion Visual Programmers","summary":" We introduce the novel Diffusion Visual Programmer (DVP), a neuro-symbolic\nimage translation framework. Our proposed DVP seamlessly embeds a\ncondition-flexible diffusion model within the GPT architecture, orchestrating a\ncoherent sequence of visual programs (i.e., computer vision models) for various\npro-symbolic steps, which span RoI identification, style transfer, and position\nmanipulation, facilitating transparent and controllable image translation\nprocesses. Extensive experiments demonstrate DVP's remarkable performance,\nsurpassing concurrent arts. This success can be attributed to several key\nfeatures of DVP: First, DVP achieves condition-flexible translation via\ninstance normalization, enabling the model to eliminate sensitivity caused by\nthe manual guidance and optimally focus on textual descriptions for\nhigh-quality content generation. Second, the framework enhances in-context\nreasoning by deciphering intricate high-dimensional concepts in feature spaces\ninto more accessible low-dimensional symbols (e.g., [Prompt], [RoI object]),\nallowing for localized, context-free editing while maintaining overall\ncoherence. Last but not least, DVP improves systemic controllability and\nexplainability by offering explicit symbolic representations at each\nprogramming stage, empowering users to intuitively interpret and modify\nresults. Our research marks a substantial step towards harmonizing artificial\nimage translation processes with cognitive intelligence, promising broader\napplications.\n","authors":["Cheng Han","James C. Liang","Qifan Wang","Majid Rabbani","Sohail Dianat","Raghuveer Rao","Ying Nian Wu","Dongfang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09742v1.pdf","comment":"25 pages, 20 figures"},{"id":"http://arxiv.org/abs/2401.07402v2","updated":"2024-01-18T05:33:10Z","published":"2024-01-15T00:40:41Z","title":"Improved Implicity Neural Representation with Fourier Bases\n Reparameterized Training","summary":" Implicit Neural Representation (INR) as a mighty representation paradigm has\nachieved success in various computer vision tasks recently. Due to the\nlow-frequency bias issue of vanilla multi-layer perceptron (MLP), existing\nmethods have investigated advanced techniques, such as positional encoding and\nperiodic activation function, to improve the accuracy of INR. In this paper, we\nconnect the network training bias with the reparameterization technique and\ntheoretically prove that weight reparameterization could provide us a chance to\nalleviate the spectral bias of MLP. Based on our theoretical analysis, we\npropose a Fourier reparameterization method which learns coefficient matrix of\nfixed Fourier bases to compose the weights of MLP. We evaluate the proposed\nFourier reparameterization method on different INR tasks with various MLP\narchitectures, including vanilla MLP, MLP with positional encoding and MLP with\nadvanced activation function, etc. The superiority approximation results on\ndifferent MLP architectures clearly validate the advantage of our proposed\nmethod. Armed with our Fourier reparameterization method, better INR with more\ntextures and less artifacts can be learned from the training data.\n","authors":["Kexuan Shi","Xingyu Zhou","Shuhang Gu"],"pdf_url":"https://arxiv.org/pdf/2401.07402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09736v1","updated":"2024-01-18T05:31:53Z","published":"2024-01-18T05:31:53Z","title":"Measuring the Discrepancy between 3D Geometric Models using Directional\n Distance Fields","summary":" Qualifying the discrepancy between 3D geometric models, which could be\nrepresented with either point clouds or triangle meshes, is a pivotal issue\nwith board applications. Existing methods mainly focus on directly establishing\nthe correspondence between two models and then aggregating point-wise distance\nbetween corresponding points, resulting in them being either inefficient or\nineffective. In this paper, we propose DirDist, an efficient, effective,\nrobust, and differentiable distance metric for 3D geometry data. Specifically,\nwe construct DirDist based on the proposed implicit representation of 3D\nmodels, namely directional distance field (DDF), which defines the directional\ndistances of 3D points to a model to capture its local surface geometry. We\nthen transfer the discrepancy between two 3D geometric models as the\ndiscrepancy between their DDFs defined on an identical domain, naturally\nestablishing model correspondence. To demonstrate the advantage of our DirDist,\nwe explore various distance metric-driven 3D geometric modeling tasks,\nincluding template surface fitting, rigid registration, non-rigid registration,\nscene flow estimation and human pose optimization. Extensive experiments show\nthat our DirDist achieves significantly higher accuracy under all tasks. As a\ngeneric distance metric, DirDist has the potential to advance the field of 3D\ngeometric modeling. The source code is available at\n\\url{https://github.com/rsy6318/DirDist}.\n","authors":["Siyu Ren","Junhui Hou","Xiaodong Chen","Hongkai Xiong","Wenping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09736v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12474v4","updated":"2024-01-18T05:29:09Z","published":"2023-10-19T05:15:17Z","title":"Enhancing High-Resolution 3D Generation through Pixel-wise Gradient\n Clipping","summary":" High-resolution 3D object generation remains a challenging task primarily due\nto the limited availability of comprehensive annotated training data. Recent\nadvancements have aimed to overcome this constraint by harnessing image\ngenerative models, pretrained on extensive curated web datasets, using\nknowledge transfer techniques like Score Distillation Sampling (SDS).\nEfficiently addressing the requirements of high-resolution rendering often\nnecessitates the adoption of latent representation-based models, such as the\nLatent Diffusion Model (LDM). In this framework, a significant challenge\narises: To compute gradients for individual image pixels, it is necessary to\nbackpropagate gradients from the designated latent space through the frozen\ncomponents of the image model, such as the VAE encoder used within LDM.\nHowever, this gradient propagation pathway has never been optimized, remaining\nuncontrolled during training. We find that the unregulated gradients adversely\naffect the 3D model's capacity in acquiring texture-related information from\nthe image generative model, leading to poor quality appearance synthesis. To\naddress this overarching challenge, we propose an innovative operation termed\nPixel-wise Gradient Clipping (PGC) designed for seamless integration into\nexisting 3D generative models, thereby enhancing their synthesis quality.\nSpecifically, we control the magnitude of stochastic gradients by clipping the\npixel-wise gradients efficiently, while preserving crucial texture-related\ngradient directions. Despite this simplicity and minimal extra cost, extensive\nexperiments demonstrate the efficacy of our PGC in enhancing the performance of\nexisting 3D generative models for high-resolution object rendering.\n","authors":["Zijie Pan","Jiachen Lu","Xiatian Zhu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12474v4.pdf","comment":"Accepted at ICLR 2024. Project page:\n https://fudan-zvg.github.io/PGC-3D"},{"id":"http://arxiv.org/abs/2401.09732v1","updated":"2024-01-18T05:20:07Z","published":"2024-01-18T05:20:07Z","title":"Instance Brownian Bridge as Texts for Open-vocabulary Video Instance\n Segmentation","summary":" Temporally locating objects with arbitrary class texts is the primary pursuit\nof open-vocabulary Video Instance Segmentation (VIS). Because of the\ninsufficient vocabulary of video data, previous methods leverage image-text\npretraining model for recognizing object instances by separately aligning each\nframe and class texts, ignoring the correlation between frames. As a result,\nthe separation breaks the instance movement context of videos, causing inferior\nalignment between video and text. To tackle this issue, we propose to link\nframe-level instance representations as a Brownian Bridge to model instance\ndynamics and align bridge-level instance representation to class texts for more\nprecisely open-vocabulary VIS (BriVIS). Specifically, we build our system upon\na frozen video segmentor to generate frame-level instance queries, and design\nTemporal Instance Resampler (TIR) to generate queries with temporal context\nfrom frame queries. To mold instance queries to follow Brownian bridge and\naccomplish alignment with class texts, we design Bridge-Text Alignment (BTA) to\nlearn discriminative bridge-level representations of instances via contrastive\nobjectives. Setting MinVIS as the basic video segmentor, BriVIS surpasses the\nOpen-vocabulary SOTA (OV2Seg) by a clear margin. For example, on the\nchallenging large-vocabulary VIS dataset (BURST), BriVIS achieves 7.43 mAP and\nexhibits 49.49% improvement compared to OV2Seg (4.97 mAP).\n","authors":["Zesen Cheng","Kehan Li","Hao Li","Peng Jin","Chang Liu","Xiawu Zheng","Rongrong Ji","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.09732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09721v1","updated":"2024-01-18T04:51:41Z","published":"2024-01-18T04:51:41Z","title":"fast graph-based denoising for point cloud color information","summary":" Point clouds are utilized in various 3D applications such as cross-reality\n(XR) and realistic 3D displays. In some applications, e.g., for live streaming\nusing a 3D point cloud, real-time point cloud denoising methods are required to\nenhance the visual quality. However, conventional high-precision denoising\nmethods cannot be executed in real time for large-scale point clouds owing to\nthe complexity of graph constructions with K nearest neighbors and noise level\nestimation. This paper proposes a fast graph-based denoising (FGBD) for a\nlarge-scale point cloud. First, high-speed graph construction is achieved by\nscanning a point cloud in various directions and searching adjacent\nneighborhoods on the scanning lines. Second, we propose a fast noise level\nestimation method using eigenvalues of the covariance matrix on a graph.\nFinally, we also propose a new low-cost filter selection method to enhance\ndenoising accuracy to compensate for the degradation caused by the acceleration\nalgorithms. In our experiments, we succeeded in reducing the processing time\ndramatically while maintaining accuracy relative to conventional denoising\nmethods. Denoising was performed at 30fps, with frames containing approximately\n1 million points.\n","authors":["Ryosuke Watanabe","Keisuke Nonaka","Eduardo Pavez","Tatsuya Kobayashi","Antonio Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.09721v1.pdf","comment":"Published in the proceeding of 2024 IEEE International Conference on\n Acoustics, Speech and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.09720v1","updated":"2024-01-18T04:48:13Z","published":"2024-01-18T04:48:13Z","title":"GaussianBody: Clothed Human Reconstruction via 3d Gaussian Splatting","summary":" In this work, we propose a novel clothed human reconstruction method called\nGaussianBody, based on 3D Gaussian Splatting. Compared with the costly neural\nradiance based models, 3D Gaussian Splatting has recently demonstrated great\nperformance in terms of training time and rendering quality. However, applying\nthe static 3D Gaussian Splatting model to the dynamic human reconstruction\nproblem is non-trivial due to complicated non-rigid deformations and rich cloth\ndetails. To address these challenges, our method considers explicit pose-guided\ndeformation to associate dynamic Gaussians across the canonical space and the\nobservation space, introducing a physically-based prior with regularized\ntransformations helps mitigate ambiguity between the two spaces. During the\ntraining process, we further propose a pose refinement strategy to update the\npose regression for compensating the inaccurate initial estimation and a\nsplit-with-scale mechanism to enhance the density of regressed point clouds.\nThe experiments validate that our method can achieve state-of-the-art\nphotorealistic novel-view rendering results with high-quality details for\ndynamic clothed human bodies, along with explicit geometry reconstruction.\n","authors":["Mengtian Li","Shengxiang Yao","Zhifeng Xie","Keyu Chen","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.09720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09716v1","updated":"2024-01-18T04:23:21Z","published":"2024-01-18T04:23:21Z","title":"HCVP: Leveraging Hierarchical Contrastive Visual Prompt for Domain\n Generalization","summary":" Domain Generalization (DG) endeavors to create machine learning models that\nexcel in unseen scenarios by learning invariant features. In DG, the prevalent\npractice of constraining models to a fixed structure or uniform\nparameterization to encapsulate invariant features can inadvertently blend\nspecific aspects. Such an approach struggles with nuanced differentiation of\ninter-domain variations and may exhibit bias towards certain domains, hindering\nthe precise learning of domain-invariant features. Recognizing this, we\nintroduce a novel method designed to supplement the model with domain-level and\ntask-specific characteristics. This approach aims to guide the model in more\neffectively separating invariant features from specific characteristics,\nthereby boosting the generalization. Building on the emerging trend of visual\nprompts in the DG paradigm, our work introduces the novel \\textbf{H}ierarchical\n\\textbf{C}ontrastive \\textbf{V}isual \\textbf{P}rompt (HCVP) methodology. This\nrepresents a significant advancement in the field, setting itself apart with a\nunique generative approach to prompts, alongside an explicit model structure\nand specialized loss functions. Differing from traditional visual prompts that\nare often shared across entire datasets, HCVP utilizes a hierarchical prompt\ngeneration network enhanced by prompt contrastive learning. These generative\nprompts are instance-dependent, catering to the unique characteristics inherent\nto different domains and tasks. Additionally, we devise a prompt modulation\nnetwork that serves as a bridge, effectively incorporating the generated visual\nprompts into the vision transformer backbone. Experiments conducted on five DG\ndatasets demonstrate the effectiveness of HCVP, outperforming both established\nDG algorithms and adaptation protocols.\n","authors":["Guanglin Zhou","Zhongyi Han","Shiming Chen","Biwei Huang","Liming Zhu","Tongliang Liu","Lina Yao","Kun Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.09716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05156v3","updated":"2024-01-18T04:19:09Z","published":"2023-04-11T11:47:34Z","title":"Accelerating Globally Optimal Consensus Maximization in Geometric Vision","summary":" Branch-and-bound-based consensus maximization stands out due to its important\nability of retrieving the globally optimal solution to outlier-affected\ngeometric problems. However, while the discovery of such solutions caries high\nscientific value, its application in practical scenarios is often prohibited by\nits computational complexity growing exponentially as a function of the\ndimensionality of the problem at hand. In this work, we convey a novel, general\ntechnique that allows us to branch over an n-1 dimensional space for an\nn-dimensional problem. The remaining degree of freedom can be solved globally\noptimally within each bound calculation by applying the efficient interval\nstabbing technique. While each individual bound derivation is harder to compute\nowing to the additional need for solving a sorting problem, the reduced number\nof intervals and tighter bounds in practice lead to a significant reduction in\nthe overall number of required iterations. Besides an abstract introduction of\nthe approach, we present applications to four fundamental geometric computer\nvision problems: camera resectioning, relative camera pose estimation, point\nset registration, and rotation and focal length estimation. Through our\nexhaustive tests, we demonstrate significant speed-up factors at times\nexceeding two orders of magnitude, thereby increasing the viability of globally\noptimal consensus maximizers in online application scenarios.\n","authors":["Xinyue Zhang","Liangzu Peng","Wanting Xu","Laurent Kneip"],"pdf_url":"https://arxiv.org/pdf/2304.05156v3.pdf","comment":"Accepted by IEEE Transactions on Pattern Analysis and Machine\n Intelligence, 2024"},{"id":"http://arxiv.org/abs/2401.09712v1","updated":"2024-01-18T04:10:20Z","published":"2024-01-18T04:10:20Z","title":"SkyEyeGPT: Unifying Remote Sensing Vision-Language Tasks via Instruction\n Tuning with Large Language Model","summary":" Large language models (LLMs) have recently been extended to the\nvision-language realm, obtaining impressive general multi-modal capabilities.\nHowever, the exploration of multi-modal large language models (MLLMs) for\nremote sensing (RS) data is still in its infancy, and the performance is not\nsatisfactory. In this work, we introduce SkyEyeGPT, a unified multi-modal large\nlanguage model specifically designed for RS vision-language understanding. To\nthis end, we meticulously curate an RS multi-modal instruction tuning dataset,\nincluding single-task and multi-task conversation instructions. After manual\nverification, we obtain a high-quality RS instruction-following dataset with\n968k samples. Our research demonstrates that with a simple yet effective\ndesign, SkyEyeGPT works surprisingly well on considerably different tasks\nwithout the need for extra encoding modules. Specifically, after projecting RS\nvisual features to the language domain via an alignment layer, they are fed\njointly with task-specific instructions into an LLM-based RS decoder to predict\nanswers for RS open-ended tasks. In addition, we design a two-stage tuning\nmethod to enhance instruction-following and multi-turn dialogue ability at\ndifferent granularities. Experiments on 8 datasets for RS vision-language tasks\ndemonstrate SkyEyeGPT's superiority in image-level and region-level tasks, such\nas captioning and visual grounding. In particular, SkyEyeGPT exhibits\nencouraging results compared to GPT-4V in some qualitative tests. The online\ndemo, code, and dataset will be released in\nhttps://github.com/ZhanYang-nwpu/SkyEyeGPT.\n","authors":["Yang Zhan","Zhitong Xiong","Yuan Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.09712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07778v5","updated":"2024-01-18T03:55:30Z","published":"2023-09-14T15:09:35Z","title":"Virchow: A Million-Slide Digital Pathology Foundation Model","summary":" The use of artificial intelligence to enable precision medicine and decision\nsupport systems through the analysis of pathology images has the potential to\nrevolutionize the diagnosis and treatment of cancer. Such applications will\ndepend on models' abilities to capture the diverse patterns observed in\npathology images. To address this challenge, we present Virchow, a foundation\nmodel for computational pathology. Using self-supervised learning empowered by\nthe DINOv2 algorithm, Virchow is a vision transformer model with 632 million\nparameters trained on 1.5 million hematoxylin and eosin stained whole slide\nimages from diverse tissue and specimen types, which is orders of magnitude\nmore data than previous works. The Virchow model enables the development of a\npan-cancer detection system with 0.949 overall specimen-level AUC across 17\ndifferent cancer types, while also achieving 0.937 AUC on 7 rare cancer types.\nThe Virchow model sets the state-of-the-art on the internal and external image\ntile level benchmarks and slide level biomarker prediction tasks. The gains in\nperformance highlight the importance of training on massive pathology image\ndatasets, suggesting scaling up the data and network architecture can improve\nthe accuracy for many high-impact computational pathology applications where\nlimited amounts of training data are available.\n","authors":["Eugene Vorontsov","Alican Bozkurt","Adam Casson","George Shaikovski","Michal Zelechowski","Siqi Liu","Kristen Severson","Eric Zimmermann","James Hall","Neil Tenenholtz","Nicolo Fusi","Philippe Mathieu","Alexander van Eck","Donghun Lee","Julian Viret","Eric Robert","Yi Kan Wang","Jeremy D. Kunz","Matthew C. H. Lee","Jan Bernhard","Ran A. Godrich","Gerard Oakley","Ewan Millar","Matthew Hanna","Juan Retamero","William A. Moye","Razik Yousfi","Christopher Kanan","David Klimstra","Brandon Rothrock","Thomas J. Fuchs"],"pdf_url":"https://arxiv.org/pdf/2309.07778v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09709v1","updated":"2024-01-18T03:41:38Z","published":"2024-01-18T03:41:38Z","title":"P2Seg: Pointly-supervised Segmentation via Mutual Distillation","summary":" Point-level Supervised Instance Segmentation (PSIS) aims to enhance the\napplicability and scalability of instance segmentation by utilizing low-cost\nyet instance-informative annotations. Existing PSIS methods usually rely on\npositional information to distinguish objects, but predicting precise\nboundaries remains challenging due to the lack of contour annotations.\nNevertheless, weakly supervised semantic segmentation methods are proficient in\nutilizing intra-class feature consistency to capture the boundary contours of\nthe same semantic regions. In this paper, we design a Mutual Distillation\nModule (MDM) to leverage the complementary strengths of both instance position\nand semantic information and achieve accurate instance-level object perception.\nThe MDM consists of Semantic to Instance (S2I) and Instance to Semantic (I2S).\nS2I is guided by the precise boundaries of semantic regions to learn the\nassociation between annotated points and instance contours. I2S leverages\ndiscriminative relationships between instances to facilitate the\ndifferentiation of various objects within the semantic map. Extensive\nexperiments substantiate the efficacy of MDM in fostering the synergy between\ninstance and semantic information, consequently improving the quality of\ninstance-level object representations. Our method achieves 55.7 mAP$_{50}$ and\n17.6 mAP on the PASCAL VOC and MS COCO datasets, significantly outperforming\nrecent PSIS methods and several box-supervised instance segmentation\ncompetitors.\n","authors":["Zipeng Wang","Xuehui Yu","Xumeng Han","Wenwen Yu","Zhixun Huang","Jianbin Jiao","Zhenjun Han"],"pdf_url":"https://arxiv.org/pdf/2401.09709v1.pdf","comment":"14 pages, 12 figures, published to ICLR2024"},{"id":"http://arxiv.org/abs/2301.00114v4","updated":"2024-01-18T03:20:19Z","published":"2022-12-31T04:11:25Z","title":"Skeletal Video Anomaly Detection using Deep Learning: Survey, Challenges\n and Future Directions","summary":" The existing methods for video anomaly detection mostly utilize videos\ncontaining identifiable facial and appearance-based features. The use of videos\nwith identifiable faces raises privacy concerns, especially when used in a\nhospital or community-based setting. Appearance-based features can also be\nsensitive to pixel-based noise, straining the anomaly detection methods to\nmodel the changes in the background and making it difficult to focus on the\nactions of humans in the foreground. Structural information in the form of\nskeletons describing the human motion in the videos is privacy-protecting and\ncan overcome some of the problems posed by appearance-based features. In this\npaper, we present a survey of privacy-protecting deep learning anomaly\ndetection methods using skeletons extracted from videos. We present a novel\ntaxonomy of algorithms based on the various learning approaches. We conclude\nthat skeleton-based approaches for anomaly detection can be a plausible\nprivacy-protecting alternative for video anomaly detection. Lastly, we identify\nmajor open research questions and provide guidelines to address them.\n","authors":["Pratik K. Mishra","Alex Mihailidis","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2301.00114v4.pdf","comment":"This work has been accepted by IEEE Transactions on Emerging Topics\n in Computational Intelligence"},{"id":"http://arxiv.org/abs/2401.09112v2","updated":"2024-01-18T03:19:53Z","published":"2024-01-17T10:26:17Z","title":"Stream Query Denoising for Vectorized HD Map Construction","summary":" To enhance perception performance in complex and extensive scenarios within\nthe realm of autonomous driving, there has been a noteworthy focus on temporal\nmodeling, with a particular emphasis on streaming methods. The prevailing trend\nin streaming models involves the utilization of stream queries for the\npropagation of temporal information. Despite the prevalence of this approach,\nthe direct application of the streaming paradigm to the construction of\nvectorized high-definition maps (HD-maps) fails to fully harness the inherent\npotential of temporal information. This paper introduces the Stream Query\nDenoising (SQD) strategy as a novel approach for temporal modeling in\nhigh-definition map (HD-map) construction. SQD is designed to facilitate the\nlearning of temporal consistency among map elements within the streaming model.\nThe methodology involves denoising the queries that have been perturbed by the\naddition of noise to the ground-truth information from the preceding frame.\nThis denoising process aims to reconstruct the ground-truth information for the\ncurrent frame, thereby simulating the prediction process inherent in stream\nqueries. The SQD strategy can be applied to those streaming methods (e.g.,\nStreamMapNet) to enhance the temporal modeling. The proposed SQD-MapNet is the\nStreamMapNet equipped with SQD. Extensive experiments on nuScenes and\nArgoverse2 show that our method is remarkably superior to other existing\nmethods across all settings of close range and long range. The code will be\navailable soon.\n","authors":["Shuo Wang","Fan Jia","Yingfei Liu","Yucheng Zhao","Zehui Chen","Tiancai Wang","Chi Zhang","Xiangyu Zhang","Feng Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.09112v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12379v2","updated":"2024-01-18T03:08:49Z","published":"2023-12-19T18:11:19Z","title":"Mixture of Cluster-conditional LoRA Experts for Vision-language\n Instruction Tuning","summary":" Instruction tuning of the Large Vision-language Models (LVLMs) has\nrevolutionized the development of versatile models with zero-shot\ngeneralization across a wide range of downstream vision-language tasks.\nHowever, diversity of training tasks of different sources and formats would\nlead to inevitable task conflicts, where different tasks conflicts for the same\nset of model parameters, resulting in sub-optimal instruction-following\nabilities. To address that, we propose the Mixture of Cluster-conditional LoRA\nExperts (MoCLE), a novel Mixture of Experts (MoE) architecture designed to\nactivate the task-customized model parameters based on the instruction\nclusters. A separate universal expert is further incorporated to improve the\ngeneralization capabilities of MoCLE for novel instructions. Extensive\nexperiments on 10 zero-shot tasks demonstrate the effectiveness of MoCLE.\n","authors":["Yunhao Gou","Zhili Liu","Kai Chen","Lanqing Hong","Hang Xu","Aoxue Li","Dit-Yan Yeung","James T. Kwok","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.12379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.06657v2","updated":"2024-01-18T02:10:39Z","published":"2023-01-17T01:46:45Z","title":"Free Lunch for Generating Effective Outlier Supervision","summary":" When deployed in practical applications, computer vision systems will\nencounter numerous unexpected images (\\emph{{i.e.}}, out-of-distribution data).\nDue to the potentially raised safety risks, these aforementioned unseen data\nshould be carefully identified and handled. Generally, existing approaches in\ndealing with out-of-distribution (OOD) detection mainly focus on the\nstatistical difference between the features of OOD and in-distribution (ID)\ndata extracted by the classifiers. Although many of these schemes have brought\nconsiderable performance improvements, reducing the false positive rate (FPR)\nwhen processing open-set images, they necessarily lack reliable theoretical\nanalysis and generalization guarantees. Unlike the observed ways, in this\npaper, we investigate the OOD detection problem based on the Bayes rule and\npresent a convincing description of the reason for failures encountered by\nconventional classifiers. Concretely, our analysis reveals that refining the\nprobability distribution yielded by the vanilla neural networks is necessary\nfor OOD detection, alleviating the issues of assigning high confidence to OOD\ndata. To achieve this effortlessly, we propose an ultra-effective method to\ngenerate near-realistic outlier supervision. Extensive experiments on\nlarge-scale benchmarks reveal that our proposed \\texttt{BayesAug} significantly\nreduces the FPR95 over 12.50\\% compared with the previous schemes, boosting the\nreliability of machine learning systems. The code will be made publicly\navailable.\n","authors":["Sen Pei","Jiaxi Sun","Richard Yi Da Xu","Bin Fan","Shiming Xiang","Gaofeng Meng"],"pdf_url":"https://arxiv.org/pdf/2301.06657v2.pdf","comment":"We have rewritten this paper, and published as \"Image Background\n Serves as Good Proxy for Out-of-distribution Data\" arXiv:2307.00519"},{"id":"http://arxiv.org/abs/2305.00163v2","updated":"2024-01-18T02:10:01Z","published":"2023-04-29T03:59:36Z","title":"Enhancing Video Super-Resolution via Implicit Resampling-based Alignment","summary":" In video super-resolution, it is common to use a frame-wise alignment to\nsupport the propagation of information over time. The role of alignment is\nwell-studied for low-level enhancement in video, but existing works overlook a\ncritical step -- resampling. We show through extensive experiments that for\nalignment to be effective, the resampling should preserve the reference\nfrequency spectrum while minimizing spatial distortions. However, most existing\nworks simply use a default choice of bilinear interpolation for resampling even\nthough bilinear interpolation has a smoothing effect and hinders\nsuper-resolution. From these observations, we propose an implicit\nresampling-based alignment. The sampling positions are encoded by a sinusoidal\npositional encoding, while the value is estimated with a coordinate network and\na window-based cross-attention. We show that bilinear interpolation inherently\nattenuates high-frequency information while an MLP-based coordinate network can\napproximate more frequencies. Experiments on synthetic and real-world datasets\nshow that alignment with our proposed implicit resampling enhances the\nperformance of state-of-the-art frameworks with minimal impact on both compute\nand parameters.\n","authors":["Kai Xu","Ziwei Yu","Xin Wang","Michael Bi Mi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2305.00163v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09677v1","updated":"2024-01-18T01:47:55Z","published":"2024-01-18T01:47:55Z","title":"Eye Motion Matters for 3D Face Reconstruction","summary":" Recent advances in single-image 3D face reconstruction have shown remarkable\nprogress in various applications. Nevertheless, prevailing techniques tend to\nprioritize the global facial contour and expression, often neglecting the\nnuanced dynamics of the eye region. In response, we introduce an Eye Landmark\nAdjustment Module, complemented by a Local Dynamic Loss, designed to capture\nthe dynamic features of the eyes area. Our module allows for flexible\nadjustment of landmarks, resulting in accurate recreation of various eye\nstates. In this paper, we present a comprehensive evaluation of our approach,\nconducting extensive experiments on two datasets. The results underscore the\nsuperior performance of our approach, highlighting its significant\ncontributions in addressing this particular challenge.\n","authors":["Xuan Wang","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09677v1.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.09673v1","updated":"2024-01-18T01:18:59Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) is widely adopted in computer vision to generate\nnew images with arbitrary styles. This process leverages neural networks to\nmerge aesthetic elements of a style image with the structural aspects of a\ncontent image into a harmoniously integrated visual result. However,\nunauthorized NST can exploit artwork. Such misuse raises socio-technical\nconcerns regarding artists' rights and motivates the development of technical\napproaches for the proactive protection of original creations. Adversarial\nattack is a concept primarily explored in machine learning security. Our work\nintroduces this technique to protect artists' intellectual property. In this\npaper Locally Adaptive Adversarial Color Attack (LAACA), a method for altering\nimages in a manner imperceptible to the human eyes but disruptive to NST.\nSpecifically, we design perturbations targeting image areas rich in\nhigh-frequency content, generated by disrupting intermediate features. Our\nexperiments and user study confirm that by attacking NST using the proposed\nmethod results in visually worse neural style transfer, thus making it an\neffective solution for visual artwork protection.\n","authors":["Zhongliang Guo","Kaixuan Wang","Weiye Li","Yifei Qian","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.09671v1","updated":"2024-01-18T01:07:00Z","published":"2024-01-18T01:07:00Z","title":"Towards Identifiable Unsupervised Domain Translation: A Diversified\n Distribution Matching Approach","summary":" Unsupervised domain translation (UDT) aims to find functions that convert\nsamples from one domain (e.g., sketches) to another domain (e.g., photos)\nwithout changing the high-level semantic meaning (also referred to as\n``content''). The translation functions are often sought by probability\ndistribution matching of the transformed source domain and target domain.\nCycleGAN stands as arguably the most representative approach among this line of\nwork. However, it was noticed in the literature that CycleGAN and variants\ncould fail to identify the desired translation functions and produce\ncontent-misaligned translations. This limitation arises due to the presence of\nmultiple translation functions -- referred to as ``measure-preserving\nautomorphism\" (MPA) -- in the solution space of the learning criteria. Despite\nawareness of such identifiability issues, solutions have remained elusive. This\nstudy delves into the core identifiability inquiry and introduces an MPA\nelimination theory. Our analysis shows that MPA is unlikely to exist, if\nmultiple pairs of diverse cross-domain conditional distributions are matched by\nthe learning function. Our theory leads to a UDT learner using distribution\nmatching over auxiliary variable-induced subsets of the domains -- other than\nover the entire data domains as in the classical approaches. The proposed\nframework is the first to rigorously establish translation identifiability\nunder reasonable UDT settings, to our best knowledge. Experiments corroborate\nwith our theoretical claims.\n","authors":["Sagar Shrestha","Xiao Fu"],"pdf_url":"https://arxiv.org/pdf/2401.09671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04938v2","updated":"2024-01-18T00:44:11Z","published":"2023-11-08T00:24:50Z","title":"Improved DDIM Sampling with Moment Matching Gaussian Mixtures","summary":" We propose using a Gaussian Mixture Model (GMM) as reverse transition\noperator (kernel) within the Denoising Diffusion Implicit Models (DDIM)\nframework, which is one of the most widely used approaches for accelerated\nsampling from pre-trained Denoising Diffusion Probabilistic Models (DDPM).\nSpecifically we match the first and second order central moments of the DDPM\nforward marginals by constraining the parameters of the GMM. We see that moment\nmatching is sufficient to obtain samples with equal or better quality than the\noriginal DDIM with Gaussian kernels. We provide experimental results with\nunconditional models trained on CelebAHQ and FFHQ and class-conditional models\ntrained on ImageNet datasets respectively. Our results suggest that using the\nGMM kernel leads to significant improvements in the quality of the generated\nsamples when the number of sampling steps is small, as measured by FID and IS\nmetrics. For example on ImageNet 256x256, using 10 sampling steps, we achieve a\nFID of 6.94 and IS of 207.85 with a GMM kernel compared to 10.15 and 196.73\nrespectively with a Gaussian kernel.\n","authors":["Prasad Gabbur"],"pdf_url":"https://arxiv.org/pdf/2311.04938v2.pdf","comment":"29 pages, 14 figures; Analysis of DDIM-GMM as a multimodal denoiser;\n Additional experiments on LSUN datasets and text-to-image generation with\n Stable Diffusion; Comparison with DPM-Solver; Ablations on GMM parameters;\n Updated equations with bold font for vectors and matrices"},{"id":"http://arxiv.org/abs/2401.10419v1","updated":"2024-01-18T23:10:08Z","published":"2024-01-18T23:10:08Z","title":"M3BUNet: Mobile Mean Max UNet for Pancreas Segmentation on CT-Scans","summary":" Segmenting organs in CT scan images is a necessary process for multiple\ndownstream medical image analysis tasks. Currently, manual CT scan segmentation\nby radiologists is prevalent, especially for organs like the pancreas, which\nrequires a high level of domain expertise for reliable segmentation due to\nfactors like small organ size, occlusion, and varying shapes. When resorting to\nautomated pancreas segmentation, these factors translate to limited reliable\nlabeled data to train effective segmentation models. Consequently, the\nperformance of contemporary pancreas segmentation models is still not within\nacceptable ranges. To improve that, we propose M3BUNet, a fusion of MobileNet\nand U-Net neural networks, equipped with a novel Mean-Max (MM) attention that\noperates in two stages to gradually segment pancreas CT images from coarse to\nfine with mask guidance for object detection. This approach empowers the\nnetwork to surpass segmentation performance achieved by similar network\narchitectures and achieve results that are on par with complex state-of-the-art\nmethods, all while maintaining a low parameter count. Additionally, we\nintroduce external contour segmentation as a preprocessing step for the coarse\nstage to assist in the segmentation process through image standardization. For\nthe fine segmentation stage, we found that applying a wavelet decomposition\nfilter to create multi-input images enhances pancreas segmentation performance.\nWe extensively evaluate our approach on the widely known NIH pancreas dataset\nand MSD pancreas dataset. Our approach demonstrates a considerable performance\nimprovement, achieving an average Dice Similarity Coefficient (DSC) value of up\nto 89.53% and an Intersection Over Union (IOU) score of up to 81.16 for the NIH\npancreas dataset, and 88.60% DSC and 79.90% IOU for the MSD Pancreas dataset.\n","authors":["Juwita juwita","Ghulam Mubashar Hassan","Naveed Akhtar","Amitava Datta"],"pdf_url":"https://arxiv.org/pdf/2401.10419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10416v1","updated":"2024-01-18T23:02:08Z","published":"2024-01-18T23:02:08Z","title":"DataViz3D: An Novel Method Leveraging Online Holographic Modeling for\n Extensive Dataset Preprocessing and Visualization","summary":" DataViz3D is an innovative online software that transforms complex datasets\ninto interactive 3D spatial models using holographic technology. This tool\nenables users to generate scatter plot within a 3D space, accurately mapped to\nthe XYZ coordinates of the dataset, providing a vivid and intuitive\nunderstanding of the spatial relationships inherent in the data. DataViz3D's\nuser friendly interface makes advanced 3D modeling and holographic\nvisualization accessible to a wide range of users, fostering new opportunities\nfor collaborative research and education across various disciplines.\n","authors":["Jinli Duan"],"pdf_url":"https://arxiv.org/pdf/2401.10416v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10404v1","updated":"2024-01-18T22:25:16Z","published":"2024-01-18T22:25:16Z","title":"Inflation with Diffusion: Efficient Temporal Adaptation for\n Text-to-Video Super-Resolution","summary":" We propose an efficient diffusion-based text-to-video super-resolution (SR)\ntuning approach that leverages the readily learned capacity of pixel level\nimage diffusion model to capture spatial information for video generation. To\naccomplish this goal, we design an efficient architecture by inflating the\nweightings of the text-to-image SR model into our video generation framework.\nAdditionally, we incorporate a temporal adapter to ensure temporal coherence\nacross video frames. We investigate different tuning approaches based on our\ninflated architecture and report trade-offs between computational costs and\nsuper-resolution quality. Empirical evaluation, both quantitative and\nqualitative, on the Shutterstock video dataset, demonstrates that our approach\nis able to perform text-to-video SR generation with good visual quality and\ntemporal consistency. To evaluate temporal coherence, we also present\nvisualizations in video format in\nhttps://drive.google.com/drive/folders/1YVc-KMSJqOrEUdQWVaI-Yfu8Vsfu_1aO?usp=sharing .\n","authors":["Xin Yuan","Jinoo Baek","Keyang Xu","Omer Tov","Hongliang Fei"],"pdf_url":"https://arxiv.org/pdf/2401.10404v1.pdf","comment":"WACV'24 workshop"},{"id":"http://arxiv.org/abs/2401.10402v1","updated":"2024-01-18T22:23:01Z","published":"2024-01-18T22:23:01Z","title":"Reconstructing the Invisible: Video Frame Restoration through Siamese\n Masked Conditional Variational Autoencoder","summary":" In the domain of computer vision, the restoration of missing information in\nvideo frames is a critical challenge, particularly in applications such as\nautonomous driving and surveillance systems. This paper introduces the Siamese\nMasked Conditional Variational Autoencoder (SiamMCVAE), leveraging a siamese\narchitecture with twin encoders based on vision transformers. This innovative\ndesign enhances the model's ability to comprehend lost content by capturing\nintrinsic similarities between paired frames. SiamMCVAE proficiently\nreconstructs missing elements in masked frames, effectively addressing issues\narising from camera malfunctions through variational inferences. Experimental\nresults robustly demonstrate the model's effectiveness in restoring missing\ninformation, thus enhancing the resilience of computer vision systems. The\nincorporation of Siamese Vision Transformer (SiamViT) encoders in SiamMCVAE\nexemplifies promising potential for addressing real-world challenges in\ncomputer vision, reinforcing the adaptability of autonomous systems in dynamic\nenvironments.\n","authors":["Yongchen Zhou","Richard Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.10402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10397v1","updated":"2024-01-18T22:10:46Z","published":"2024-01-18T22:10:46Z","title":"Analyzing and Mitigating Bias for Vulnerable Classes: Towards Balanced\n Representation in Dataset","summary":" The accuracy and fairness of perception systems in autonomous driving are\ncrucial, particularly for vulnerable road users. Mainstream research has looked\ninto improving the performance metrics for classification accuracy. However,\nthe hidden traits of bias inheritance in the AI models, class imbalances and\ndisparities in the datasets are often overlooked. In this context, our study\nexamines the class imbalances for vulnerable road users by focusing on class\ndistribution analysis, performance evaluation, and bias impact assessment. We\nidentify the concern of imbalances in class representation, leading to\npotential biases in detection accuracy. Utilizing popular CNN models and Vision\nTransformers (ViTs) with the nuScenes dataset, our performance evaluation\nreveals detection disparities for underrepresented classes. We propose a\nmethodology for model optimization and bias mitigation, which includes data\naugmentation, resampling, and metric-specific learning. Using the proposed\nmitigation approaches, we see improvement in IoU(%) and NDS(%) metrics from\n71.3 to 75.6 and 80.6 to 83.7 respectively, for the CNN model. Similarly, for\nViT, we observe improvement in IoU and NDS metrics from 74.9 to 79.2 and 83.8\nto 87.1 respectively. This research contributes to developing more reliable\nmodels and datasets, enhancing inclusiveness for minority classes.\n","authors":["Dewant Katare","David Solans Noguero","Souneil Park","Nicolas Kourtellis","Marijn Janssen","Aaron Yi Ding"],"pdf_url":"https://arxiv.org/pdf/2401.10397v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02506v3","updated":"2024-01-18T22:09:40Z","published":"2023-03-04T21:22:47Z","title":"Prismer: A Vision-Language Model with Multi-Task Experts","summary":" Recent vision-language models have shown impressive multi-modal generation\ncapabilities. However, typically they require training huge models on massive\ndatasets. As a more scalable alternative, we introduce Prismer, a data- and\nparameter-efficient vision-language model that leverages an ensemble of\ntask-specific experts. Prismer only requires training of a small number of\ncomponents, with the majority of network weights inherited from multiple\nreadily-available, pre-trained experts, and kept frozen during training. By\nleveraging experts from a wide range of domains, we show Prismer can\nefficiently pool this expert knowledge and adapt it to various vision-language\nreasoning tasks. In our experiments, we show that Prismer achieves fine-tuned\nand few-shot learning performance which is competitive with current\nstate-of-the-arts, whilst requiring up to two orders of magnitude less training\ndata. Code is available at https://github.com/NVlabs/prismer.\n","authors":["Shikun Liu","Linxi Fan","Edward Johns","Zhiding Yu","Chaowei Xiao","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2303.02506v3.pdf","comment":"Published at TMLR 2024. Project Page:\n https://shikun.io/projects/prismer Code: https://github.com/NVlabs/prismer"},{"id":"http://arxiv.org/abs/2401.10379v1","updated":"2024-01-18T21:04:25Z","published":"2024-01-18T21:04:25Z","title":"Agricultural Object Detection with You Look Only Once (YOLO) Algorithm:\n A Bibliometric and Systematic Literature Review","summary":" Vision is a major component in several digital technologies and tools used in\nagriculture. The object detector, You Look Only Once (YOLO), has gained\npopularity in agriculture in a relatively short span due to its\nstate-of-the-art performance. YOLO offers real-time detection with good\naccuracy and is implemented in various agricultural tasks, including\nmonitoring, surveillance, sensing, automation, and robotics. The research and\napplication of YOLO in agriculture are accelerating rapidly but are fragmented\nand multidisciplinary. Moreover, the performance characteristics (i.e.,\naccuracy, speed, computation) of the object detector influence the rate of\ntechnology implementation and adoption in agriculture. Thus, the study aims to\ncollect extensive literature to document and critically evaluate the advances\nand application of YOLO for agricultural object recognition. First, we\nconducted a bibliometric review of 257 articles to understand the scholarly\nlandscape of YOLO in agricultural domain. Secondly, we conducted a systematic\nreview of 30 articles to identify current knowledge, gaps, and modifications in\nYOLO for specific agricultural tasks. The study critically assesses and\nsummarizes the information on YOLO's end-to-end learning approach, including\ndata acquisition, processing, network modification, integration, and\ndeployment. We also discussed task-specific YOLO algorithm modification and\nintegration to meet the agricultural object or environment-specific challenges.\nIn general, YOLO-integrated digital tools and technologies show the potential\nfor real-time, automated monitoring, surveillance, and object handling to\nreduce labor, production cost, and environmental impact while maximizing\nresource efficiency. The study provides detailed documentation and\nsignificantly advances the existing knowledge on applying YOLO in agriculture,\nwhich can greatly benefit the scientific community.\n","authors":["Chetan M Badgujar","Alwin Poulose","Hao Gan"],"pdf_url":"https://arxiv.org/pdf/2401.10379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00067v2","updated":"2024-01-18T20:58:50Z","published":"2023-11-29T19:52:53Z","title":"Predicting breast cancer with AI for individual risk-adjusted MRI\n screening and early detection","summary":" Women with an increased life-time risk of breast cancer undergo supplemental\nannual screening MRI. We propose to predict the risk of developing breast\ncancer within one year based on the current MRI, with the objective of reducing\nscreening burden and facilitating early detection. An AI algorithm was\ndeveloped on 53,858 breasts from 12,694 patients who underwent screening or\ndiagnostic MRI and accrued over 12 years, with 2,331 confirmed cancers. A first\nU-Net was trained to segment lesions and identify regions of concern. A second\nconvolutional network was trained to detect malignant cancer using features\nextracted by the U-Net. This network was then fine-tuned to estimate the risk\nof developing cancer within a year in cases that radiologists considered normal\nor likely benign. Risk predictions from this AI were evaluated with a\nretrospective analysis of 9,183 breasts from a high-risk screening cohort,\nwhich were not used for training. Statistical analysis focused on the tradeoff\nbetween number of omitted exams versus negative predictive value, and number of\npotential early detections versus positive predictive value. The AI algorithm\nidentified regions of concern that coincided with future tumors in 52% of\nscreen-detected cancers. Upon directed review, a radiologist found that 71.3%\nof cancers had a visible correlate on the MRI prior to diagnosis, 65% of these\ncorrelates were identified by the AI model. Reevaluating these regions in 10%\nof all cases with higher AI-predicted risk could have resulted in up to 33%\nearly detections by a radiologist. Additionally, screening burden could have\nbeen reduced in 16% of lower-risk cases by recommending a later follow-up\nwithout compromising current interval cancer rate. With increasing datasets and\nimproving image quality we expect this new AI-aided, adaptive screening to\nmeaningfully reduce screening burden and improve early detection.\n","authors":["Lukas Hirsch","Yu Huang","Hernan A. Makse","Danny F. Martinez","Mary Hughes","Sarah Eskreis-Winkler","Katja Pinker","Elizabeth Morris","Lucas C. Parra","Elizabeth J. Sutton"],"pdf_url":"https://arxiv.org/pdf/2312.00067v2.pdf","comment":"Major revisions and rewriting in progress"},{"id":"http://arxiv.org/abs/2401.10373v1","updated":"2024-01-18T20:43:43Z","published":"2024-01-18T20:43:43Z","title":"Harmonized Spatial and Spectral Learning for Robust and Generalized\n Medical Image Segmentation","summary":" Deep learning has demonstrated remarkable achievements in medical image\nsegmentation. However, prevailing deep learning models struggle with poor\ngeneralization due to (i) intra-class variations, where the same class appears\ndifferently in different samples, and (ii) inter-class independence, resulting\nin difficulties capturing intricate relationships between distinct objects,\nleading to higher false negative cases. This paper presents a novel approach\nthat synergies spatial and spectral representations to enhance\ndomain-generalized medical image segmentation. We introduce the innovative\nSpectral Correlation Coefficient objective to improve the model's capacity to\ncapture middle-order features and contextual long-range dependencies. This\nobjective complements traditional spatial objectives by incorporating valuable\nspectral information. Extensive experiments reveal that optimizing this\nobjective with existing architectures like UNet and TransUNet significantly\nenhances generalization, interpretability, and noise robustness, producing more\nconfident predictions. For instance, in cardiac segmentation, we observe a 0.81\npp and 1.63 pp (pp = percentage point) improvement in DSC over UNet and\nTransUNet, respectively. Our interpretability study demonstrates that, in most\ntasks, objectives optimized with UNet outperform even TransUNet by introducing\nglobal contextual information alongside local details. These findings\nunderscore the versatility and effectiveness of our proposed method across\ndiverse imaging modalities and medical domains.\n","authors":["Vandan Gorade","Sparsh Mittal","Debesh Jha","Rekha Singhal","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2401.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14501v2","updated":"2024-01-18T20:26:45Z","published":"2023-03-25T15:42:27Z","title":"Link Prediction for Flow-Driven Spatial Networks","summary":" Link prediction algorithms aim to infer the existence of connections (or\nlinks) between nodes in network-structured data and are typically applied to\nrefine the connectivity among nodes. In this work, we focus on link prediction\nfor flow-driven spatial networks, which are embedded in a Euclidean space and\nrelate to physical exchange and transportation processes (e.g., blood flow in\nvessels or traffic flow in road networks). To this end, we propose the Graph\nAttentive Vectors (GAV) link prediction framework. GAV models simplified\ndynamics of physical flow in spatial networks via an attentive,\nneighborhood-aware message-passing paradigm, updating vector embeddings in a\nconstrained manner. We evaluate GAV on eight flow-driven spatial networks given\nby whole-brain vessel graphs and road networks. GAV demonstrates superior\nperformances across all datasets and metrics and outperformed the\nstate-of-the-art on the ogbl-vessel benchmark at the time of submission by 12%\n(98.38 vs. 87.98 AUC). All code is publicly available on GitHub.\n","authors":["Bastian Wittmann","Johannes C. Paetzold","Chinmay Prabhakar","Daniel Rueckert","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2303.14501v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16031v3","updated":"2024-01-18T19:59:42Z","published":"2022-03-30T03:32:05Z","title":"How Deep is Your Art: An Experimental Study on the Limits of Artistic\n Understanding in a Single-Task, Single-Modality Neural Network","summary":" Computational modeling of artwork meaning is complex and difficult. This is\nbecause art interpretation is multidimensional and highly subjective. This\npaper experimentally investigated the degree to which a state-of-the-art Deep\nConvolutional Neural Network (DCNN), a popular Machine Learning approach, can\ncorrectly distinguish modern conceptual art work into the galleries devised by\nart curators. Two hypotheses were proposed to state that the DCNN model uses\nExhibited Properties for classification, like shape and color, but not\nNon-Exhibited Properties, such as historical context and artist intention. The\ntwo hypotheses were experimentally validated using a methodology designed for\nthis purpose. VGG-11 DCNN pre-trained on ImageNet dataset and discriminatively\nfine-tuned was trained on handcrafted datasets designed from real-world\nconceptual photography galleries. Experimental results supported the two\nhypotheses showing that the DCNN model ignores Non-Exhibited Properties and\nuses only Exhibited Properties for artwork classification. This work points to\ncurrent DCNN limitations, which should be addressed by future DNN models.\n","authors":["Mahan Agha Zahedi","Niloofar Gholamrezaei","Alex Doboli"],"pdf_url":"https://arxiv.org/pdf/2203.16031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10341v1","updated":"2024-01-18T19:09:47Z","published":"2024-01-18T19:09:47Z","title":"ELRT: Efficient Low-Rank Training for Compact Convolutional Neural\n Networks","summary":" Low-rank compression, a popular model compression technique that produces\ncompact convolutional neural networks (CNNs) with low rankness, has been\nwell-studied in the literature. On the other hand, low-rank training, as an\nalternative way to train low-rank CNNs from scratch, has been exploited little\nyet. Unlike low-rank compression, low-rank training does not need pre-trained\nfull-rank models, and the entire training phase is always performed on the\nlow-rank structure, bringing attractive benefits for practical applications.\nHowever, the existing low-rank training solutions still face several\nchallenges, such as a considerable accuracy drop and/or still needing to update\nfull-size models during the training. In this paper, we perform a systematic\ninvestigation on low-rank CNN training. By identifying the proper low-rank\nformat and performance-improving strategy, we propose ELRT, an efficient\nlow-rank training solution for high-accuracy, high-compactness, low-rank CNN\nmodels. Our extensive evaluation results for training various CNNs on different\ndatasets demonstrate the effectiveness of ELRT.\n","authors":["Yang Sui","Miao Yin","Yu Gong","Jinqi Xiao","Huy Phan","Bo Yuan"],"pdf_url":"https://arxiv.org/pdf/2401.10341v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.10225v1","updated":"2024-01-18T18:59:11Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":" In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models, that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval in conversational QA, we fine-tune a dense retriever on a\nmulti-turn QA dataset, which provides comparable results to using the\nstate-of-the-art query rewriting model while largely reducing deployment cost.\nNotably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10\nconversational QA datasets (54.14 vs. 53.90), without relying on any synthetic\ndata from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10184v1","updated":"2024-01-18T18:12:28Z","published":"2024-01-18T18:12:28Z","title":"Comparing Traditional and LLM-based Search for Image Geolocation","summary":" Web search engines have long served as indispensable tools for information\nretrieval; user behavior and query formulation strategies have been well\nstudied. The introduction of search engines powered by large language models\n(LLMs) suggested more conversational search and new types of query strategies.\nIn this paper, we compare traditional and LLM-based search for the task of\nimage geolocation, i.e., determining the location where an image was captured.\nOur work examines user interactions, with a particular focus on query\nformulation strategies. In our study, 60 participants were assigned either\ntraditional or LLM-based search engines as assistants for geolocation.\nParticipants using traditional search more accurately predicted the location of\nthe image compared to those using the LLM-based search. Distinct strategies\nemerged between users depending on the type of assistant. Participants using\nthe LLM-based search issued longer, more natural language queries, but had\nshorter search sessions. When reformulating their search queries, traditional\nsearch participants tended to add more terms to their initial queries, whereas\nparticipants using the LLM-based search consistently rephrased their initial\nqueries.\n","authors":["Albatool Wazzan","Stephen MacNeil","Richard Souvenir"],"pdf_url":"https://arxiv.org/pdf/2401.10184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.02631v2","updated":"2024-01-18T17:35:39Z","published":"2022-05-31T01:54:29Z","title":"A Survey on Modern Recommendation System based on Big Data","summary":" This survey provides an exhaustive exploration of the evolution and current\nstate of recommendation systems, which have seen widespread integration in\nvarious web applications. It focuses on the advancement of personalized\nrecommendation strategies for online products or services. We categorize\nrecommendation techniques into four primary types: content-based, collaborative\nfiltering-based, knowledge-based, and hybrid-based, each addressing unique\nscenarios. The survey offers a detailed examination of the historical context\nand the latest innovative approaches in recommendation systems, particularly\nthose employing big data. Additionally, it identifies and discusses key\nchallenges faced by modern recommendation systems, such as data sparsity,\nscalability issues, and the need for diversity in recommendations. The survey\nconcludes by highlighting these challenges as potential areas for fruitful\nfuture research in the field.\n","authors":["Anchen Sun","Yuanzhe Peng"],"pdf_url":"https://arxiv.org/pdf/2206.02631v2.pdf","comment":"10 pages, 8 figures, 1 table"},{"id":"http://arxiv.org/abs/2310.12086v2","updated":"2024-01-18T16:20:06Z","published":"2023-10-18T16:27:49Z","title":"FactCHD: Benchmarking Fact-Conflicting Hallucination Detection","summary":" Despite their impressive generative capabilities, LLMs are hindered by\nfact-conflicting hallucinations in real-world applications. The accurate\nidentification of hallucinations in texts generated by LLMs, especially in\ncomplex inferential scenarios, is a relatively unexplored area. To address this\ngap, we present FactCHD, a dedicated benchmark designed for the detection of\nfact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset\nthat spans various factuality patterns, including vanilla, multi-hop,\ncomparison, and set operation. A distinctive element of FactCHD is its\nintegration of fact-based evidence chains, significantly enhancing the depth of\nevaluating the detectors' explanations. Experiments on different LLMs expose\nthe shortcomings of current approaches in detecting factual errors accurately.\nFurthermore, we introduce Truth-Triangulator that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset is available at\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chenxi Wang","Ningyu Zhang","Jiang Yong","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2304.09048v2","updated":"2024-01-18T16:14:35Z","published":"2023-04-18T15:12:34Z","title":"CodeKGC: Code Language Model for Generative Knowledge Graph Construction","summary":" Current generative knowledge graph construction approaches usually fail to\ncapture structural knowledge by simply flattening natural language into\nserialized texts or a specification language. However, large generative\nlanguage model trained on structured data such as code has demonstrated\nimpressive capability in understanding natural language for structural\nprediction and reasoning tasks. Intuitively, we address the task of generative\nknowledge graph construction with code language model: given a code-format\nnatural language input, the target is to generate triples which can be\nrepresented as code completion tasks. Specifically, we develop schema-aware\nprompts that effectively utilize the semantic structure within the knowledge\ngraph. As code inherently possesses structure, such as class and function\ndefinitions, it serves as a useful model for prior semantic structural\nknowledge. Furthermore, we employ a rationale-enhanced generation method to\nboost the performance. Rationales provide intermediate steps, thereby improving\nknowledge extraction abilities. Experimental results indicate that the proposed\napproach can obtain better performance on benchmark datasets compared with\nbaselines. Code and datasets are available in\nhttps://github.com/zjunlp/DeepKE/tree/main/example/llm.\n","authors":["Zhen Bi","Jing Chen","Yinuo Jiang","Feiyu Xiong","Wei Guo","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.09048v2.pdf","comment":"ACM Transactions on Asian and Low-Resource Language Information\n Processing"},{"id":"http://arxiv.org/abs/2401.10036v1","updated":"2024-01-18T15:00:01Z","published":"2024-01-18T15:00:01Z","title":"LOCALINTEL: Generating Organizational Threat Intelligence from Global\n and Local Cyber Knowledge","summary":" Security Operations Center (SoC) analysts gather threat reports from openly\naccessible global threat databases and customize them manually to suit a\nparticular organization's needs. These analysts also depend on internal\nrepositories, which act as private local knowledge database for an\norganization. Credible cyber intelligence, critical operational details, and\nrelevant organizational information are all stored in these local knowledge\ndatabases. Analysts undertake a labor intensive task utilizing these global and\nlocal knowledge databases to manually create organization's unique threat\nresponse and mitigation strategies. Recently, Large Language Models (LLMs) have\nshown the capability to efficiently process large diverse knowledge sources. We\nleverage this ability to process global and local knowledge databases to\nautomate the generation of organization-specific threat intelligence.\n In this work, we present LOCALINTEL, a novel automated knowledge\ncontextualization system that, upon prompting, retrieves threat reports from\nthe global threat repositories and uses its local knowledge database to\ncontextualize them for a specific organization. LOCALINTEL comprises of three\nkey phases: global threat intelligence retrieval, local knowledge retrieval,\nand contextualized completion generation. The former retrieves intelligence\nfrom global threat repositories, while the second retrieves pertinent knowledge\nfrom the local knowledge database. Finally, the fusion of these knowledge\nsources is orchestrated through a generator to produce a contextualized\ncompletion.\n","authors":["Shaswata Mitra","Subash Neupane","Trisha Chakraborty","Sudip Mittal","Aritran Piplai","Manas Gaur","Shahram Rahimi"],"pdf_url":"https://arxiv.org/pdf/2401.10036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09945v1","updated":"2024-01-18T12:47:13Z","published":"2024-01-18T12:47:13Z","title":"HGAttack: Transferable Heterogeneous Graph Adversarial Attack","summary":" Heterogeneous Graph Neural Networks (HGNNs) are increasingly recognized for\ntheir performance in areas like the web and e-commerce, where resilience\nagainst adversarial attacks is crucial. However, existing adversarial attack\nmethods, which are primarily designed for homogeneous graphs, fall short when\napplied to HGNNs due to their limited ability to address the structural and\nsemantic complexity of HGNNs. This paper introduces HGAttack, the first\ndedicated gray box evasion attack method for heterogeneous graphs. We design a\nnovel surrogate model to closely resemble the behaviors of the target HGNN and\nutilize gradient-based methods for perturbation generation. Specifically, the\nproposed surrogate model effectively leverages heterogeneous information by\nextracting meta-path induced subgraphs and applying GNNs to learn node\nembeddings with distinct semantics from each subgraph. This approach improves\nthe transferability of generated attacks on the target HGNN and significantly\nreduces memory costs. For perturbation generation, we introduce a\nsemantics-aware mechanism that leverages subgraph gradient information to\nautonomously identify vulnerable edges across a wide range of relations within\na constrained perturbation budget. We validate HGAttack's efficacy with\ncomprehensive experiments on three datasets, providing empirical analyses of\nits generated perturbations. Outperforming baseline methods, HGAttack\ndemonstrated significant efficacy in diminishing the performance of target HGNN\nmodels, affirming the effectiveness of our approach in evaluating the\nrobustness of HGNNs against adversarial attacks.\n","authors":["He Zhao","Zhiwei Zeng","Yongwei Wang","Deheng Ye","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2401.09945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09885v1","updated":"2024-01-18T10:56:27Z","published":"2024-01-18T10:56:27Z","title":"Source Code Clone Detection Using Unsupervised Similarity Measures","summary":" Assessing similarity in source code has gained significant attention in\nrecent years due to its importance in software engineering tasks such as clone\ndetection and code search and recommendation. This work presents a comparative\nanalysis of unsupervised similarity measures for identifying source code clone\ndetection. The goal is to overview the current state-of-the-art techniques,\ntheir strengths, and weaknesses. To do that, we compile the existing\nunsupervised strategies and evaluate their performance on a benchmark dataset\nto guide software engineers in selecting appropriate methods for their specific\nuse cases. The source code of this study is available at\n\\url{https://github.com/jorge-martinez-gil/codesim}\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2401.09885v1.pdf","comment":"Accepted for publication as Full Paper in the Software Quality Days\n 2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2401.09839v1","updated":"2024-01-18T09:54:18Z","published":"2024-01-18T09:54:18Z","title":"MatSciRE: Leveraging Pointer Networks to Automate Entity and Relation\n Extraction for Material Science Knowledge-base Construction","summary":" Material science literature is a rich source of factual information about\nvarious categories of entities (like materials and compositions) and various\nrelations between these entities, such as conductivity, voltage, etc.\nAutomatically extracting this information to generate a material science\nknowledge base is a challenging task. In this paper, we propose MatSciRE\n(Material Science Relation Extractor), a Pointer Network-based encoder-decoder\nframework, to jointly extract entities and relations from material science\narticles as a triplet ($entity1, relation, entity2$). Specifically, we target\nthe battery materials and identify five relations to work on - conductivity,\ncoulombic efficiency, capacity, voltage, and energy. Our proposed approach\nachieved a much better F1-score (0.771) than a previous attempt using\nChemDataExtractor (0.716). The overall graphical framework of MatSciRE is shown\nin Fig 1. The material information is extracted from material science\nliterature in the form of entity-relation triplets using MatSciRE.\n","authors":["Ankan Mullick","Akash Ghosh","G Sai Chaitanya","Samir Ghui","Tapas Nayak","Seung-Cheol Lee","Satadeep Bhattacharjee","Pawan Goyal"],"pdf_url":"https://arxiv.org/pdf/2401.09839v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09725v1","updated":"2024-01-18T05:00:18Z","published":"2024-01-18T05:00:18Z","title":"Enhancing Image-Text Matching with Adaptive Feature Aggregation","summary":" Image-text matching aims to find matched cross-modal pairs accurately. While\ncurrent methods often rely on projecting cross-modal features into a common\nembedding space, they frequently suffer from imbalanced feature representations\nacross different modalities, leading to unreliable retrieval results. To\naddress these limitations, we introduce a novel Feature Enhancement Module that\nadaptively aggregates single-modal features for more balanced and robust\nimage-text retrieval. Additionally, we propose a new loss function that\novercomes the shortcomings of original triplet ranking loss, thereby\nsignificantly improving retrieval performance. The proposed model has been\nevaluated on two public datasets and achieves competitive retrieval performance\nwhen compared with several state-of-the-art models. Implementation codes can be\nfound here.\n","authors":["Zuhui Wang","Yunting Yin","I. V. Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2401.09725v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.09693v1","updated":"2024-01-18T02:48:06Z","published":"2024-01-18T02:48:06Z","title":"EfficientRec an unlimited user-item scale recommendation system based on\n clustering and users interaction embedding profile","summary":" Recommendation systems are highly interested in technology companies\nnowadays. The businesses are constantly growing users and products, causing the\nnumber of users and items to continuously increase over time, to very large\nnumbers. Traditional recommendation algorithms with complexity dependent on the\nnumber of users and items make them difficult to adapt to the industrial\nenvironment. In this paper, we introduce a new method applying graph neural\nnetworks with a contrastive learning framework in extracting user preferences.\nWe incorporate a soft clustering architecture that significantly reduces the\ncomputational cost of the inference process. Experiments show that the model is\nable to learn user preferences with low computational cost in both training and\nprediction phases. At the same time, the model gives a very good accuracy. We\ncall this architecture EfficientRec with the implication of model compactness\nand the ability to scale to unlimited users and products.\n","authors":["Vu Hong Quan","Le Hoang Ngan","Le Minh Duc","Nguyen Tran Ngoc Linh","Hoang Quynh-Le"],"pdf_url":"https://arxiv.org/pdf/2401.09693v1.pdf","comment":"Published in 14th Asian Conference on Intelligent Information and\n Database Systems (ACIIDS), 2022"},{"id":"http://arxiv.org/abs/2309.11623v2","updated":"2024-01-18T22:37:48Z","published":"2023-09-20T20:21:13Z","title":"Leveraging Negative Signals with Self-Attention for Sequential Music\n Recommendation","summary":" Music streaming services heavily rely on their recommendation engines to\ncontinuously provide content to their consumers. Sequential recommendation\nconsequently has seen considerable attention in current literature, where state\nof the art approaches focus on self-attentive models leveraging contextual\ninformation such as long and short-term user history and item features;\nhowever, most of these studies focus on long-form content domains (retail,\nmovie, etc.) rather than short-form, such as music. Additionally, many do not\nexplore incorporating negative session-level feedback during training. In this\nstudy, we investigate the use of transformer-based self-attentive architectures\nto learn implicit session-level information for sequential music\nrecommendation. We additionally propose a contrastive learning task to\nincorporate negative feedback (e.g skipped tracks) to promote positive hits and\npenalize negative hits. This task is formulated as a simple loss term that can\nbe incorporated into a variety of deep learning architectures for sequential\nrecommendation. Our experiments show that this results in consistent\nperformance gains over the baseline architectures ignoring negative user\nfeedback.\n","authors":["Pavan Seshadri","Peter Knees"],"pdf_url":"https://arxiv.org/pdf/2309.11623v2.pdf","comment":"Accepted to the 1st Workshop on Music Recommender Systems, co-located\n with the 17th ACM Conference on Recommender Systems (MuRS @ RecSys 2023)"},{"id":"http://arxiv.org/abs/2401.10316v1","updated":"2024-01-18T18:59:55Z","published":"2024-01-18T18:59:55Z","title":"Improving One-class Recommendation with Multi-tasking on Various\n Preference Intensities","summary":" In the one-class recommendation problem, it's required to make\nrecommendations basing on users' implicit feedback, which is inferred from\ntheir action and inaction. Existing works obtain representations of users and\nitems by encoding positive and negative interactions observed from training\ndata. However, these efforts assume that all positive signals from implicit\nfeedback reflect a fixed preference intensity, which is not realistic.\nConsequently, representations learned with these methods usually fail to\ncapture informative entity features that reflect various preference\nintensities.\n In this paper, we propose a multi-tasking framework taking various preference\nintensities of each signal from implicit feedback into consideration.\nRepresentations of entities are required to satisfy the objective of each\nsubtask simultaneously, making them more robust and generalizable. Furthermore,\nwe incorporate attentive graph convolutional layers to explore high-order\nrelationships in the user-item bipartite graph and dynamically capture the\nlatent tendencies of users toward the items they interact with. Experimental\nresults show that our method performs better than state-of-the-art methods by a\nlarge margin on three large-scale real-world benchmark datasets.\n","authors":["Chu-Jen Shao","Hao-Ming Fu","Pu-Jen Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.10316v1.pdf","comment":"RecSys 2020 (ACM Conference on Recommender Systems 2020)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.10227v1","updated":"2024-01-18T18:59:19Z","published":"2024-01-18T18:59:19Z","title":"A Simple Latent Diffusion Approach for Panoptic Segmentation and Mask\n Inpainting","summary":" Panoptic and instance segmentation networks are often trained with\nspecialized object detection modules, complex loss functions, and ad-hoc\npost-processing steps to handle the permutation-invariance of the instance\nmasks. This work builds upon Stable Diffusion and proposes a latent diffusion\napproach for panoptic segmentation, resulting in a simple architecture which\nomits these complexities. Our training process consists of two steps: (1)\ntraining a shallow autoencoder to project the segmentation masks to latent\nspace; (2) training a diffusion model to allow image-conditioned sampling in\nlatent space. The use of a generative model unlocks the exploration of mask\ncompletion or inpainting, which has applications in interactive segmentation.\nThe experimental validation yields promising results for both panoptic\nsegmentation and mask inpainting. While not setting a new state-of-the-art, our\nmodel's simplicity, generality, and mask completion capability are desirable\nproperties.\n","authors":["Wouter Van Gansbeke","Bert De Brabandere"],"pdf_url":"https://arxiv.org/pdf/2401.10227v1.pdf","comment":"Code: https://github.com/segments-ai/latent-diffusion-segmentation"},{"id":"http://arxiv.org/abs/2401.10225v1","updated":"2024-01-18T18:59:11Z","published":"2024-01-18T18:59:11Z","title":"ChatQA: Building GPT-4 Level Conversational QA Models","summary":" In this work, we introduce ChatQA, a family of conversational question\nanswering (QA) models, that obtain GPT-4 level accuracies. Specifically, we\npropose a two-stage instruction tuning method that can significantly improve\nthe zero-shot conversational QA results from large language models (LLMs). To\nhandle retrieval in conversational QA, we fine-tune a dense retriever on a\nmulti-turn QA dataset, which provides comparable results to using the\nstate-of-the-art query rewriting model while largely reducing deployment cost.\nNotably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10\nconversational QA datasets (54.14 vs. 53.90), without relying on any synthetic\ndata from OpenAI GPT models.\n","authors":["Zihan Liu","Wei Ping","Rajarshi Roy","Peng Xu","Mohammad Shoeybi","Bryan Catanzaro"],"pdf_url":"https://arxiv.org/pdf/2401.10225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10220v1","updated":"2024-01-18T18:58:49Z","published":"2024-01-18T18:58:49Z","title":"AutoFT: Robust Fine-Tuning by Optimizing Hyperparameters on OOD Data","summary":" Foundation models encode rich representations that can be adapted to a\ndesired task by fine-tuning on task-specific data. However, fine-tuning a model\non one particular data distribution often compromises the model's original\nperformance on other distributions. Current methods for robust fine-tuning\nutilize hand-crafted regularization techniques to constrain the fine-tuning\nprocess towards the base foundation model. Yet, it is hard to precisely specify\nwhat characteristics of the foundation model to retain during fine-tuning, as\nthis depends on how the pre-training, fine-tuning, and evaluation data\ndistributions relate to each other. We propose AutoFT, a data-driven approach\nfor guiding foundation model fine-tuning. AutoFT optimizes fine-tuning\nhyperparameters to maximize performance on a small out-of-distribution (OOD)\nvalidation set. To guide fine-tuning in a granular way, AutoFT searches a\nhighly expressive hyperparameter space that includes weight coefficients for\nmany different losses, in addition to learning rate and weight decay values. We\nevaluate AutoFT on nine natural distribution shifts which include domain shifts\nand subpopulation shifts. Our experiments show that AutoFT significantly\nimproves generalization to new OOD data, outperforming existing robust\nfine-tuning methods. Notably, AutoFT achieves new state-of-the-art performance\non the WILDS-iWildCam and WILDS-FMoW benchmarks, outperforming the previous\nbest methods by $6.0\\%$ and $1.5\\%$, respectively.\n","authors":["Caroline Choi","Yoonho Lee","Annie Chen","Allan Zhou","Aditi Raghunathan","Chelsea Finn"],"pdf_url":"https://arxiv.org/pdf/2401.10220v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2401.10216v1","updated":"2024-01-18T18:57:10Z","published":"2024-01-18T18:57:10Z","title":"Enabling Efficient Equivariant Operations in the Fourier Basis via Gaunt\n Tensor Products","summary":" Developing equivariant neural networks for the E(3) group plays an important\nrole in modeling 3D data across real-world applications. Enforcing this\nequivariance primarily involves the tensor products of irreducible\nrepresentations (irreps). However, the computational complexity of such\noperations increases significantly as higher-order tensors are used. In this\nwork, we propose a systematic approach to substantially accelerate the\ncomputation of the tensor products of irreps. We mathematically connect the\ncommonly used Clebsch-Gordan coefficients to the Gaunt coefficients, which are\nintegrals of products of three spherical harmonics. Through Gaunt coefficients,\nthe tensor product of irreps becomes equivalent to the multiplication between\nspherical functions represented by spherical harmonics. This perspective\nfurther allows us to change the basis for the equivariant operations from\nspherical harmonics to a 2D Fourier basis. Consequently, the multiplication\nbetween spherical functions represented by a 2D Fourier basis can be\nefficiently computed via the convolution theorem and Fast Fourier Transforms.\nThis transformation reduces the complexity of full tensor products of irreps\nfrom $\\mathcal{O}(L^6)$ to $\\mathcal{O}(L^3)$, where $L$ is the max degree of\nirreps. Leveraging this approach, we introduce the Gaunt Tensor Product, which\nserves as a new method to construct efficient equivariant operations across\ndifferent model architectures. Our experiments on the Open Catalyst Project and\n3BPA datasets demonstrate both the increased efficiency and improved\nperformance of our approach.\n","authors":["Shengjie Luo","Tianlang Chen","Aditi S. Krishnapriyan"],"pdf_url":"https://arxiv.org/pdf/2401.10216v1.pdf","comment":"36 pages; ICLR 2024 (Spotlight Presentation); Code:\n https://github.com/lsj2408/Gaunt-Tensor-Product"},{"id":"http://arxiv.org/abs/2401.10207v1","updated":"2024-01-18T18:45:29Z","published":"2024-01-18T18:45:29Z","title":"Eclectic Rule Extraction for Explainability of Deep Neural Network based\n Intrusion Detection Systems","summary":" This paper addresses trust issues created from the ubiquity of black box\nalgorithms and surrogate explainers in Explainable Intrusion Detection Systems\n(X-IDS). While Explainable Artificial Intelligence (XAI) aims to enhance\ntransparency, black box surrogate explainers, such as Local Interpretable\nModel-Agnostic Explanation (LIME) and SHapley Additive exPlanation (SHAP), are\ndifficult to trust. The black box nature of these surrogate explainers makes\nthe process behind explanation generation opaque and difficult to understand.\nTo avoid this problem, one can use transparent white box algorithms such as\nRule Extraction (RE). There are three types of RE algorithms: pedagogical,\ndecompositional, and eclectic. Pedagogical methods offer fast but untrustworthy\nwhite-box explanations, while decompositional RE provides trustworthy\nexplanations with poor scalability. This work explores eclectic rule\nextraction, which strikes a balance between scalability and trustworthiness. By\ncombining techniques from pedagogical and decompositional approaches, eclectic\nrule extraction leverages the advantages of both, while mitigating some of\ntheir drawbacks. The proposed Hybrid X-IDS architecture features eclectic RE as\na white box surrogate explainer for black box Deep Neural Networks (DNN). The\npresented eclectic RE algorithm extracts human-readable rules from hidden\nlayers, facilitating explainable and trustworthy rulesets. Evaluations on\nUNSW-NB15 and CIC-IDS-2017 datasets demonstrate the algorithm's ability to\ngenerate rulesets with 99.9% accuracy, mimicking DNN outputs. The contributions\nof this work include the hybrid X-IDS architecture, the eclectic rule\nextraction algorithm applicable to intrusion detection datasets, and a thorough\nanalysis of performance and explainability, demonstrating the trade-offs\ninvolved in rule extraction speed and accuracy.\n","authors":["Jesse Ables","Nathaniel Childers","William Anderson","Sudip Mittal","Shahram Rahimi","Ioana Banicescu","Maria Seale"],"pdf_url":"https://arxiv.org/pdf/2401.10207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.07580v2","updated":"2024-01-18T18:42:28Z","published":"2023-02-15T10:43:31Z","title":"Unboxing Tree Ensembles for interpretability: a hierarchical\n visualization tool and a multivariate optimal re-built tree","summary":" The interpretability of models has become a crucial issue in Machine Learning\nbecause of algorithmic decisions' growing impact on real-world applications.\nTree ensemble methods, such as Random Forests or XgBoost, are powerful learning\ntools for classification tasks. However, while combining multiple trees may\nprovide higher prediction quality than a single one, it sacrifices the\ninterpretability property resulting in \"black-box\" models. In light of this, we\naim to develop an interpretable representation of a tree-ensemble model that\ncan provide valuable insights into its behavior. First, given a target\ntree-ensemble model, we develop a hierarchical visualization tool based on a\nheatmap representation of the forest's feature use, considering the frequency\nof a feature and the level at which it is selected as an indicator of\nimportance. Next, we propose a mixed-integer linear programming (MILP)\nformulation for constructing a single optimal multivariate tree that accurately\nmimics the target model predictions. The goal is to provide an interpretable\nsurrogate model based on oblique hyperplane splits, which uses only the most\nrelevant features according to the defined forest's importance indicators. The\nMILP model includes a penalty on feature selection based on their frequency in\nthe forest to further induce sparsity of the splits. The natural formulation\nhas been strengthened to improve the computational performance of\n{mixed-integer} software. Computational experience is carried out on benchmark\ndatasets from the UCI repository using a state-of-the-art off-the-shelf solver.\nResults show that the proposed model is effective in yielding a shallow\ninterpretable tree approximating the tree-ensemble decision function.\n","authors":["Giulia Di Teodoro","Marta Monaci","Laura Palagi"],"pdf_url":"https://arxiv.org/pdf/2302.07580v2.pdf","comment":"44 pages, 9 figures, 20 tables"},{"id":"http://arxiv.org/abs/2309.16467v2","updated":"2024-01-18T18:25:38Z","published":"2023-09-28T14:33:20Z","title":"Compositional Program Generation for Few-Shot Systematic Generalization","summary":" Compositional generalization is a key ability of humans that enables us to\nlearn new concepts from only a handful examples. Neural machine learning\nmodels, including the now ubiquitous Transformers, struggle to generalize in\nthis way, and typically require thousands of examples of a concept during\ntraining in order to generalize meaningfully. This difference in ability\nbetween humans and artificial neural architectures, motivates this study on a\nneuro-symbolic architecture called the Compositional Program Generator (CPG).\nCPG has three key features: \\textit{modularity}, \\textit{composition}, and\n\\textit{abstraction}, in the form of grammar rules, that enable it to\ngeneralize both systematically to new concepts in a few-shot manner, as well as\nproductively by length on various sequence-to-sequence language tasks. For each\ninput, CPG uses a grammar of the input language and a parser to generate a\nparse in which each grammar rule is assigned its own unique semantic module, a\nprobabilistic copy or substitution program. Instances with the same parse are\nalways processed with the same composed modules, while those with different\nparses may be processed with different modules. CPG learns parameters for the\nmodules and is able to learn the semantics for new rules and types\nincrementally, without forgetting or retraining on rules it's already seen. It\nachieves perfect generalization on both the SCAN and COGS benchmarks using just\n14 examples for SCAN and 22 examples for COGS -- state-of-the-art accuracy with\na 1000x improvement in sample efficiency.\n","authors":["Tim Klinger","Luke Liu","Soham Dan","Maxwell Crouse","Parikshit Ram","Alexander Gray"],"pdf_url":"https://arxiv.org/pdf/2309.16467v2.pdf","comment":"7 pages of text with 1 page of references"},{"id":"http://arxiv.org/abs/2401.10191v1","updated":"2024-01-18T18:25:29Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n Continual Learning","summary":" Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v1.pdf","comment":"Accepted to ICLR2024 (main track), code is available at:\n https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2401.10190v1","updated":"2024-01-18T18:23:10Z","published":"2024-01-18T18:23:10Z","title":"A Kaczmarz-inspired approach to accelerate the optimization of neural\n network wavefunctions","summary":" Neural network wavefunctions optimized using the variational Monte Carlo\nmethod have been shown to produce highly accurate results for the electronic\nstructure of atoms and small molecules, but the high cost of optimizing such\nwavefunctions prevents their application to larger systems. We propose the\nSubsampled Projected-Increment Natural Gradient Descent (SPRING) optimizer to\nreduce this bottleneck. SPRING combines ideas from the recently introduced\nminimum-step stochastic reconfiguration optimizer (MinSR) and the classical\nrandomized Kaczmarz method for solving linear least-squares problems. We\ndemonstrate that SPRING outperforms both MinSR and the popular\nKronecker-Factored Approximate Curvature method (KFAC) across a number of small\natoms and molecules, given that the learning rates of all methods are optimally\ntuned. For example, on the oxygen atom, SPRING attains chemical accuracy after\nforty thousand training iterations, whereas both MinSR and KFAC fail to do so\neven after one hundred thousand iterations.\n","authors":["Gil Goldshlager","Nilin Abrahamsen","Lin Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10189v1","updated":"2024-01-18T18:20:15Z","published":"2024-01-18T18:20:15Z","title":"Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through\n Text Reconstruction","summary":" Fine-grained few-shot entity extraction in the chemical domain faces two\nunique challenges. First, compared with entity extraction tasks in the general\ndomain, sentences from chemical papers usually contain more entities. Moreover,\nentity extraction models usually have difficulty extracting entities of\nlong-tailed types. In this paper, we propose Chem-FINESE, a novel\nsequence-to-sequence (seq2seq) based few-shot entity extraction approach, to\naddress these two challenges. Our Chem-FINESE has two components: a seq2seq\nentity extractor to extract named entities from the input sentence and a\nseq2seq self-validation module to reconstruct the original input sentence from\nextracted entities. Inspired by the fact that a good entity extraction system\nneeds to extract entities faithfully, our new self-validation module leverages\nentity extraction results to reconstruct the original input sentence. Besides,\nwe design a new contrastive loss to reduce excessive copying during the\nextraction process. Finally, we release ChemNER+, a new fine-grained chemical\nentity extraction dataset that is annotated by domain experts with the ChemNER\nschema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets\nshow that our newly proposed framework has contributed up to 8.26% and 6.84%\nabsolute F1-score gains respectively.\n","authors":["Qingyun Wang","Zixuan Zhang","Hongxiang Li","Xuan Liu","Jiawei Han","Heng Ji","Huimin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10189v1.pdf","comment":"16 pages. Accepted by Findings of the Association for Computational\n Linguistics: EACL 2024. Code and resources are available at\n https://github.com/EagleW/Chem-FINESE"},{"id":"http://arxiv.org/abs/2308.05021v3","updated":"2024-01-18T18:18:59Z","published":"2023-08-09T15:31:17Z","title":"On Error Propagation of Diffusion Models","summary":" Although diffusion models (DMs) have shown promising performances in a number\nof tasks (e.g., speech synthesis and image generation), they might suffer from\nerror propagation because of their sequential structure. However, this is not\ncertain because some sequential models, such as Conditional Random Field (CRF),\nare free from this problem. To address this issue, we develop a theoretical\nframework to mathematically formulate error propagation in the architecture of\nDMs, The framework contains three elements, including modular error, cumulative\nerror, and propagation equation. The modular and cumulative errors are related\nby the equation, which interprets that DMs are indeed affected by error\npropagation. Our theoretical study also suggests that the cumulative error is\nclosely related to the generation quality of DMs. Based on this finding, we\napply the cumulative error as a regularization term to reduce error\npropagation. Because the term is computationally intractable, we derive its\nupper bound and design a bootstrap algorithm to efficiently estimate the bound\nfor optimization. We have conducted extensive experiments on multiple image\ndatasets, showing that our proposed regularization reduces error propagation,\nsignificantly improves vanilla DMs, and outperforms previous baselines.\n","authors":["Yangming Li","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2308.05021v3.pdf","comment":"Accepted by ICLR-2024"},{"id":"http://arxiv.org/abs/2309.14068v3","updated":"2024-01-18T18:16:33Z","published":"2023-09-25T12:03:32Z","title":"Soft Mixture Denoising: Beyond the Expressive Bottleneck of Diffusion\n Models","summary":" Because diffusion models have shown impressive performances in a number of\ntasks, such as image synthesis, there is a trend in recent works to prove (with\ncertain assumptions) that these models have strong approximation capabilities.\nIn this paper, we show that current diffusion models actually have an\nexpressive bottleneck in backward denoising and some assumption made by\nexisting theoretical guarantees is too strong. Based on this finding, we prove\nthat diffusion models have unbounded errors in both local and global denoising.\nIn light of our theoretical studies, we introduce soft mixture denoising (SMD),\nan expressive and efficient model for backward denoising. SMD not only permits\ndiffusion models to well approximate any Gaussian mixture distributions in\ntheory, but also is simple and efficient for implementation. Our experiments on\nmultiple image datasets show that SMD significantly improves different types of\ndiffusion models (e.g., DDPM), espeically in the situation of few backward\niterations.\n","authors":["Yangming Li","Boris van Breugel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2309.14068v3.pdf","comment":"Accepted by ICLR-2024"},{"id":"http://arxiv.org/abs/2401.10185v1","updated":"2024-01-18T18:12:35Z","published":"2024-01-18T18:12:35Z","title":"Transfer Learning in Human Activity Recognition: A Survey","summary":" Sensor-based human activity recognition (HAR) has been an active research\narea, owing to its applications in smart environments, assisted living,\nfitness, healthcare, etc. Recently, deep learning based end-to-end training has\nresulted in state-of-the-art performance in domains such as computer vision and\nnatural language, where large amounts of annotated data are available. However,\nlarge quantities of annotated data are not available for sensor-based HAR.\nMoreover, the real-world settings on which the HAR is performed differ in terms\nof sensor modalities, classification tasks, and target users. To address this\nproblem, transfer learning has been employed extensively. In this survey, we\nfocus on these transfer learning methods in the application domains of smart\nhome and wearables-based HAR. In particular, we provide a problem-solution\nperspective by categorizing and presenting the works in terms of their\ncontributions and the challenges they address. We also present an updated view\nof the state-of-the-art for both application domains. Based on our analysis of\n205 papers, we highlight the gaps in the literature and provide a roadmap for\naddressing them. This survey provides a reference to the HAR community, by\nsummarizing the existing works and providing a promising research agenda.\n","authors":["Sourish Gunesh Dhekane","Thomas Ploetz"],"pdf_url":"https://arxiv.org/pdf/2401.10185v1.pdf","comment":"40 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2401.10176v1","updated":"2024-01-18T18:05:35Z","published":"2024-01-18T18:05:35Z","title":"Comprehensive OOD Detection Improvements","summary":" As machine learning becomes increasingly prevalent in impactful decisions,\nrecognizing when inference data is outside the model's expected input\ndistribution is paramount for giving context to predictions.\nOut-of-distribution (OOD) detection methods have been created for this task.\nSuch methods can be split into representation-based or logit-based methods from\nwhether they respectively utilize the model's embeddings or predictions for OOD\ndetection. In contrast to most papers which solely focus on one such group, we\naddress both. We employ dimensionality reduction on feature embeddings in\nrepresentation-based methods for both time speedups and improved performance.\nAdditionally, we propose DICE-COL, a modification of the popular logit-based\nmethod Directed Sparsification (DICE) that resolves an unnoticed flaw. We\ndemonstrate the effectiveness of our methods on the OpenOODv1.5 benchmark\nframework, where they significantly improve performance and set\nstate-of-the-art results.\n","authors":["Anish Lakkapragada","Amol Khanna","Edward Raff","Nathan Inkawhich"],"pdf_url":"https://arxiv.org/pdf/2401.10176v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19776v3","updated":"2024-01-18T17:53:45Z","published":"2023-10-30T17:45:32Z","title":"Learn to Categorize or Categorize to Learn? Self-Coding for Generalized\n Category Discovery","summary":" In the quest for unveiling novel categories at test time, we confront the\ninherent limitations of traditional supervised recognition models that are\nrestricted by a predefined category set. While strides have been made in the\nrealms of self-supervised and open-world learning towards test-time category\ndiscovery, a crucial yet often overlooked question persists: what exactly\ndelineates a category? In this paper, we conceptualize a category through the\nlens of optimization, viewing it as an optimal solution to a well-defined\nproblem. Harnessing this unique conceptualization, we propose a novel,\nefficient and self-supervised method capable of discovering previously unknown\ncategories at test time. A salient feature of our approach is the assignment of\nminimum length category codes to individual data instances, which encapsulates\nthe implicit category hierarchy prevalent in real-world datasets. This\nmechanism affords us enhanced control over category granularity, thereby\nequipping our model to handle fine-grained categories adeptly. Experimental\nevaluations, bolstered by state-of-the-art benchmark comparisons, testify to\nthe efficacy of our solution in managing unknown categories at test time.\nFurthermore, we fortify our proposition with a theoretical foundation,\nproviding proof of its optimality. Our code is available at\nhttps://github.com/SarahRastegar/InfoSieve.\n","authors":["Sarah Rastegar","Hazel Doughty","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2310.19776v3.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.10149v1","updated":"2024-01-18T17:22:22Z","published":"2024-01-18T17:22:22Z","title":"Multi-Agent Reinforcement Learning for Maritime Operational Technology\n Cyber Security","summary":" This paper demonstrates the potential for autonomous cyber defence to be\napplied on industrial control systems and provides a baseline environment to\nfurther explore Multi-Agent Reinforcement Learning's (MARL) application to this\nproblem domain. It introduces a simulation environment, IPMSRL, of a generic\nIntegrated Platform Management System (IPMS) and explores the use of MARL for\nautonomous cyber defence decision-making on generic maritime based IPMS\nOperational Technology (OT). OT cyber defensive actions are less mature than\nthey are for Enterprise IT. This is due to the relatively brittle nature of OT\ninfrastructure originating from the use of legacy systems, design-time\nengineering assumptions, and lack of full-scale modern security controls. There\nare many obstacles to be tackled across the cyber landscape due to continually\nincreasing cyber-attack sophistication and the limitations of traditional\nIT-centric cyber defence solutions. Traditional IT controls are rarely deployed\non OT infrastructure, and where they are, some threats aren't fully addressed.\nIn our experiments, a shared critic implementation of Multi Agent Proximal\nPolicy Optimisation (MAPPO) outperformed Independent Proximal Policy\nOptimisation (IPPO). MAPPO reached an optimal policy (episode outcome mean of\n1) after 800K timesteps, whereas IPPO was only able to reach an episode outcome\nmean of 0.966 after one million timesteps. Hyperparameter tuning greatly\nimproved training performance. Across one million timesteps the tuned\nhyperparameters reached an optimal policy whereas the default hyperparameters\nonly managed to win sporadically, with most simulations resulting in a draw. We\ntested a real-world constraint, attack detection alert success, and found that\nwhen alert success probability is reduced to 0.75 or 0.9, the MARL defenders\nwere still able to win in over 97.5% or 99.5% of episodes, respectively.\n","authors":["Alec Wilson","Ryan Menzies","Neela Morarji","David Foster","Marco Casassa Mont","Esin Turkbeyler","Lisa Gralewski"],"pdf_url":"https://arxiv.org/pdf/2401.10149v1.pdf","comment":"13 pages, 7 figures, Proceedings of the Conference on Applied Machine\n Learning in Information Security 2023 (CAMLIS)"},{"id":"http://arxiv.org/abs/2401.10148v1","updated":"2024-01-18T17:22:11Z","published":"2024-01-18T17:22:11Z","title":"Explicitly Disentangled Representations in Object-Centric Learning","summary":" Extracting structured representations from raw visual data is an important\nand long-standing challenge in machine learning. Recently, techniques for\nunsupervised learning of object-centric representations have raised growing\ninterest. In this context, enhancing the robustness of the latent features can\nimprove the efficiency and effectiveness of the training of downstream tasks. A\npromising step in this direction is to disentangle the factors that cause\nvariation in the data. Previously, Invariant Slot Attention disentangled\nposition, scale, and orientation from the remaining features. Extending this\napproach, we focus on separating the shape and texture components. In\nparticular, we propose a novel architecture that biases object-centric models\ntoward disentangling shape and texture components into two non-overlapping\nsubsets of the latent space dimensions. These subsets are known a priori, hence\nbefore the training process. Experiments on a range of object-centric\nbenchmarks reveal that our approach achieves the desired disentanglement while\nalso numerically improving baseline performance in most cases. In addition, we\nshow that our method can generate novel textures for a specific object or\ntransfer textures between objects with distinct shapes.\n","authors":["Riccardo Majellaro","Jonathan Collu","Aske Plaat","Thomas M. Moerland"],"pdf_url":"https://arxiv.org/pdf/2401.10148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15188v4","updated":"2024-01-18T17:21:42Z","published":"2023-09-26T18:49:30Z","title":"ICML 2023 Topological Deep Learning Challenge : Design and Results","summary":" This paper presents the computational challenge on topological deep learning\nthat was hosted within the ICML 2023 Workshop on Topology and Geometry in\nMachine Learning. The competition asked participants to provide open-source\nimplementations of topological neural networks from the literature by\ncontributing to the python packages TopoNetX (data processing) and TopoModelX\n(deep learning). The challenge attracted twenty-eight qualifying submissions in\nits two-month duration. This paper describes the design of the challenge and\nsummarizes its main findings.\n","authors":["Mathilde Papillon","Mustafa Hajij","Helen Jenne","Johan Mathe","Audun Myers","Theodore Papamarkou","Tolga Birdal","Tamal Dey","Tim Doster","Tegan Emerson","Gurusankar Gopalakrishnan","Devendra Govil","Aldo Guzmán-Sáenz","Henry Kvinge","Neal Livesay","Soham Mukherjee","Shreyas N. Samaga","Karthikeyan Natesan Ramamurthy","Maneel Reddy Karri","Paul Rosen","Sophia Sanborn","Robin Walters","Jens Agerberg","Sadrodin Barikbin","Claudio Battiloro","Gleb Bazhenov","Guillermo Bernardez","Aiden Brent","Sergio Escalera","Simone Fiorellino","Dmitrii Gavrilev","Mohammed Hassanin","Paul Häusner","Odin Hoff Gardaa","Abdelwahed Khamis","Manuel Lecha","German Magai","Tatiana Malygina","Rubén Ballester","Kalyan Nadimpalli","Alexander Nikitin","Abraham Rabinowitz","Alessandro Salatiello","Simone Scardapane","Luca Scofano","Suraj Singh","Jens Sjölund","Pavel Snopov","Indro Spinelli","Lev Telyatnikov","Lucia Testa","Maosheng Yang","Yixiao Yue","Olga Zaghen","Ali Zia","Nina Miolane"],"pdf_url":"https://arxiv.org/pdf/2309.15188v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09172v3","updated":"2024-01-18T17:13:21Z","published":"2023-04-18T17:59:45Z","title":"Hyperbolic Image-Text Representations","summary":" Visual and linguistic concepts naturally organize themselves in a hierarchy,\nwhere a textual concept \"dog\" entails all images that contain dogs. Despite\nbeing intuitive, current large-scale vision and language models such as CLIP do\nnot explicitly capture such hierarchy. We propose MERU, a contrastive model\nthat yields hyperbolic representations of images and text. Hyperbolic spaces\nhave suitable geometric properties to embed tree-like data, so MERU can better\ncapture the underlying hierarchy in image-text datasets. Our results show that\nMERU learns a highly interpretable and structured representation space while\nbeing competitive with CLIP's performance on standard multi-modal tasks like\nimage classification and image-text retrieval. Our code and models are\navailable at https://www.github.com/facebookresearch/meru\n","authors":["Karan Desai","Maximilian Nickel","Tanmay Rajpurohit","Justin Johnson","Ramakrishna Vedantam"],"pdf_url":"https://arxiv.org/pdf/2304.09172v3.pdf","comment":"ICML 2023 (v3: Add link to code in abstract)"},{"id":"http://arxiv.org/abs/2401.10134v1","updated":"2024-01-18T17:03:59Z","published":"2024-01-18T17:03:59Z","title":"Spatial-Temporal Large Language Model for Traffic Prediction","summary":" Traffic prediction, a critical component for intelligent transportation\nsystems, endeavors to foresee future traffic at specific locations using\nhistorical data. Although existing traffic prediction models often emphasize\ndeveloping complex neural network structures, their accuracy has not seen\nimprovements accordingly. Recently, Large Language Models (LLMs) have shown\noutstanding capabilities in time series analysis. Differing from existing\nmodels, LLMs progress mainly through parameter expansion and extensive\npre-training while maintaining their fundamental structures. In this paper, we\npropose a Spatial-Temporal Large Language Model (ST-LLM) for traffic\nprediction. Specifically, ST-LLM redefines the timesteps at each location as\ntokens and incorporates a spatial-temporal embedding module to learn the\nspatial location and global temporal representations of tokens. Then these\nrepresentations are fused to provide each token with unified spatial and\ntemporal information. Furthermore, we propose a novel partially frozen\nattention strategy of the LLM, which is designed to capture spatial-temporal\ndependencies for traffic prediction. Comprehensive experiments on real traffic\ndatasets offer evidence that ST-LLM outperforms state-of-the-art models.\nNotably, the ST-LLM also exhibits robust performance in both few-shot and\nzero-shot prediction scenarios.\n","authors":["Chenxi Liu","Sun Yang","Qianxiong Xu","Zhishuai Li","Cheng Long","Ziyue Li","Rui Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10134v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20496v2","updated":"2024-01-18T16:51:21Z","published":"2023-10-31T14:34:00Z","title":"BasisFormer: Attention-based Time Series Forecasting with Learnable and\n Interpretable Basis","summary":" Bases have become an integral part of modern deep learning-based models for\ntime series forecasting due to their ability to act as feature extractors or\nfuture references. To be effective, a basis must be tailored to the specific\nset of time series data and exhibit distinct correlation with each time series\nwithin the set. However, current state-of-the-art methods are limited in their\nability to satisfy both of these requirements simultaneously. To address this\nchallenge, we propose BasisFormer, an end-to-end time series forecasting\narchitecture that leverages learnable and interpretable bases. This\narchitecture comprises three components: First, we acquire bases through\nadaptive self-supervised learning, which treats the historical and future\nsections of the time series as two distinct views and employs contrastive\nlearning. Next, we design a Coef module that calculates the similarity\ncoefficients between the time series and bases in the historical view via\nbidirectional cross-attention. Finally, we present a Forecast module that\nselects and consolidates the bases in the future view based on the similarity\ncoefficients, resulting in accurate future predictions. Through extensive\nexperiments on six datasets, we demonstrate that BasisFormer outperforms\nprevious state-of-the-art methods by 11.04\\% and 15.78\\% respectively for\nunivariate and multivariate forecasting tasks. Code is available at:\n\\url{https://github.com/nzl5116190/Basisformer}\n","authors":["Zelin Ni","Hang Yu","Shizhan Liu","Jianguo Li","Weiyao Lin"],"pdf_url":"https://arxiv.org/pdf/2310.20496v2.pdf","comment":"NeurIPS 2023(poster)"},{"id":"http://arxiv.org/abs/2401.10119v1","updated":"2024-01-18T16:50:55Z","published":"2024-01-18T16:50:55Z","title":"Towards Principled Graph Transformers","summary":" Graph learning architectures based on the k-dimensional Weisfeiler-Leman\n(k-WL) hierarchy offer a theoretically well-understood expressive power.\nHowever, such architectures often fail to deliver solid predictive performance\non real-world tasks, limiting their practical impact. In contrast, global\nattention-based models such as graph transformers demonstrate strong\nperformance in practice, but comparing their expressive power with the k-WL\nhierarchy remains challenging, particularly since these architectures rely on\npositional or structural encodings for their expressivity and predictive\nperformance. To address this, we show that the recently proposed Edge\nTransformer, a global attention model operating on node pairs instead of nodes,\nhas at least 3-WL expressive power. Empirically, we demonstrate that the Edge\nTransformer surpasses other theoretically aligned architectures regarding\npredictive performance while not relying on positional or structural encodings.\n","authors":["Luis Müller","Christopher Morris"],"pdf_url":"https://arxiv.org/pdf/2401.10119v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10359v4","updated":"2024-01-18T16:47:43Z","published":"2023-11-17T07:25:18Z","title":"FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel\n Identification","summary":" Highly parallelized workloads like machine learning training, inferences and\ngeneral HPC tasks are greatly accelerated using GPU devices. In a cloud\ncomputing cluster, serving a GPU's computation power through multi-tasks\nsharing is highly demanded since there are always more task requests than the\nnumber of GPU available. Existing GPU sharing solutions focus on reducing\ntask-level waiting time or task-level switching costs when multiple jobs\ncompeting for a single GPU. Non-stopped computation requests come with\ndifferent priorities, having non-symmetric impact on QoS for sharing a GPU\ndevice. Existing work missed the kernel-level optimization opportunity brought\nby this setting. To address this problem, we present a novel kernel-level\nscheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT\nincorporates task-level priority information, fine-grained kernel\nidentification, and kernel measurement, allowing low priorities task's\nexecution during high priority task's inter-kernel idle time. Thereby, filling\nthe GPU's device runtime fully, and reduce overall GPU sharing impact to cloud\nservices. Across a set of ML models, the FIKIT based inference system\naccelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in\nGPU sharing mode, and more than half of the cases are accelerated by more than\n3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have\na comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We\nfurther limit the kernel measurement and runtime fine-grained kernel scheduling\noverhead to less than 5%.\n","authors":["Wenqing Wu"],"pdf_url":"https://arxiv.org/pdf/2311.10359v4.pdf","comment":"20 pages, 20 figures. Update the Abstract on the arXiv page"},{"id":"http://arxiv.org/abs/2306.04961v2","updated":"2024-01-18T16:47:33Z","published":"2023-06-08T06:35:47Z","title":"Recovering Simultaneously Structured Data via Non-Convex Iteratively\n Reweighted Least Squares","summary":" We propose a new algorithm for the problem of recovering data that adheres to\nmultiple, heterogeneous low-dimensional structures from linear observations.\nFocusing on data matrices that are simultaneously row-sparse and low-rank, we\npropose and analyze an iteratively reweighted least squares (IRLS) algorithm\nthat is able to leverage both structures. In particular, it optimizes a\ncombination of non-convex surrogates for row-sparsity and rank, a balancing of\nwhich is built into the algorithm. We prove locally quadratic convergence of\nthe iterates to a simultaneously structured data matrix in a regime of minimal\nsample complexity (up to constants and a logarithmic factor), which is known to\nbe impossible for a combination of convex surrogates. In experiments, we show\nthat the IRLS method exhibits favorable empirical convergence, identifying\nsimultaneously row-sparse and low-rank matrices from fewer measurements than\nstate-of-the-art methods. Code is available at\nhttps://github.com/ckuemmerle/simirls.\n","authors":["Christian Kümmerle","Johannes Maly"],"pdf_url":"https://arxiv.org/pdf/2306.04961v2.pdf","comment":"35 pages, 7 figures"},{"id":"http://arxiv.org/abs/2305.17198v2","updated":"2024-01-18T16:25:38Z","published":"2023-05-26T18:43:16Z","title":"A Model-Based Solution to the Offline Multi-Agent Reinforcement Learning\n Coordination Problem","summary":" Training multiple agents to coordinate is an essential problem with\napplications in robotics, game theory, economics, and social sciences. However,\nmost existing Multi-Agent Reinforcement Learning (MARL) methods are online and\nthus impractical for real-world applications in which collecting new\ninteractions is costly or dangerous. While these algorithms should leverage\noffline data when available, doing so gives rise to what we call the offline\ncoordination problem. Specifically, we identify and formalize the strategy\nagreement (SA) and the strategy fine-tuning (SFT) coordination challenges, two\nissues at which current offline MARL algorithms fail. Concretely, we reveal\nthat the prevalent model-free methods are severely deficient and cannot handle\ncoordination-intensive offline multi-agent tasks in either toy or MuJoCo\ndomains. To address this setback, we emphasize the importance of inter-agent\ninteractions and propose the very first model-based offline MARL method. Our\nresulting algorithm, Model-based Offline Multi-Agent Proximal Policy\nOptimization (MOMA-PPO) generates synthetic interaction data and enables agents\nto converge on a strategy while fine-tuning their policies accordingly. This\nsimple model-based solution solves the coordination-intensive offline tasks,\nsignificantly outperforming the prevalent model-free methods even under severe\npartial observability and with learned world models.\n","authors":["Paul Barde","Jakob Foerster","Derek Nowrouzezahrai","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12086v2","updated":"2024-01-18T16:20:06Z","published":"2023-10-18T16:27:49Z","title":"FactCHD: Benchmarking Fact-Conflicting Hallucination Detection","summary":" Despite their impressive generative capabilities, LLMs are hindered by\nfact-conflicting hallucinations in real-world applications. The accurate\nidentification of hallucinations in texts generated by LLMs, especially in\ncomplex inferential scenarios, is a relatively unexplored area. To address this\ngap, we present FactCHD, a dedicated benchmark designed for the detection of\nfact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset\nthat spans various factuality patterns, including vanilla, multi-hop,\ncomparison, and set operation. A distinctive element of FactCHD is its\nintegration of fact-based evidence chains, significantly enhancing the depth of\nevaluating the detectors' explanations. Experiments on different LLMs expose\nthe shortcomings of current approaches in detecting factual errors accurately.\nFurthermore, we introduce Truth-Triangulator that synthesizes reflective\nconsiderations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming\nto yield more credible detection through the amalgamation of predictive results\nand evidence. The benchmark dataset is available at\nhttps://github.com/zjunlp/FactCHD.\n","authors":["Xiang Chen","Duanzheng Song","Honghao Gui","Chenxi Wang","Ningyu Zhang","Jiang Yong","Fei Huang","Chengfei Lv","Dan Zhang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.12086v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.10107v1","updated":"2024-01-18T16:18:18Z","published":"2024-01-18T16:18:18Z","title":"Comparison analysis between standard polysomnographic data and\n in-ear-EEG signals: A preliminary study","summary":" Study Objectives: Polysomnography (PSG) currently serves as the benchmark for\nevaluating sleep disorders. Its discomfort, impracticality for home-use, and\nintroduction of bias in sleep quality assessment necessitate the exploration of\nless invasive, cost-effective, and portable alternatives. One promising\ncontender is the in-ear-EEG sensor, which offers advantages in terms of\ncomfort, fixed electrode positions, resistance to electromagnetic interference,\nand user-friendliness. This study aims to establish a methodology to assess the\nsimilarity between the in-ear-EEG signal and standard PSG.\n Methods: We assess the agreement between the PSG and in-ear-EEG derived\nhypnograms. We extract features in the time- and frequency- domain from PSG and\nin-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers\nand the in-ear-EEG-scorers were in agreement. We introduce a methodology to\nquantify the similarity between PSG derivations and the single-channel\nin-ear-EEG. The approach relies on a comparison of distributions of selected\nfeatures -- extracted for each sleep stage and subject on both PSG and the\nin-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity\nIndex (JSD-FSI).\n Results: We found a high intra-scorer variability, mainly due to the\nuncertainty the scorers had in evaluating the in-ear-EEG signals. We show that\nthe similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/-\n0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line\nwith the similarity values computed independently on standard\nPSG-channel-combinations.\n Conclusions: In-ear-EEG is a valuable solution for home-based sleep\nmonitoring, however further studies with a larger and more heterogeneous\ndataset are needed.\n","authors":["Gianpaolo Palo","Luigi Fiorillo","Giuliana Monachino","Michal Bechny","Mark Melnykowycz","Athina Tzovara","Valentina Agostini","Francesca Dalia Faraci"],"pdf_url":"https://arxiv.org/pdf/2401.10107v1.pdf","comment":"29 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2304.09048v2","updated":"2024-01-18T16:14:35Z","published":"2023-04-18T15:12:34Z","title":"CodeKGC: Code Language Model for Generative Knowledge Graph Construction","summary":" Current generative knowledge graph construction approaches usually fail to\ncapture structural knowledge by simply flattening natural language into\nserialized texts or a specification language. However, large generative\nlanguage model trained on structured data such as code has demonstrated\nimpressive capability in understanding natural language for structural\nprediction and reasoning tasks. Intuitively, we address the task of generative\nknowledge graph construction with code language model: given a code-format\nnatural language input, the target is to generate triples which can be\nrepresented as code completion tasks. Specifically, we develop schema-aware\nprompts that effectively utilize the semantic structure within the knowledge\ngraph. As code inherently possesses structure, such as class and function\ndefinitions, it serves as a useful model for prior semantic structural\nknowledge. Furthermore, we employ a rationale-enhanced generation method to\nboost the performance. Rationales provide intermediate steps, thereby improving\nknowledge extraction abilities. Experimental results indicate that the proposed\napproach can obtain better performance on benchmark datasets compared with\nbaselines. Code and datasets are available in\nhttps://github.com/zjunlp/DeepKE/tree/main/example/llm.\n","authors":["Zhen Bi","Jing Chen","Yinuo Jiang","Feiyu Xiong","Wei Guo","Huajun Chen","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.09048v2.pdf","comment":"ACM Transactions on Asian and Low-Resource Language Information\n Processing"},{"id":"http://arxiv.org/abs/2401.10095v1","updated":"2024-01-18T16:05:00Z","published":"2024-01-18T16:05:00Z","title":"Learning shallow quantum circuits","summary":" Despite fundamental interests in learning quantum circuits, the existence of\na computationally efficient algorithm for learning shallow quantum circuits\nremains an open question. Because shallow quantum circuits can generate\ndistributions that are classically hard to sample from, existing learning\nalgorithms do not apply. In this work, we present a polynomial-time classical\nalgorithm for learning the description of any unknown $n$-qubit shallow quantum\ncircuit $U$ (with arbitrary unknown architecture) within a small diamond\ndistance using single-qubit measurement data on the output states of $U$. We\nalso provide a polynomial-time classical algorithm for learning the description\nof any unknown $n$-qubit state $\\lvert \\psi \\rangle = U \\lvert 0^n \\rangle$\nprepared by a shallow quantum circuit $U$ (on a 2D lattice) within a small\ntrace distance using single-qubit measurements on copies of $\\lvert \\psi\n\\rangle$. Our approach uses a quantum circuit representation based on local\ninversions and a technique to combine these inversions. This circuit\nrepresentation yields an optimization landscape that can be efficiently\nnavigated and enables efficient learning of quantum circuits that are\nclassically hard to simulate.\n","authors":["Hsin-Yuan Huang","Yunchao Liu","Michael Broughton","Isaac Kim","Anurag Anshu","Zeph Landau","Jarrod R. McClean"],"pdf_url":"https://arxiv.org/pdf/2401.10095v1.pdf","comment":"10 pages, 14 figures (7 inline; 7 floating) + 76-page appendix"},{"id":"http://arxiv.org/abs/2306.00788v3","updated":"2024-01-18T16:00:30Z","published":"2023-06-01T15:18:55Z","title":"Understanding Augmentation-based Self-Supervised Representation Learning\n via RKHS Approximation and Regression","summary":" Data augmentation is critical to the empirical success of modern\nself-supervised representation learning, such as contrastive learning and\nmasked language modeling. However, a theoretical understanding of the exact\nrole of augmentation remains limited. Recent work has built the connection\nbetween self-supervised learning and the approximation of the top eigenspace of\na graph Laplacian operator, suggesting that learning a linear probe atop such\nrepresentation can be connected to RKHS regression. Building on this insight,\nthis work delves into a statistical analysis of augmentation-based pretraining.\nStarting from the isometry property, a geometric characterization of the target\nfunction given by the augmentation, we disentangle the effects of the model and\nthe augmentation, and prove two generalization bounds that are free of model\ncomplexity. Our first bound works for an arbitrary encoder, where the\nprediction error is decomposed as the sum of an estimation error incurred by\nfitting a linear probe with RKHS regression, and an approximation error\nentailed by RKHS approximation. Our second bound specifically addresses the\ncase where the encoder is near-optimal, that is it approximates the top-d\neigenspace of the RKHS induced by the augmentation. A key ingredient in our\nanalysis is the augmentation complexity, which we use to quantitatively compare\ndifferent augmentations and analyze their impact on downstream performance.\n","authors":["Runtian Zhai","Bingbin Liu","Andrej Risteski","Zico Kolter","Pradeep Ravikumar"],"pdf_url":"https://arxiv.org/pdf/2306.00788v3.pdf","comment":"ICLR 2024 spotlight. 34 pages"},{"id":"http://arxiv.org/abs/2305.18417v2","updated":"2024-01-18T15:50:01Z","published":"2023-05-28T19:07:55Z","title":"Determinantal Point Process Attention Over Grid Cell Code Supports Out\n of Distribution Generalization","summary":" Deep neural networks have made tremendous gains in emulating human-like\nintelligence, and have been used increasingly as ways of understanding how the\nbrain may solve the complex computational problems on which this relies.\nHowever, these still fall short of, and therefore fail to provide insight into\nhow the brain supports strong forms of generalization of which humans are\ncapable. One such case is out-of-distribution (OOD) generalization-successful\nperformance on test examples that lie outside the distribution of the training\nset. Here, we identify properties of processing in the brain that may\ncontribute to this ability. We describe a two-part algorithm that draws on\nspecific features of neural computation to achieve OOD generalization, and\nprovide a proof of concept by evaluating performance on two challenging\ncognitive tasks. First we draw on the fact that the mammalian brain represents\nmetric spaces using grid cell code (e.g., in entorhinal cortex): abstract\nrepresentations of relational structure, organized in recurring motifs that\ncover the representational space. Second, we propose an attentional mechanism\nthat operates over the grid cell code using Determinantal Point Process (DPP),\nthat we call DPP attention (DPP-A) -- a transformation that ensures maximum\nsparseness in the coverage of that space. We show that a loss function that\ncombines standard task-optimized error with DPP-A can exploit the recurring\nmotifs in the grid cell code, and can be integrated with common architectures\nto achieve strong OOD generalization performance on analogy and arithmetic\ntasks. This provides both an interpretation of how the grid cell code in the\nmammalian brain may contribute to generalization performance, and at the same\ntime a potential means for improving such capabilities in artificial neural\nnetworks.\n","authors":["Shanka Subhra Mondal","Steven Frankland","Taylor Webb","Jonathan D. Cohen"],"pdf_url":"https://arxiv.org/pdf/2305.18417v2.pdf","comment":"29 pages (including Appendix), 21 figures"},{"id":"http://arxiv.org/abs/2311.13594v2","updated":"2024-01-18T15:39:09Z","published":"2023-11-22T18:55:25Z","title":"Labeling Neural Representations with Inverse Recognition","summary":" Deep Neural Networks (DNNs) demonstrate remarkable capabilities in learning\ncomplex hierarchical data representations, but the nature of these\nrepresentations remains largely unknown. Existing global explainability\nmethods, such as Network Dissection, face limitations such as reliance on\nsegmentation masks, lack of statistical significance testing, and high\ncomputational demands. We propose Inverse Recognition (INVERT), a scalable\napproach for connecting learned representations with human-understandable\nconcepts by leveraging their capacity to discriminate between these concepts.\nIn contrast to prior work, INVERT is capable of handling diverse types of\nneurons, exhibits less computational complexity, and does not rely on the\navailability of segmentation masks. Moreover, INVERT provides an interpretable\nmetric assessing the alignment between the representation and its corresponding\nexplanation and delivering a measure of statistical significance. We\ndemonstrate the applicability of INVERT in various scenarios, including the\nidentification of representations affected by spurious correlations, and the\ninterpretation of the hierarchical structure of decision-making within the\nmodels.\n","authors":["Kirill Bykov","Laura Kopf","Shinichi Nakajima","Marius Kloft","Marina M. -C. Höhne"],"pdf_url":"https://arxiv.org/pdf/2311.13594v2.pdf","comment":"25 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.10462v2","updated":"2024-01-18T15:37:33Z","published":"2023-08-21T04:31:06Z","title":"Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation\n with Large Language Models","summary":" Large Language Models (LLMs) demonstrate impressive capabilities to generate\naccurate code snippets given natural language intents in zero-shot, i.e.,\nwithout the need for specific fine-tuning. While prior studies have highlighted\nthe advantages of fine-tuning LLMs, this process incurs high computational\ncosts, making it impractical in resource-scarce environments, particularly for\nmodels with billions of parameters. To address these challenges, previous\nresearch explored In-Context Learning (ICL) as a strategy to guide the LLM\ngenerative process with task-specific prompt examples. However, ICL introduces\ninconveniences, such as the need for designing contextually relevant prompts\nand the absence of learning task-specific parameters, thereby limiting\ndownstream task performance. In this context, we foresee Parameter-Efficient\nFine-Tuning (PEFT) techniques as a promising approach to efficiently specialize\nLLMs to task-specific data while maintaining reasonable resource consumption.\nIn this paper, we deliver a comprehensive study of PEFT techniques for LLMs\nunder the automated code generation scenario. Our comprehensive investigation\nof PEFT techniques for LLMs reveals their superiority and potential over ICL\nacross a diverse set of LLMs. Additionally, we demonstrate the extended\ncapabilities of PEFT, showcasing its ability to learn from two distinct\ndatasets jointly without compromising performance. Furthermore, our study\nhighlights the potential for tuning larger LLMs and significant reductions in\nmemory usage by combining PEFT with quantization. Therefore, this study opens\nopportunities for broader applications of PEFT in software engineering\nscenarios. Our code is available at\nhttps://github.com/martin-wey/peft-llm-code/.\n","authors":["Martin Weyssow","Xin Zhou","Kisub Kim","David Lo","Houari Sahraoui"],"pdf_url":"https://arxiv.org/pdf/2308.10462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.03708v2","updated":"2024-01-18T15:35:38Z","published":"2023-09-07T13:36:03Z","title":"Chat Failures and Troubles: Reasons and Solutions","summary":" This paper examines some common problems in Human-Robot Interaction (HRI)\ncausing failures and troubles in Chat. A given use case's design decisions\nstart with the suitable robot, the suitable chatting model, identifying common\nproblems that cause failures, identifying potential solutions, and planning\ncontinuous improvement. In conclusion, it is recommended to use a closed-loop\ncontrol algorithm that guides the use of trained Artificial Intelligence (AI)\npre-trained models and provides vocabulary filtering, re-train batched models\non new datasets, learn online from data streams, and/or use reinforcement\nlearning models to self-update the trained models and reduce errors.\n","authors":["Manal Helal","Patrick Holthaus","Gabriella Lakatos","Farshid Amirabdollahian"],"pdf_url":"https://arxiv.org/pdf/2309.03708v2.pdf","comment":"In WTF Workshop Proceedings (arXiv:2401.04108) held in conjunction\n with the ACM conference on Conversational User Interfaces (CUI), 19 - 21/07\n 2023, in Eindhoven, The Netherlands"},{"id":"http://arxiv.org/abs/2312.12838v2","updated":"2024-01-18T15:27:37Z","published":"2023-12-20T08:42:57Z","title":"FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image\n Segmentation against Heterogeneous Annotation Noise","summary":" Federated learning (FL) has emerged as a promising paradigm for training\nsegmentation models on decentralized medical data, owing to its\nprivacy-preserving property. However, existing research overlooks the prevalent\nannotation noise encountered in real-world medical datasets, which limits the\nperformance ceilings of FL. In this paper, we, for the first time, identify and\ntackle this problem. For problem formulation, we propose a contour evolution\nfor modeling non-independent and identically distributed (Non-IID) noise across\npixels within each client and then extend it to the case of multi-source data\nto form a heterogeneous noise model (i.e., Non-IID annotation noise across\nclients). For robust learning from annotations with such two-level Non-IID\nnoise, we emphasize the importance of data quality in model aggregation,\nallowing high-quality clients to have a greater impact on FL. To achieve this,\nwe propose Federated learning with Annotation quAlity-aware AggregatIon, named\nFedA3I, by introducing a quality factor based on client-wise noise estimation.\nSpecifically, noise estimation at each client is accomplished through the\nGaussian mixture model and then incorporated into model aggregation in a\nlayer-wise manner to up-weight high-quality clients. Extensive experiments on\ntwo real-world medical image segmentation datasets demonstrate the superior\nperformance of FedA$^3$I against the state-of-the-art approaches in dealing\nwith cross-client annotation noise. The code is available at\nhttps://github.com/wnn2000/FedAAAI.\n","authors":["Nannan Wu","Zhaobin Sun","Zengqiang Yan","Li Yu"],"pdf_url":"https://arxiv.org/pdf/2312.12838v2.pdf","comment":"Accepted at AAAI'24"},{"id":"http://arxiv.org/abs/2307.13275v2","updated":"2024-01-18T15:14:42Z","published":"2023-07-25T06:13:01Z","title":"CTAGE: Curvature-Based Topology-Aware Graph Embedding for Learning\n Molecular Representations","summary":" AI-driven drug design relies significantly on predicting molecular\nproperties, which is a complex task. In current approaches, the most commonly\nused feature representations for training deep neural network models are based\non SMILES and molecular graphs. While these methods are concise and efficient,\nthey have limitations in capturing complex spatial information. Recently,\nresearchers have recognized the importance of incorporating three-dimensional\ninformation of molecular structures into models. However, capturing spatial\ninformation requires the introduction of additional units in the generator,\nbringing additional design and computational costs. Therefore, it is necessary\nto develop a method for predicting molecular properties that effectively\ncombines spatial structural information while maintaining the simplicity and\nefficiency of graph neural networks. In this work, we propose an embedding\napproach CTAGE, utilizing $k$-hop discrete Ricci curvature to extract\nstructural insights from molecular graph data. This effectively integrates\nspatial structural information while preserving the training complexity of the\nnetwork. Experimental results indicate that introducing node curvature\nsignificantly improves the performance of current graph neural network\nframeworks, validating that the information from k-hop node curvature\neffectively reflects the relationship between molecular structure and function.\n","authors":["Yili Chen","Zhengyu Li","Zheng Wan","Hui Yu","Xian Wei"],"pdf_url":"https://arxiv.org/pdf/2307.13275v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03686v2","updated":"2024-01-18T14:54:37Z","published":"2023-08-07T16:01:14Z","title":"Nearly $d$-Linear Convergence Bounds for Diffusion Models via Stochastic\n Localization","summary":" Denoising diffusions are a powerful method to generate approximate samples\nfrom high-dimensional data distributions. Recent results provide polynomial\nbounds on their convergence rate, assuming $L^2$-accurate scores. Until now,\nthe tightest bounds were either superlinear in the data dimension or required\nstrong smoothness assumptions. We provide the first convergence bounds which\nare linear in the data dimension (up to logarithmic factors) assuming only\nfinite second moments of the data distribution. We show that diffusion models\nrequire at most $\\tilde O(\\frac{d \\log^2(1/\\delta)}{\\varepsilon^2})$ steps to\napproximate an arbitrary distribution on $\\mathbb{R}^d$ corrupted with Gaussian\nnoise of variance $\\delta$ to within $\\varepsilon^2$ in KL divergence. Our\nproof extends the Girsanov-based methods of previous works. We introduce a\nrefined treatment of the error from discretizing the reverse SDE inspired by\nstochastic localization.\n","authors":["Joe Benton","Valentin De Bortoli","Arnaud Doucet","George Deligiannidis"],"pdf_url":"https://arxiv.org/pdf/2308.03686v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11086v2","updated":"2024-01-18T14:40:43Z","published":"2022-11-20T21:18:41Z","title":"An Embarrassingly Simple Baseline for Imbalanced Semi-Supervised\n Learning","summary":" Semi-supervised learning (SSL) has shown great promise in leveraging\nunlabeled data to improve model performance. While standard SSL assumes uniform\ndata distribution, we consider a more realistic and challenging setting called\nimbalanced SSL, where imbalanced class distributions occur in both labeled and\nunlabeled data. Although there are existing endeavors to tackle this challenge,\ntheir performance degenerates when facing severe imbalance since they can not\nreduce the class imbalance sufficiently and effectively. In this paper, we\nstudy a simple yet overlooked baseline -- SimiS -- which tackles data imbalance\nby simply supplementing labeled data with pseudo-labels, according to the\ndifference in class distribution from the most frequent class. Such a simple\nbaseline turns out to be highly effective in reducing class imbalance. It\noutperforms existing methods by a significant margin, e.g., 12.8%, 13.6%, and\n16.7% over previous SOTA on CIFAR100-LT, FOOD101-LT, and ImageNet127\nrespectively. The reduced imbalance results in faster convergence and better\npseudo-label accuracy of SimiS. The simplicity of our method also makes it\npossible to be combined with other re-balancing techniques to improve the\nperformance further. Moreover, our method shows great robustness to a wide\nrange of data distributions, which holds enormous potential in practice. Code\nwill be publicly available.\n","authors":["Hao Chen","Yue Fan","Yidong Wang","Jindong Wang","Bernt Schiele","Xing Xie","Marios Savvides","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2211.11086v2.pdf","comment":"Issues in the paper, will re-open later"},{"id":"http://arxiv.org/abs/2311.01356v3","updated":"2024-01-18T14:39:26Z","published":"2023-11-02T16:03:26Z","title":"Upper and lower bounds for the Lipschitz constant of random neural\n networks","summary":" Empirical studies have widely demonstrated that neural networks are highly\nsensitive to small, adversarial perturbations of the input. The worst-case\nrobustness against these so-called adversarial examples can be quantified by\nthe Lipschitz constant of the neural network. In this paper, we study upper and\nlower bounds for the Lipschitz constant of random ReLU neural networks.\nSpecifically, we assume that the weights and biases follow a generalization of\nthe He initialization, where general symmetric distributions for the biases are\npermitted. For shallow neural networks, we characterize the Lipschitz constant\nup to an absolute numerical constant. For deep networks with fixed depth and\nsufficiently large width, our established upper bound is larger than the lower\nbound by a factor that is logarithmic in the width.\n","authors":["Paul Geuchen","Thomas Heindl","Dominik Stöger","Felix Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2311.01356v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01300v3","updated":"2024-01-18T14:33:34Z","published":"2023-04-03T18:52:01Z","title":"On Mitigating the Utility-Loss in Differentially Private Learning: A new\n Perspective by a Geometrically Inspired Kernel Approach","summary":" Privacy-utility tradeoff remains as one of the fundamental issues of\ndifferentially private machine learning. This paper introduces a geometrically\ninspired kernel-based approach to mitigate the accuracy-loss issue in\nclassification. In this approach, a representation of the affine hull of given\ndata points is learned in Reproducing Kernel Hilbert Spaces (RKHS). This leads\nto a novel distance measure that hides privacy-sensitive information about\nindividual data points and improves the privacy-utility tradeoff via\nsignificantly reducing the risk of membership inference attacks. The\neffectiveness of the approach is demonstrated through experiments on MNIST\ndataset, Freiburg groceries dataset, and a real biomedical dataset. It is\nverified that the approach remains computationally practical. The application\nof the approach to federated learning is considered and it is observed that the\naccuracy-loss due to data being distributed is either marginal or not\nsignificantly high.\n","authors":["Mohit Kumar","Bernhard A. Moser","Lukas Fischer"],"pdf_url":"https://arxiv.org/pdf/2304.01300v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13184v2","updated":"2024-01-18T14:32:15Z","published":"2023-11-22T06:23:18Z","title":"Large Language Model-Enhanced Algorithm Selection: Towards Comprehensive\n Algorithm Representation","summary":" Algorithm selection aims to identify the most suitable algorithm for solving\na specific problem before execution, which has become a critical process of the\nAutoML. Current mainstream algorithm selection techniques rely heavily on\nfeature representations of various problems and employ the performance of each\nalgorithm as supervised information. However, there is a significant research\ngap concerning the consideration of algorithm features. This gap is primarily\nattributed to the inherent complexity of algorithms, making it particularly\nchallenging to find a universally effective feature extraction method that is\napplicable across a diverse range of algorithms. Unfortunately, neglecting this\naspect undoubtedly impacts the accuracy of algorithm selection and indirectly\nnecessitates an increased volume of problem data for training purposes. This\npaper takes a significant stride towards addressing this gap by proposing an\napproach that integrates algorithm representation into the algorithm selection\nprocess. Specifically, our proposed model employs distinct modules to extract\nrepresentations of both problems and algorithms, where the algorithm\nrepresentation leverages the capabilities of pre-trained LLMs in the realm of\ncode comprehension. Following the extraction of embedding vectors for both\nalgorithms and problems, the most suitable algorithm is determined through\ncalculations of matching degrees. Our experiments not only validate the\neffectiveness of the proposed model but also showcase the performance of\ndifferent embedded pre-trained LLMs, which suggests that the proposed algorithm\nselection framework holds the potential to serve as a baseline task for\nevaluating the code representation capabilities of LLMs.\n","authors":["Xingyu Wu","Yan Zhong","Jibin Wu","Bingbing Jiang","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2311.13184v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10014v1","updated":"2024-01-18T14:31:11Z","published":"2024-01-18T14:31:11Z","title":"Optimizing Medication Decisions for Patients with Atrial Fibrillation\n through Path Development Network","summary":" Atrial fibrillation (AF) is a common cardiac arrhythmia characterized by\nrapid and irregular contractions of the atria. It significantly elevates the\nrisk of strokes due to slowed blood flow in the atria, especially in the left\natrial appendage, which is prone to blood clot formation. Such clots can\nmigrate into cerebral arteries, leading to ischemic stroke. To assess whether\nAF patients should be prescribed anticoagulants, doctors often use the\nCHA2DS2-VASc scoring system. However, anticoagulant use must be approached with\ncaution as it can impact clotting functions. This study introduces a machine\nlearning algorithm that predicts whether patients with AF should be recommended\nanticoagulant therapy using 12-lead ECG data. In this model, we use STOME to\nenhance time-series data and then process it through a Convolutional Neural\nNetwork (CNN). By incorporating a path development layer, the model achieves a\nspecificity of 30.6% under the condition of an NPV of 1. In contrast, LSTM\nalgorithms without path development yield a specificity of only 2.7% under the\nsame NPV condition.\n","authors":["Tian Xie"],"pdf_url":"https://arxiv.org/pdf/2401.10014v1.pdf","comment":"Master's thesis"},{"id":"http://arxiv.org/abs/2401.09988v1","updated":"2024-01-18T14:06:29Z","published":"2024-01-18T14:06:29Z","title":"Developing an AI-based Integrated System for Bee Health Evaluation","summary":" Honey bees pollinate about one-third of the world's food supply, but bee\ncolonies have alarmingly declined by nearly 40% over the past decade due to\nseveral factors, including pesticides and pests. Traditional methods for\nmonitoring beehives, such as human inspection, are subjective, disruptive, and\ntime-consuming. To overcome these limitations, artificial intelligence has been\nused to assess beehive health. However, previous studies have lacked an\nend-to-end solution and primarily relied on data from a single source, either\nbee images or sounds. This study introduces a comprehensive system consisting\nof bee object detection and health evaluation. Additionally, it utilized a\ncombination of visual and audio signals to analyze bee behaviors. An\nAttention-based Multimodal Neural Network (AMNN) was developed to adaptively\nfocus on key features from each type of signal for accurate bee health\nassessment. The AMNN achieved an overall accuracy of 92.61%, surpassing eight\nexisting single-signal Convolutional Neural Networks and Recurrent Neural\nNetworks. It outperformed the best image-based model by 32.51% and the top\nsound-based model by 13.98% while maintaining efficient processing times.\nFurthermore, it improved prediction robustness, attaining an F1-score higher\nthan 90% across all four evaluated health conditions. The study also shows that\naudio signals are more reliable than images for assessing bee health. By\nseamlessly integrating AMNN with image and sound data in a comprehensive bee\nhealth monitoring system, this approach provides a more efficient and\nnon-invasive solution for the early detection of bee diseases and the\npreservation of bee colonies.\n","authors":["Andrew Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09988v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02070v3","updated":"2024-01-18T14:03:28Z","published":"2023-02-04T02:47:41Z","title":"Semantic-Guided Generative Image Augmentation Method with Diffusion\n Models for Image Classification","summary":" Existing image augmentation methods consist of two categories:\nperturbation-based methods and generative methods. Perturbation-based methods\napply pre-defined perturbations to augment an original image, but only locally\nvary the image, thus lacking image diversity. In contrast, generative methods\nbring more image diversity in the augmented images but may not preserve\nsemantic consistency, thus incorrectly changing the essential semantics of the\noriginal image. To balance image diversity and semantic consistency in\naugmented images, we propose SGID, a Semantic-guided Generative Image\naugmentation method with Diffusion models for image classification.\nSpecifically, SGID employs diffusion models to generate augmented images with\ngood image diversity. More importantly, SGID takes image labels and captions as\nguidance to maintain semantic consistency between the augmented and original\nimages. Experimental results show that SGID outperforms the best augmentation\nbaseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and\n0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image\naugmentation baselines and further improves the overall performance. We\ndemonstrate the semantic consistency and image diversity of SGID through\nquantitative human and automated evaluations, as well as qualitative case\nstudies.\n","authors":["Bohan Li","Xiao Xu","Xinghao Wang","Yutai Hou","Yunlong Feng","Feng Wang","Xuanliang Zhang","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2302.02070v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2401.09986v1","updated":"2024-01-18T14:02:23Z","published":"2024-01-18T14:02:23Z","title":"FLex&Chill: Improving Local Federated Learning Training with Logit\n Chilling","summary":" Federated learning are inherently hampered by data heterogeneity: non-iid\ndistributed training data over local clients. We propose a novel model training\napproach for federated learning, FLex&Chill, which exploits the Logit Chilling\nmethod. Through extensive evaluations, we demonstrate that, in the presence of\nnon-iid data characteristics inherent in federated learning systems, this\napproach can expedite model convergence and improve inference accuracy.\nQuantitatively, from our experiments, we observe up to 6X improvement in the\nglobal federated learning model convergence time, and up to 3.37% improvement\nin inference accuracy.\n","authors":["Kichang Lee","Songkuk Kim","JeongGil Ko"],"pdf_url":"https://arxiv.org/pdf/2401.09986v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2401.09980v1","updated":"2024-01-18T13:51:20Z","published":"2024-01-18T13:51:20Z","title":"Ventricular Segmentation: A Brief Comparison of U-Net Derivatives","summary":" Medical imaging refers to the technologies and methods utilized to view the\nhuman body and its inside, in order to diagnose, monitor, or even treat medical\ndisorders. This paper aims to explore the application of deep learning\ntechniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic\nResonance Imaging) images, aiming to enhance the diagnosis, monitoring, and\ntreatment of medical disorders related to the heart. The focus centers on\nimplementing various architectures that are derivatives of U-Net, to\neffectively isolate specific parts of the heart for comprehensive anatomical\nand functional analysis. Through a combination of images, graphs, and\nquantitative metrics, the efficacy of the models and their predictions are\nshowcased. Additionally, this paper addresses encountered challenges and\noutline strategies for future improvements. This abstract provides a concise\noverview of the efforts in utilizing deep learning for cardiac image\nsegmentation, emphasizing both the accomplishments and areas for further\nrefinement.\n","authors":["Ketan Suhaas Saichandran"],"pdf_url":"https://arxiv.org/pdf/2401.09980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09979v1","updated":"2024-01-18T13:46:41Z","published":"2024-01-18T13:46:41Z","title":"False Discovery Rate Control for Gaussian Graphical Models via\n Neighborhood Screening","summary":" Gaussian graphical models emerge in a wide range of fields. They model the\nstatistical relationships between variables as a graph, where an edge between\ntwo variables indicates conditional dependence. Unfortunately, well-established\nestimators, such as the graphical lasso or neighborhood selection, are known to\nbe susceptible to a high prevalence of false edge detections. False detections\nmay encourage inaccurate or even incorrect scientific interpretations, with\nmajor implications in applications, such as biomedicine or healthcare. In this\npaper, we introduce a nodewise variable selection approach to graph learning\nand provably control the false discovery rate of the selected edge set at a\nself-estimated level. A novel fusion method of the individual neighborhoods\noutputs an undirected graph estimate. The proposed method is parameter-free and\ndoes not require tuning by the user. Benchmarks against competing false\ndiscovery rate controlling methods in numerical experiments considering\ndifferent graph topologies show a significant gain in performance.\n","authors":["Taulant Koka","Jasin Machkour","Michael Muma"],"pdf_url":"https://arxiv.org/pdf/2401.09979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13971v6","updated":"2024-01-18T13:35:55Z","published":"2023-05-23T11:54:37Z","title":"Grammar-Constrained Decoding for Structured NLP Tasks without Finetuning","summary":" Despite their impressive performance, large language models (LMs) still\nstruggle with reliably generating complex output structures when not finetuned\nto follow the required output format exactly. To address this issue,\ngrammar-constrained decoding (GCD) can be used to control the generation of\nLMs, guaranteeing that the output follows a given structure. Most existing GCD\nmethods are, however, limited to specific tasks, such as parsing or code\ngeneration. In this work, we demonstrate that formal grammars can describe the\noutput space for a much wider range of tasks and argue that GCD can serve as a\nunified framework for structured NLP tasks in general. For increased\nflexibility, we introduce input-dependent grammars, which allow the grammar to\ndepend on the input and thus enable the generation of different output\nstructures for different inputs. We then empirically demonstrate the power and\nflexibility of GCD-enhanced LMs on (1) information extraction, (2) entity\ndisambiguation, and (3) constituency parsing. Our results indicate that\ngrammar-constrained LMs substantially outperform unconstrained LMs or even beat\ntask-specific finetuned models. Grammar constraints thus hold great promise for\nharnessing off-the-shelf LMs for a wide range of structured NLP tasks,\nespecially where training data is scarce or finetuning is expensive. Code and\ndata: https://github.com/epfl-dlab/GCD.\n","authors":["Saibo Geng","Martin Josifoski","Maxime Peyrard","Robert West"],"pdf_url":"https://arxiv.org/pdf/2305.13971v6.pdf","comment":"Accepted at EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2305.19838v3","updated":"2024-01-18T13:26:17Z","published":"2023-05-31T13:26:49Z","title":"Relaxing the Additivity Constraints in Decentralized No-Regret\n High-Dimensional Bayesian Optimization","summary":" Bayesian Optimization (BO) is typically used to optimize an unknown function\n$f$ that is noisy and costly to evaluate, by exploiting an acquisition function\nthat must be maximized at each optimization step. Even if provably\nasymptotically optimal BO algorithms are efficient at optimizing\nlow-dimensional functions, scaling them to high-dimensional spaces remains an\nopen problem, often tackled by assuming an additive structure for $f$. By doing\nso, BO algorithms typically introduce additional restrictive assumptions on the\nadditive structure that reduce their applicability domain. This paper contains\ntwo main contributions: (i) we relax the restrictive assumptions on the\nadditive structure of $f$ without weakening the maximization guarantees of the\nacquisition function, and (ii) we address the over-exploration problem for\ndecentralized BO algorithms. To these ends, we propose DuMBO, an asymptotically\noptimal decentralized BO algorithm that achieves very competitive performance\nagainst state-of-the-art BO algorithms, especially when the additive structure\nof $f$ comprises high-dimensional factors.\n","authors":["Anthony Bardou","Patrick Thiran","Thomas Begin"],"pdf_url":"https://arxiv.org/pdf/2305.19838v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09953v1","updated":"2024-01-18T12:58:53Z","published":"2024-01-18T12:58:53Z","title":"Through the Dual-Prism: A Spectral Perspective on Graph Data\n Augmentation for Graph Classification","summary":" Graph Neural Networks (GNNs) have become the preferred tool to process graph\ndata, with their efficacy being boosted through graph data augmentation\ntechniques. Despite the evolution of augmentation methods, issues like graph\nproperty distortions and restricted structural changes persist. This leads to\nthe question: Is it possible to develop more property-conserving and\nstructure-sensitive augmentation methods? Through a spectral lens, we\ninvestigate the interplay between graph properties, their augmentation, and\ntheir spectral behavior, and found that keeping the low-frequency eigenvalues\nunchanged can preserve the critical properties at a large scale when generating\naugmented graphs. These observations inform our introduction of the Dual-Prism\n(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly\nretains essential graph properties while diversifying augmented graphs.\nExtensive experiments validate the efficiency of our approach, providing a new\nand promising direction for graph data augmentation.\n","authors":["Yutong Xia","Runpeng Yu","Yuxuan Liang","Xavier Bresson","Xinchao Wang","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2401.09953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09949v1","updated":"2024-01-18T12:51:38Z","published":"2024-01-18T12:51:38Z","title":"SymbolNet: Neural Symbolic Regression with Adaptive Dynamic Pruning","summary":" Contrary to the use of genetic programming, the neural network approach to\nsymbolic regression can scale well with high input dimension and leverage\ngradient methods for faster equation searching. Common ways of constraining\nexpression complexity have relied on multistage pruning methods with\nfine-tuning, but these often lead to significant performance loss. In this\nwork, we propose SymbolNet, a neural network approach to symbolic regression in\na novel framework that enables dynamic pruning of model weights, input\nfeatures, and mathematical operators in a single training, where both training\nloss and expression complexity are optimized simultaneously. We introduce a\nsparsity regularization term per pruning type, which can adaptively adjust its\nown strength and lead to convergence to a target sparsity level. In contrast to\nmost existing symbolic regression methods that cannot efficiently handle\ndatasets with more than $O$(10) inputs, we demonstrate the effectiveness of our\nmodel on the LHC jet tagging task (16 inputs), MNIST (784 inputs), and SVHN\n(3072 inputs).\n","authors":["Ho Fung Tsoi","Vladimir Loncar","Sridhara Dasu","Philip Harris"],"pdf_url":"https://arxiv.org/pdf/2401.09949v1.pdf","comment":"11 pages. Submitted to IEEE TNNLS, under review"},{"id":"http://arxiv.org/abs/2401.09945v1","updated":"2024-01-18T12:47:13Z","published":"2024-01-18T12:47:13Z","title":"HGAttack: Transferable Heterogeneous Graph Adversarial Attack","summary":" Heterogeneous Graph Neural Networks (HGNNs) are increasingly recognized for\ntheir performance in areas like the web and e-commerce, where resilience\nagainst adversarial attacks is crucial. However, existing adversarial attack\nmethods, which are primarily designed for homogeneous graphs, fall short when\napplied to HGNNs due to their limited ability to address the structural and\nsemantic complexity of HGNNs. This paper introduces HGAttack, the first\ndedicated gray box evasion attack method for heterogeneous graphs. We design a\nnovel surrogate model to closely resemble the behaviors of the target HGNN and\nutilize gradient-based methods for perturbation generation. Specifically, the\nproposed surrogate model effectively leverages heterogeneous information by\nextracting meta-path induced subgraphs and applying GNNs to learn node\nembeddings with distinct semantics from each subgraph. This approach improves\nthe transferability of generated attacks on the target HGNN and significantly\nreduces memory costs. For perturbation generation, we introduce a\nsemantics-aware mechanism that leverages subgraph gradient information to\nautonomously identify vulnerable edges across a wide range of relations within\na constrained perturbation budget. We validate HGAttack's efficacy with\ncomprehensive experiments on three datasets, providing empirical analyses of\nits generated perturbations. Outperforming baseline methods, HGAttack\ndemonstrated significant efficacy in diminishing the performance of target HGNN\nmodels, affirming the effectiveness of our approach in evaluating the\nrobustness of HGNNs against adversarial attacks.\n","authors":["He Zhao","Zhiwei Zeng","Yongwei Wang","Deheng Ye","Chunyan Miao"],"pdf_url":"https://arxiv.org/pdf/2401.09945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09944v1","updated":"2024-01-18T12:46:26Z","published":"2024-01-18T12:46:26Z","title":"WindSeer: Real-time volumetric wind prediction over complex terrain\n aboard a small UAV","summary":" Real-time high-resolution wind predictions are beneficial for various\napplications including safe manned and unmanned aviation. Current weather\nmodels require too much compute and lack the necessary predictive capabilities\nas they are valid only at the scale of multiple kilometers and hours - much\nlower spatial and temporal resolutions than these applications require. Our\nwork, for the first time, demonstrates the ability to predict low-altitude wind\nin real-time on limited-compute devices, from only sparse measurement data. We\ntrain a neural network, WindSeer, using only synthetic data from computational\nfluid dynamics simulations and show that it can successfully predict real wind\nfields over terrain with known topography from just a few noisy and spatially\nclustered wind measurements. WindSeer can generate accurate predictions at\ndifferent resolutions and domain sizes on previously unseen topography without\nretraining. We demonstrate that the model successfully predicts historical wind\ndata collected by weather stations and wind measured onboard drones.\n","authors":["Florian Achermann","Thomas Stastny","Bogdan Danciu","Andrey Kolobov","Jen Jen Chung","Roland Siegwart","Nicholas Lawrance"],"pdf_url":"https://arxiv.org/pdf/2401.09944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09943v1","updated":"2024-01-18T12:45:46Z","published":"2024-01-18T12:45:46Z","title":"Infinite-Horizon Graph Filters: Leveraging Power Series to Enhance\n Sparse Information Aggregation","summary":" Graph Neural Networks (GNNs) have shown considerable effectiveness in a\nvariety of graph learning tasks, particularly those based on the\nmessage-passing approach in recent years. However, their performance is often\nconstrained by a limited receptive field, a challenge that becomes more acute\nin the presence of sparse graphs. In light of the power series, which possesses\ninfinite expansion capabilities, we propose a novel \\underline{G}raph\n\\underline{P}ower \\underline{F}ilter \\underline{N}eural Network (GPFN) that\nenhances node classification by employing a power series graph filter to\naugment the receptive field. Concretely, our GPFN designs a new way to build a\ngraph filter with an infinite receptive field based on the convergence power\nseries, which can be analyzed in the spectral and spatial domains. Besides, we\ntheoretically prove that our GPFN is a general framework that can integrate any\npower series and capture long-range dependencies. Finally, experimental results\non three datasets demonstrate the superiority of our GPFN over state-of-the-art\nbaselines.\n","authors":["Ruizhe Zhang","Xinke Jiang","Yuchen Fang","Jiayuan Luo","Yongxin Xu","Yichen Zhu","Xu Chu","Junfeng Zhao","Yasha Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.09943v1.pdf","comment":"v1"},{"id":"http://arxiv.org/abs/2401.09940v1","updated":"2024-01-18T12:41:58Z","published":"2024-01-18T12:41:58Z","title":"Biases in Expected Goals Models Confound Finishing Ability","summary":" Expected Goals (xG) has emerged as a popular tool for evaluating finishing\nskill in soccer analytics. It involves comparing a player's cumulative xG with\ntheir actual goal output, where consistent overperformance indicates strong\nfinishing ability. However, the assessment of finishing skill in soccer using\nxG remains contentious due to players' difficulty in consistently outperforming\ntheir cumulative xG. In this paper, we aim to address the limitations and\nnuances surrounding the evaluation of finishing skill using xG statistics.\nSpecifically, we explore three hypotheses: (1) the deviation between actual and\nexpected goals is an inadequate metric due to the high variance of shot\noutcomes and limited sample sizes, (2) the inclusion of all shots in cumulative\nxG calculation may be inappropriate, and (3) xG models contain biases arising\nfrom interdependencies in the data that affect skill measurement. We found that\nsustained overperformance of cumulative xG requires both high shot volumes and\nexceptional finishing, including all shot types can obscure the finishing\nability of proficient strikers, and that there is a persistent bias that makes\nthe actual and expected goals closer for excellent finishers than it really is.\nOverall, our analysis indicates that we need more nuanced quantitative\napproaches for investigating a player's finishing ability, which we achieved\nusing a technique from AI fairness to learn an xG model that is calibrated for\nmultiple subgroups of players. As a concrete use case, we show that (1) the\nstandard biased xG model underestimates Messi's GAX by 17% and (2) Messi's GAX\nis 27% higher than the typical elite high-shot-volume attacker, indicating that\nMessi is even a more exceptional finisher than people commonly believed.\n","authors":["Jesse Davis","Pieter Robberechts"],"pdf_url":"https://arxiv.org/pdf/2401.09940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14630v3","updated":"2024-01-18T12:31:30Z","published":"2022-12-30T10:56:56Z","title":"Detecting Change Intervals with Isolation Distributional Kernel","summary":" Detecting abrupt changes in data distribution is one of the most significant\ntasks in streaming data analysis. Although many unsupervised Change-Point\nDetection (CPD) methods have been proposed recently to identify those changes,\nthey still suffer from missing subtle changes, poor scalability, or/and\nsensitivity to outliers. To meet these challenges, we are the first to\ngeneralise the CPD problem as a special case of the Change-Interval Detection\n(CID) problem. Then we propose a CID method, named iCID, based on a recent\nIsolation Distributional Kernel (IDK). iCID identifies the change interval if\nthere is a high dissimilarity score between two non-homogeneous temporal\nadjacent intervals. The data-dependent property and finite feature map of IDK\nenabled iCID to efficiently identify various types of change-points in data\nstreams with the tolerance of outliers. Moreover, the proposed online and\noffline versions of iCID have the ability to optimise key parameter settings.\nThe effectiveness and efficiency of iCID have been systematically verified on\nboth synthetic and real-world datasets.\n","authors":["Yang Cao","Ye Zhu","Kai Ming Ting","Flora D. Salim","Hong Xian Li","Luxing Yang","Gang Li"],"pdf_url":"https://arxiv.org/pdf/2212.14630v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17670v2","updated":"2024-01-18T12:29:31Z","published":"2023-12-29T16:37:08Z","title":"TopCoW: Benchmarking Topology-Aware Anatomical Segmentation of the\n Circle of Willis (CoW) for CTA and MRA","summary":" The Circle of Willis (CoW) is an important network of arteries connecting\nmajor circulations of the brain. Its vascular architecture is believed to\naffect the risk, severity, and clinical outcome of serious neuro-vascular\ndiseases. However, characterizing the highly variable CoW anatomy is still a\nmanual and time-consuming expert task. The CoW is usually imaged by two\nangiographic imaging modalities, magnetic resonance angiography (MRA) and\ncomputed tomography angiography (CTA), but there exist limited public datasets\nwith annotations on CoW anatomy, especially for CTA. Therefore we organized the\nTopCoW Challenge in 2023 with the release of an annotated CoW dataset. The\nTopCoW dataset was the first public dataset with voxel-level annotations for\nthirteen possible CoW vessel components, enabled by virtual-reality (VR)\ntechnology. It was also the first large dataset with paired MRA and CTA from\nthe same patients. TopCoW challenge formalized the CoW characterization problem\nas a multiclass anatomical segmentation task with an emphasis on topological\nmetrics. We invited submissions worldwide for the CoW segmentation task, which\nattracted over 140 registered participants from four continents. The top\nperforming teams managed to segment many CoW components to Dice scores around\n90%, but with lower scores for communicating arteries and rare variants. There\nwere also topological mistakes for predictions with high Dice scores.\nAdditional topological analysis revealed further areas for improvement in\ndetecting certain CoW components and matching CoW variant topology accurately.\nTopCoW represented a first attempt at benchmarking the CoW anatomical\nsegmentation task for MRA and CTA, both morphologically and topologically.\n","authors":["Kaiyuan Yang","Fabio Musio","Yihui Ma","Norman Juchler","Johannes C. Paetzold","Rami Al-Maskari","Luciano Höher","Hongwei Bran Li","Ibrahim Ethem Hamamci","Anjany Sekuboyina","Suprosanna Shit","Houjing Huang","Diana Waldmannstetter","Florian Kofler","Fernando Navarro","Martin Menten","Ivan Ezhov","Daniel Rueckert","Iris Vos","Ynte Ruigrok","Birgitta Velthuis","Hugo Kuijf","Julien Hämmerli","Catherine Wurster","Philippe Bijlenga","Laura Westphal","Jeroen Bisschop","Elisa Colombo","Hakim Baazaoui","Andrew Makmur","James Hallinan","Bene Wiestler","Jan S. Kirschke","Roland Wiest","Emmanuel Montagnon","Laurent Letourneau-Guillon","Adrian Galdran","Francesco Galati","Daniele Falcetta","Maria A. Zuluaga","Chaolong Lin","Haoran Zhao","Zehan Zhang","Sinyoung Ra","Jongyun Hwang","Hyunjin Park","Junqiang Chen","Marek Wodzinski","Henning Müller","Pengcheng Shi","Wei Liu","Ting Ma","Cansu Yalçin","Rachika E. Hamadache","Joaquim Salvi","Xavier Llado","Uma Maria Lal-Trehan Estrada","Valeriia Abramova","Luca Giancardo","Arnau Oliver","Jialu Liu","Haibin Huang","Yue Cui","Zehang Lin","Yusheng Liu","Shunzhi Zhu","Tatsat R. Patel","Vincent M. Tutino","Maysam Orouskhani","Huayu Wang","Mahmud Mossa-Basha","Chengcheng Zhu","Maximilian R. Rokuss","Yannick Kirchhoff","Nico Disch","Julius Holzschuh","Fabian Isensee","Klaus Maier-Hein","Yuki Sato","Sven Hirsch","Susanne Wegener","Bjoern Menze"],"pdf_url":"https://arxiv.org/pdf/2312.17670v2.pdf","comment":"23 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW\n 2023 Challenge"},{"id":"http://arxiv.org/abs/2303.17045v2","updated":"2024-01-18T12:10:03Z","published":"2023-03-29T22:16:52Z","title":"Training Neural Networks is NP-Hard in Fixed Dimension","summary":" We study the parameterized complexity of training two-layer neural networks\nwith respect to the dimension of the input data and the number of hidden\nneurons, considering ReLU and linear threshold activation functions. Albeit the\ncomputational complexity of these problems has been studied numerous times in\nrecent years, several questions are still open. We answer questions by Arora et\nal. [ICLR '18] and Khalife and Basu [IPCO '22] showing that both problems are\nNP-hard for two dimensions, which excludes any polynomial-time algorithm for\nconstant dimension. We also answer a question by Froese et al. [JAIR '22]\nproving W[1]-hardness for four ReLUs (or two linear threshold neurons) with\nzero training error. Finally, in the ReLU case, we show fixed-parameter\ntractability for the combined parameter number of dimensions and number of\nReLUs if the network is assumed to compute a convex map. Our results settle the\ncomplexity status regarding these parameters almost completely.\n","authors":["Vincent Froese","Christoph Hertrich"],"pdf_url":"https://arxiv.org/pdf/2303.17045v2.pdf","comment":"Paper accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2401.09918v1","updated":"2024-01-18T12:03:19Z","published":"2024-01-18T12:03:19Z","title":"Probabilistic Truly Unordered Rule Sets","summary":" Rule set learning has recently been frequently revisited because of its\ninterpretability. Existing methods have several shortcomings though. First,\nmost existing methods impose orders among rules, either explicitly or\nimplicitly, which makes the models less comprehensible. Second, due to the\ndifficulty of handling conflicts caused by overlaps (i.e., instances covered by\nmultiple rules), existing methods often do not consider probabilistic rules.\nThird, learning classification rules for multi-class target is understudied, as\nmost existing methods focus on binary classification or multi-class\nclassification via the ``one-versus-rest\" approach.\n To address these shortcomings, we propose TURS, for Truly Unordered Rule\nSets. To resolve conflicts caused by overlapping rules, we propose a novel\nmodel that exploits the probabilistic properties of our rule sets, with the\nintuition of only allowing rules to overlap if they have similar probabilistic\noutputs. We next formalize the problem of learning a TURS model based on the\nMDL principle and develop a carefully designed heuristic algorithm. We\nbenchmark against a wide range of rule-based methods and demonstrate that our\nmethod learns rule sets that have lower model complexity and highly competitive\npredictive performance. In addition, we empirically show that rules in our\nmodel are empirically ``independent\" and hence truly unordered.\n","authors":["Lincen Yang","Matthijs van Leeuwen"],"pdf_url":"https://arxiv.org/pdf/2401.09918v1.pdf","comment":"Submitted to JMLR"},{"id":"http://arxiv.org/abs/2401.09916v1","updated":"2024-01-18T11:57:05Z","published":"2024-01-18T11:57:05Z","title":"Enabling On-device Continual Learning with Binary Neural Networks","summary":" On-device learning remains a formidable challenge, especially when dealing\nwith resource-constrained devices that have limited computational capabilities.\nThis challenge is primarily rooted in two key issues: first, the memory\navailable on embedded devices is typically insufficient to accommodate the\nmemory-intensive back-propagation algorithm, which often relies on\nfloating-point precision. Second, the development of learning algorithms on\nmodels with extreme quantization levels, such as Binary Neural Networks (BNNs),\nis critical due to the drastic reduction in bit representation. In this study,\nwe propose a solution that combines recent advancements in the field of\nContinual Learning (CL) and Binary Neural Networks to enable on-device training\nwhile maintaining competitive performance. Specifically, our approach leverages\nbinary latent replay (LR) activations and a novel quantization scheme that\nsignificantly reduces the number of bits required for gradient computation. The\nexperimental validation demonstrates a significant accuracy improvement in\ncombination with a noticeable reduction in memory requirement, confirming the\nsuitability of our approach in expanding the practical applications of deep\nlearning in real-world scenarios.\n","authors":["Lorenzo Vorabbi","Davide Maltoni","Guido Borghi","Stefano Santi"],"pdf_url":"https://arxiv.org/pdf/2401.09916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09915v1","updated":"2024-01-18T11:54:42Z","published":"2024-01-18T11:54:42Z","title":"Qadence: a differentiable interface for digital-analog programs","summary":" Digital-analog quantum computing (DAQC) is an alternative paradigm for\nuniversal quantum computation combining digital single-qubit gates with global\nanalog operations acting on a register of interacting qubits. Currently, no\navailable open-source software is tailored to express, differentiate, and\nexecute programs within the DAQC paradigm. In this work, we address this\nshortfall by presenting Qadence, a high-level programming interface for\nbuilding complex digital-analog quantum programs developed at Pasqal. Thanks to\nits flexible interface, native differentiability, and focus on real-device\nexecution, Qadence aims at advancing research on variational quantum algorithms\nbuilt for native DAQC platforms such as Rydberg atom arrays.\n","authors":["Dominik Seitz","Niklas Heim","João P. Moutinho","Roland Guichard","Vytautas Abramavicius","Aleksander Wennersteen","Gert-Jan Both","Anton Quelle","Caroline de Groot","Gergana V. Velikova","Vincent E. Elfving","Mario Dagrada"],"pdf_url":"https://arxiv.org/pdf/2401.09915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09902v1","updated":"2024-01-18T11:32:50Z","published":"2024-01-18T11:32:50Z","title":"Interplay between depth and width for interpolation in neural ODEs","summary":" Neural ordinary differential equations (neural ODEs) have emerged as a\nnatural tool for supervised learning from a control perspective, yet a complete\nunderstanding of their optimal architecture remains elusive. In this work, we\nexamine the interplay between their width $p$ and number of layer transitions\n$L$ (effectively the depth $L+1$). Specifically, we assess the model\nexpressivity in terms of its capacity to interpolate either a finite dataset\n$D$ comprising $N$ pairs of points or two probability measures in\n$\\mathbb{R}^d$ within a Wasserstein error margin $\\varepsilon>0$. Our findings\nreveal a balancing trade-off between $p$ and $L$, with $L$ scaling as\n$O(1+N/p)$ for dataset interpolation, and\n$L=O\\left(1+(p\\varepsilon^d)^{-1}\\right)$ for measure interpolation.\n In the autonomous case, where $L=0$, a separate study is required, which we\nundertake focusing on dataset interpolation. We address the relaxed problem of\n$\\varepsilon$-approximate controllability and establish an error decay of\n$\\varepsilon\\sim O(\\log(p)p^{-1/d})$. This decay rate is a consequence of\napplying a universal approximation theorem to a custom-built Lipschitz vector\nfield that interpolates $D$. In the high-dimensional setting, we further\ndemonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact\ncontrol.\n","authors":["Antonio Álvarez-López","Arselane Hadj Slimane","Enrique Zuazua Iriondo"],"pdf_url":"https://arxiv.org/pdf/2401.09902v1.pdf","comment":"16 pages, 10 figures, double column"},{"id":"http://arxiv.org/abs/2401.09890v1","updated":"2024-01-18T11:05:03Z","published":"2024-01-18T11:05:03Z","title":"A Survey on Hardware Accelerators for Large Language Models","summary":" Large Language Models (LLMs) have emerged as powerful tools for natural\nlanguage processing tasks, revolutionizing the field with their ability to\nunderstand and generate human-like text. As the demand for more sophisticated\nLLMs continues to grow, there is a pressing need to address the computational\nchallenges associated with their scale and complexity. This paper presents a\ncomprehensive survey on hardware accelerators designed to enhance the\nperformance and energy efficiency of Large Language Models. By examining a\ndiverse range of accelerators, including GPUs, FPGAs, and custom-designed\narchitectures, we explore the landscape of hardware solutions tailored to meet\nthe unique computational demands of LLMs. The survey encompasses an in-depth\nanalysis of architecture, performance metrics, and energy efficiency\nconsiderations, providing valuable insights for researchers, engineers, and\ndecision-makers aiming to optimize the deployment of LLMs in real-world\napplications.\n","authors":["Christoforos Kachris"],"pdf_url":"https://arxiv.org/pdf/2401.09890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09886v1","updated":"2024-01-18T10:59:18Z","published":"2024-01-18T10:59:18Z","title":"Cooperative Edge Caching Based on Elastic Federated and Multi-Agent Deep\n Reinforcement Learning in Next-Generation Network","summary":" Edge caching is a promising solution for next-generation networks by\nempowering caching units in small-cell base stations (SBSs), which allows user\nequipments (UEs) to fetch users' requested contents that have been pre-cached\nin SBSs. It is crucial for SBSs to predict accurate popular contents through\nlearning while protecting users' personal information. Traditional federated\nlearning (FL) can protect users' privacy but the data discrepancies among UEs\ncan lead to a degradation in model quality. Therefore, it is necessary to train\npersonalized local models for each UE to predict popular contents accurately.\nIn addition, the cached contents can be shared among adjacent SBSs in\nnext-generation networks, thus caching predicted popular contents in different\nSBSs may affect the cost to fetch contents. Hence, it is critical to determine\nwhere the popular contents are cached cooperatively. To address these issues,\nwe propose a cooperative edge caching scheme based on elastic federated and\nmulti-agent deep reinforcement learning (CEFMR) to optimize the cost in the\nnetwork. We first propose an elastic FL algorithm to train the personalized\nmodel for each UE, where adversarial autoencoder (AAE) model is adopted for\ntraining to improve the prediction accuracy, then {a popular} content\nprediction algorithm is proposed to predict the popular contents for each SBS\nbased on the trained AAE model. Finally, we propose a multi-agent deep\nreinforcement learning (MADRL) based algorithm to decide where the predicted\npopular contents are collaboratively cached among SBSs. Our experimental\nresults demonstrate the superiority of our proposed scheme to existing baseline\ncaching schemes.\n","authors":["Qiong Wu","Wenhua Wang","Pingyi Fan","Qiang Fan","Huiling Zhu","Khaled B. Letaief"],"pdf_url":"https://arxiv.org/pdf/2401.09886v1.pdf","comment":"This paper has been submitted to IEEE TNSM. The source code has been\n released at:\n https://github.com/qiongwu86/Edge-Caching-Based-on-Multi-Agent-Deep-Reinforcement-Learning-and-Federated-Learning"},{"id":"http://arxiv.org/abs/2401.09881v1","updated":"2024-01-18T10:53:45Z","published":"2024-01-18T10:53:45Z","title":"GA-SmaAt-GNet: Generative Adversarial Small Attention GNet for Extreme\n Precipitation Nowcasting","summary":" In recent years, data-driven modeling approaches have gained considerable\ntraction in various meteorological applications, particularly in the realm of\nweather forecasting. However, these approaches often encounter challenges when\ndealing with extreme weather conditions. In light of this, we propose\nGA-SmaAt-GNet, a novel generative adversarial architecture that makes use of\ntwo methodologies aimed at enhancing the performance of deep learning models\nfor extreme precipitation nowcasting. Firstly, it uses a novel SmaAt-GNet built\nupon the successful SmaAt-UNet architecture as generator. This network\nincorporates precipitation masks (binarized precipitation maps) as an\nadditional data source, leveraging valuable information for improved\npredictions. Additionally, GA-SmaAt-GNet utilizes an attention-augmented\ndiscriminator inspired by the well-established Pix2Pix architecture.\nFurthermore, we assess the performance of GA-SmaAt-GNet using real-life\nprecipitation dataset from the Netherlands. Our experimental results reveal a\nnotable improvement in both overall performance and for extreme precipitation\nevents. Furthermore, we conduct uncertainty analysis on the proposed\nGA-SmaAt-GNet model as well as on the precipitation dataset, providing\nadditional insights into the predictive capabilities of the model. Finally, we\noffer further insights into the predictions of our proposed model using\nGrad-CAM. This visual explanation technique generates activation heatmaps,\nillustrating areas of the input that are more activated for various parts of\nthe network.\n","authors":["Eloy Reulen","Siamak Mehrkanoon"],"pdf_url":"https://arxiv.org/pdf/2401.09881v1.pdf","comment":"16 pages, 11 figurs"},{"id":"http://arxiv.org/abs/2401.09880v1","updated":"2024-01-18T10:52:46Z","published":"2024-01-18T10:52:46Z","title":"Attention-Based Recurrent Neural Network For Automatic Behavior Laying\n Hen Recognition","summary":" One of the interests of modern poultry farming is the vocalization of laying\nhens which contain very useful information on health behavior. This information\nis used as health and well-being indicators that help breeders better monitor\nlaying hens, which involves early detection of problems for rapid and more\neffective intervention. In this work, we focus on the sound analysis for the\nrecognition of the types of calls of the laying hens in order to propose a\nrobust system of characterization of their behavior for a better monitoring. To\ndo this, we first collected and annotated laying hen call signals, then\ndesigned an optimal acoustic characterization based on the combination of time\nand frequency domain features. We then used these features to build the\nmulti-label classification models based on recurrent neural network to assign a\nsemantic class to the vocalization that characterize the laying hen behavior.\nThe results show an overall performance with our model based on the combination\nof time and frequency domain features that obtained the highest F1-score\n(F1=92.75) with a gain of 17% on the models using the frequency domain features\nand of 8% on the compared approaches from the litterature.\n","authors":["Fréjus A. A. Laleye","Mikaël A. Mousse"],"pdf_url":"https://arxiv.org/pdf/2401.09880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05109v3","updated":"2024-01-18T10:41:37Z","published":"2023-06-08T11:16:20Z","title":"Yet Another ICU Benchmark: A Flexible Multi-Center Framework for\n Clinical ML","summary":" Medical applications of machine learning (ML) have experienced a surge in\npopularity in recent years. The intensive care unit (ICU) is a natural habitat\nfor ML given the abundance of available data from electronic health records.\nModels have been proposed to address numerous ICU prediction tasks like the\nearly detection of complications. While authors frequently report\nstate-of-the-art performance, it is challenging to verify claims of\nsuperiority. Datasets and code are not always published, and cohort\ndefinitions, preprocessing pipelines, and training setups are difficult to\nreproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular\nframework that allows researchers to define reproducible and comparable\nclinical ML experiments; we offer an end-to-end solution from cohort definition\nto model evaluation. The framework natively supports most open-access ICU\ndatasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future\nICU datasets. Combined with a transparent preprocessing pipeline and extensible\ntraining code for multiple ML and deep learning models, YAIB enables unified\nmodel development. Our benchmark comes with five predefined established\nprediction tasks (mortality, acute kidney injury, sepsis, kidney function, and\nlength of stay) developed in collaboration with clinicians. Adding further\ntasks is straightforward by design. Using YAIB, we demonstrate that the choice\nof dataset, cohort definition, and preprocessing have a major impact on the\nprediction performance - often more so than model class - indicating an urgent\nneed for YAIB as a holistic benchmarking tool. We provide our work to the\nclinical ML community to accelerate method development and enable real-world\nclinical implementations. Software Repository:\nhttps://github.com/rvandewater/YAIB.\n","authors":["Robin van de Water","Hendrik Schmidt","Paul Elbers","Patrick Thoral","Bert Arnrich","Patrick Rockenschaub"],"pdf_url":"https://arxiv.org/pdf/2306.05109v3.pdf","comment":"Main benchmark: https://github.com/rvandewater/YAIB, Cohort\n generation: https://github.com/rvandewater/YAIB-cohorts, Models:\n https://github.com/rvandewater/YAIB-models"},{"id":"http://arxiv.org/abs/2401.09870v1","updated":"2024-01-18T10:33:30Z","published":"2024-01-18T10:33:30Z","title":"Reconciling Spatial and Temporal Abstractions for Goal Representation","summary":" Goal representation affects the performance of Hierarchical Reinforcement\nLearning (HRL) algorithms by decomposing the complex learning problem into\neasier subtasks. Recent studies show that representations that preserve\ntemporally abstract environment dynamics are successful in solving difficult\nproblems and provide theoretical guarantees for optimality. These methods\nhowever cannot scale to tasks where environment dynamics increase in complexity\ni.e. the temporally abstract transition relations depend on larger number of\nvariables. On the other hand, other efforts have tried to use spatial\nabstraction to mitigate the previous issues. Their limitations include\nscalability to high dimensional environments and dependency on prior knowledge.\n In this paper, we propose a novel three-layer HRL algorithm that introduces,\nat different levels of the hierarchy, both a spatial and a temporal goal\nabstraction. We provide a theoretical study of the regret bounds of the learned\npolicies. We evaluate the approach on complex continuous control tasks,\ndemonstrating the effectiveness of spatial and temporal abstractions learned by\nthis approach.\n","authors":["Mehdi Zadem","Sergio Mover","Sao Mai Nguyen"],"pdf_url":"https://arxiv.org/pdf/2401.09870v1.pdf","comment":"Accepted for publication in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09865v1","updated":"2024-01-18T10:28:45Z","published":"2024-01-18T10:28:45Z","title":"Improving fine-grained understanding in image-text pre-training","summary":" We introduce SPARse Fine-grained Contrastive Alignment (SPARC), a simple\nmethod for pretraining more fine-grained multimodal representations from\nimage-text pairs. Given that multiple image patches often correspond to single\nwords, we propose to learn a grouping of image patches for every token in the\ncaption. To achieve this, we use a sparse similarity metric between image\npatches and language tokens and compute for each token a language-grouped\nvision embedding as the weighted average of patches. The token and\nlanguage-grouped vision embeddings are then contrasted through a fine-grained\nsequence-wise loss that only depends on individual samples and does not require\nother batch samples as negatives. This enables more detailed information to be\nlearned in a computationally inexpensive manner. SPARC combines this\nfine-grained loss with a contrastive loss between global image and text\nembeddings to learn representations that simultaneously encode global and local\ninformation. We thoroughly evaluate our proposed method and show improved\nperformance over competing approaches both on image-level tasks relying on\ncoarse-grained information, e.g. classification, as well as region-level tasks\nrelying on fine-grained information, e.g. retrieval, object detection, and\nsegmentation. Moreover, SPARC improves model faithfulness and captioning in\nfoundational vision-language models.\n","authors":["Ioana Bica","Anastasija Ilić","Matthias Bauer","Goker Erdogan","Matko Bošnjak","Christos Kaplanis","Alexey A. Gritsenko","Matthias Minderer","Charles Blundell","Razvan Pascanu","Jovana Mitrović"],"pdf_url":"https://arxiv.org/pdf/2401.09865v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2312.06305v2","updated":"2024-01-18T10:26:13Z","published":"2023-12-11T11:26:43Z","title":"A Meta-Level Learning Algorithm for Sequential Hyper-Parameter Space\n Reduction in AutoML","summary":" AutoML platforms have numerous options for the algorithms to try for each\nstep of the analysis, i.e., different possible algorithms for imputation,\ntransformations, feature selection, and modelling. Finding the optimal\ncombination of algorithms and hyper-parameter values is computationally\nexpensive, as the number of combinations to explore leads to an exponential\nexplosion of the space. In this paper, we present the Sequential\nHyper-parameter Space Reduction (SHSR) algorithm that reduces the space for an\nAutoML tool with negligible drop in its predictive performance. SHSR is a\nmeta-level learning algorithm that analyzes past runs of an AutoML tool on\nseveral datasets and learns which hyper-parameter values to filter out from\nconsideration on a new dataset to analyze. SHSR is evaluated on 284\nclassification and 375 regression problems, showing an approximate 30%\nreduction in execution time with a performance drop of less than 0.1%.\n","authors":["Giorgos Borboudakis","Paulos Charonyktakis","Konstantinos Paraschakis","Ioannis Tsamardinos"],"pdf_url":"https://arxiv.org/pdf/2312.06305v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16316v2","updated":"2024-01-18T10:23:17Z","published":"2023-09-28T10:13:23Z","title":"Astroconformer: The Prospects of Analyzing Stellar Light Curves with\n Transformer-Based Deep Learning Models","summary":" Stellar light curves contain valuable information about oscillations and\ngranulation, offering insights into stars' internal structures and evolutionary\nstates. Traditional asteroseismic techniques, primarily focused on power\nspectral analysis, often overlook the crucial phase information in these light\ncurves. Addressing this gap, recent machine learning applications, particularly\nthose using Convolutional Neural Networks (CNNs), have made strides in\ninferring stellar properties from light curves. However, CNNs are limited by\ntheir localized feature extraction capabilities. In response, we introduce\n$\\textit{Astroconformer}$, a Transformer-based deep learning framework,\nspecifically designed to capture long-range dependencies in stellar light\ncurves. Our empirical analysis centers on estimating surface gravity ($\\log\ng$), using a dataset derived from single-quarter Kepler light curves with $\\log\ng$ values ranging from 0.2 to 4.4. $\\textit{Astroconformer}$ demonstrates\nsuperior performance, achieving a root-mean-square-error (RMSE) of 0.017 dex at\n$\\log g\\approx3$ in data-rich regimes and up to 0.1 dex in sparser areas. This\nperformance surpasses both K-nearest neighbor models and advanced CNNs.\nAblation studies highlight the influence of receptive field size on model\neffectiveness, with larger fields correlating to improved results.\n$\\textit{Astroconformer}$ also excels in extracting $\\nu_{\\max}$ with high\nprecision. It achieves less than 2% relative median absolute error for 90-day\nred giant light curves. Notably, the error remains under 3% for 30-day light\ncurves, whose oscillations are undetectable by a conventional pipeline in 30%\ncases. Furthermore, the attention mechanisms in $\\textit{Astroconformer}$ align\nclosely with the characteristics of stellar oscillations and granulation\nobserved in light curves.\n","authors":["Jia-Shu Pan","Yuan-Sen Ting","Jie Yu"],"pdf_url":"https://arxiv.org/pdf/2309.16316v2.pdf","comment":"15 pages, 10 figures, Accepted by MNRAS"},{"id":"http://arxiv.org/abs/2305.07376v2","updated":"2024-01-18T10:22:03Z","published":"2023-05-12T10:58:21Z","title":"DAISM: Digital Approximate In-SRAM Multiplier-based Accelerator for DNN\n Training and Inference","summary":" DNNs are widely used but face significant computational costs due to matrix\nmultiplications, especially from data movement between the memory and\nprocessing units. One promising approach is therefore Processing-in-Memory as\nit greatly reduces this overhead. However, most PIM solutions rely either on\nnovel memory technologies that have yet to mature or bit-serial computations\nthat have significant performance overhead and scalability issues. Our work\nproposes an in-SRAM digital multiplier, that uses a conventional memory to\nperform bit-parallel computations, leveraging multiple wordlines activation. We\nthen introduce DAISM, an architecture leveraging this multiplier, which\nachieves up to two orders of magnitude higher area efficiency compared to the\nSOTA counterparts, with competitive energy efficiency.\n","authors":["Lorenzo Sonnino","Shaswot Shresthamali","Yuan He","Masaaki Kondo"],"pdf_url":"https://arxiv.org/pdf/2305.07376v2.pdf","comment":"12 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.09862v1","updated":"2024-01-18T10:21:15Z","published":"2024-01-18T10:21:15Z","title":"Evolutionary Multi-Objective Optimization of Large Language Model\n Prompts for Balancing Sentiments","summary":" The advent of large language models (LLMs) such as ChatGPT has attracted\nconsiderable attention in various domains due to their remarkable performance\nand versatility. As the use of these models continues to grow, the importance\nof effective prompt engineering has come to the fore. Prompt optimization\nemerges as a crucial challenge, as it has a direct impact on model performance\nand the extraction of relevant information. Recently, evolutionary algorithms\n(EAs) have shown promise in addressing this issue, paving the way for novel\noptimization strategies. In this work, we propose a evolutionary\nmulti-objective (EMO) approach specifically tailored for prompt optimization\ncalled EMO-Prompts, using sentiment analysis as a case study. We use sentiment\nanalysis capabilities as our experimental targets. Our results demonstrate that\nEMO-Prompts effectively generates prompts capable of guiding the LLM to produce\ntexts embodying two conflicting emotions simultaneously.\n","authors":["Jill Baumann","Oliver Kramer"],"pdf_url":"https://arxiv.org/pdf/2401.09862v1.pdf","comment":"Accepted in EvoApps at EvoStar 2024"},{"id":"http://arxiv.org/abs/2305.02749v5","updated":"2024-01-18T09:57:32Z","published":"2023-05-04T11:38:25Z","title":"Explainable Reinforcement Learning via a Causal World Model","summary":" Generating explanations for reinforcement learning (RL) is challenging as\nactions may produce long-term effects on the future. In this paper, we develop\na novel framework for explainable RL by learning a causal world model without\nprior knowledge of the causal structure of the environment. The model captures\nthe influence of actions, allowing us to interpret the long-term effects of\nactions through causal chains, which present how actions influence\nenvironmental variables and finally lead to rewards. Different from most\nexplanatory models which suffer from low accuracy, our model remains accurate\nwhile improving explainability, making it applicable in model-based learning.\nAs a result, we demonstrate that our causal model can serve as the bridge\nbetween explainability and learning.\n","authors":["Zhongwei Yu","Jingqing Ruan","Dengpeng Xing"],"pdf_url":"https://arxiv.org/pdf/2305.02749v5.pdf","comment":"Accepted by IJCAI 2023"},{"id":"http://arxiv.org/abs/2401.09840v1","updated":"2024-01-18T09:54:19Z","published":"2024-01-18T09:54:19Z","title":"FREED++: Improving RL Agents for Fragment-Based Molecule Generation by\n Thorough Reproduction","summary":" A rational design of new therapeutic drugs aims to find a molecular structure\nwith desired biological functionality, e.g., an ability to activate or suppress\na specific protein via binding to it. Molecular docking is a common technique\nfor evaluating protein-molecule interactions. Recently, Reinforcement Learning\n(RL) has emerged as a promising approach to generating molecules with the\ndocking score (DS) as a reward. In this work, we reproduce, scrutinize and\nimprove the recent RL model for molecule generation called FREED\n(arXiv:2110.01219). Extensive evaluation of the proposed method reveals several\nlimitations and challenges despite the outstanding results reported for three\ntarget proteins. Our contributions include fixing numerous implementation bugs\nand simplifying the model while increasing its quality, significantly extending\nexperiments, and conducting an accurate comparison with current\nstate-of-the-art methods for protein-conditioned molecule generation. We show\nthat the resulting fixed model is capable of producing molecules with superior\ndocking scores compared to alternative approaches.\n","authors":["Alexander Telepov","Artem Tsypin","Kuzma Khrabrov","Sergey Yakukhnov","Pavel Strashnov","Petr Zhilyaev","Egor Rumiantsev","Daniel Ezhov","Manvel Avetisian","Olga Popova","Artur Kadurin"],"pdf_url":"https://arxiv.org/pdf/2401.09840v1.pdf","comment":"37 pages, 10 figures, to be published in TMLR journal\n (https://www.jmlr.org/tmlr/)"},{"id":"http://arxiv.org/abs/2401.09180v2","updated":"2024-01-18T09:51:46Z","published":"2024-01-17T12:43:28Z","title":"Unsupervised Multiple Domain Translation through Controlled\n Disentanglement in Variational Autoencoder","summary":" Unsupervised Multiple Domain Translation is the task of transforming data\nfrom one domain to other domains without having paired data to train the\nsystems. Typically, methods based on Generative Adversarial Networks (GANs) are\nused to address this task. However, our proposal exclusively relies on a\nmodified version of a Variational Autoencoder. This modification consists of\nthe use of two latent variables disentangled in a controlled way by design. One\nof this latent variables is imposed to depend exclusively on the domain, while\nthe other one must depend on the rest of the variability factors of the data.\nAdditionally, the conditions imposed over the domain latent variable allow for\nbetter control and understanding of the latent space. We empirically\ndemonstrate that our approach works on different vision datasets improving the\nperformance of other well known methods. Finally, we prove that, indeed, one of\nthe latent variables stores all the information related to the domain and the\nother one hardly contains any domain information.\n","authors":["Antonio Almudévar","Théo Mariotte","Alfonso Ortega","Marie Tahon"],"pdf_url":"https://arxiv.org/pdf/2401.09180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20708v2","updated":"2024-01-18T09:30:45Z","published":"2023-10-31T17:59:56Z","title":"Unexpected Improvements to Expected Improvement for Bayesian\n Optimization","summary":" Expected Improvement (EI) is arguably the most popular acquisition function\nin Bayesian optimization and has found countless successful applications, but\nits performance is often exceeded by that of more recent methods. Notably, EI\nand its variants, including for the parallel and multi-objective settings, are\nchallenging to optimize because their acquisition values vanish numerically in\nmany regions. This difficulty generally increases as the number of\nobservations, dimensionality of the search space, or the number of constraints\ngrow, resulting in performance that is inconsistent across the literature and\nmost often sub-optimal. Herein, we propose LogEI, a new family of acquisition\nfunctions whose members either have identical or approximately equal optima as\ntheir canonical counterparts, but are substantially easier to optimize\nnumerically. We demonstrate that numerical pathologies manifest themselves in\n\"classic\" analytic EI, Expected Hypervolume Improvement (EHVI), as well as\ntheir constrained, noisy, and parallel variants, and propose corresponding\nreformulations that remedy these pathologies. Our empirical results show that\nmembers of the LogEI family of acquisition functions substantially improve on\nthe optimization performance of their canonical counterparts and surprisingly,\nare on par with or exceed the performance of recent state-of-the-art\nacquisition functions, highlighting the understated role of numerical\noptimization in the literature.\n","authors":["Sebastian Ament","Samuel Daulton","David Eriksson","Maximilian Balandat","Eytan Bakshy"],"pdf_url":"https://arxiv.org/pdf/2310.20708v2.pdf","comment":"NeurIPS 2023 Spotlight"},{"id":"http://arxiv.org/abs/2305.02650v2","updated":"2024-01-18T09:25:44Z","published":"2023-05-04T08:41:03Z","title":"A Constrained BA Algorithm for Rate-Distortion and Distortion-Rate\n Functions","summary":" The Blahut-Arimoto (BA) algorithm has played a fundamental role in the\nnumerical computation of rate-distortion (RD) functions. This algorithm\npossesses a desirable monotonic convergence property by alternatively\nminimizing its Lagrangian with a fixed multiplier. In this paper, we propose a\nnovel modification of the BA algorithm, wherein the multiplier is updated\nthrough a one-dimensional root-finding step using a monotonic univariate\nfunction, efficiently implemented by Newton's method in each iteration.\nConsequently, the modified algorithm directly computes the RD function for a\ngiven target distortion, without exploring the entire RD curve as in the\noriginal BA algorithm. Moreover, this modification presents a versatile\nframework, applicable to a wide range of problems, including the computation of\ndistortion-rate (DR) functions. Theoretical analysis shows that the outputs of\nthe modified algorithms still converge to the solutions of the RD and DR\nfunctions with rate $O(1/n)$, where $n$ is the number of iterations.\nAdditionally, these algorithms provide $\\varepsilon$-approximation solutions\nwith $O\\left(\\frac{MN\\log N}{\\varepsilon}(1+\\log |\\log \\varepsilon|)\\right)$\narithmetic operations, where $M,N$ are the sizes of source and reproduced\nalphabets respectively. Numerical experiments demonstrate that the modified\nalgorithms exhibit significant acceleration compared with the original BA\nalgorithms and showcase commendable performance across classical source\ndistributions such as discretized Gaussian, Laplacian and uniform sources.\n","authors":["Lingyi Chen","Shitong Wu","Wenhao Ye","Huihui Wu","Wenyi Zhang","Hao Wu","Bo Bai"],"pdf_url":"https://arxiv.org/pdf/2305.02650v2.pdf","comment":"Version_2"},{"id":"http://arxiv.org/abs/2401.09819v1","updated":"2024-01-18T09:20:27Z","published":"2024-01-18T09:20:27Z","title":"PPNet: A Novel Neural Network Structure for End-to-End Near-Optimal Path\n Planning","summary":" The classical path planners, such as sampling-based path planners, have the\nlimitations of sensitivity to the initial solution and slow convergence to the\noptimal solution. However, finding a near-optimal solution in a short period is\nchallenging in many applications such as the autonomous vehicle with limited\npower/fuel. To achieve an end-to-end near-optimal path planner, we first divide\nthe path planning problem into two subproblems, which are path's space\nsegmentation and waypoints generation in the given path's space. We further\npropose a two-level cascade neural network named Path Planning Network (PPNet)\nto solve the path planning problem by solving the abovementioned subproblems.\nMoreover, we propose a novel efficient data generation method for path planning\nnamed EDaGe-PP. The results show the total computation time is less than 1/33\nand the success rate of PPNet trained by the dataset that is generated by\nEDaGe-PP is about $2 \\times$ compared to other methods. We validate PPNet\nagainst state-of-the-art path planning methods. The results show PPNet can find\na near-optimal solution in 15.3ms, which is much shorter than the\nstate-of-the-art path planners.\n","authors":["Qinglong Meng","Chongkun Xia","Xueqian Wang","Songping Mai","Bin Liang"],"pdf_url":"https://arxiv.org/pdf/2401.09819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09804v1","updated":"2024-01-18T08:48:54Z","published":"2024-01-18T08:48:54Z","title":"Clickbait vs. Quality: How Engagement-Based Optimization Shapes the\n Content Landscape in Online Platforms","summary":" Online content platforms commonly use engagement-based optimization when\nmaking recommendations. This encourages content creators to invest in quality,\nbut also rewards gaming tricks such as clickbait. To understand the total\nimpact on the content landscape, we study a game between content creators\ncompeting on the basis of engagement metrics and analyze the equilibrium\ndecisions about investment in quality and gaming. First, we show the content\ncreated at equilibrium exhibits a positive correlation between quality and\ngaming, and we empirically validate this finding on a Twitter dataset. Using\nthe equilibrium structure of the content landscape, we then examine the\ndownstream performance of engagement-based optimization along several axes.\nPerhaps counterintuitively, the average quality of content consumed by users\ncan decrease at equilibrium as gaming tricks become more costly for content\ncreators to employ. Moreover, engagement-based optimization can perform worse\nin terms of user utility than a baseline with random recommendations, and\nengagement-based optimization is also suboptimal in terms of realized\nengagement relative to quality-based optimization. Altogether, our results\nhighlight the need to consider content creator incentives when evaluating a\nplatform's choice of optimization metric.\n","authors":["Nicole Immorlica","Meena Jagadeesan","Brendan Lucier"],"pdf_url":"https://arxiv.org/pdf/2401.09804v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09796v1","updated":"2024-01-18T08:33:09Z","published":"2024-01-18T08:33:09Z","title":"A Fast, Performant, Secure Distributed Training Framework For Large\n Language Model","summary":" The distributed (federated) LLM is an important method for co-training the\ndomain-specific LLM using siloed data. However, maliciously stealing model\nparameters and data from the server or client side has become an urgent problem\nto be solved. In this paper, we propose a secure distributed LLM based on model\nslicing. In this case, we deploy the Trusted Execution Environment (TEE) on\nboth the client and server side, and put the fine-tuned structure (LoRA or\nembedding of P-tuning v2) into the TEE. Then, secure communication is executed\nin the TEE and general environments through lightweight encryption. In order to\nfurther reduce the equipment cost as well as increase the model performance and\naccuracy, we propose a split fine-tuning scheme. In particular, we split the\nLLM by layers and place the latter layers in a server-side TEE (the client does\nnot need a TEE). We then combine the proposed Sparsification Parameter\nFine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream\ntask. Numerous experiments have shown that our method guarantees accuracy while\nmaintaining security.\n","authors":["Wei Huang","Yinggui Wang","Anda Cheng","Aihui Zhou","Chaofan Yu","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09796v1.pdf","comment":"Accept ICASSP2024"},{"id":"http://arxiv.org/abs/2401.09793v1","updated":"2024-01-18T08:26:33Z","published":"2024-01-18T08:26:33Z","title":"PatchAD: Patch-based MLP-Mixer for Time Series Anomaly Detection","summary":" Anomaly detection stands as a crucial aspect of time series analysis, aiming\nto identify abnormal events in time series samples. The central challenge of\nthis task lies in effectively learning the representations of normal and\nabnormal patterns in a label-lacking scenario. Previous research mostly relied\non reconstruction-based approaches, restricting the representational abilities\nof the models. In addition, most of the current deep learning-based methods are\nnot lightweight enough, which prompts us to design a more efficient framework\nfor anomaly detection. In this study, we introduce PatchAD, a novel multi-scale\npatch-based MLP-Mixer architecture that leverages contrastive learning for\nrepresentational extraction and anomaly detection. Specifically, PatchAD is\ncomposed of four distinct MLP Mixers, exclusively utilizing the MLP\narchitecture for high efficiency and lightweight architecture. Additionally, we\nalso innovatively crafted a dual project constraint module to mitigate\npotential model degradation. Comprehensive experiments demonstrate that PatchAD\nachieves state-of-the-art results across multiple real-world multivariate time\nseries datasets. Our code is publicly\navailable.\\footnote{\\url{https://github.com/EmorZz1G/PatchAD}}\n","authors":["Zhijie Zhong","Zhiwen Yu","Yiyuan Yang","Weizheng Wang","Kaixiang Yang"],"pdf_url":"https://arxiv.org/pdf/2401.09793v1.pdf","comment":"13 pages, 16 figures, IJCAI 2024 under review, paper id 3166"},{"id":"http://arxiv.org/abs/2401.09791v1","updated":"2024-01-18T08:23:29Z","published":"2024-01-18T08:23:29Z","title":"BreastRegNet: A Deep Learning Framework for Registration of Breast\n Faxitron and Histopathology Images","summary":" A standard treatment protocol for breast cancer entails administering\nneoadjuvant therapy followed by surgical removal of the tumor and surrounding\ntissue. Pathologists typically rely on cabinet X-ray radiographs, known as\nFaxitron, to examine the excised breast tissue and diagnose the extent of\nresidual disease. However, accurately determining the location, size, and\nfocality of residual cancer can be challenging, and incorrect assessments can\nlead to clinical consequences. The utilization of automated methods can improve\nthe histopathology process, allowing pathologists to choose regions for\nsampling more effectively and precisely. Despite the recognized necessity,\nthere are currently no such methods available. Training such automated\ndetection models require accurate ground truth labels on ex-vivo radiology\nimages, which can be acquired through registering Faxitron and histopathology\nimages and mapping the extent of cancer from histopathology to x-ray images.\nThis study introduces a deep learning-based image registration approach trained\non mono-modal synthetic image pairs. The models were trained using data from 50\nwomen who received neoadjuvant chemotherapy and underwent surgery. The results\ndemonstrate that our method is faster and yields significantly lower average\nlandmark error ($2.1\\pm1.96$ mm) over the state-of-the-art iterative\n($4.43\\pm4.1$ mm) and deep learning ($4.02\\pm3.15$ mm) approaches. Improved\nperformance of our approach in integrating radiology and pathology information\nfacilitates generating large datasets, which allows training models for more\naccurate breast cancer detection.\n","authors":["Negar Golestani","Aihui Wang","Gregory R Bean","Mirabela Rusu"],"pdf_url":"https://arxiv.org/pdf/2401.09791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04855v2","updated":"2024-01-18T08:18:58Z","published":"2024-01-10T00:08:00Z","title":"LPAC: Learnable Perception-Action-Communication Loops with Applications\n to Coverage Control","summary":" Coverage control is the problem of navigating a robot swarm to\ncollaboratively monitor features or a phenomenon of interest not known a\npriori. The problem is challenging in decentralized settings with robots that\nhave limited communication and sensing capabilities. We propose a learnable\nPerception-Action-Communication (LPAC) architecture for the problem, wherein a\nconvolution neural network (CNN) processes localized perception; a graph neural\nnetwork (GNN) facilitates robot communications; finally, a shallow multi-layer\nperceptron (MLP) computes robot actions. The GNN enables collaboration in the\nrobot swarm by computing what information to communicate with nearby robots and\nhow to incorporate received information. Evaluations show that the LPAC models\n-- trained using imitation learning -- outperform standard decentralized and\ncentralized coverage control algorithms. The learned policy generalizes to\nenvironments different from the training dataset, transfers to larger\nenvironments with more robots, and is robust to noisy position estimates. The\nresults indicate the suitability of LPAC architectures for decentralized\nnavigation in robot swarms to achieve collaborative behavior.\n","authors":["Saurav Agarwal","Ramya Muthukrishnan","Walker Gosrich","Vijay Kumar","Alejandro Ribeiro"],"pdf_url":"https://arxiv.org/pdf/2401.04855v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09787v1","updated":"2024-01-18T08:12:23Z","published":"2024-01-18T08:12:23Z","title":"Querying Easily Flip-flopped Samples for Deep Active Learning","summary":" Active learning is a machine learning paradigm that aims to improve the\nperformance of a model by strategically selecting and querying unlabeled data.\nOne effective selection strategy is to base it on the model's predictive\nuncertainty, which can be interpreted as a measure of how informative a sample\nis. The sample's distance to the decision boundary is a natural measure of\npredictive uncertainty, but it is often intractable to compute, especially for\ncomplex decision boundaries formed in multiclass classification tasks. To\naddress this issue, this paper proposes the {\\it least disagree metric} (LDM),\ndefined as the smallest probability of disagreement of the predicted label, and\nan estimator for LDM proven to be asymptotically consistent under mild\nassumptions. The estimator is computationally efficient and can be easily\nimplemented for deep learning models using parameter perturbation. The\nLDM-based active learning is performed by querying unlabeled data with the\nsmallest LDM. Experimental results show that our LDM-based active learning\nalgorithm obtains state-of-the-art overall performance on all considered\ndatasets and deep architectures.\n","authors":["Seong Jin Cho","Gwangsu Kim","Junghyun Lee","Jinwoo Shin","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2401.09787v1.pdf","comment":"34 pages, 17 figures, 5 tables. Accepted to the 12th International\n Conference on Learning Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2307.02140v2","updated":"2024-01-18T07:57:06Z","published":"2023-07-05T09:30:14Z","title":"Towards Open Federated Learning Platforms: Survey and Vision from\n Technical and Legal Perspectives","summary":" Traditional Federated Learning (FL) follows a server-domincated cooperation\nparadigm which narrows the application scenarios of FL and decreases the\nenthusiasm of data holders to participate. To fully unleash the potential of\nFL, we advocate rethinking the design of current FL frameworks and extending it\nto a more generalized concept: Open Federated Learning Platforms. We propose\ntwo reciprocal cooperation frameworks for FL to achieve this: query-based FL\nand contract-based FL. In this survey, we conduct a comprehensive review of the\nfeasibility of constructing an open FL platform from both technical and legal\nperspectives. We begin by reviewing the definition of FL and summarizing its\ninherent limitations, including server-client coupling, low model reusability,\nand non-public. In the query-based FL platform, which is an open model sharing\nand reusing platform empowered by the community for model mining, we explore a\nwide range of valuable topics, including the availability of up-to-date model\nrepositories for model querying, legal compliance analysis between different\nmodel licenses, and copyright issues and intellectual property protection in\nmodel reusing. In particular, we introduce a novel taxonomy to streamline the\nanalysis of model license compatibility in FL studies that involve batch model\nreusing methods, including combination, amalgamation, distillation, and\ngeneration. This taxonomy provides a systematic framework for identifying the\ncorresponding clauses of licenses and facilitates the identification of\npotential legal implications and restrictions when reusing models. Through this\nsurvey, we uncover the the current dilemmas faced by FL and advocate for the\ndevelopment of sustainable open FL platforms. We aim to provide guidance for\nestablishing such platforms in the future, while identifying potential problems\nand challenges that need to be addressed.\n","authors":["Moming Duan"],"pdf_url":"https://arxiv.org/pdf/2307.02140v2.pdf","comment":"This is an ongoing work. See the latest version on\n https://github.com/morningD/Model-Centric-FML"},{"id":"http://arxiv.org/abs/2311.18243v2","updated":"2024-01-18T07:47:24Z","published":"2023-11-30T04:21:10Z","title":"DKiS: Decay weight invertible image steganography with private key","summary":" Image steganography, defined as the practice of concealing information within\nanother image, traditionally encounters security challenges when its methods\nbecome publicly known or are under attack. To address this, a novel private\nkey-based image steganography technique has been introduced. This approach\nensures the security of the hidden information, as access requires a\ncorresponding private key, regardless of the public knowledge of the\nsteganography method. Experimental evidence has been presented, demonstrating\nthe effectiveness of our method and showcasing its real-world applicability.\nFurthermore, a critical challenge in the invertible image steganography process\nhas been identified by us: the transfer of non-essential, or `garbage',\ninformation from the secret to the host pipeline. To tackle this issue, the\ndecay weight has been introduced to control the information transfer,\neffectively filtering out irrelevant data and enhancing the performance of\nimage steganography. The code for this technique is publicly accessible at\nhttps://github.com/yanghangAI/DKiS, and a practical demonstration can be found\nat http://yanghang.site/hidekey.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00695v5","updated":"2024-01-18T07:39:29Z","published":"2023-02-01T19:00:10Z","title":"Versatile Energy-Based Probabilistic Models for High Energy Physics","summary":" As a classical generative modeling approach, energy-based models have the\nnatural advantage of flexibility in the form of the energy function. Recently,\nenergy-based models have achieved great success in modeling high-dimensional\ndata in computer vision and natural language processing. In line with these\nadvancements, we build a multi-purpose energy-based probabilistic model for\nHigh Energy Physics events at the Large Hadron Collider. This framework builds\non a powerful generative model and describes higher-order inter-particle\ninteractions. It suits different encoding architectures and builds on implicit\ngeneration. As for applicative aspects, it can serve as a powerful\nparameterized event generator for physics simulation, a generic anomalous\nsignal detector free from spurious correlations, and an augmented event\nclassifier for particle identification.\n","authors":["Taoli Cheng","Aaron Courville"],"pdf_url":"https://arxiv.org/pdf/2302.00695v5.pdf","comment":"17 pages, 9 figures. NeurIPS 2023 camera ready"},{"id":"http://arxiv.org/abs/2401.09769v1","updated":"2024-01-18T07:36:38Z","published":"2024-01-18T07:36:38Z","title":"Towards Learning from Graphs with Heterophily: Progress and Future","summary":" Graphs are structured data that models complex relations between real-world\nentities. Heterophilous graphs, where linked nodes are prone to be with\ndifferent labels or dissimilar features, have recently attracted significant\nattention and found many applications. Meanwhile, increasing efforts have been\nmade to advance learning from heterophilous graphs. Although there exist\nsurveys on the relevant topic, they focus on heterophilous GNNs, which are only\nsub-topics of heterophilous graph learning. In this survey, we comprehensively\noverview existing works on learning from graphs with heterophily.First, we\ncollect over 180 publications and introduce the development of this field.\nThen, we systematically categorize existing methods based on a hierarchical\ntaxonomy including learning strategies, model architectures and practical\napplications. Finally, we discuss the primary challenges of existing studies\nand highlight promising avenues for future research.More publication details\nand corresponding open-source codes can be accessed and will be continuously\nupdated at our\nrepositories:https://github.com/gongchenghua/Awesome-Survey-Graphs-with-Heterophily.\n","authors":["Chenghua Gong","Yao Cheng","Xiang Li","Caihua Shan","Siqiang Luo","Chuan Shi"],"pdf_url":"https://arxiv.org/pdf/2401.09769v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2303.02472v2","updated":"2024-01-18T07:27:09Z","published":"2023-03-04T18:06:36Z","title":"ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration\n Measure","summary":" Studies have shown that modern neural networks tend to be poorly calibrated\ndue to over-confident predictions. Traditionally, post-processing methods have\nbeen used to calibrate the model after training. In recent years, various\ntrainable calibration measures have been proposed to incorporate them directly\ninto the training process. However, these methods all incorporate internal\nhyperparameters, and the performance of these calibration objectives relies on\ntuning these hyperparameters, incurring more computational costs as the size of\nneural networks and datasets become larger. As such, we present Expected\nSquared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable\ncalibration objective loss, where we view the calibration error from the\nperspective of the squared difference between the two expectations. With\nextensive experiments on several architectures (CNNs, Transformers) and\ndatasets, we demonstrate that (1) incorporating ESD into the training improves\nmodel calibration in various batch size settings without the need for internal\nhyperparameter tuning, (2) ESD yields the best-calibrated results compared with\nprevious approaches, and (3) ESD drastically improves the computational costs\nrequired for calibration during training due to the absence of internal\nhyperparameter. The code is publicly accessible at\nhttps://github.com/hee-suk-yoon/ESD.\n","authors":["Hee Suk Yoon","Joshua Tian Jin Tee","Eunseop Yoon","Sunjae Yoon","Gwangsu Kim","Yingzhen Li","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2303.02472v2.pdf","comment":"ICLR 2023"},{"id":"http://arxiv.org/abs/2401.09756v1","updated":"2024-01-18T07:07:42Z","published":"2024-01-18T07:07:42Z","title":"Explaining Drift using Shapley Values","summary":" Machine learning models often deteriorate in their performance when they are\nused to predict the outcomes over data on which they were not trained. These\nscenarios can often arise in real world when the distribution of data changes\ngradually or abruptly due to major events like a pandemic. There have been many\nattempts in machine learning research to come up with techniques that are\nresilient to such Concept drifts. However, there is no principled framework to\nidentify the drivers behind the drift in model performance. In this paper, we\npropose a novel framework - DBShap that uses Shapley values to identify the\nmain contributors of the drift and quantify their respective contributions. The\nproposed framework not only quantifies the importance of individual features in\ndriving the drift but also includes the change in the underlying relation\nbetween the input and output as a possible driver. The explanation provided by\nDBShap can be used to understand the root cause behind the drift and use it to\nmake the model resilient to the drift.\n","authors":["Narayanan U. Edakunni","Utkarsh Tekriwal","Anukriti Jain"],"pdf_url":"https://arxiv.org/pdf/2401.09756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09754v1","updated":"2024-01-18T06:57:29Z","published":"2024-01-18T06:57:29Z","title":"Universally Robust Graph Neural Networks by Preserving Neighbor\n Similarity","summary":" Despite the tremendous success of graph neural networks in learning\nrelational data, it has been widely investigated that graph neural networks are\nvulnerable to structural attacks on homophilic graphs. Motivated by this, a\nsurge of robust models is crafted to enhance the adversarial robustness of\ngraph neural networks on homophilic graphs. However, the vulnerability based on\nheterophilic graphs remains a mystery to us. To bridge this gap, in this paper,\nwe start to explore the vulnerability of graph neural networks on heterophilic\ngraphs and theoretically prove that the update of the negative classification\nloss is negatively correlated with the pairwise similarities based on the\npowered aggregated neighbor features. This theoretical proof explains the\nempirical observations that the graph attacker tends to connect dissimilar node\npairs based on the similarities of neighbor features instead of ego features\nboth on homophilic and heterophilic graphs. In this way, we novelly introduce a\nnovel robust model termed NSPGNN which incorporates a dual-kNN graphs pipeline\nto supervise the neighbor similarity-guided propagation. This propagation\nutilizes the low-pass filter to smooth the features of node pairs along the\npositive kNN graphs and the high-pass filter to discriminate the features of\nnode pairs along the negative kNN graphs. Extensive experiments on both\nhomophilic and heterophilic graphs validate the universal robustness of NSPGNN\ncompared to the state-of-the-art methods.\n","authors":["Yulin Zhu","Yuni Lai","Xing Ai","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.09754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12971v2","updated":"2024-01-18T06:57:18Z","published":"2023-09-22T16:11:17Z","title":"Higher-order Graph Convolutional Network with Flower-Petals Laplacians\n on Simplicial Complexes","summary":" Despite the recent successes of vanilla Graph Neural Networks (GNNs) on\nvarious tasks, their foundation on pairwise networks inherently limits their\ncapacity to discern latent higher-order interactions in complex systems. To\nbridge this capability gap, we propose a novel approach exploiting the rich\nmathematical theory of simplicial complexes (SCs) - a robust tool for modeling\nhigher-order interactions. Current SC-based GNNs are burdened by high\ncomplexity and rigidity, and quantifying higher-order interaction strengths\nremains challenging. Innovatively, we present a higher-order Flower-Petals (FP)\nmodel, incorporating FP Laplacians into SCs. Further, we introduce a\nHigher-order Graph Convolutional Network (HiGCN) grounded in FP Laplacians,\ncapable of discerning intrinsic features across varying topological scales. By\nemploying learnable graph filters, a parameter group within each FP Laplacian\ndomain, we can identify diverse patterns where the filters' weights serve as a\nquantifiable measure of higher-order interaction strengths. The theoretical\nunderpinnings of HiGCN's advanced expressiveness are rigorously demonstrated.\nAdditionally, our empirical investigations reveal that the proposed model\naccomplishes state-of-the-art performance on a range of graph tasks and\nprovides a scalable and flexible solution to explore higher-order interactions\nin graphs. Codes and datasets are available at\nhttps://github.com/Yiminghh/HiGCN.\n","authors":["Yiming Huang","Yujie Zeng","Qiang Wu","Linyuan Lü"],"pdf_url":"https://arxiv.org/pdf/2309.12971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09753v1","updated":"2024-01-18T06:57:05Z","published":"2024-01-18T06:57:05Z","title":"Applications of Machine Learning to Optimizing Polyolefin Manufacturing","summary":" This chapter is a preprint from our book by , focusing on leveraging machine\nlearning (ML) in chemical and polyolefin manufacturing optimization. It's\ncrafted for both novices and seasoned professionals keen on the latest ML\napplications in chemical processes. We trace the evolution of AI and ML in\nchemical industries, delineate core ML components, and provide resources for ML\nbeginners. A detailed discussion on various ML methods is presented, covering\nregression, classification, and unsupervised learning techniques, with\nperformance metrics and examples. Ensemble methods, deep learning networks,\nincluding MLP, DNNs, RNNs, CNNs, and transformers, are explored for their\ngrowing role in chemical applications. Practical workshops guide readers\nthrough predictive modeling using advanced ML algorithms. The chapter\nculminates with insights into science-guided ML, advocating for a hybrid\napproach that enhances model accuracy. The extensive bibliography offers\nresources for further research and practical implementation. This chapter aims\nto be a thorough primer on ML's practical application in chemical engineering,\nparticularly for polyolefin production, and sets the stage for continued\nlearning in subsequent chapters. Please cite the original work [169,170] when\nreferencing.\n","authors":["Niket Sharma","Y. A. Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09752v1","updated":"2024-01-18T06:52:52Z","published":"2024-01-18T06:52:52Z","title":"Improving Speaker-independent Speech Emotion Recognition Using Dynamic\n Joint Distribution Adaptation","summary":" In speaker-independent speech emotion recognition, the training and testing\nsamples are collected from diverse speakers, leading to a multi-domain shift\nchallenge across the feature distributions of data from different speakers.\nConsequently, when the trained model is confronted with data from new speakers,\nits performance tends to degrade. To address the issue, we propose a Dynamic\nJoint Distribution Adaptation (DJDA) method under the framework of multi-source\ndomain adaptation. DJDA firstly utilizes joint distribution adaptation (JDA),\ninvolving marginal distribution adaptation (MDA) and conditional distribution\nadaptation (CDA), to more precisely measure the multi-domain distribution\nshifts caused by different speakers. This helps eliminate speaker bias in\nemotion features, allowing for learning discriminative and speaker-invariant\nspeech emotion features from coarse-level to fine-level. Furthermore, we\nquantify the adaptation contributions of MDA and CDA within JDA by using a\ndynamic balance factor based on $\\mathcal{A}$-Distance, promoting to\neffectively handle the unknown distributions encountered in data from new\nspeakers. Experimental results demonstrate the superior performance of our DJDA\nas compared to other state-of-the-art (SOTA) methods.\n","authors":["Cheng Lu","Yuan Zong","Hailun Lian","Yan Zhao","Björn Schuller","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.09752v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.08689v2","updated":"2024-01-18T06:45:16Z","published":"2024-01-13T08:30:13Z","title":"NODI: Out-Of-Distribution Detection with Noise from Diffusion","summary":" Out-of-distribution (OOD) detection is a crucial part of deploying machine\nlearning models safely. It has been extensively studied with a plethora of\nmethods developed in the literature. This problem is tackled with an OOD score\ncomputation, however, previous methods compute the OOD scores with limited\nusage of the in-distribution dataset. For instance, the OOD scores are computed\nwith information from a small portion of the in-distribution data. Furthermore,\nthese methods encode images with a neural image encoder. The robustness of\nthese methods is rarely checked with respect to image encoders of different\ntraining methods and architectures. In this work, we introduce the diffusion\nprocess into the OOD task. The diffusion model integrates information on the\nwhole training set into the predicted noise vectors. What's more, we deduce a\nclosed-form solution for the noise vector (stable point). Then the noise vector\nis converted into our OOD score, we test both the deep model predicted noise\nvector and the closed-form noise vector on the OOD benchmarks \\cite{openood}.\nOur method outperforms previous OOD methods across all types of image encoders\n(Table. \\ref{main}). A $3.5\\%$ performance gain is achieved with the MAE-based\nimage encoder. Moreover, we studied the robustness of OOD methods by applying\ndifferent types of image encoders. Some OOD methods failed to generalize well\nwhen switching image encoders from ResNet to Vision Transformers, our method\nperforms exhibits good robustness with all the image encoders.\n","authors":["Jingqiu Zhou","Aojun Zhou","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2401.08689v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09750v1","updated":"2024-01-18T06:32:53Z","published":"2024-01-18T06:32:53Z","title":"Exploration and Anti-Exploration with Distributional Random Network\n Distillation","summary":" Exploration remains a critical issue in deep reinforcement learning for an\nagent to attain high returns in unknown environments. Although the prevailing\nexploration Random Network Distillation (RND) algorithm has been demonstrated\nto be effective in numerous environments, it often needs more discriminative\npower in bonus allocation. This paper highlights the ``bonus inconsistency''\nissue within RND, pinpointing its primary limitation. To address this issue, we\nintroduce the Distributional RND (DRND), a derivative of the RND. DRND enhances\nthe exploration process by distilling a distribution of random networks and\nimplicitly incorporating pseudo counts to improve the precision of bonus\nallocation. This refinement encourages agents to engage in more extensive\nexploration. Our method effectively mitigates the inconsistency issue without\nintroducing significant computational overhead. Both theoretical analysis and\nexperimental results demonstrate the superiority of our approach over the\noriginal RND algorithm. Our method excels in challenging online exploration\nscenarios and effectively serves as an anti-exploration mechanism in D4RL\noffline tasks.\n","authors":["Kai Yang","Jian Tao","Jiafei Lyu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2401.09750v1.pdf","comment":"Submitted to ICML 2024"},{"id":"http://arxiv.org/abs/2401.09748v1","updated":"2024-01-18T06:19:05Z","published":"2024-01-18T06:19:05Z","title":"Bootstrapping OTS-Funcimg Pre-training Model (Botfip) -- A Comprehensive\n Symbolic Regression Framework","summary":" In the field of scientific computing, many problem-solving approaches tend to\nfocus only on the process and final outcome, even in AI for science, there is a\nlack of deep multimodal information mining behind the data, missing a\nmultimodal framework akin to that in the image-text domain. In this paper, we\ntake Symbolic Regression(SR) as our focal point and, drawing inspiration from\nthe BLIP model in the image-text domain, propose a scientific computing\nmultimodal framework based on Function Images (Funcimg) and Operation Tree\nSequence (OTS), named Bootstrapping OTS-Funcimg Pre-training Model (Botfip). In\nSR experiments, we validate the advantages of Botfip in low-complexity SR\nproblems, showcasing its potential. As a MED framework, Botfip holds promise\nfor future applications in a broader range of scientific computing problems.\n","authors":["Tianhao Chen","Pengbo Xu","Haibiao Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.09748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.00924v3","updated":"2024-01-18T06:13:12Z","published":"2023-01-03T01:36:31Z","title":"Increasing biases can be more efficient than increasing weights","summary":" We introduce a novel computational unit for neural networks that features\nmultiple biases, challenging the traditional perceptron structure. This unit\nemphasizes the importance of preserving uncorrupted information as it is passed\nfrom one unit to the next, applying activation functions later in the process\nwith specialized biases for each unit. Through both empirical and theoretical\nanalyses, we show that by focusing on increasing biases rather than weights,\nthere is potential for significant enhancement in a neural network model's\nperformance. This approach offers an alternative perspective on optimizing\ninformation flow within neural networks. See source code at\nhttps://github.com/CuriosAI/dac-dev.\n","authors":["Carlo Metta","Marco Fantozzi","Andrea Papini","Gianluca Amato","Matteo Bergamaschi","Silvia Giulia Galfrè","Alessandro Marchetti","Michelangelo Vegliò","Maurizio Parton","Francesco Morandin"],"pdf_url":"https://arxiv.org/pdf/2301.00924v3.pdf","comment":"Major rewriting. Supersedes v1 and v2. Focusing on the fact that not\n all parameters are born equal: biases can be more important than weights.\n Accordingly, new title and new abstract, and many more experiments on fully\n connected architectures. This is the extended version of the paper published\n at WACV 2024"},{"id":"http://arxiv.org/abs/2308.08469v5","updated":"2024-01-18T06:01:28Z","published":"2023-08-16T16:19:50Z","title":"LLM4TS: Aligning Pre-Trained LLMs as Data-Efficient Time-Series\n Forecasters","summary":" Multivariate time-series forecasting is vital in various domains, e.g.,\neconomic planning and weather prediction. Deep train-from-scratch models have\nexhibited effective performance yet require large amounts of data, which limits\nreal-world applicability. Recently, researchers have leveraged the\nrepresentation learning transferability of pre-trained Large Language Models\n(LLMs) to handle limited non-linguistic datasets effectively. However,\nincorporating LLMs with time-series data presents challenges of limited\nadaptation due to different compositions between time-series and linguistic\ndata, and the inability to process multi-scale temporal information. To tackle\nthese challenges, we propose LLM4TS, a framework for time-series forecasting\nwith pre-trained LLMs. LLM4TS consists of a two-stage fine-tuning strategy: the\n\\textit{time-series alignment} stage to align LLMs with the nuances of\ntime-series data, and the \\textit{forecasting fine-tuning} stage for downstream\ntime-series forecasting tasks. Furthermore, our framework features a novel\ntwo-level aggregation method that integrates multi-scale temporal data within\npre-trained LLMs, enhancing their ability to interpret time-specific\ninformation. In experiments across 7 time-series forecasting datasets, LLM4TS\nis superior to existing state-of-the-art methods compared with\ntrained-from-scratch models in full-shot scenarios, and also achieves an\naverage improvement of 6.84% in MSE in few-shot scenarios. In addition,\nevaluations compared with different self-supervised learning approaches\nhighlight LLM4TS's effectiveness with representation learning in forecasting\ntasks.\n","authors":["Ching Chang","Wei-Yao Wang","Wen-Chih Peng","Tien-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.08469v5.pdf","comment":"This paper is currently under review. The code will be made available\n upon acceptance"},{"id":"http://arxiv.org/abs/2303.09906v2","updated":"2024-01-18T05:42:20Z","published":"2023-03-17T11:49:17Z","title":"Discovering mesoscopic descriptions of collective movement with neural\n stochastic modelling","summary":" Collective motion is an ubiquitous phenomenon in nature, inspiring engineers,\nphysicists and mathematicians to develop mathematical models and bio-inspired\ndesigns. Collective motion at small to medium group sizes ($\\sim$10-1000\nindividuals, also called the `mesoscale'), can show nontrivial features due to\nstochasticity. Therefore, characterizing both the deterministic and stochastic\naspects of the dynamics is crucial in the study of mesoscale collective\nphenomena. Here, we use a physics-inspired, neural-network based approach to\ncharacterize the stochastic group dynamics of interacting individuals, through\na stochastic differential equation (SDE) that governs the collective dynamics\nof the group. We apply this technique on both synthetic and real-world\ndatasets, and identify the deterministic and stochastic aspects of the dynamics\nusing drift and diffusion fields, enabling us to make novel inferences about\nthe nature of order in these systems.\n","authors":["Utkarsh Pratiush","Arshed Nabeel","Vishwesha Guttal","Prathosh AP"],"pdf_url":"https://arxiv.org/pdf/2303.09906v2.pdf","comment":"(v2) Minor corrections and clarifications. Added funding sources"},{"id":"http://arxiv.org/abs/2401.08727v2","updated":"2024-01-18T05:25:22Z","published":"2024-01-16T14:22:44Z","title":"MA2GCN: Multi Adjacency relationship Attention Graph Convolutional\n Networks for Traffic Prediction using Trajectory data","summary":" The problem of traffic congestion not only causes a large amount of economic\nlosses, but also seriously endangers the urban environment. Predicting traffic\ncongestion has important practical significance. So far, most studies have been\nbased on historical data from sensors placed on different roads to predict\nfuture traffic flow and speed, to analyze the traffic congestion conditions of\na certain road segment. However, due to the fixed position of sensors, it is\ndifficult to mine new information. On the other hand, vehicle trajectory data\nis more flexible and can extract traffic information as needed. Therefore, we\nproposed a new traffic congestion prediction model - Multi Adjacency\nrelationship Attention Graph Convolutional Networks(MA2GCN). This model\ntransformed vehicle trajectory data into graph structured data in grid form,\nand proposed a vehicle entry and exit matrix based on the mobility between\ndifferent grids. At the same time, in order to improve the performance of the\nmodel, this paper also built a new adaptive adjacency matrix generation method\nand adjacency matrix attention module. This model mainly used gated temporal\nconvolution and graph convolution to extract temporal and spatial information,\nrespectively. Compared with multiple baselines, our model achieved the best\nperformance on Shanghai taxi GPS trajectory dataset. The code is available at\nhttps://github.com/zachysun/Taxi_Traffic_Benchmark.\n","authors":["Zhengke Sun","Yuliang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.08727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11034v2","updated":"2024-01-18T05:20:27Z","published":"2023-12-18T09:09:52Z","title":"Partial Label Learning with a Partner","summary":" In partial label learning (PLL), each instance is associated with a set of\ncandidate labels among which only one is ground-truth. The majority of the\nexisting works focuses on constructing robust classifiers to estimate the\nlabeling confidence of candidate labels in order to identify the correct one.\nHowever, these methods usually struggle to rectify mislabeled samples. To help\nexisting PLL methods identify and rectify mislabeled samples, in this paper, we\nintroduce a novel partner classifier and propose a novel ``mutual supervision''\nparadigm. Specifically, we instantiate the partner classifier predicated on the\nimplicit fact that non-candidate labels of a sample should not be assigned to\nit, which is inherently accurate and has not been fully investigated in PLL.\nFurthermore, a novel collaborative term is formulated to link the base\nclassifier and the partner one. During each stage of mutual supervision, both\nclassifiers will blur each other's predictions through a blurring mechanism to\nprevent overconfidence in a specific label. Extensive experiments demonstrate\nthat the performance and disambiguation ability of several well-established\nstand-alone and deep-learning based PLL approaches can be significantly\nimproved by coupling with this learning paradigm.\n","authors":["Chongjie Si","Zekun Jiang","Xuehui Wang","Yan Wang","Xiaokang Yang","Wei Shen"],"pdf_url":"https://arxiv.org/pdf/2312.11034v2.pdf","comment":"2024, AAAI oral"},{"id":"http://arxiv.org/abs/2401.09728v1","updated":"2024-01-18T05:17:30Z","published":"2024-01-18T05:17:30Z","title":"Offline Imitation Learning by Controlling the Effective Planning Horizon","summary":" In offline imitation learning (IL), we generally assume only a handful of\nexpert trajectories and a supplementary offline dataset from suboptimal\nbehaviors to learn the expert policy. While it is now common to minimize the\ndivergence between state-action visitation distributions so that the agent also\nconsiders the future consequences of an action, a sampling error in an offline\ndataset may lead to erroneous estimates of state-action visitations in the\noffline case. In this paper, we investigate the effect of controlling the\neffective planning horizon (i.e., reducing the discount factor) as opposed to\nimposing an explicit regularizer, as previously studied. Unfortunately, it\nturns out that the existing algorithms suffer from magnified approximation\nerrors when the effective planning horizon is shortened, which results in a\nsignificant degradation in performance. We analyze the main cause of the\nproblem and provide the right remedies to correct the algorithm. We show that\nthe corrected algorithm improves on popular imitation learning benchmarks by\ncontrolling the effective planning horizon rather than an explicit\nregularization.\n","authors":["Hee-Jun Ahn","Seong-Woong Shim","Byung-Jun Lee"],"pdf_url":"https://arxiv.org/pdf/2401.09728v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2310.15141v2","updated":"2024-01-18T04:42:34Z","published":"2023-10-23T17:47:34Z","title":"SpecTr: Fast Speculative Decoding via Optimal Transport","summary":" Autoregressive sampling from large language models has led to\nstate-of-the-art results in several natural language tasks. However,\nautoregressive sampling generates tokens one at a time making it slow, and even\nprohibitive in certain tasks. One way to speed up sampling is\n$\\textit{speculative decoding}$: use a small model to sample a $\\textit{draft}$\n(block or sequence of tokens), and then score all tokens in the draft by the\nlarge language model in parallel. A subset of the tokens in the draft are\naccepted (and the rest rejected) based on a statistical method to guarantee\nthat the final output follows the distribution of the large model. In this\nwork, we provide a principled understanding of speculative decoding through the\nlens of optimal transport (OT) with $\\textit{membership cost}$. This framework\ncan be viewed as an extension of the well-known $\\textit{maximal-coupling}$\nproblem. This new formulation enables us to generalize the speculative decoding\nmethod to allow for a set of $k$ candidates at the token-level, which leads to\nan improved optimal membership cost. We show that the optimal draft selection\nalgorithm (transport plan) can be computed via linear programming, whose\nbest-known runtime is exponential in $k$. We then propose a valid draft\nselection algorithm whose acceptance probability is $(1-1/e)$-optimal\nmultiplicatively. Moreover, it can be computed in time almost linear with size\nof domain of a single token. Using this $new draft selection$ algorithm, we\ndevelop a new autoregressive sampling algorithm called $\\textit{SpecTr}$, which\nprovides speedup in decoding while ensuring that there is no quality\ndegradation in the decoded output. We experimentally demonstrate that for\nstate-of-the-art large language models, the proposed approach achieves a wall\nclock speedup of 2.13X, a further 1.37X speedup over speculative decoding on\nstandard benchmarks.\n","authors":["Ziteng Sun","Ananda Theertha Suresh","Jae Hun Ro","Ahmad Beirami","Himanshu Jain","Felix Yu"],"pdf_url":"https://arxiv.org/pdf/2310.15141v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.07202v2","updated":"2024-01-18T04:23:26Z","published":"2023-11-13T09:41:32Z","title":"Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model\n Predictive Control","summary":" Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive\nControl (MPC) successfully attains globally optimal solutions by upholding\nconvexity within the MPC framework. However, current ICNN architectures\nencounter the issue of vanishing/exploding gradients, which limits their\nability to serve as deep neural networks for complex tasks. Additionally, the\ncurrent neural network-based MPC, including conventional neural network-based\nMPC and ICNN-based MPC, faces slower convergence speed when compared to MPC\nbased on first-principles models. In this study, we leverage the principles of\nICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the\nspecific goal of reducing convergence time and mitigating the\nvanishing/exploding gradient problem while ensuring closed-loop stability. From\na simulation study of a nonlinear chemical reactor, we observed a mitigation of\nvanishing/exploding gradient problem and a reduction in convergence time, with\na percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain\nRNN, plain LSTM, and Input Convex Recurrent Neural Network, respectively.\n","authors":["Zihao Wang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2311.07202v2.pdf","comment":"Submitted to 6th Annual Learning for Dynamics & Control Conference\n (L4DC 2024)"},{"id":"http://arxiv.org/abs/2311.00964v2","updated":"2024-01-18T04:11:57Z","published":"2023-11-02T03:18:40Z","title":"On Finding Bi-objective Pareto-optimal Fraud Prevention Rule Sets for\n Fintech Applications","summary":" Rules are widely used in Fintech institutions to make fraud prevention\ndecisions, since rules are highly interpretable thanks to their intuitive\nif-then structure. In practice, a two-stage framework of fraud prevention\ndecision rule set mining is usually employed in large Fintech institutions.\nThis paper is concerned with finding high-quality rule subsets in a\nbi-objective space (such as precision and recall) from an initial pool of\nrules. To this end, we adopt the concept of Pareto optimality and aim to find a\nset of non-dominated rule subsets, which constitutes a Pareto front. We propose\na heuristic-based framework called PORS and we identify that the core of PORS\nis the problem of solution selection on the front (SSF). We provide a\nsystematic categorization of the SSF problem and a thorough empirical\nevaluation of various SSF methods on both public and proprietary datasets. We\nalso introduce a novel variant of sequential covering algorithm called\nSpectralRules to encourage the diversity of the initial rule set and we\nempirically find that SpectralRules further improves the quality of the found\nPareto front. On two real application scenarios within Alipay, we demonstrate\nthe advantages of our proposed methodology compared to existing work.\n","authors":["Chengyao Wen","Yin Lou"],"pdf_url":"https://arxiv.org/pdf/2311.00964v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.02860v2","updated":"2024-01-18T04:05:05Z","published":"2024-01-05T15:32:24Z","title":"Framework for Variable-lag Motif Following Relation Inference In Time\n Series using Matrix Profile analysis","summary":" Knowing who follows whom and what patterns they are following are crucial\nsteps to understand collective behaviors (e.g. a group of human, a school of\nfish, or a stock market). Time series is one of resources that can be used to\nget insight regarding following relations. However, the concept of following\npatterns or motifs and the solution to find them in time series are not\nobvious. In this work, we formalize a concept of following motifs between two\ntime series and present a framework to infer following patterns between two\ntime series. The framework utilizes one of efficient and scalable methods to\nretrieve motifs from time series called the Matrix Profile Method. We compare\nour proposed framework with several baselines. The framework performs better\nthan baselines in the simulation datasets. In the dataset of sound recording,\nthe framework is able to retrieve the following motifs within a pair of time\nseries that two singers sing following each other. In the cryptocurrency\ndataset, the framework is capable of capturing the following motifs within a\npair of time series from two digital currencies, which implies that the values\nof one currency follow the values of another currency patterns. Our framework\ncan be utilized in any field of time series to get insight regarding following\npatterns between time series.\n","authors":["Naaek Chinpattanakarn","Chainarong Amornbunchornvej"],"pdf_url":"https://arxiv.org/pdf/2401.02860v2.pdf","comment":"Revising based on an expert's comments in the research community"},{"id":"http://arxiv.org/abs/2309.07778v5","updated":"2024-01-18T03:55:30Z","published":"2023-09-14T15:09:35Z","title":"Virchow: A Million-Slide Digital Pathology Foundation Model","summary":" The use of artificial intelligence to enable precision medicine and decision\nsupport systems through the analysis of pathology images has the potential to\nrevolutionize the diagnosis and treatment of cancer. Such applications will\ndepend on models' abilities to capture the diverse patterns observed in\npathology images. To address this challenge, we present Virchow, a foundation\nmodel for computational pathology. Using self-supervised learning empowered by\nthe DINOv2 algorithm, Virchow is a vision transformer model with 632 million\nparameters trained on 1.5 million hematoxylin and eosin stained whole slide\nimages from diverse tissue and specimen types, which is orders of magnitude\nmore data than previous works. The Virchow model enables the development of a\npan-cancer detection system with 0.949 overall specimen-level AUC across 17\ndifferent cancer types, while also achieving 0.937 AUC on 7 rare cancer types.\nThe Virchow model sets the state-of-the-art on the internal and external image\ntile level benchmarks and slide level biomarker prediction tasks. The gains in\nperformance highlight the importance of training on massive pathology image\ndatasets, suggesting scaling up the data and network architecture can improve\nthe accuracy for many high-impact computational pathology applications where\nlimited amounts of training data are available.\n","authors":["Eugene Vorontsov","Alican Bozkurt","Adam Casson","George Shaikovski","Michal Zelechowski","Siqi Liu","Kristen Severson","Eric Zimmermann","James Hall","Neil Tenenholtz","Nicolo Fusi","Philippe Mathieu","Alexander van Eck","Donghun Lee","Julian Viret","Eric Robert","Yi Kan Wang","Jeremy D. Kunz","Matthew C. H. Lee","Jan Bernhard","Ran A. Godrich","Gerard Oakley","Ewan Millar","Matthew Hanna","Juan Retamero","William A. Moye","Razik Yousfi","Christopher Kanan","David Klimstra","Brandon Rothrock","Thomas J. Fuchs"],"pdf_url":"https://arxiv.org/pdf/2309.07778v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12469v2","updated":"2024-01-18T03:28:25Z","published":"2023-12-19T07:13:32Z","title":"Distilling Autoregressive Models to Obtain High-Performance\n Non-Autoregressive Solvers for Vehicle Routing Problems with Faster Inference\n Speed","summary":" Neural construction models have shown promising performance for Vehicle\nRouting Problems (VRPs) by adopting either the Autoregressive (AR) or\nNon-Autoregressive (NAR) learning approach. While AR models produce\nhigh-quality solutions, they generally have a high inference latency due to\ntheir sequential generation nature. Conversely, NAR models generate solutions\nin parallel with a low inference latency but generally exhibit inferior\nperformance. In this paper, we propose a generic Guided Non-Autoregressive\nKnowledge Distillation (GNARKD) method to obtain high-performance NAR models\nhaving a low inference latency. GNARKD removes the constraint of sequential\ngeneration in AR models while preserving the learned pivotal components in the\nnetwork architecture to obtain the corresponding NAR models through knowledge\ndistillation. We evaluate GNARKD by applying it to three widely adopted AR\nmodels to obtain NAR VRP solvers for both synthesized and real-world instances.\nThe experimental results demonstrate that GNARKD significantly reduces the\ninference time (4-5 times faster) with acceptable performance drop (2-3\\%). To\nthe best of our knowledge, this study is first-of-its-kind to obtain NAR VRP\nsolvers from AR ones through knowledge distillation.\n","authors":["Yubin Xiao","Di Wang","Boyang Li","Mingzhao Wang","Xuan Wu","Changliang Zhou","You Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.12469v2.pdf","comment":"11 pages, 5 figures, accepted by AAAI24"},{"id":"http://arxiv.org/abs/2401.09693v1","updated":"2024-01-18T02:48:06Z","published":"2024-01-18T02:48:06Z","title":"EfficientRec an unlimited user-item scale recommendation system based on\n clustering and users interaction embedding profile","summary":" Recommendation systems are highly interested in technology companies\nnowadays. The businesses are constantly growing users and products, causing the\nnumber of users and items to continuously increase over time, to very large\nnumbers. Traditional recommendation algorithms with complexity dependent on the\nnumber of users and items make them difficult to adapt to the industrial\nenvironment. In this paper, we introduce a new method applying graph neural\nnetworks with a contrastive learning framework in extracting user preferences.\nWe incorporate a soft clustering architecture that significantly reduces the\ncomputational cost of the inference process. Experiments show that the model is\nable to learn user preferences with low computational cost in both training and\nprediction phases. At the same time, the model gives a very good accuracy. We\ncall this architecture EfficientRec with the implication of model compactness\nand the ability to scale to unlimited users and products.\n","authors":["Vu Hong Quan","Le Hoang Ngan","Le Minh Duc","Nguyen Tran Ngoc Linh","Hoang Quynh-Le"],"pdf_url":"https://arxiv.org/pdf/2401.09693v1.pdf","comment":"Published in 14th Asian Conference on Intelligent Information and\n Database Systems (ACIIDS), 2022"},{"id":"http://arxiv.org/abs/2401.09691v1","updated":"2024-01-18T02:44:18Z","published":"2024-01-18T02:44:18Z","title":"Imitation Learning Inputting Image Feature to Each Layer of Neural\n Network","summary":" Imitation learning enables robots to learn and replicate human behavior from\ntraining data. Recent advances in machine learning enable end-to-end learning\napproaches that directly process high-dimensional observation data, such as\nimages. However, these approaches face a critical challenge when processing\ndata from multiple modalities, inadvertently ignoring data with a lower\ncorrelation to the desired output, especially when using short sampling\nperiods. This paper presents a useful method to address this challenge, which\namplifies the influence of data with a relatively low correlation to the output\nby inputting the data into each neural network layer. The proposed approach\neffectively incorporates diverse data sources into the learning process.\nThrough experiments using a simple pick-and-place operation with raw images and\njoint information as input, significant improvements in success rates are\ndemonstrated even when dealing with data from short sampling periods.\n","authors":["Koki Yamane","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2401.09691v1.pdf","comment":"IEEE The 18th International Workshop on Advanced Motion Control\n (AMC2024)"},{"id":"http://arxiv.org/abs/2310.13897v2","updated":"2024-01-18T02:31:31Z","published":"2023-10-21T03:26:39Z","title":"Masked Hard-Attention Transformers and Boolean RASP Recognize Exactly\n the Star-Free Languages","summary":" We consider transformer encoders with hard attention (in which all attention\nis focused on exactly one position) and strict future masking (in which each\nposition only attends to positions strictly to its left), and prove that the\nclass of languages recognized by these networks is exactly the star-free\nlanguages. Adding position embeddings increases the class of recognized\nlanguages to other well-studied classes. A key technique in these proofs is\nBoolean RASP, a variant of RASP that is restricted to Boolean values. Via the\nstar-free languages, we relate transformers to first-order logic, temporal\nlogic, and algebraic automata theory.\n","authors":["Dana Angluin","David Chiang","Andy Yang"],"pdf_url":"https://arxiv.org/pdf/2310.13897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17366v2","updated":"2024-01-18T02:27:11Z","published":"2023-09-28T10:05:37Z","title":"3D-Mol: A Novel Contrastive Learning Framework for Molecular Property\n Prediction with 3D Information","summary":" Molecular property prediction, crucial for early drug candidate screening and\noptimization, has seen advancements with deep learning-based methods. While\ndeep learning-based methods have advanced considerably, they often fall short\nin fully leveraging 3D spatial information. Specifically, current molecular\nencoding techniques tend to inadequately extract spatial information, leading\nto ambiguous representations where a single one might represent multiple\ndistinct molecules. Moreover, existing molecular modeling methods focus\npredominantly on the most stable 3D conformations, neglecting other viable\nconformations present in reality. To address these issues, we propose 3D-Mol, a\nnovel approach designed for more accurate spatial structure representation. It\ndeconstructs molecules into three hierarchical graphs to better extract\ngeometric information. Additionally, 3D-Mol leverages contrastive learning for\npretraining on 20 million unlabeled data, treating their conformations with\nidentical topological structures as weighted positive pairs and contrasting\nones as negatives, based on the similarity of their 3D conformation descriptors\nand fingerprints. We compare 3D-Mol with various state-of-the-art baselines on\n7 benchmarks and demonstrate our outstanding performance.\n","authors":["Taojie Kuang","Yiming Ren","Zhixiang Ren"],"pdf_url":"https://arxiv.org/pdf/2309.17366v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09682v1","updated":"2024-01-18T02:21:53Z","published":"2024-01-18T02:21:53Z","title":"Comparative Study on the Performance of Categorical Variable Encoders in\n Classification and Regression Tasks","summary":" Categorical variables often appear in datasets for classification and\nregression tasks, and they need to be encoded into numerical values before\ntraining. Since many encoders have been developed and can significantly impact\nperformance, choosing the appropriate encoder for a task becomes a\ntime-consuming yet important practical issue. This study broadly classifies\nmachine learning models into three categories: 1) ATI models that implicitly\nperform affine transformations on inputs, such as multi-layer perceptron neural\nnetwork; 2) Tree-based models that are based on decision trees, such as random\nforest; and 3) the rest, such as kNN. Theoretically, we prove that the one-hot\nencoder is the best choice for ATI models in the sense that it can mimic any\nother encoders by learning suitable weights from the data. We also explain why\nthe target encoder and its variants are the most suitable encoders for\ntree-based models. This study conducted comprehensive computational experiments\nto evaluate 14 encoders, including one-hot and target encoders, along with\neight common machine-learning models on 28 datasets. The computational results\nagree with our theoretical analysis. The findings in this study shed light on\nhow to select the suitable encoder for data scientists in fields such as fraud\ndetection, disease diagnosis, etc.\n","authors":["Wenbin Zhu","Runwen Qiu","Ying Fu"],"pdf_url":"https://arxiv.org/pdf/2401.09682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09681v1","updated":"2024-01-18T02:21:06Z","published":"2024-01-18T02:21:06Z","title":"Harnessing Density Ratios for Online Reinforcement Learning","summary":" The theories of offline and online reinforcement learning, despite having\nevolved in parallel, have begun to show signs of the possibility for a\nunification, with algorithms and analysis techniques for one setting often\nhaving natural counterparts in the other. However, the notion of density ratio\nmodeling, an emerging paradigm in offline RL, has been largely absent from\nonline RL, perhaps for good reason: the very existence and boundedness of\ndensity ratios relies on access to an exploratory dataset with good coverage,\nbut the core challenge in online RL is to collect such a dataset without having\none to start. In this work we show -- perhaps surprisingly -- that density\nratio-based algorithms have online counterparts. Assuming only the existence of\nan exploratory distribution with good coverage, a structural condition known as\ncoverability (Xie et al., 2023), we give a new algorithm (GLOW) that uses\ndensity ratio realizability and value function realizability to perform\nsample-efficient online exploration. GLOW addresses unbounded density ratios\nvia careful use of truncation, and combines this with optimism to guide\nexploration. GLOW is computationally inefficient; we complement it with a more\nefficient counterpart, HyGLOW, for the Hybrid RL setting (Song et al., 2022)\nwherein online RL is augmented with additional offline data. HyGLOW is derived\nas a special case of a more general meta-algorithm that provides a provable\nblack-box reduction from hybrid RL to offline RL, which may be of independent\ninterest.\n","authors":["Philip Amortila","Dylan J. Foster","Nan Jiang","Ayush Sekhari","Tengyang Xie"],"pdf_url":"https://arxiv.org/pdf/2401.09681v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.07231v3","updated":"2024-01-18T02:04:04Z","published":"2024-01-14T08:32:32Z","title":"Use of Prior Knowledge to Discover Causal Additive Models with\n Unobserved Variables and its Application to Time Series Data","summary":" This paper proposes two methods for causal additive models with unobserved\nvariables (CAM-UV). CAM-UV assumes that the causal functions take the form of\ngeneralized additive models and that latent confounders are present. First, we\npropose a method that leverages prior knowledge for efficient causal discovery.\nThen, we propose an extension of this method for inferring causality in time\nseries data. The original CAM-UV algorithm differs from other existing causal\nfunction models in that it does not seek the causal order between observed\nvariables, but rather aims to identify the causes for each observed variable.\nTherefore, the first proposed method in this paper utilizes prior knowledge,\nsuch as understanding that certain variables cannot be causes of specific\nothers. Moreover, by incorporating the prior knowledge that causes precedes\ntheir effects in time, we extend the first algorithm to the second method for\ncausal discovery in time series data. We validate the first proposed method by\nusing simulated data to demonstrate that the accuracy of causal discovery\nincreases as more prior knowledge is accumulated. Additionally, we test the\nsecond proposed method by comparing it with existing time series causal\ndiscovery methods, using both simulated data and real-world data.\n","authors":["Takashi Nicholas Maeda","Shohei Shimizu"],"pdf_url":"https://arxiv.org/pdf/2401.07231v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04273v3","updated":"2024-01-18T01:52:47Z","published":"2023-12-07T12:53:05Z","title":"Invariant Random Forest: Tree-Based Model Solution for OOD\n Generalization","summary":" Out-Of-Distribution (OOD) generalization is an essential topic in machine\nlearning. However, recent research is only focusing on the corresponding\nmethods for neural networks. This paper introduces a novel and effective\nsolution for OOD generalization of decision tree models, named Invariant\nDecision Tree (IDT). IDT enforces a penalty term with regard to the\nunstable/varying behavior of a split across different environments during the\ngrowth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is\nconstructed. Our proposed method is motivated by a theoretical result under\nmild conditions, and validated by numerical tests with both synthetic and real\ndatasets. The superior performance compared to non-OOD tree models implies that\nconsidering OOD generalization for tree models is absolutely necessary and\nshould be given more attention.\n","authors":["Yufan Liao","Qi Wu","Xing Yan"],"pdf_url":"https://arxiv.org/pdf/2312.04273v3.pdf","comment":"AAAI Conference on Artificial Intelligence, 2024 (Oral Presentation)"},{"id":"http://arxiv.org/abs/2401.09192v2","updated":"2024-01-18T01:41:29Z","published":"2024-01-17T13:04:14Z","title":"Preparing Lessons for Progressive Training on Language Models","summary":" The rapid progress of Transformers in artificial intelligence has come at the\ncost of increased resource consumption and greenhouse gas emissions due to\ngrowing model sizes. Prior work suggests using pretrained small models to\nimprove training efficiency, but this approach may not be suitable for new\nmodel structures. On the other hand, training from scratch can be slow, and\nprogressively stacking layers often fails to achieve significant acceleration.\nTo address these challenges, we propose a novel method called Apollo, which\nprep\\textbf{a}res lessons for ex\\textbf{p}anding \\textbf{o}perations by\n\\textbf{l}earning high-\\textbf{l}ayer functi\\textbf{o}nality during training of\nlow layers. Our approach involves low-value-prioritized sampling (LVPS) to\ntrain different depths and weight sharing to facilitate efficient expansion. We\nalso introduce an interpolation method for stable model depth extension.\nExperiments demonstrate that Apollo achieves state-of-the-art acceleration\nratios, even rivaling methods using pretrained models, making it a universal\nand efficient solution for training deep models while reducing time, financial,\nand environmental costs.\n","authors":["Yu Pan","Ye Yuan","Yichun Yin","Jiaxin Shi","Zenglin Xu","Ming Zhang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2401.09192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09673v1","updated":"2024-01-18T01:18:59Z","published":"2024-01-18T01:18:59Z","title":"Artwork Protection Against Neural Style Transfer Using Locally Adaptive\n Adversarial Color Attack","summary":" Neural style transfer (NST) is widely adopted in computer vision to generate\nnew images with arbitrary styles. This process leverages neural networks to\nmerge aesthetic elements of a style image with the structural aspects of a\ncontent image into a harmoniously integrated visual result. However,\nunauthorized NST can exploit artwork. Such misuse raises socio-technical\nconcerns regarding artists' rights and motivates the development of technical\napproaches for the proactive protection of original creations. Adversarial\nattack is a concept primarily explored in machine learning security. Our work\nintroduces this technique to protect artists' intellectual property. In this\npaper Locally Adaptive Adversarial Color Attack (LAACA), a method for altering\nimages in a manner imperceptible to the human eyes but disruptive to NST.\nSpecifically, we design perturbations targeting image areas rich in\nhigh-frequency content, generated by disrupting intermediate features. Our\nexperiments and user study confirm that by attacking NST using the proposed\nmethod results in visually worse neural style transfer, thus making it an\neffective solution for visual artwork protection.\n","authors":["Zhongliang Guo","Kaixuan Wang","Weiye Li","Yifei Qian","Ognjen Arandjelović","Lei Fang"],"pdf_url":"https://arxiv.org/pdf/2401.09673v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.09671v1","updated":"2024-01-18T01:07:00Z","published":"2024-01-18T01:07:00Z","title":"Towards Identifiable Unsupervised Domain Translation: A Diversified\n Distribution Matching Approach","summary":" Unsupervised domain translation (UDT) aims to find functions that convert\nsamples from one domain (e.g., sketches) to another domain (e.g., photos)\nwithout changing the high-level semantic meaning (also referred to as\n``content''). The translation functions are often sought by probability\ndistribution matching of the transformed source domain and target domain.\nCycleGAN stands as arguably the most representative approach among this line of\nwork. However, it was noticed in the literature that CycleGAN and variants\ncould fail to identify the desired translation functions and produce\ncontent-misaligned translations. This limitation arises due to the presence of\nmultiple translation functions -- referred to as ``measure-preserving\nautomorphism\" (MPA) -- in the solution space of the learning criteria. Despite\nawareness of such identifiability issues, solutions have remained elusive. This\nstudy delves into the core identifiability inquiry and introduces an MPA\nelimination theory. Our analysis shows that MPA is unlikely to exist, if\nmultiple pairs of diverse cross-domain conditional distributions are matched by\nthe learning function. Our theory leads to a UDT learner using distribution\nmatching over auxiliary variable-induced subsets of the domains -- other than\nover the entire data domains as in the classical approaches. The proposed\nframework is the first to rigorously establish translation identifiability\nunder reasonable UDT settings, to our best knowledge. Experiments corroborate\nwith our theoretical claims.\n","authors":["Sagar Shrestha","Xiao Fu"],"pdf_url":"https://arxiv.org/pdf/2401.09671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.14238v3","updated":"2024-01-18T00:57:57Z","published":"2020-11-29T00:00:20Z","title":"Approximate Cross-validated Mean Estimates for Bayesian Hierarchical\n Regression Models","summary":" We introduce a novel procedure for obtaining cross-validated predictive\nestimates for Bayesian hierarchical regression models (BHRMs). Bayesian\nhierarchical models are popular for their ability to model complex dependence\nstructures and provide probabilistic uncertainty estimates, but can be\ncomputationally expensive to run. Cross-validation (CV) is therefore not a\ncommon practice to evaluate the predictive performance of BHRMs. Our method\ncircumvents the need to re-run computationally costly estimation methods for\neach cross-validation fold and makes CV more feasible for large BHRMs. By\nconditioning on the variance-covariance parameters, we shift the CV problem\nfrom probability-based sampling to a simple and familiar optimization problem.\nIn many cases, this produces estimates which are equivalent to full CV. We\nprovide theoretical results and demonstrate its efficacy on publicly available\ndata and in simulations.\n","authors":["Amy X. Zhang","Le Bao","Changcheng Li","Michael J. Daniels"],"pdf_url":"https://arxiv.org/pdf/2011.14238v3.pdf","comment":"26 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.09665v1","updated":"2024-01-18T00:50:37Z","published":"2024-01-18T00:50:37Z","title":"Accelerating Distributed Stochastic Optimization via Self-Repellent\n Random Walks","summary":" We study a family of distributed stochastic optimization algorithms where\ngradients are sampled by a token traversing a network of agents in random-walk\nfashion. Typically, these random-walks are chosen to be Markov chains that\nasymptotically sample from a desired target distribution, and play a critical\nrole in the convergence of the optimization iterates. In this paper, we take a\nnovel approach by replacing the standard linear Markovian token by one which\nfollows a nonlinear Markov chain - namely the Self-Repellent Radom Walk (SRRW).\nDefined for any given 'base' Markov chain, the SRRW, parameterized by a\npositive scalar {\\alpha}, is less likely to transition to states that were\nhighly visited in the past, thus the name. In the context of MCMC sampling on a\ngraph, a recent breakthrough in Doshi et al. (2023) shows that the SRRW\nachieves O(1/{\\alpha}) decrease in the asymptotic variance for sampling. We\npropose the use of a 'generalized' version of the SRRW to drive token\nalgorithms for distributed stochastic optimization in the form of stochastic\napproximation, termed SA-SRRW. We prove that the optimization iterate errors of\nthe resulting SA-SRRW converge to zero almost surely and prove a central limit\ntheorem, deriving the explicit form of the resulting asymptotic covariance\nmatrix corresponding to iterate errors. This asymptotic covariance is always\nsmaller than that of an algorithm driven by the base Markov chain and decreases\nat rate O(1/{\\alpha}^2) - the performance benefit of using SRRW thereby\namplified in the stochastic optimization context. Empirical results support our\ntheoretical findings.\n","authors":["Jie Hu","Vishwaraj Doshi","Do Young Eun"],"pdf_url":"https://arxiv.org/pdf/2401.09665v1.pdf","comment":"Accepted for oral presentation at the Twelfth International\n Conference on Learning Representations (ICLR 2024)"},{"id":"http://arxiv.org/abs/2311.04938v2","updated":"2024-01-18T00:44:11Z","published":"2023-11-08T00:24:50Z","title":"Improved DDIM Sampling with Moment Matching Gaussian Mixtures","summary":" We propose using a Gaussian Mixture Model (GMM) as reverse transition\noperator (kernel) within the Denoising Diffusion Implicit Models (DDIM)\nframework, which is one of the most widely used approaches for accelerated\nsampling from pre-trained Denoising Diffusion Probabilistic Models (DDPM).\nSpecifically we match the first and second order central moments of the DDPM\nforward marginals by constraining the parameters of the GMM. We see that moment\nmatching is sufficient to obtain samples with equal or better quality than the\noriginal DDIM with Gaussian kernels. We provide experimental results with\nunconditional models trained on CelebAHQ and FFHQ and class-conditional models\ntrained on ImageNet datasets respectively. Our results suggest that using the\nGMM kernel leads to significant improvements in the quality of the generated\nsamples when the number of sampling steps is small, as measured by FID and IS\nmetrics. For example on ImageNet 256x256, using 10 sampling steps, we achieve a\nFID of 6.94 and IS of 207.85 with a GMM kernel compared to 10.15 and 196.73\nrespectively with a Gaussian kernel.\n","authors":["Prasad Gabbur"],"pdf_url":"https://arxiv.org/pdf/2311.04938v2.pdf","comment":"29 pages, 14 figures; Analysis of DDIM-GMM as a multimodal denoiser;\n Additional experiments on LSUN datasets and text-to-image generation with\n Stable Diffusion; Comparison with DPM-Solver; Ablations on GMM parameters;\n Updated equations with bold font for vectors and matrices"},{"id":"http://arxiv.org/abs/2210.15629v3","updated":"2024-01-18T00:43:41Z","published":"2022-10-27T17:20:50Z","title":"Language Control Diffusion: Efficiently Scaling through Space, Time, and\n Tasks","summary":" Training generalist agents is difficult across several axes, requiring us to\ndeal with high-dimensional inputs (space), long horizons (time), and\ngeneralization to novel tasks. Recent advances with architectures have allowed\nfor improved scaling along one or two of these axes, but are still\ncomputationally prohibitive to use. In this paper, we propose to address all\nthree axes by leveraging \\textbf{L}anguage to \\textbf{C}ontrol\n\\textbf{D}iffusion models as a hierarchical planner conditioned on language\n(LCD). We effectively and efficiently scale diffusion models for planning in\nextended temporal, state, and task dimensions to tackle long horizon control\nproblems conditioned on natural language instructions, as a step towards\ngeneralist agents. Comparing LCD with other state-of-the-art models on the\nCALVIN language robotics benchmark finds that LCD outperforms other SOTA\nmethods in multi-task success rates, whilst improving inference speed over\nother comparable diffusion models by 3.3x~15x. We show that LCD can\nsuccessfully leverage the unique strength of diffusion models to produce\ncoherent long range plans while addressing their weakness in generating\nlow-level details and control.\n","authors":["Edwin Zhang","Yujie Lu","William Wang","Amy Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.15629v3.pdf","comment":"ICLR 2024, Project and code available at\n https://github.com/ezhang7423/language-control-diffusion"},{"id":"http://arxiv.org/abs/2312.12849v2","updated":"2024-01-18T00:39:29Z","published":"2023-12-20T08:59:05Z","title":"Divergences induced by dual subtractive and divisive normalizations of\n exponential families and their convex deformations","summary":" Exponential families are statistical models which are the workhorses in\nstatistics, information theory, and machine learning among others. An\nexponential family can either be normalized subtractively by its cumulant or\nfree energy function or equivalently normalized divisively by its partition\nfunction. Both subtractive and divisive normalizers are strictly convex and\nsmooth functions inducing pairs of Bregman and Jensen divergences. It is\nwell-known that skewed Bhattacharryya distances between probability densities\nof an exponential family amounts to skewed Jensen divergences induced by the\ncumulant function between their corresponding natural parameters, and in limit\ncases that the sided Kullback-Leibler divergences amount to reverse-sided\nBregman divergences. In this paper, we first show that the $\\alpha$-divergences\nbetween unnormalized densities of an exponential family amounts to scaled\n$\\alpha$-skewed Jensen divergences induced by the partition function. We then\nshow how comparative convexity with respect to a pair of quasi-arithmetic means\nallows to deform both convex functions and their arguments, and thereby define\ndually flat spaces with corresponding divergences when ordinary convexity is\npreserved.\n","authors":["Frank Nielsen"],"pdf_url":"https://arxiv.org/pdf/2312.12849v2.pdf","comment":"19 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.00323v3","updated":"2024-01-18T00:10:32Z","published":"2023-06-01T03:43:41Z","title":"Thought Cloning: Learning to Think while Acting by Imitating Human\n Thinking","summary":" Language is often considered a key aspect of human thinking, providing us\nwith exceptional abilities to generalize, explore, plan, replan, and adapt to\nnew situations. However, Reinforcement Learning (RL) agents are far from\nhuman-level performance in any of these abilities. We hypothesize one reason\nfor such cognitive deficiencies is that they lack the benefits of thinking in\nlanguage and that we can improve AI agents by training them to think like\nhumans do. We introduce a novel Imitation Learning framework, Thought Cloning,\nwhere the idea is to not just clone the behaviors of human demonstrators, but\nalso the thoughts humans have as they perform these behaviors. While we expect\nThought Cloning to truly shine at scale on internet-sized datasets of humans\nthinking out loud while acting (e.g. online videos with transcripts), here we\nconduct experiments in a domain where the thinking and action data are\nsynthetically generated. Results reveal that Thought Cloning learns much faster\nthan Behavioral Cloning and its performance advantage grows the further out of\ndistribution test tasks are, highlighting its ability to better handle novel\nsituations. Thought Cloning also provides important benefits for AI Safety and\nInterpretability, and makes it easier to debug and improve AI. Because we can\nobserve the agent's thoughts, we can (1) more easily diagnose why things are\ngoing wrong, making it easier to fix the problem, (2) steer the agent by\ncorrecting its thinking, or (3) prevent it from doing unsafe things it plans to\ndo. Overall, by training agents how to think as well as behave, Thought Cloning\ncreates safer, more powerful agents.\n","authors":["Shengran Hu","Jeff Clune"],"pdf_url":"https://arxiv.org/pdf/2306.00323v3.pdf","comment":"Accepted to NeurIPS 2023 as a spotlight"},{"id":"http://arxiv.org/abs/2401.09656v1","updated":"2024-01-18T00:09:54Z","published":"2024-01-18T00:09:54Z","title":"Mobility Accelerates Learning: Convergence Analysis on Hierarchical\n Federated Learning in Vehicular Networks","summary":" Hierarchical federated learning (HFL) enables distributed training of models\nacross multiple devices with the help of several edge servers and a cloud edge\nserver in a privacy-preserving manner. In this paper, we consider HFL with\nhighly mobile devices, mainly targeting at vehicular networks. Through\nconvergence analysis, we show that mobility influences the convergence speed by\nboth fusing the edge data and shuffling the edge models. While mobility is\nusually considered as a challenge from the perspective of communication, we\nprove that it increases the convergence speed of HFL with edge-level\nheterogeneous data, since more diverse data can be incorporated. Furthermore,\nwe demonstrate that a higher speed leads to faster convergence, since it\naccelerates the fusion of data. Simulation results show that mobility increases\nthe model accuracy of HFL by up to 15.1% when training a convolutional neural\nnetwork on the CIFAR-10 dataset.\n","authors":["Tan Chen","Jintao Yan","Yuxuan Sun","Sheng Zhou","Deniz Gündüz","Zhisheng Niu"],"pdf_url":"https://arxiv.org/pdf/2401.09656v1.pdf","comment":"Submitted to IEEE for possible publication"},{"id":"http://arxiv.org/abs/2401.10419v1","updated":"2024-01-18T23:10:08Z","published":"2024-01-18T23:10:08Z","title":"M3BUNet: Mobile Mean Max UNet for Pancreas Segmentation on CT-Scans","summary":" Segmenting organs in CT scan images is a necessary process for multiple\ndownstream medical image analysis tasks. Currently, manual CT scan segmentation\nby radiologists is prevalent, especially for organs like the pancreas, which\nrequires a high level of domain expertise for reliable segmentation due to\nfactors like small organ size, occlusion, and varying shapes. When resorting to\nautomated pancreas segmentation, these factors translate to limited reliable\nlabeled data to train effective segmentation models. Consequently, the\nperformance of contemporary pancreas segmentation models is still not within\nacceptable ranges. To improve that, we propose M3BUNet, a fusion of MobileNet\nand U-Net neural networks, equipped with a novel Mean-Max (MM) attention that\noperates in two stages to gradually segment pancreas CT images from coarse to\nfine with mask guidance for object detection. This approach empowers the\nnetwork to surpass segmentation performance achieved by similar network\narchitectures and achieve results that are on par with complex state-of-the-art\nmethods, all while maintaining a low parameter count. Additionally, we\nintroduce external contour segmentation as a preprocessing step for the coarse\nstage to assist in the segmentation process through image standardization. For\nthe fine segmentation stage, we found that applying a wavelet decomposition\nfilter to create multi-input images enhances pancreas segmentation performance.\nWe extensively evaluate our approach on the widely known NIH pancreas dataset\nand MSD pancreas dataset. Our approach demonstrates a considerable performance\nimprovement, achieving an average Dice Similarity Coefficient (DSC) value of up\nto 89.53% and an Intersection Over Union (IOU) score of up to 81.16 for the NIH\npancreas dataset, and 88.60% DSC and 79.90% IOU for the MSD Pancreas dataset.\n","authors":["Juwita juwita","Ghulam Mubashar Hassan","Naveed Akhtar","Amitava Datta"],"pdf_url":"https://arxiv.org/pdf/2401.10419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01202v2","updated":"2024-01-18T22:48:17Z","published":"2023-10-02T13:42:36Z","title":"Unified Uncertainty Calibration","summary":" To build robust, fair, and safe AI systems, we would like our classifiers to\nsay ``I don't know'' when facing test examples that are difficult or fall\noutside of the training classes.The ubiquitous strategy to predict under\nuncertainty is the simplistic \\emph{reject-or-classify} rule: abstain from\nprediction if epistemic uncertainty is high, classify otherwise.Unfortunately,\nthis recipe does not allow different sources of uncertainty to communicate with\neach other, produces miscalibrated predictions, and it does not allow to\ncorrect for misspecifications in our uncertainty estimates. To address these\nthree issues, we introduce \\emph{unified uncertainty calibration (U2C)}, a\nholistic framework to combine aleatoric and epistemic uncertainties. U2C\nenables a clean learning-theoretical analysis of uncertainty estimation, and\noutperforms reject-or-classify across a variety of ImageNet benchmarks. Our\ncode is available at:\nhttps://github.com/facebookresearch/UnifiedUncertaintyCalibration\n","authors":["Kamalika Chaudhuri","David Lopez-Paz"],"pdf_url":"https://arxiv.org/pdf/2310.01202v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.10891v4","updated":"2024-01-18T22:43:56Z","published":"2021-11-21T20:11:33Z","title":"Active Restoration of Lost Audio Signals Using Machine Learning and\n Latent Information","summary":" Digital audio signal reconstruction of a lost or corrupt segment using deep\nlearning algorithms has been explored intensively in recent years.\nNevertheless, prior traditional methods with linear interpolation, phase coding\nand tone insertion techniques are still in vogue. However, we found no research\nwork on reconstructing audio signals with the fusion of dithering,\nsteganography, and machine learning regressors. Therefore, this paper proposes\nthe combination of steganography, halftoning (dithering), and state-of-the-art\nshallow and deep learning methods. The results (including comparing the SPAIN,\nAutoregressive, deep learning-based, graph-based, and other methods) are\nevaluated with three different metrics. The observations from the results show\nthat the proposed solution is effective and can enhance the reconstruction of\naudio signals performed by the side information (e.g., Latent representation)\nsteganography provides. Moreover, this paper proposes a novel framework for\nreconstruction from heavily compressed embedded audio data using halftoning\n(i.e., dithering) and machine learning, which we termed the HCR (halftone-based\ncompression and reconstruction). This work may trigger interest in optimising\nthis approach and/or transferring it to different domains (i.e., image\nreconstruction). Compared to existing methods, we show improvement in the\ninpainting performance in terms of signal-to-noise ratio (SNR), the objective\ndifference grade (ODG) and Hansen's audio quality metric. In particular, our\nproposed framework outperformed the learning-based methods (D2WGAN and SG) and\nthe traditional statistical algorithms (e.g., SPAIN, TDC, WCP).\n","authors":["Zohra Adila Cheddad","Abbas Cheddad"],"pdf_url":"https://arxiv.org/pdf/2111.10891v4.pdf","comment":"18 Pages, 2 Tables, 8 Figures"},{"id":"http://arxiv.org/abs/2309.11623v2","updated":"2024-01-18T22:37:48Z","published":"2023-09-20T20:21:13Z","title":"Leveraging Negative Signals with Self-Attention for Sequential Music\n Recommendation","summary":" Music streaming services heavily rely on their recommendation engines to\ncontinuously provide content to their consumers. Sequential recommendation\nconsequently has seen considerable attention in current literature, where state\nof the art approaches focus on self-attentive models leveraging contextual\ninformation such as long and short-term user history and item features;\nhowever, most of these studies focus on long-form content domains (retail,\nmovie, etc.) rather than short-form, such as music. Additionally, many do not\nexplore incorporating negative session-level feedback during training. In this\nstudy, we investigate the use of transformer-based self-attentive architectures\nto learn implicit session-level information for sequential music\nrecommendation. We additionally propose a contrastive learning task to\nincorporate negative feedback (e.g skipped tracks) to promote positive hits and\npenalize negative hits. This task is formulated as a simple loss term that can\nbe incorporated into a variety of deep learning architectures for sequential\nrecommendation. Our experiments show that this results in consistent\nperformance gains over the baseline architectures ignoring negative user\nfeedback.\n","authors":["Pavan Seshadri","Peter Knees"],"pdf_url":"https://arxiv.org/pdf/2309.11623v2.pdf","comment":"Accepted to the 1st Workshop on Music Recommender Systems, co-located\n with the 17th ACM Conference on Recommender Systems (MuRS @ RecSys 2023)"},{"id":"http://arxiv.org/abs/2312.13110v3","updated":"2024-01-18T22:28:06Z","published":"2023-12-20T15:30:15Z","title":"Pre-training of Molecular GNNs via Conditional Boltzmann Generator","summary":" Learning representations of molecular structures using deep learning is a\nfundamental problem in molecular property prediction tasks. Molecules\ninherently exist in the real world as three-dimensional structures;\nfurthermore, they are not static but in continuous motion in the 3D Euclidean\nspace, forming a potential energy surface. Therefore, it is desirable to\ngenerate multiple conformations in advance and extract molecular\nrepresentations using a 4D-QSAR model that incorporates multiple conformations.\nHowever, this approach is impractical for drug and material discovery tasks\nbecause of the computational cost of obtaining multiple conformations. To\naddress this issue, we propose a pre-training method for molecular GNNs using\nan existing dataset of molecular conformations to generate a latent vector\nuniversal to multiple conformations from a 2D molecular graph. Our method,\ncalled Boltzmann GNN, is formulated by maximizing the conditional marginal\nlikelihood of a conditional generative model for conformations generation. We\nshow that our model has a better prediction performance for molecular\nproperties than existing pre-training methods using molecular graphs and\nthree-dimensional molecular structures.\n","authors":["Daiki Koge","Naoaki Ono","Shigehiko Kanaya"],"pdf_url":"https://arxiv.org/pdf/2312.13110v3.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2401.10405v1","updated":"2024-01-18T22:26:31Z","published":"2024-01-18T22:26:31Z","title":"Differentially Private and Adversarially Robust Machine Learning: An\n Empirical Evaluation","summary":" Malicious adversaries can attack machine learning models to infer sensitive\ninformation or damage the system by launching a series of evasion attacks.\nAlthough various work addresses privacy and security concerns, they focus on\nindividual defenses, but in practice, models may undergo simultaneous attacks.\nThis study explores the combination of adversarial training and differentially\nprivate training to defend against simultaneous attacks. While\ndifferentially-private adversarial training, as presented in DP-Adv,\noutperforms the other state-of-the-art methods in performance, it lacks formal\nprivacy guarantees and empirical validation. Thus, in this work, we benchmark\nthe performance of this technique using a membership inference attack and\nempirically show that the resulting approach is as private as non-robust\nprivate models. This work also highlights the need to explore privacy\nguarantees in dynamic training paradigms.\n","authors":["Janvi Thakkar","Giulio Zizzo","Sergio Maffeis"],"pdf_url":"https://arxiv.org/pdf/2401.10405v1.pdf","comment":"Accepted at PPAI-24: The 5th AAAI Workshop on Privacy-Preserving\n Artificial Intelligence"},{"id":"http://arxiv.org/abs/2401.10396v1","updated":"2024-01-18T22:10:21Z","published":"2024-01-18T22:10:21Z","title":"Deep Dict: Deep Learning-based Lossy Time Series Compressor for IoT Data","summary":" We propose Deep Dict, a deep learning-based lossy time series compressor\ndesigned to achieve a high compression ratio while maintaining decompression\nerror within a predefined range. Deep Dict incorporates two essential\ncomponents: the Bernoulli transformer autoencoder (BTAE) and a distortion\nconstraint. BTAE extracts Bernoulli representations from time series data,\nreducing the size of the representations compared to conventional autoencoders.\nThe distortion constraint limits the prediction error of BTAE to the desired\nrange. Moreover, in order to address the limitations of common regression\nlosses such as L1/L2, we introduce a novel loss function called quantized\nentropy loss (QEL). QEL takes into account the specific characteristics of the\nproblem, enhancing robustness to outliers and alleviating optimization\nchallenges. Our evaluation of Deep Dict across ten diverse time series datasets\nfrom various domains reveals that Deep Dict outperforms state-of-the-art lossy\ncompressors in terms of compression ratio by a significant margin by up to\n53.66%.\n","authors":["Jinxin Liu","Petar Djukic","Michel Kulhandjian","Burak Kantarci"],"pdf_url":"https://arxiv.org/pdf/2401.10396v1.pdf","comment":"6 pages, 13 figures, IEEE International Conference on Communications\n (ICC) 2024"},{"id":"http://arxiv.org/abs/2303.02506v3","updated":"2024-01-18T22:09:40Z","published":"2023-03-04T21:22:47Z","title":"Prismer: A Vision-Language Model with Multi-Task Experts","summary":" Recent vision-language models have shown impressive multi-modal generation\ncapabilities. However, typically they require training huge models on massive\ndatasets. As a more scalable alternative, we introduce Prismer, a data- and\nparameter-efficient vision-language model that leverages an ensemble of\ntask-specific experts. Prismer only requires training of a small number of\ncomponents, with the majority of network weights inherited from multiple\nreadily-available, pre-trained experts, and kept frozen during training. By\nleveraging experts from a wide range of domains, we show Prismer can\nefficiently pool this expert knowledge and adapt it to various vision-language\nreasoning tasks. In our experiments, we show that Prismer achieves fine-tuned\nand few-shot learning performance which is competitive with current\nstate-of-the-arts, whilst requiring up to two orders of magnitude less training\ndata. Code is available at https://github.com/NVlabs/prismer.\n","authors":["Shikun Liu","Linxi Fan","Edward Johns","Zhiding Yu","Chaowei Xiao","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2303.02506v3.pdf","comment":"Published at TMLR 2024. Project Page:\n https://shikun.io/projects/prismer Code: https://github.com/NVlabs/prismer"},{"id":"http://arxiv.org/abs/2401.10394v1","updated":"2024-01-18T22:07:48Z","published":"2024-01-18T22:07:48Z","title":"Distribution Consistency based Self-Training for Graph Neural Networks\n with Sparse Labels","summary":" Few-shot node classification poses a significant challenge for Graph Neural\nNetworks (GNNs) due to insufficient supervision and potential distribution\nshifts between labeled and unlabeled nodes. Self-training has emerged as a\nwidely popular framework to leverage the abundance of unlabeled data, which\nexpands the training set by assigning pseudo-labels to selected unlabeled\nnodes. Efforts have been made to develop various selection strategies based on\nconfidence, information gain, etc. However, none of these methods takes into\naccount the distribution shift between the training and testing node sets. The\npseudo-labeling step may amplify this shift and even introduce new ones,\nhindering the effectiveness of self-training. Therefore, in this work, we\nexplore the potential of explicitly bridging the distribution shift between the\nexpanded training set and test set during self-training. To this end, we\npropose a novel Distribution-Consistent Graph Self-Training (DC-GST) framework\nto identify pseudo-labeled nodes that are both informative and capable of\nredeeming the distribution discrepancy and formulate it as a differentiable\noptimization task. A distribution-shift-aware edge predictor is further adopted\nto augment the graph and increase the model's generalizability in assigning\npseudo labels. We evaluate our proposed method on four publicly available\nbenchmark datasets and extensive experiments demonstrate that our framework\nconsistently outperforms state-of-the-art baselines.\n","authors":["Fali Wang","Tianxiang Zhao","Suhang Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10394v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2401.10393v1","updated":"2024-01-18T22:06:38Z","published":"2024-01-18T22:06:38Z","title":"Catastrophic Interference is Mitigated in Naturalistic Power-Law\n Learning Environments","summary":" Neural networks often suffer from catastrophic interference (CI): performance\non previously learned tasks drops off significantly when learning a new task.\nThis contrasts strongly with humans, who can sequentially learn new tasks\nwithout appreciably forgetting previous tasks. Prior work has explored various\ntechniques for mitigating CI such as regularization, rehearsal, generative\nreplay, and distillation methods. The current work takes a different approach,\none guided by cognitive science research showing that in naturalistic\nenvironments, the probability of encountering a task decreases as a power-law\nof the time since it was last performed. We argue that a realistic evaluation\nof techniques for the mitigation of CI should be performed in simulated\nnaturalistic learning environments. Thus, we evaluate the extent of mitigation\nof CI when training simple rehearsal-based methods in power-law environments\nsimilar to the ones humans face. Our work explores this novel rehearsal-based\napproach for a domain-incremental task: learning permutations in the MNIST\ntask. We compare our rehearsal environment with other baselines to show its\nefficacy in promoting continual learning. Additionally, we investigate whether\nthis environment shows forward facilitation, i.e., faster learning of later\ntasks. Next, we explore the robustness of our learning environment to the\nnumber of tasks, model size, and amount of data rehearsed after each task.\nNotably, our results show that the performance is comparable or superior to\nthat of models trained using popular regularization methods and also to\nrehearsals in non-power-law environments. The benefits of this training\nparadigm include simplicity and the lack of a need for extra neural circuitry.\nIn addition, because our method is orthogonal to other methods, future research\ncan combine training in power-law environments with other continual learning\nmechanisms.\n","authors":["Atith Gandhi","Raj Sanjay Shah","Vijay Marupudi","Sashank Varma"],"pdf_url":"https://arxiv.org/pdf/2401.10393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10386v1","updated":"2024-01-18T21:49:04Z","published":"2024-01-18T21:49:04Z","title":"Noninvasive Acute Compartment Syndrome Diagnosis Using Random Forest\n Machine Learning","summary":" Acute compartment syndrome (ACS) is an orthopedic emergency, caused by\nelevated pressure within a muscle compartment, that leads to permanent tissue\ndamage and eventually death. Diagnosis of ACS relies heavily on\npatient-reported symptoms, a method that is clinically unreliable and often\nsupplemented with invasive intracompartmental pressure measurements. This study\nproposes a continuous, objective, noninvasive diagnostic for ACS. The device\ndetects ACS through a random forest machine learning model that uses pressure\nreadings from force-sensitive resistors (FSRs) placed on the skin. The final\ndiagnosis is exported real-time to a web application via Bluetooth. To validate\nthe diagnostic, a data set containing FSR measurements and the corresponding\nsimulated intracompartmental pressure was created. The diagnostic achieved an\naccuracy, on par to the invasive gold standard, of 97%. The device excelled in\nkey performance metrics including precision, sensitivity, and F1 score.\nManufactured for 73 USD, our device may be an economic alternative to\nneedle-based diagnostics. These results demonstrate the potential of\nnoninvasive ACS diagnostics to meet clinical standards and enhance patient\ncare.\n","authors":["Zaina Abu Hweij","Florence Liang","Sophie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10385v1","updated":"2024-01-18T21:45:09Z","published":"2024-01-18T21:45:09Z","title":"Approximation of Solution Operators for High-dimensional PDEs","summary":" We propose a finite-dimensional control-based method to approximate solution\noperators for evolutional partial differential equations (PDEs), particularly\nin high-dimensions. By employing a general reduced-order model, such as a deep\nneural network, we connect the evolution of the model parameters with\ntrajectories in a corresponding function space. Using the computational\ntechnique of neural ordinary differential equation, we learn the control over\nthe parameter space such that from any initial starting point, the controlled\ntrajectories closely approximate the solutions to the PDE. Approximation\naccuracy is justified for a general class of second-order nonlinear PDEs.\nNumerical results are presented for several high-dimensional PDEs, including\nreal-world applications to solving Hamilton-Jacobi-Bellman equations. These are\ndemonstrated to show the accuracy and efficiency of the proposed method.\n","authors":["Nathan Gaby","Xiaojing Ye"],"pdf_url":"https://arxiv.org/pdf/2401.10385v1.pdf","comment":"14 pages, 4 page appendix, 4 figures"},{"id":"http://arxiv.org/abs/2401.10383v1","updated":"2024-01-18T21:36:17Z","published":"2024-01-18T21:36:17Z","title":"Cooperative Multi-Agent Graph Bandits: UCB Algorithm and Regret Analysis","summary":" In this paper, we formulate the multi-agent graph bandit problem as a\nmulti-agent extension of the graph bandit problem introduced by Zhang,\nJohansson, and Li [CISS 57, 1-6 (2023)]. In our formulation, $N$ cooperative\nagents travel on a connected graph $G$ with $K$ nodes. Upon arrival at each\nnode, agents observe a random reward drawn from a node-dependent probability\ndistribution. The reward of the system is modeled as a weighted sum of the\nrewards the agents observe, where the weights capture the decreasing marginal\nreward associated with multiple agents sampling the same node at the same time.\nWe propose an Upper Confidence Bound (UCB)-based learning algorithm,\nMulti-G-UCB, and prove that its expected regret over $T$ steps is bounded by\n$O(N\\log(T)[\\sqrt{KT} + DK])$, where $D$ is the diameter of graph $G$. Lastly,\nwe numerically test our algorithm by comparing it to alternative methods.\n","authors":["Phevos Paschalidis","Runyu Zhang","Na Li"],"pdf_url":"https://arxiv.org/pdf/2401.10383v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12871v2","updated":"2024-01-18T21:20:08Z","published":"2023-08-24T15:48:21Z","title":"IPA: Inference Pipeline Adaptation to Achieve High Accuracy and\n Cost-Efficiency","summary":" Efficiently optimizing multi-model inference pipelines for fast, accurate,\nand cost-effective inference is a crucial challenge in machine learning\nproduction systems, given their tight end-to-end latency requirements. To\nsimplify the exploration of the vast and intricate trade-off space of latency,\naccuracy, and cost in inference pipelines, providers frequently opt to consider\none of them. However, the challenge lies in reconciling latency, accuracy, and\ncost trade-offs. To address this challenge and propose a solution to\nefficiently manage model variants in inference pipelines, we present IPA, an\nonline deep learning Inference Pipeline Adaptation system that efficiently\nleverages model variants for each deep learning task. Model variants are\ndifferent versions of pre-trained models for the same deep learning task with\nvariations in resource requirements, latency, and accuracy. IPA dynamically\nconfigures batch size, replication, and model variants to optimize accuracy,\nminimize costs, and meet user-defined latency Service Level Agreements (SLAs)\nusing Integer Programming. It supports multi-objective settings for achieving\ndifferent trade-offs between accuracy and cost objectives while remaining\nadaptable to varying workloads and dynamic traffic patterns. Navigating a wider\nvariety of configurations allows \\namex{} to achieve better trade-offs between\ncost and accuracy objectives compared to existing methods. Extensive\nexperiments in a Kubernetes implementation with five real-world inference\npipelines demonstrate that IPA improves end-to-end accuracy by up to 21% with a\nminimal cost increase. The code and data for replications are available at\nhttps://github.com/reconfigurable-ml-pipeline/ipa.\n","authors":["Saeid Ghafouri","Kamran Razavi","Mehran Salmani","Alireza Sanaee","Tania Lorido-Botran","Lin Wang","Joseph Doyle","Pooyan Jamshidi"],"pdf_url":"https://arxiv.org/pdf/2308.12871v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04965v2","updated":"2024-01-18T21:17:04Z","published":"2023-10-08T01:51:17Z","title":"MULTISCRIPT: Multimodal Script Learning for Supporting Open Domain\n Everyday Tasks","summary":" Automatically generating scripts (i.e. sequences of key steps described in\ntext) from video demonstrations and reasoning about the subsequent steps are\ncrucial to the modern AI virtual assistants to guide humans to complete\neveryday tasks, especially unfamiliar ones. However, current methods for\ngenerative script learning rely heavily on well-structured preceding steps\ndescribed in text and/or images or are limited to a certain domain, resulting\nin a disparity with real-world user scenarios. To address these limitations, we\npresent a new benchmark challenge -- MultiScript, with two new tasks on\ntask-oriented multimodal script learning: (1) multimodal script generation, and\n(2) subsequent step prediction. For both tasks, the input consists of a target\ntask name and a video illustrating what has been done to complete the target\ntask, and the expected output is (1) a sequence of structured step descriptions\nin text based on the demonstration video, and (2) a single text description for\nthe subsequent step, respectively. Built from WikiHow, MultiScript covers\nmultimodal scripts in videos and text descriptions for over 6,655 human\neveryday tasks across 19 diverse domains. To establish baseline performance on\nMultiScript, we propose two knowledge-guided multimodal generative frameworks\nthat incorporate the task-related knowledge prompted from large language models\nsuch as Vicuna. Experimental results show that our proposed approaches\nsignificantly improve over the competitive baselines.\n","authors":["Jingyuan Qi","Minqian Liu","Ying Shen","Zhiyang Xu","Lifu Huang"],"pdf_url":"https://arxiv.org/pdf/2310.04965v2.pdf","comment":"Accepted by AAAI 2024. 11 pages, 9 figures, 4 tables"},{"id":"http://arxiv.org/abs/2212.00219v4","updated":"2024-01-18T21:07:17Z","published":"2022-12-01T01:59:28Z","title":"Are you using test log-likelihood correctly?","summary":" Test log-likelihood is commonly used to compare different models of the same\ndata or different approximate inference algorithms for fitting the same\nprobabilistic model. We present simple examples demonstrating how comparisons\nbased on test log-likelihood can contradict comparisons according to other\nobjectives. Specifically, our examples show that (i) approximate Bayesian\ninference algorithms that attain higher test log-likelihoods need not also\nyield more accurate posterior approximations and (ii) conclusions about\nforecast accuracy based on test log-likelihood comparisons may not agree with\nconclusions based on root mean squared error.\n","authors":["Sameer K. Deshpande","Soumya Ghosh","Tin D. Nguyen","Tamara Broderick"],"pdf_url":"https://arxiv.org/pdf/2212.00219v4.pdf","comment":"Presented at the ICBINB Workshop at NeurIPS 2022. This version\n accepted at TMLR, available at https://openreview.net/forum?id=n2YifD4Dxo"},{"id":"http://arxiv.org/abs/2312.00067v2","updated":"2024-01-18T20:58:50Z","published":"2023-11-29T19:52:53Z","title":"Predicting breast cancer with AI for individual risk-adjusted MRI\n screening and early detection","summary":" Women with an increased life-time risk of breast cancer undergo supplemental\nannual screening MRI. We propose to predict the risk of developing breast\ncancer within one year based on the current MRI, with the objective of reducing\nscreening burden and facilitating early detection. An AI algorithm was\ndeveloped on 53,858 breasts from 12,694 patients who underwent screening or\ndiagnostic MRI and accrued over 12 years, with 2,331 confirmed cancers. A first\nU-Net was trained to segment lesions and identify regions of concern. A second\nconvolutional network was trained to detect malignant cancer using features\nextracted by the U-Net. This network was then fine-tuned to estimate the risk\nof developing cancer within a year in cases that radiologists considered normal\nor likely benign. Risk predictions from this AI were evaluated with a\nretrospective analysis of 9,183 breasts from a high-risk screening cohort,\nwhich were not used for training. Statistical analysis focused on the tradeoff\nbetween number of omitted exams versus negative predictive value, and number of\npotential early detections versus positive predictive value. The AI algorithm\nidentified regions of concern that coincided with future tumors in 52% of\nscreen-detected cancers. Upon directed review, a radiologist found that 71.3%\nof cancers had a visible correlate on the MRI prior to diagnosis, 65% of these\ncorrelates were identified by the AI model. Reevaluating these regions in 10%\nof all cases with higher AI-predicted risk could have resulted in up to 33%\nearly detections by a radiologist. Additionally, screening burden could have\nbeen reduced in 16% of lower-risk cases by recommending a later follow-up\nwithout compromising current interval cancer rate. With increasing datasets and\nimproving image quality we expect this new AI-aided, adaptive screening to\nmeaningfully reduce screening burden and improve early detection.\n","authors":["Lukas Hirsch","Yu Huang","Hernan A. Makse","Danny F. Martinez","Mary Hughes","Sarah Eskreis-Winkler","Katja Pinker","Elizabeth Morris","Lucas C. Parra","Elizabeth J. Sutton"],"pdf_url":"https://arxiv.org/pdf/2312.00067v2.pdf","comment":"Major revisions and rewriting in progress"},{"id":"http://arxiv.org/abs/2401.10375v1","updated":"2024-01-18T20:56:42Z","published":"2024-01-18T20:56:42Z","title":"Vulnerabilities of Foundation Model Integrated Federated Learning Under\n Adversarial Threats","summary":" Federated Learning (FL) addresses critical issues in machine learning related\nto data privacy and security, yet suffering from data insufficiency and\nimbalance under certain circumstances. The emergence of foundation models (FMs)\noffers potential solutions to the limitations of existing FL frameworks, e.g.,\nby generating synthetic data for model initialization. However, due to the\ninherent safety concerns of FMs, integrating FMs into FL could introduce new\nrisks, which remains largely unexplored. To address this gap, we conduct the\nfirst investigation on the vulnerability of FM integrated FL (FM-FL) under\nadversarial threats. Based on a unified framework of FM-FL, we introduce a\nnovel attack strategy that exploits safety issues of FM to compromise FL client\nmodels. Through extensive experiments with well-known models and benchmark\ndatasets in both image and text domains, we reveal the high susceptibility of\nthe FM-FL to this new threat under various FL configurations. Furthermore, we\nfind that existing FL defense strategies offer limited protection against this\nnovel attack approach. This research highlights the critical need for enhanced\nsecurity measures in FL in the era of FMs.\n","authors":["Chen Wu","Xi Li","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10375v1.pdf","comment":"Chen Wu and Xi Li are equal contribution. The corresponding author is\n Jiaqi Wang"},{"id":"http://arxiv.org/abs/2401.10373v1","updated":"2024-01-18T20:43:43Z","published":"2024-01-18T20:43:43Z","title":"Harmonized Spatial and Spectral Learning for Robust and Generalized\n Medical Image Segmentation","summary":" Deep learning has demonstrated remarkable achievements in medical image\nsegmentation. However, prevailing deep learning models struggle with poor\ngeneralization due to (i) intra-class variations, where the same class appears\ndifferently in different samples, and (ii) inter-class independence, resulting\nin difficulties capturing intricate relationships between distinct objects,\nleading to higher false negative cases. This paper presents a novel approach\nthat synergies spatial and spectral representations to enhance\ndomain-generalized medical image segmentation. We introduce the innovative\nSpectral Correlation Coefficient objective to improve the model's capacity to\ncapture middle-order features and contextual long-range dependencies. This\nobjective complements traditional spatial objectives by incorporating valuable\nspectral information. Extensive experiments reveal that optimizing this\nobjective with existing architectures like UNet and TransUNet significantly\nenhances generalization, interpretability, and noise robustness, producing more\nconfident predictions. For instance, in cardiac segmentation, we observe a 0.81\npp and 1.63 pp (pp = percentage point) improvement in DSC over UNet and\nTransUNet, respectively. Our interpretability study demonstrates that, in most\ntasks, objectives optimized with UNet outperform even TransUNet by introducing\nglobal contextual information alongside local details. These findings\nunderscore the versatility and effectiveness of our proposed method across\ndiverse imaging modalities and medical domains.\n","authors":["Vandan Gorade","Sparsh Mittal","Debesh Jha","Rekha Singhal","Ulas Bagci"],"pdf_url":"https://arxiv.org/pdf/2401.10373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13486v2","updated":"2024-01-18T20:42:48Z","published":"2023-12-20T23:45:06Z","title":"Meta-Learning with Versatile Loss Geometries for Fast Adaptation Using\n Mirror Descent","summary":" Utilizing task-invariant prior knowledge extracted from related tasks,\nmeta-learning is a principled framework that empowers learning a new task\nespecially when data records are limited. A fundamental challenge in\nmeta-learning is how to quickly \"adapt\" the extracted prior in order to train a\ntask-specific model within a few optimization steps. Existing approaches deal\nwith this challenge using a preconditioner that enhances convergence of the\nper-task training process. Though effective in representing locally a quadratic\ntraining loss, these simple linear preconditioners can hardly capture complex\nloss geometries. The present contribution addresses this limitation by learning\na nonlinear mirror map, which induces a versatile distance metric to enable\ncapturing and optimizing a wide range of loss geometries, hence facilitating\nthe per-task training. Numerical tests on few-shot learning datasets\ndemonstrate the superior expressiveness and convergence of the advocated\napproach.\n","authors":["Yilang Zhang","Bingcong Li","Georgios B. Giannakis"],"pdf_url":"https://arxiv.org/pdf/2312.13486v2.pdf","comment":"Accepted by 2024 IEEE International Conference on Acoustics, Speech\n and Signal Processing (ICASSP-24)"},{"id":"http://arxiv.org/abs/2401.10371v1","updated":"2024-01-18T20:35:47Z","published":"2024-01-18T20:35:47Z","title":"Langevin Unlearning: A New Perspective of Noisy Gradient Descent for\n Machine Unlearning","summary":" Machine unlearning has raised significant interest with the adoption of laws\nensuring the ``right to be forgotten''. Researchers have provided a\nprobabilistic notion of approximate unlearning under a similar definition of\nDifferential Privacy (DP), where privacy is defined as statistical\nindistinguishability to retraining from scratch. We propose Langevin\nunlearning, an unlearning framework based on noisy gradient descent with\nprivacy guarantees for approximate unlearning problems. Langevin unlearning\nunifies the DP learning process and the privacy-certified unlearning process\nwith many algorithmic benefits. These include approximate certified unlearning\nfor non-convex problems, complexity saving compared to retraining, sequential\nand batch unlearning for multiple unlearning requests. We verify the\npracticality of Langevin unlearning by studying its privacy-utility-complexity\ntrade-off via experiments on benchmark datasets, and also demonstrate its\nsuperiority against gradient-decent-plus-output-perturbation based approximate\nunlearning.\n","authors":["Eli Chien","Haoyu Wang","Ziang Chen","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10370v1","updated":"2024-01-18T20:35:32Z","published":"2024-01-18T20:35:32Z","title":"Deep Generative Modeling for Financial Time Series with Application in\n VaR: A Comparative Review","summary":" In the financial services industry, forecasting the risk factor distribution\nconditional on the history and the current market environment is the key to\nmarket risk modeling in general and value at risk (VaR) model in particular. As\none of the most widely adopted VaR models in commercial banks, Historical\nsimulation (HS) uses the empirical distribution of daily returns in a\nhistorical window as the forecast distribution of risk factor returns in the\nnext day. The objectives for financial time series generation are to generate\nsynthetic data paths with good variety, and similar distribution and dynamics\nto the original historical data. In this paper, we apply multiple existing deep\ngenerative methods (e.g., CGAN, CWGAN, Diffusion, and Signature WGAN) for\nconditional time series generation, and propose and test two new methods for\nconditional multi-step time series generation, namely Encoder-Decoder CGAN and\nConditional TimeVAE. Furthermore, we introduce a comprehensive framework with a\nset of KPIs to measure the quality of the generated time series for financial\nmodeling. The KPIs cover distribution distance, autocorrelation and\nbacktesting. All models (HS, parametric and neural networks) are tested on both\nhistorical USD yield curve data and additional data simulated from GARCH and\nCIR processes. The study shows that top performing models are HS, GARCH and\nCWGAN models. Future research directions in this area are also discussed.\n","authors":["Lars Ericson","Xuejun Zhu","Xusi Han","Rao Fu","Shuang Li","Steve Guo","Ping Hu"],"pdf_url":"https://arxiv.org/pdf/2401.10370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10364v1","updated":"2024-01-18T20:14:10Z","published":"2024-01-18T20:14:10Z","title":"Using LLM such as ChatGPT for Designing and Implementing a RISC\n Processor: Execution,Challenges and Limitations","summary":" This paper discusses the feasibility of using Large Language Models LLM for\ncode generation with a particular application in designing an RISC. The paper\nalso reviews the associated steps such as parsing, tokenization, encoding,\nattention mechanism, sampling the tokens and iterations during code generation.\nThe generated code for the RISC components is verified through testbenches and\nhardware implementation on a FPGA board. Four metric parameters Correct output\non the first iteration, Number of errors embedded in the code, Number of trials\nrequired to achieve the code and Failure to generate the code after three\niterations, are used to compare the efficiency of using LLM in programming. In\nall the cases, the generated code had significant errors and human intervention\nwas always required to fix the bugs. LLM can therefore be used to complement a\nprogrammer code design.\n","authors":["Shadeeb Hossain","Aayush Gohil","Yizhou Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10822v2","updated":"2024-01-18T20:13:41Z","published":"2023-06-19T10:12:32Z","title":"Interpreting Deep Neural Networks with the Package innsight","summary":" The R package innsight offers a general toolbox for revealing variable-wise\ninterpretations of deep neural networks' predictions with so-called feature\nattribution methods. Aside from the unified and user-friendly framework, the\npackage stands out in three ways: It is generally the first R package\nimplementing feature attribution methods for neural networks. Secondly, it\noperates independently of the deep learning library allowing the interpretation\nof models from any R package, including keras, torch, neuralnet, and even\ncustom models. Despite its flexibility, innsight benefits internally from the\ntorch package's fast and efficient array calculations, which builds on LibTorch\n$-$ PyTorch's C++ backend $-$ without a Python dependency. Finally, it offers a\nvariety of visualization tools for tabular, signal, image data or a combination\nof these. Additionally, the plots can be rendered interactively using the\nplotly package.\n","authors":["Niklas Koenen","Marvin N. Wright"],"pdf_url":"https://arxiv.org/pdf/2306.10822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10361v1","updated":"2024-01-18T20:05:34Z","published":"2024-01-18T20:05:34Z","title":"Hierarchical Federated Learning in Multi-hop Cluster-Based VANETs","summary":" The usage of federated learning (FL) in Vehicular Ad hoc Networks (VANET) has\ngarnered significant interest in research due to the advantages of reducing\ntransmission overhead and protecting user privacy by communicating local\ndataset gradients instead of raw data. However, implementing FL in VANETs faces\nchallenges, including limited communication resources, high vehicle mobility,\nand the statistical diversity of data distributions. In order to tackle these\nissues, this paper introduces a novel framework for hierarchical federated\nlearning (HFL) over multi-hop clustering-based VANET. The proposed method\nutilizes a weighted combination of the average relative speed and cosine\nsimilarity of FL model parameters as a clustering metric to consider both data\ndiversity and high vehicle mobility. This metric ensures convergence with\nminimum changes in cluster heads while tackling the complexities associated\nwith non-independent and identically distributed (non-IID) data scenarios.\nAdditionally, the framework includes a novel mechanism to manage seamless\ntransitions of cluster heads (CHs), followed by transferring the most recent FL\nmodel parameter to the designated CH. Furthermore, the proposed approach\nconsiders the option of merging CHs, aiming to reduce their count and,\nconsequently, mitigate associated overhead. Through extensive simulations, the\nproposed hierarchical federated learning over clustered VANET has been\ndemonstrated to improve accuracy and convergence time significantly while\nmaintaining an acceptable level of packet overhead compared to previously\nproposed clustering algorithms and non-clustered VANET.\n","authors":["M. Saeid HaghighiFard","Sinem Coleri"],"pdf_url":"https://arxiv.org/pdf/2401.10361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.16031v3","updated":"2024-01-18T19:59:42Z","published":"2022-03-30T03:32:05Z","title":"How Deep is Your Art: An Experimental Study on the Limits of Artistic\n Understanding in a Single-Task, Single-Modality Neural Network","summary":" Computational modeling of artwork meaning is complex and difficult. This is\nbecause art interpretation is multidimensional and highly subjective. This\npaper experimentally investigated the degree to which a state-of-the-art Deep\nConvolutional Neural Network (DCNN), a popular Machine Learning approach, can\ncorrectly distinguish modern conceptual art work into the galleries devised by\nart curators. Two hypotheses were proposed to state that the DCNN model uses\nExhibited Properties for classification, like shape and color, but not\nNon-Exhibited Properties, such as historical context and artist intention. The\ntwo hypotheses were experimentally validated using a methodology designed for\nthis purpose. VGG-11 DCNN pre-trained on ImageNet dataset and discriminatively\nfine-tuned was trained on handcrafted datasets designed from real-world\nconceptual photography galleries. Experimental results supported the two\nhypotheses showing that the DCNN model ignores Non-Exhibited Properties and\nuses only Exhibited Properties for artwork classification. This work points to\ncurrent DCNN limitations, which should be addressed by future DNN models.\n","authors":["Mahan Agha Zahedi","Niloofar Gholamrezaei","Alex Doboli"],"pdf_url":"https://arxiv.org/pdf/2203.16031v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10360v1","updated":"2024-01-18T19:58:59Z","published":"2024-01-18T19:58:59Z","title":"Excuse me, sir? Your language model is leaking (information)","summary":" We introduce a cryptographic method to hide an arbitrary secret payload in\nthe response of a Large Language Model (LLM). A secret key is required to\nextract the payload from the model's response, and without the key it is\nprovably impossible to distinguish between the responses of the original LLM\nand the LLM that hides a payload. In particular, the quality of generated text\nis not affected by the payload. Our approach extends a recent result of Christ,\nGunn and Zamir (2023) who introduced an undetectable watermarking scheme for\nLLMs.\n","authors":["Or Zamir"],"pdf_url":"https://arxiv.org/pdf/2401.10360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10355v1","updated":"2024-01-18T19:48:53Z","published":"2024-01-18T19:48:53Z","title":"Intelligent Optimization and Machine Learning Algorithms for Structural\n Anomaly Detection using Seismic Signals","summary":" The lack of anomaly detection methods during mechanized tunnelling can cause\nfinancial loss and deficits in drilling time. On-site excavation requires hard\nobstacles to be recognized prior to drilling in order to avoid damaging the\ntunnel boring machine and to adjust the propagation velocity. The efficiency of\nthe structural anomaly detection can be increased with intelligent optimization\ntechniques and machine learning. In this research, the anomaly in a simple\nstructure is detected by comparing the experimental measurements of the\nstructural vibrations with numerical simulations using parameter estimation\nmethods.\n","authors":["Maximilian Trapp","Can Bogoclu","Tamara Nestorović","Dirk Roos"],"pdf_url":"https://arxiv.org/pdf/2401.10355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10354v1","updated":"2024-01-18T19:46:24Z","published":"2024-01-18T19:46:24Z","title":"Towards providing reliable job completion time predictions using PCS","summary":" In this paper we build a case for providing job completion time predictions\nto cloud users, similar to the delivery date of a package or arrival time of a\nbooked ride. Our analysis reveals that providing predictability can come at the\nexpense of performance and fairness. Existing cloud scheduling systems optimize\nfor extreme points in the trade-off space, making them either extremely\nunpredictable or impractical.\n To address this challenge, we present PCS, a new scheduling framework that\naims to provide predictability while balancing other traditional objectives.\nThe key idea behind PCS is to use Weighted-Fair-Queueing (WFQ) and find a\nsuitable configuration of different WFQ parameters (e.g., class weights) that\nmeets specific goals for predictability. It uses a simulation-aided search\nstrategy, to efficiently discover WFQ configurations that lie on the Pareto\nfront of the trade-off space between these objectives. We implement and\nevaluate PCS in the context of DNN job scheduling on GPUs. Our evaluation, on a\nsmall scale GPU testbed and larger-scale simulations, shows that PCS can\nprovide accurate completion time estimates while marginally compromising on\nperformance and fairness.\n","authors":["Abdullah Bin Faisal","Noah Martin","Hafiz Mohsin Bashir","Swaminathan Lamelas","Fahad R. Dogar"],"pdf_url":"https://arxiv.org/pdf/2401.10354v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03298v2","updated":"2024-01-18T19:45:02Z","published":"2023-10-05T03:56:09Z","title":"A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive\n Sampling","summary":" Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate\nmodeling and design optimization by incorporating data from various\nlow-fidelity (LF) models. While most existing MF methods assume a fixed\ndataset, adaptive sampling methods that dynamically allocate resources among\nfidelity models can achieve higher efficiency in the exploring and exploiting\nthe design space. However, most existing MF methods rely on the hierarchical\nassumption of fidelity levels or fail to capture the intercorrelation between\nmultiple fidelity levels and utilize it to quantify the value of the future\nsamples and navigate the adaptive sampling. To address this hurdle, we propose\na framework hinged on a latent embedding for different fidelity models and the\nassociated pre-posterior analysis to explicitly utilize their correlation for\nadaptive sampling. In this framework, each infill sampling iteration includes\ntwo steps: We first identify the location of interest with the greatest\npotential improvement using the high-fidelity (HF) model, then we search for\nthe next sample across all fidelity levels that maximize the improvement per\nunit cost at the location identified in the first step. This is made possible\nby a single Latent Variable Gaussian Process (LVGP) model that maps different\nfidelity models into an interpretable latent space to capture their\ncorrelations without assuming hierarchical fidelity levels. The LVGP enables us\nto assess how LF sampling candidates will affect HF response with pre-posterior\nanalysis and determine the next sample with the best benefit-to-cost ratio.\nThrough test cases, we demonstrate that the proposed method outperforms the\nbenchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO)\nproblems in convergence rate and robustness. Moreover, the method offers the\nflexibility to switch between GF and BO by simply changing the acquisition\nfunction.\n","authors":["Yi-Ping Chen","Liwei Wang","Yigitcan Comlek","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03298v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09726v2","updated":"2024-01-18T19:27:04Z","published":"2022-12-19T18:51:06Z","title":"Improving Faithfulness of Abstractive Summarization by Controlling\n Confounding Effect of Irrelevant Sentences","summary":" Lack of factual correctness is an issue that still plagues state-of-the-art\nsummarization systems despite their impressive progress on generating seemingly\nfluent summaries. In this paper, we show that factual inconsistency can be\ncaused by irrelevant parts of the input text, which act as confounders. To that\nend, we leverage information-theoretic measures of causal effects to quantify\nthe amount of confounding and precisely quantify how they affect the\nsummarization performance. Based on insights derived from our theoretical\nresults, we design a simple multi-task model to control such confounding by\nleveraging human-annotated relevant sentences when available. Crucially, we\ngive a principled characterization of data distributions where such confounding\ncan be large thereby necessitating the use of human annotated relevant\nsentences to generate factual summaries. Our approach improves faithfulness\nscores by 20\\% over strong baselines on AnswerSumm\n\\citep{fabbri2021answersumm}, a conversation summarization dataset where lack\nof faithfulness is a significant issue due to the subjective nature of the\ntask. Our best method achieves the highest faithfulness score while also\nachieving state-of-the-art results on standard metrics like ROUGE and METEOR.\nWe corroborate these improvements through human evaluation.\n","authors":["Asish Ghoshal","Arash Einolghozati","Ankit Arun","Haoran Li","Lili Yu","Vera Gor","Yashar Mehdad","Scott Wen-tau Yih","Asli Celikyilmaz"],"pdf_url":"https://arxiv.org/pdf/2212.09726v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.06826v3","updated":"2024-01-18T19:12:12Z","published":"2021-09-14T17:12:20Z","title":"Few-shot Quality-Diversity Optimization","summary":" In the past few years, a considerable amount of research has been dedicated\nto the exploitation of previous learning experiences and the design of Few-shot\nand Meta Learning approaches, in problem domains ranging from Computer Vision\nto Reinforcement Learning based control. A notable exception, where to the best\nof our knowledge, little to no effort has been made in this direction is\nQuality-Diversity (QD) optimization. QD methods have been shown to be effective\ntools in dealing with deceptive minima and sparse rewards in Reinforcement\nLearning. However, they remain costly due to their reliance on inherently\nsample inefficient evolutionary processes. We show that, given examples from a\ntask distribution, information about the paths taken by optimization in\nparameter space can be leveraged to build a prior population, which when used\nto initialize QD methods in unseen environments, allows for few-shot\nadaptation. Our proposed method does not require backpropagation. It is simple\nto implement and scale, and furthermore, it is agnostic to the underlying\nmodels that are being trained. Experiments carried in both sparse and dense\nreward settings using robotic manipulation and navigation benchmarks show that\nit considerably reduces the number of generations that are required for QD\noptimization in these environments.\n","authors":["Achkan Salehi","Alexandre Coninx","Stephane Doncieux"],"pdf_url":"https://arxiv.org/pdf/2109.06826v3.pdf","comment":"Accepted for publication in the IEEE Robotics and Automation Letters\n (RA-L) journal"},{"id":"http://arxiv.org/abs/2309.04452v2","updated":"2024-01-18T19:02:54Z","published":"2023-09-08T17:20:51Z","title":"Postprocessing of Ensemble Weather Forecasts Using Permutation-invariant\n Neural Networks","summary":" Statistical postprocessing is used to translate ensembles of raw numerical\nweather forecasts into reliable probabilistic forecast distributions. In this\nstudy, we examine the use of permutation-invariant neural networks for this\ntask. In contrast to previous approaches, which often operate on ensemble\nsummary statistics and dismiss details of the ensemble distribution, we propose\nnetworks that treat forecast ensembles as a set of unordered member forecasts\nand learn link functions that are by design invariant to permutations of the\nmember ordering. We evaluate the quality of the obtained forecast distributions\nin terms of calibration and sharpness and compare the models against classical\nand neural network-based benchmark methods. In case studies addressing the\npostprocessing of surface temperature and wind gust forecasts, we demonstrate\nstate-of-the-art prediction quality. To deepen the understanding of the learned\ninference process, we further propose a permutation-based importance analysis\nfor ensemble-valued predictors, which highlights specific aspects of the\nensemble forecast that are considered important by the trained postprocessing\nmodels. Our results suggest that most of the relevant information is contained\nin a few ensemble-internal degrees of freedom, which may impact the design of\nfuture ensemble forecasting and postprocessing systems.\n","authors":["Kevin Höhlein","Benedikt Schulz","Rüdiger Westermann","Sebastian Lerch"],"pdf_url":"https://arxiv.org/pdf/2309.04452v2.pdf","comment":"in press"},{"id":"http://arxiv.org/abs/2401.10338v1","updated":"2024-01-18T19:02:41Z","published":"2024-01-18T19:02:41Z","title":"MELODY: Robust Semi-Supervised Hybrid Model for Entity-Level Online\n Anomaly Detection with Multivariate Time Series","summary":" In large IT systems, software deployment is a crucial process in online\nservices as their code is regularly updated. However, a faulty code change may\ndegrade the target service's performance and cause cascading outages in\ndownstream services. Thus, software deployments should be comprehensively\nmonitored, and their anomalies should be detected timely. In this paper, we\nstudy the problem of anomaly detection for deployments. We begin by identifying\nthe challenges unique to this anomaly detection problem, which is at\nentity-level (e.g., deployments), relative to the more typical problem of\nanomaly detection in multivariate time series (MTS). The unique challenges\ninclude the heterogeneity of deployments, the low latency tolerance, the\nambiguous anomaly definition, and the limited supervision. To address them, we\npropose a novel framework, semi-supervised hybrid Model for Entity-Level Online\nDetection of anomalY (MELODY). MELODY first transforms the MTS of different\nentities to the same feature space by an online feature extractor, then uses a\nnewly proposed semi-supervised deep one-class model for detecting anomalous\nentities. We evaluated MELODY on real data of cloud services with 1.2M+ time\nseries. The relative F1 score improvement of MELODY over the state-of-the-art\nmethods ranges from 7.6% to 56.5%. The user evaluation suggests MELODY is\nsuitable for monitoring deployments in large online systems.\n","authors":["Jingchao Ni","Gauthier Guinet","Peihong Jiang","Laurent Callot","Andrey Kan"],"pdf_url":"https://arxiv.org/pdf/2401.10338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10337v1","updated":"2024-01-18T19:02:00Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n Security Attack Pattern Recognition","summary":" Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v1.pdf","comment":"accepted at EACL 2024, in ARR October 2023"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.09854v1","updated":"2024-01-18T10:10:25Z","published":"2024-01-18T10:10:25Z","title":"A Survey on Energy Consumption and Environmental Impact of Video\n Streaming","summary":" Climate change challenges require a notable decrease in worldwide greenhouse\ngas (GHG) emissions across technology sectors. Digital technologies, especially\nvideo streaming, accounting for most Internet traffic, make no exception. Video\nstreaming demand increases with remote working, multimedia communication\nservices (e.g., WhatsApp, Skype), video streaming content (e.g., YouTube,\nNetflix), video resolution (4K/8K, 50 fps/60 fps), and multi-view video, making\nenergy consumption and environmental footprint critical. This survey\ncontributes to a better understanding of sustainable and efficient video\nstreaming technologies by providing insights into the state-of-the-art and\npotential future directions for researchers, developers, and engineers, service\nproviders, hosting platforms, and consumers. We widen this survey's focus on\ncontent provisioning and content consumption based on the observation that\ncontinuously active network equipment underneath video streaming consumes\nsubstantial energy independent of the transmitted data type. We propose a\ntaxonomy of factors that affect the energy consumption in video streaming, such\nas encoding schemes, resource requirements, storage, content retrieval,\ndecoding, and display. We identify notable weaknesses in video streaming that\nrequire further research for improved energy efficiency: (1) fixed bitrate\nladders in HTTP live streaming; (2) inefficient hardware utilization of\nexisting video players; (3) lack of comprehensive open energy measurement\ndataset covering various device types and coding parameters for reproducible\nresearch.\n","authors":["Samira Afzal","Narges Mehran","Zoha Azimi Ourimi","Farzad Tashtarian","Hadi Amirpour","Radu Prodan","Christian Timmerer"],"pdf_url":"https://arxiv.org/pdf/2401.09854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12798v4","updated":"2024-01-18T09:03:24Z","published":"2023-10-19T14:52:58Z","title":"MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and\n Uni-Modal Adapter","summary":" Language Models (LMs) have demonstrated impressive molecule understanding\nability on various 1D text-related tasks. However, they inherently lack 2D\ngraph perception - a critical ability of human professionals in comprehending\nmolecules' topological structures. To bridge this gap, we propose MolCA:\nMolecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal\nAdapter. MolCA enables an LM (e.g., Galactica) to understand both text- and\ngraph-based molecular contents via the cross-modal projector. Specifically, the\ncross-modal projector is implemented as a Q-Former to connect a graph encoder's\nrepresentation space and an LM's text space. Further, MolCA employs a uni-modal\nadapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks.\nUnlike previous studies that couple an LM with a graph encoder via cross-modal\ncontrastive learning, MolCA retains the LM's ability of open-ended text\ngeneration and augments it with 2D graph information. To showcase its\neffectiveness, we extensively benchmark MolCA on tasks of molecule captioning,\nIUPAC name prediction, and molecule-text retrieval, on which MolCA\nsignificantly outperforms the baselines. Our codes and checkpoints can be found\nat https://github.com/acharkq/MolCA.\n","authors":["Zhiyuan Liu","Sihang Li","Yanchen Luo","Hao Fei","Yixin Cao","Kenji Kawaguchi","Xiang Wang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2310.12798v4.pdf","comment":"EMNLP main conference. 9 pages"},{"id":"http://arxiv.org/abs/2401.09774v1","updated":"2024-01-18T07:50:07Z","published":"2024-01-18T07:50:07Z","title":"On the Audio Hallucinations in Large Audio-Video Language Models","summary":" Large audio-video language models can generate descriptions for both video\nand audio. However, they sometimes ignore audio content, producing audio\ndescriptions solely reliant on visual information. This paper refers to this as\naudio hallucinations and analyzes them in large audio-video language models. We\ngather 1,000 sentences by inquiring about audio information and annotate them\nwhether they contain hallucinations. If a sentence is hallucinated, we also\ncategorize the type of hallucination. The results reveal that 332 sentences are\nhallucinated with distinct trends observed in nouns and verbs for each\nhallucination type. Based on this, we tackle a task of audio hallucination\nclassification using pre-trained audio-text models in the zero-shot and\nfine-tuning settings. Our experimental results reveal that the zero-shot models\nachieve higher performance (52.2% in F1) than the random (40.3%) and the\nfine-tuning models achieve 87.9%, outperforming the zero-shot models.\n","authors":["Taichi Nishimura","Shota Nakada","Masayoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2401.09774v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2311.18243v2","updated":"2024-01-18T07:47:24Z","published":"2023-11-30T04:21:10Z","title":"DKiS: Decay weight invertible image steganography with private key","summary":" Image steganography, defined as the practice of concealing information within\nanother image, traditionally encounters security challenges when its methods\nbecome publicly known or are under attack. To address this, a novel private\nkey-based image steganography technique has been introduced. This approach\nensures the security of the hidden information, as access requires a\ncorresponding private key, regardless of the public knowledge of the\nsteganography method. Experimental evidence has been presented, demonstrating\nthe effectiveness of our method and showcasing its real-world applicability.\nFurthermore, a critical challenge in the invertible image steganography process\nhas been identified by us: the transfer of non-essential, or `garbage',\ninformation from the secret to the host pipeline. To tackle this issue, the\ndecay weight has been introduced to control the information transfer,\neffectively filtering out irrelevant data and enhancing the performance of\nimage steganography. The code for this technique is publicly accessible at\nhttps://github.com/yanghangAI/DKiS, and a practical demonstration can be found\nat http://yanghang.site/hidekey.\n","authors":["Hang Yang","Yitian Xu","Xuhua Liu"],"pdf_url":"https://arxiv.org/pdf/2311.18243v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09759v1","updated":"2024-01-18T07:19:10Z","published":"2024-01-18T07:19:10Z","title":"SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech\n Recognition","summary":" Audio-visual speech recognition (AVSR) is a multimodal extension of automatic\nspeech recognition (ASR), using video as a complement to audio. In AVSR,\nconsiderable efforts have been directed at datasets for facial features such as\nlip-readings, while they often fall short in evaluating the image comprehension\ncapabilities in broader contexts. In this paper, we construct SlideAVSR, an\nAVSR dataset using scientific paper explanation videos. SlideAVSR provides a\nnew benchmark where models transcribe speech utterances with texts on the\nslides on the presentation recordings. As technical terminologies that are\nfrequent in paper explanations are notoriously challenging to transcribe\nwithout reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR\nproblems. As a simple yet effective baseline, we propose DocWhisper, an AVSR\nmodel that can refer to textual information from slides, and confirm its\neffectiveness on SlideAVSR.\n","authors":["Hao Wang","Shuhei Kurita","Shuichiro Shimizu","Daisuke Kawahara"],"pdf_url":"https://arxiv.org/pdf/2401.09759v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09725v1","updated":"2024-01-18T05:00:18Z","published":"2024-01-18T05:00:18Z","title":"Enhancing Image-Text Matching with Adaptive Feature Aggregation","summary":" Image-text matching aims to find matched cross-modal pairs accurately. While\ncurrent methods often rely on projecting cross-modal features into a common\nembedding space, they frequently suffer from imbalanced feature representations\nacross different modalities, leading to unreliable retrieval results. To\naddress these limitations, we introduce a novel Feature Enhancement Module that\nadaptively aggregates single-modal features for more balanced and robust\nimage-text retrieval. Additionally, we propose a new loss function that\novercomes the shortcomings of original triplet ranking loss, thereby\nsignificantly improving retrieval performance. The proposed model has been\nevaluated on two public datasets and achieves competitive retrieval performance\nwhen compared with several state-of-the-art models. Implementation codes can be\nfound here.\n","authors":["Zuhui Wang","Yunting Yin","I. V. Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2401.09725v1.pdf","comment":"Accepted by ICASSP 2024"}]},"2024-01-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.10882v1","updated":"2024-01-19T18:49:36Z","published":"2024-01-19T18:49:36Z","title":"Reinforcement learning for question answering in programming domain\n using public community scoring as a human feedback","summary":" In this study, we investigate the enhancement of the GPT Neo 125M performance\nin Community Question Answering (CQA) with a focus on programming, through the\nintegration of Reinforcement Learning from Human Feedback (RLHF) and the\nutilization of scores from Stack Overflow. Two distinct reward model training\nstrategies are employed for fine-tuning with Proximal Policy Optimization\n(PPO). Notably, the improvements in performance achieved through this method\nare comparable to those of GPT Neo 2.7B parameter variant. Additionally, an\nauxiliary scoring mechanism is introduced, which demonstrates the limitations\nof conventional linguistic metrics in evaluating responses in the programming\ndomain. Through accurate analysis, this paper looks at the divergence between\ntraditional linguistic metrics and our human-preferences-based reward model,\nunderscoring the imperative for domain-specific evaluation methods. By\nelucidating the complexities involved in applying RLHF to programming CQA and\naccentuating the significance of context-aware evaluation, this study\ncontributes to the ongoing efforts in refining Large Language Models through\nfocused human feedback.\n","authors":["Alexey Gorbatovski","Sergey Kovalchuk"],"pdf_url":"https://arxiv.org/pdf/2401.10882v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10862v1","updated":"2024-01-19T18:05:34Z","published":"2024-01-19T18:05:34Z","title":"Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs\n Without Fine-Tuning","summary":" Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type\nof attack that can coax these models into generating harmful and illegal\ncontent. In this paper, we show that pruning up to 20% of LLM parameters\nmarkedly increases their resistance to such attacks without additional training\nand without sacrificing their performance in standard benchmarks. Intriguingly,\nwe discovered that the enhanced safety observed post-pruning correlates to the\ninitial safety training level of the model, hinting that the effect of pruning\ncould be more general and may hold for other LLM behaviors beyond safety.\nAdditionally, we introduce a curated dataset of 225 harmful tasks across five\ncategories, inserted into ten different Jailbreaking prompts, showing that\npruning aids LLMs in concentrating attention on task-relevant tokens in\njailbreaking prompts. Lastly, our experiments reveal that the prominent chat\nmodels, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high\nsusceptibility to jailbreaking attacks, with some categories achieving nearly\n70-100% success rate. These insights underline the potential of pruning as a\ngeneralizable approach for improving LLM safety, reliability, and potentially\nother desired behaviors.\n","authors":["Adib Hasan","Ileana Rugina","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10850v1","updated":"2024-01-19T17:51:11Z","published":"2024-01-19T17:51:11Z","title":"Advancements in eHealth Data Analytics through Natural Language\n Processing and Deep Learning","summary":" The healthcare environment is commonly referred to as \"information-rich\" but\nalso \"knowledge poor\". Healthcare systems collect huge amounts of data from\nvarious sources: lab reports, medical letters, logs of medical tools or\nprograms, medical prescriptions, etc. These massive sets of data can provide\ngreat knowledge and information that can improve the medical services, and\noverall the healthcare domain, such as disease prediction by analyzing the\npatient's symptoms or disease prevention, by facilitating the discovery of\nbehavioral factors for diseases. Unfortunately, only a relatively small volume\nof the textual eHealth data is processed and interpreted, an important factor\nbeing the difficulty in efficiently performing Big Data operations. In the\nmedical field, detecting domain-specific multi-word terms is a crucial task as\nthey can define an entire concept with a few words. A term can be defined as a\nlinguistic structure or a concept, and it is composed of one or more words with\na specific meaning to a domain. All the terms of a domain create its\nterminology. This chapter offers a critical study of the current, most\nperformant solutions for analyzing unstructured (image and textual) eHealth\ndata. This study also provides a comparison of the current Natural Language\nProcessing and Deep Learning techniques in the eHealth context. Finally, we\nexamine and discuss some of the current issues, and we define a set of research\ndirections in this area.\n","authors":["Elena-Simona Apostol","Ciprian-Octavian Truică"],"pdf_url":"https://arxiv.org/pdf/2401.10850v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n in extremist social media","summary":" Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2309.14393v2","updated":"2024-01-19T17:33:44Z","published":"2023-09-25T14:50:04Z","title":"LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language\n Models","summary":" The carbon footprint associated with large language models (LLMs) is a\nsignificant concern, encompassing emissions from their training, inference,\nexperimentation, and storage processes, including operational and embodied\ncarbon emissions. An essential aspect is accurately estimating the carbon\nimpact of emerging LLMs even before their training, which heavily relies on GPU\nusage. Existing studies have reported the carbon footprint of LLM training, but\nonly one tool, mlco2, can predict the carbon footprint of new neural networks\nprior to physical training. However, mlco2 has several serious limitations. It\ncannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,\ndisregards critical architectural parameters, focuses solely on GPUs, and\ncannot model embodied carbon footprints. Addressing these gaps, we introduce\n\\textit{\\carb}, an end-to-end carbon footprint projection model designed for\nboth dense and MoE LLMs. Compared to mlco2, \\carb~significantly enhances the\naccuracy of carbon footprint estimations for various LLMs. The source code is\nreleased at \\url{https://github.com/SotaroKaneda/MLCarbon}.\n","authors":["Ahmad Faiz","Sotaro Kaneda","Ruhan Wang","Rita Osi","Prateek Sharma","Fan Chen","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.14393v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.10825v1","updated":"2024-01-19T17:21:05Z","published":"2024-01-19T17:21:05Z","title":"A survey on recent advances in named entity recognition","summary":" Named Entity Recognition seeks to extract substrings within a text that name\nreal-world objects and to determine their type (for example, whether they refer\nto persons or organizations). In this survey, we first present an overview of\nrecent popular approaches, but we also look at graph- and transformer- based\nmethods including Large Language Models (LLMs) that have not had much coverage\nin other surveys. Second, we focus on methods designed for datasets with scarce\nannotations. Third, we evaluate the performance of the main NER implementations\non a variety of datasets with differing characteristics (as regards their\ndomain, their size, and their number of classes). We thus provide a deep\ncomparison of algorithms that are never considered together. Our experiments\nshed some light on how the characteristics of datasets affect the behavior of\nthe methods that we compare.\n","authors":["Imed Keraghel","Stanislas Morbieu","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2401.10825v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2401.05273v2","updated":"2024-01-19T16:57:30Z","published":"2024-01-10T17:13:28Z","title":"INACIA: Integrating Large Language Models in Brazilian Audit Courts:\n Opportunities and Challenges","summary":" This paper introduces INACIA (Instru\\c{c}\\~ao Assistida com Intelig\\^encia\nArtificial), a groundbreaking system designed to integrate Large Language\nModels (LLMs) into the operational framework of Brazilian Federal Court of\nAccounts (TCU). The system automates various stages of case analysis, including\nbasic information extraction, admissibility examination, Periculum in mora and\nFumus boni iuris analyses, and recommendations generation. Through a series of\nexperiments, we demonstrate INACIA's potential in extracting relevant\ninformation from case documents, evaluating its legal plausibility, and\nformulating propositions for judicial decision-making. Utilizing a validation\ndataset alongside LLMs, our evaluation methodology presents an innovative\napproach to assessing system performance, correlating highly with human\njudgment. The results highlight INACIA's proficiency in handling complex legal\ntasks, indicating its suitability for augmenting efficiency and judicial\nfairness within legal systems. The paper also discusses potential enhancements\nand future applications, positioning INACIA as a model for worldwide AI\nintegration in legal domains.\n","authors":["Jayr Pereira","Andre Assumpcao","Julio Trecenti","Luiz Airosa","Caio Lente","Jhonatan Cléto","Guilherme Dobins","Rodrigo Nogueira","Luis Mitchell","Roberto Lotufo"],"pdf_url":"https://arxiv.org/pdf/2401.05273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08565v2","updated":"2024-01-19T16:48:59Z","published":"2023-09-15T17:33:24Z","title":"How Transferable are Attribute Controllers on Pretrained Multilingual\n Translation Models?","summary":" Customizing machine translation models to comply with fine-grained attributes\nsuch as formality has seen tremendous progress recently. However, current\napproaches mostly rely on at least some supervised data with attribute\nannotation. Data scarcity therefore remains a bottleneck to democratizing such\ncustomization possibilities to a wider range of languages, lower-resource ones\nin particular. Given recent progress in pretrained massively multilingual\ntranslation models, we use them as a foundation to transfer the attribute\ncontrolling capabilities to languages without supervised data. In this work, we\npresent a comprehensive analysis of transferring attribute controllers based on\na pretrained NLLB-200 model. We investigate both training- and inference-time\ncontrol techniques under various data scenarios, and uncover their relative\nstrengths and weaknesses in zero-shot performance and domain robustness. We\nshow that both paradigms are complementary, as shown by consistent improvements\non 5 zero-shot directions. Moreover, a human evaluation on a real low-resource\nlanguage, Bengali, confirms our findings on zero-shot transfer to new target\nlanguages. The code is\n$\\href{https://github.com/dannigt/attribute-controller-transfer}{\\text{here}}$.\n","authors":["Danni Liu","Jan Niehues"],"pdf_url":"https://arxiv.org/pdf/2309.08565v2.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2302.12190v2","updated":"2024-01-19T16:30:14Z","published":"2023-02-23T17:31:40Z","title":"MCWDST: a Minimum-Cost Weighted Directed Spanning Tree Algorithm for\n Real-Time Fake News Mitigation in Social Media","summary":" The widespread availability of internet access and handheld devices confers\nto social media a power similar to the one newspapers used to have. People seek\naffordable information on social media and can reach it within seconds. Yet\nthis convenience comes with dangers; any user may freely post whatever they\nplease and the content can stay online for a long period, regardless of its\ntruthfulness. A need to detect untruthful information, also known as fake news,\narises. In this paper, we present an end-to-end solution that accurately\ndetects fake news and immunizes network nodes that spread them in real-time. To\ndetect fake news, we propose two new stack deep learning architectures that\nutilize convolutional and bidirectional LSTM layers. To mitigate the spread of\nfake news, we propose a real-time network-aware strategy that (1) constructs a\nminimum-cost weighted directed spanning tree for a detected node, and (2)\nimmunizes nodes in that tree by scoring their harmfulness using a novel ranking\nfunction. We demonstrate the effectiveness of our solution on five real-world\ndatasets.\n","authors":["Ciprian-Octavian Truică","Elena-Simona Apostol","Radu-Cătălin Nicolescu","Panagiotis Karras"],"pdf_url":"https://arxiv.org/pdf/2302.12190v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07107v3","updated":"2024-01-19T16:01:28Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v3.pdf","comment":"updated to version 2"},{"id":"http://arxiv.org/abs/2401.10774v1","updated":"2024-01-19T15:48:40Z","published":"2024-01-19T15:48:40Z","title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple\n Decoding Heads","summary":" The inference process in Large Language Models (LLMs) is often limited due to\nthe absence of parallelism in the auto-regressive decoding process, resulting\nin most operations being restricted by the memory bandwidth of accelerators.\nWhile methods such as speculative decoding have been suggested to address this\nissue, their implementation is impeded by the challenges associated with\nacquiring and maintaining a separate draft model. In this paper, we present\nMedusa, an efficient method that augments LLM inference by adding extra\ndecoding heads to predict multiple subsequent tokens in parallel. Using a\ntree-based attention mechanism, Medusa constructs multiple candidate\ncontinuations and verifies them simultaneously in each decoding step. By\nleveraging parallel processing, Medusa introduces only minimal overhead in\nterms of single-step latency while substantially reducing the number of\ndecoding steps required.\n We present two levels of fine-tuning procedures for Medusa to meet the needs\nof different use cases: Medusa-1: Medusa is directly fine-tuned on top of a\nfrozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa\nis fine-tuned together with the backbone LLM, enabling better prediction\naccuracy of Medusa heads and higher speedup but needing a special training\nrecipe that preserves the backbone model's capabilities.\n Moreover, we propose several extensions that improve or expand the utility of\nMedusa, including a self-distillation to handle situations where no training\ndata is available and a typical acceptance scheme to boost the acceptance rate\nwhile maintaining generation quality. We evaluate Medusa on models of various\nsizes and training procedures. Our experiments demonstrate that Medusa-1 can\nachieve over 2.2x speedup without compromising generation quality, while\nMedusa-2 further improves the speedup to 2.3-3.6x.\n","authors":["Tianle Cai","Yuhong Li","Zhengyang Geng","Hongwu Peng","Jason D. Lee","Deming Chen","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2401.10774v1.pdf","comment":"The code for this implementation is available at\n https://github.com/FasterDecoding/Medusa"},{"id":"http://arxiv.org/abs/2401.10768v1","updated":"2024-01-19T15:39:49Z","published":"2024-01-19T15:39:49Z","title":"Mitigating Hallucinations of Large Language Models via Knowledge\n Consistent Alignment","summary":" While Large Language Models (LLMs) have proven to be exceptional on a variety\nof tasks after alignment, they may still produce responses that contradict the\ncontext or world knowledge confidently, a phenomenon known as\n``hallucination''. In this paper, we demonstrate that reducing the\ninconsistency between the external knowledge encapsulated in the training data\nand the intrinsic knowledge inherited in the pretraining corpus could mitigate\nhallucination in alignment. Specifically, we introduce a novel knowledge\nconsistent alignment (KCA) approach, which involves automatically formulating\nexaminations based on external knowledge for accessing the comprehension of\nLLMs. For data encompassing knowledge inconsistency, KCA implements several\nsimple yet efficient strategies for processing. We illustrate the superior\nperformance of the proposed KCA approach in mitigating hallucinations across\nsix benchmarks using LLMs of different backbones and scales. Furthermore, we\nconfirm the correlation between knowledge inconsistency and hallucination,\nsignifying the effectiveness of reducing knowledge inconsistency in alleviating\nhallucinations. Our code, model weights, and data are public at\n\\url{https://github.com/fanqiwan/KCA}.\n","authors":["Fanqi Wan","Xinting Huang","Leyang Cui","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10768v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2306.16143v4","updated":"2024-01-19T15:05:14Z","published":"2023-06-28T12:17:45Z","title":"Generative User-Experience Research for Developing Domain-specific\n Natural Language Processing Applications","summary":" User experience (UX) is a part of human-computer interaction (HCI) research\nand focuses on increasing intuitiveness, transparency, simplicity, and trust\nfor the system users. Most UX research for machine learning (ML) or natural\nlanguage processing (NLP) focuses on a data-driven methodology. It engages\ndomain users mainly for usability evaluation. Moreover, more typical UX methods\ntailor the systems towards user usability, unlike learning about the user needs\nfirst. This paper proposes a new methodology for integrating generative UX\nresearch into developing domain NLP applications. Generative UX research\nemploys domain users at the initial stages of prototype development, i.e.,\nideation and concept evaluation, and the last stage for evaluating system\nusefulness and user utility. The methodology emerged from and is evaluated on a\ncase study about the full-cycle prototype development of a domain-specific\nsemantic search for daily operations in the process industry. A key finding of\nour case study is that involving domain experts increases their interest and\ntrust in the final NLP application. The combined UX+NLP research of the\nproposed method efficiently considers data- and user-driven opportunities and\nconstraints, which can be crucial for developing NLP applications.\n","authors":["Anastasia Zhukova","Lukas von Sperl","Christian E. Matt","Bela Gipp"],"pdf_url":"https://arxiv.org/pdf/2306.16143v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10716v1","updated":"2024-01-19T14:27:44Z","published":"2024-01-19T14:27:44Z","title":"Structured Code Representations Enable Data-Efficient Adaptation of Code\n Language Models","summary":" Current language models tailored for code tasks often adopt the\npre-training-then-fine-tuning paradigm from natural language processing,\nmodeling source code as plain text. This approach, however, overlooks the\nunambiguous structures inherent in programming languages. In this work, we\nexplore data-efficient adaptation of pre-trained code models by further\npre-training and fine-tuning them with program structures. Specifically, we\nrepresent programs as parse trees -- also known as concrete syntax trees (CSTs)\n-- and adapt pre-trained models on serialized CSTs. Although the models that we\nadapt have been pre-trained only on the surface form of programs, we find that\na small amount of continual pre-training and fine-tuning on CSTs without\nchanging the model architecture yields improvements over the baseline approach\nacross various code tasks. The improvements are found to be particularly\nsignificant when there are limited training examples, demonstrating the\neffectiveness of integrating program structures with plain-text representation\neven when working with backbone models that have not been pre-trained with\nstructures.\n","authors":["Mayank Agarwal","Yikang Shen","Bailin Wang","Yoon Kim","Jie Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10716v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10712v1","updated":"2024-01-19T14:22:29Z","published":"2024-01-19T14:22:29Z","title":"Q&A Prompts: Discovering Rich Visual Clues through Mining\n Question-Answer Prompts for VQA requiring Diverse World Knowledge","summary":" With the breakthrough of multi-modal large language models, answering complex\nvisual questions that demand advanced reasoning abilities and world knowledge\nhas become a much more important testbed for developing AI models than ever.\nHowever, equipping AI models with robust cross-modality reasoning ability\nremains challenging since the cognition scheme of humans has not been\nunderstood systematically. In this paper, we believe that if we can collect\nvisual clues in the given image as much as possible, we will recognize the\nimage more accurately, understand the question better, recall relevant\nknowledge more easily, and finally reason out the answer. We discover these\nrich visual clues by mining question-answer pairs in images and sending them\ninto multi-modal large language models as prompts. We call the proposed method\nQ&A Prompts. Specifically, we first use the image-answer pairs and the\ncorresponding questions in the training set as inputs and outputs to train a\nvisual question generation model. Then, we use an image tagging model to\nidentify various instances and send packaged image-tag pairs into the visual\nquestion generation model to generate relevant questions with the extracted\nimage tags as answers. Finally, we encode these generated question-answer pairs\nas prompts with a visual-aware prompting module and send them into pre-trained\nmulti-modal large language models to reason out the final answers. Experimental\nresults show that, compared with state-of-the-art methods, our Q&A Prompts\nachieves substantial improvements on the challenging visual question answering\ndatasets requiring reasoning over diverse world knowledge, such as OK-VQA and\nA-OKVQA.\n","authors":["Haibi Wang","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10711v1","updated":"2024-01-19T14:21:46Z","published":"2024-01-19T14:21:46Z","title":"Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal\n Models for Video Question Answering","summary":" Video Question Answering (VideoQA) aims to answer natural language questions\nbased on the information observed in videos. Despite the recent success of\nLarge Multimodal Models (LMMs) in image-language understanding and reasoning,\nthey deal with VideoQA insufficiently by simply taking uniformly sampled frames\nas visual inputs, which ignores question-relevant visual clues. Moreover, there\nare no human annotations for question-critical timestamps in existing VideoQA\ndatasets. In light of this, we propose a novel weakly supervised framework to\nenforce the LMMs to reason out the answers with question-critical moments as\nvisual inputs. Specifically, we fuse the question and answer pairs as event\ndescriptions to find multiple keyframes as target moments, which will be\npseudo-labels. With these pseudo-labels as additionally weak supervision, we\ndevise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG\nlearns multiple Gaussian functions to characterize the temporal structure of\nthe video, and sample question-critical frames as positive moments to be the\nvisual inputs of LMMs. Extensive experiments on several VideoQA benchmarks\nverify the effectiveness of our framework, and we achieve substantial\nimprovements compared to previous state-of-the-art methods.\n","authors":["Haibo Wang","Chenghang Lai","Yixuan Sun","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10695v1","updated":"2024-01-19T14:00:19Z","published":"2024-01-19T14:00:19Z","title":"LangBridge: Multilingual Reasoning Without Multilingual Supervision","summary":" We introduce LangBridge, a zero-shot approach to adapt language models for\nmultilingual reasoning tasks without multilingual supervision. LangBridge\noperates by bridging two models, each specialized in different aspects: (1) one\nspecialized in understanding multiple languages (e.g., mT5 encoder) and (2) one\nspecialized in reasoning (e.g., Orca 2). LangBridge connects the two models by\nintroducing minimal trainable parameters between them. Despite utilizing only\nEnglish data for training, LangBridge considerably enhances the performance of\nlanguage models on low-resource languages across mathematical reasoning,\ncoding, and logical reasoning. Our analysis suggests that the efficacy of\nLangBridge stems from the language-agnostic characteristics of multilingual\nrepresentations. We publicly release our code and models.\n","authors":["Dongkeun Yoon","Joel Jang","Sungdong Kim","Seungone Kim","Sheikh Shafayat","Minjoon Seo"],"pdf_url":"https://arxiv.org/pdf/2401.10695v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.09343v2","updated":"2024-01-19T13:33:22Z","published":"2024-01-17T17:08:36Z","title":"Efficient slot labelling","summary":" Slot labelling is an essential component of any dialogue system, aiming to\nfind important arguments in every user turn. Common approaches involve large\npre-trained language models (PLMs) like BERT or RoBERTa, but they face\nchallenges such as high computational requirements and dependence on\npre-training data. In this work, we propose a lightweight method which performs\non par or better than the state-of-the-art PLM-based methods, while having\nalmost 10x less trainable parameters. This makes it especially applicable for\nreal-life industry scenarios.\n","authors":["Vladimir Vlasov"],"pdf_url":"https://arxiv.org/pdf/2401.09343v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10444v3","updated":"2024-01-19T13:19:13Z","published":"2023-09-19T09:04:15Z","title":"Exploring Iterative Enhancement for Improving Learnersourced\n Multiple-Choice Question Explanations with Large Language Models","summary":" Large language models exhibit superior capabilities in processing and\nunderstanding language, yet their applications in educational contexts remain\nunderexplored. Learnersourcing enhances learning by engaging students in\ncreating their own educational content. When learnersourcing multiple-choice\nquestions, creating explanations for the solution of a question is a crucial\nstep; it helps other students understand the solution and promotes a deeper\nunderstanding of related concepts. However, it is often difficult for students\nto craft effective solution explanations, due to limited subject understanding.\nTo help scaffold the task of automated explanation generation, we present and\nevaluate a framework called \"ILearner-LLM\", that iteratively enhances the\ngenerated explanations for the given questions with large language models.\nComprising an explanation generation model and an explanation evaluation model,\nthe framework generates high-quality student-aligned explanations by\niteratively feeding the quality rating score from the evaluation model back\ninto the instruction prompt of the explanation generation model. Experimental\nresults demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and\nGPT-4 to generate higher quality explanations that are closer to those written\nby students on five PeerWise datasets. Our findings represent a promising path\nto enrich the learnersourcing experience for students and to enhance the\ncapabilities of large language models for educational applications.\n","authors":["Qiming Bao","Juho Leinonen","Alex Yuxuan Peng","Wanjun Zhong","Gaël Gendron","Timothy Pistotti","Alice Huang","Paul Denny","Michael Witbrock","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2309.10444v3.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2306.00168v3","updated":"2024-01-19T13:05:04Z","published":"2023-05-31T20:25:08Z","title":"Measuring the Robustness of NLP Models to Domain Shifts","summary":" Existing research on Domain Robustness (DR) suffers from disparate setups,\nlack of task variety, and scarce research on recent models and capabilities\nsuch as few-shot learning. Furthermore, we claim that the common practice of\nmeasuring DR might further obscure the picture. Current research focuses on\nchallenge sets and relies solely on the Source Drop (SD): Using the source\nin-domain performance as a reference point for degradation. However, the Target\nDrop (TD) should be used as a complementary point of view. To understand the DR\nchallenge in modern NLP models, we developed a benchmark comprised of seven NLP\ntasks, including classification, QA, and generation. Our benchmark focuses on\nnatural topical domain shifts and enables measuring both the SD and the TD. Our\ncomprehensive study, involving over 14,000 domain shifts across 18 fine-tuned\nand few-shot models, shows that both models suffer from drops upon domain\nshifts. While fine-tuned models excel in-domain, few-shot LLMs often surpass\nthem cross-domain, showing better robustness. In addition, we found that a\nlarge SD can be explained by shifting to a harder domain rather than a genuine\nDR challenge. Thus, the TD is a more reliable metric.\n","authors":["Nitay Calderon","Naveh Porat","Eyal Ben-David","Alexander Chapanin","Zorik Gekhman","Nadav Oved","Vitaly Shalumov","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2306.00168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01185v2","updated":"2024-01-19T12:34:07Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":" In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93\\% - 95\\%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8000 words on average, and varying widely from under 2000\nwords to more than 20000), and that the amount of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v2.pdf","comment":"7 pages, 8 figures; GitHub repository\n https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2401.10660v1","updated":"2024-01-19T12:26:57Z","published":"2024-01-19T12:26:57Z","title":"A Simple Framework to Accelerate Multilingual Language Model for\n Monolingual Text Generation","summary":" Recent advancements in large language models have facilitated the execution\nof complex language tasks, not only in English but also in non-English\nlanguages. However, the tokenizers of most language models, such as Llama,\ntrained on English-centric corpora, tend to excessively fragment tokens in\nnon-English languages. This issue is especially pronounced in non-roman\nalphabetic languages, which are often divided at a character or even Unicode\nlevel, leading to slower text generation. To address this, our study introduces\na novel framework designed to expedite text generation in these languages. This\nframework predicts larger linguistic units than those of conventional\nmultilingual tokenizers and is specifically tailored to the target language,\nthereby reducing the number of decoding steps required. Our empirical results\ndemonstrate that the proposed framework increases the generation speed by a\nfactor of 1.9 compared to standard decoding while maintaining the performance\nof a pre-trained multilingual model on monolingual tasks.\n","authors":["Jimin Hong","Gibbeum Lee","Jaewoong Cho"],"pdf_url":"https://arxiv.org/pdf/2401.10660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10653v1","updated":"2024-01-19T11:59:13Z","published":"2024-01-19T11:59:13Z","title":"Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech\n Detection","summary":" With the recent surge and exponential growth of social media usage,\nscrutinizing social media content for the presence of any hateful content is of\nutmost importance. Researchers have been diligently working since the past\ndecade on distinguishing between content that promotes hatred and content that\ndoes not. Traditionally, the main focus has been on analyzing textual content.\nHowever, recent research attempts have also commenced into the identification\nof audio-based content. Nevertheless, studies have shown that relying solely on\naudio or text-based content may be ineffective, as recent upsurge indicates\nthat individuals often employ sarcasm in their speech and writing. To overcome\nthese challenges, we present an approach to identify whether a speech promotes\nhate or not utilizing both audio and textual representations. Our methodology\nis based on the Transformer framework that incorporates both audio and text\nsampling, accompanied by our very own layer called \"Attentive Fusion\". The\nresults of our study surpassed previous state-of-the-art techniques, achieving\nan impressive macro F1 score of 0.927 on the Test Set.\n","authors":["Atanu Mandal","Gargi Roy","Amit Barman","Indranil Dutta","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.10653v1.pdf","comment":"Accepted in 20th International Conference on Natural Language\n Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.10647v1","updated":"2024-01-19T11:48:09Z","published":"2024-01-19T11:48:09Z","title":"Sowing the Wind, Reaping the Whirlwind: The Impact of Editing Language\n Models","summary":" In the rapidly advancing field of artificial intelligence, the concept of\nRed-Teaming or Jailbreaking large language models (LLMs) has emerged as a\ncrucial area of study. This approach is especially significant in terms of\nassessing and enhancing the safety and robustness of these models. This paper\ninvestigates the intricate consequences of such modifications through model\nediting, uncovering a complex relationship between enhancing model accuracy and\npreserving its ethical integrity. Our in-depth analysis reveals a striking\nparadox: while injecting accurate information is crucial for model reliability,\nit can paradoxically destabilize the model's foundational framework, resulting\nin unpredictable and potentially unsafe behaviors. Additionally, we propose a\nbenchmark dataset NicheHazardQA to investigate this unsafe behavior both within\nthe same and cross topical domain. This aspect of our research sheds light on\nhow the edits, impact the model's safety metrics and guardrails. Our findings\nshow that model editing serves as a cost-effective tool for topical red-teaming\nby methodically applying targeted edits and evaluating the resultant model\nbehavior\n","authors":["Rima Hazra","Sayan Layek","Somnath Banerjee","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2401.10647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13274v2","updated":"2024-01-19T10:06:50Z","published":"2023-11-22T09:51:53Z","title":"Enhancing Summarization Performance through Transformer-Based Prompt\n Engineering in Automated Medical Reporting","summary":" Customized medical prompts enable Large Language Models (LLM) to effectively\naddress medical dialogue summarization. The process of medical reporting is\noften time-consuming for healthcare professionals. Implementing medical\ndialogue summarization techniques presents a viable solution to alleviate this\ntime constraint by generating automated medical reports. The effectiveness of\nLLMs in this process is significantly influenced by the formulation of the\nprompt, which plays a crucial role in determining the quality and relevance of\nthe generated reports. In this research, we used a combination of two distinct\nprompting strategies, known as shot prompting and pattern prompting to enhance\nthe performance of automated medical reporting. The evaluation of the automated\nmedical reports is carried out using the ROUGE score and a human evaluation\nwith the help of an expert panel. The two-shot prompting approach in\ncombination with scope and domain context outperforms other methods and\nachieves the highest score when compared to the human reference set by a\ngeneral practitioner. However, the automated reports are approximately twice as\nlong as the human references, due to the addition of both redundant and\nrelevant statements that are added to the report.\n","authors":["Daphne van Zandvoort","Laura Wiersema","Tom Huibers","Sandra van Dulmen","Sjaak Brinkkemper"],"pdf_url":"https://arxiv.org/pdf/2311.13274v2.pdf","comment":"12 pages, 4 figures, to be presented at HEALTHINF 2024, author\n contributions: research conducted and written by Daphne van Zandvoort and\n Laura Wiersema, research suggested and used software created by Tom Huibers,\n data provided and feedback provided by Sandra van Dulmen, supervision and\n feedback provided by Sjaak Brinkkemper"},{"id":"http://arxiv.org/abs/2311.12399v3","updated":"2024-01-19T09:49:46Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v3.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.10580v1","updated":"2024-01-19T09:46:08Z","published":"2024-01-19T09:46:08Z","title":"PHOENIX: Open-Source Language Adaption for Direct Preference\n Optimization","summary":" Large language models have gained immense importance in recent years and have\ndemonstrated outstanding results in solving various tasks. However, despite\nthese achievements, many questions remain unanswered in the context of large\nlanguage models. Besides the optimal use of the models for inference and the\nalignment of the results to the desired specifications, the transfer of models\nto other languages is still an underdeveloped area of research. The recent\npublication of models such as Llama-2 and Zephyr has provided new insights into\narchitectural improvements and the use of human feedback. However, insights\ninto adapting these techniques to other languages remain scarce. In this paper,\nwe build on latest improvements and apply the Direct Preference\nOptimization(DPO) approach to the German language. The model is available at\nhttps://huggingface.co/DRXD1000/Phoenix.\n","authors":["Matthias Uhlig","Sigurd Schacht","Sudarshan Kamath Barkur"],"pdf_url":"https://arxiv.org/pdf/2401.10580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10567v1","updated":"2024-01-19T09:13:28Z","published":"2024-01-19T09:13:28Z","title":"Self-training from Self-memory in Data-to-text Generation","summary":" This paper introduces a novel training model, self-training from self-memory\n(STSM) in data-to-text generation (DTG), allowing the model to self-train on\nsubsets, including self-memory as outputs inferred directly from the trained\nmodels and/or the new data. The quality of self-memory is validated by two\nmodels, data-to-text (D2T) and text-to-data (T2D), by two pre-defined\nconditions: (1) the appearance of all source values in the outputs of the D2T\nmodel and (2) the ability to convert back to source data in the outputs in the\nT2D model. We utilize a greedy algorithm to generate shorter D2T outputs if\nthey contain all source values. Subsequently, we use the T2D model to confirm\nthat these outputs can capture input relationships by demonstrating their\ncapacity to convert text back into data. With 30% of the dataset, we can train\nthe D2T model with a competitive performance compared to full training in the\nsame setup. We experiment with our model on two datasets, E2E NLG and DART.\nSTSM offers the D2T model a generalization capability from its subset memory\nwhile reducing training data volume. Ultimately, we anticipate that this paper\nwill contribute to continual learning solutions that adapt to new training\ndata, incorporating it as a form of self-memory in DTG tasks. The curated\ndataset is publicly available at: https://github.com/hoangthangta/STSM.\n","authors":["Hoang-Thang Ta"],"pdf_url":"https://arxiv.org/pdf/2401.10567v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2401.09566v2","updated":"2024-01-19T08:57:19Z","published":"2024-01-17T19:43:43Z","title":"Aligning Large Language Models with Counterfactual DPO","summary":" Advancements in large language models (LLMs) have demonstrated remarkable\ncapabilities across a diverse range of applications. These models excel in\ngenerating text completions that are contextually coherent and cover an\nextensive array of subjects. However, the vast datasets required for their\ntraining make aligning response styles during the pretraining and instruction\ntuning phases challenging. Consequently, an additional alignment phase is\ntypically employed, wherein the model is further trained with human preference\ndata to better align its outputs with human expectations. While this process\ndoesn't introduce new capabilities per se, it does accentuate generation styles\ninnate to the model. This paper explores the utilization of counterfactual\nprompting within the framework of Direct Preference Optimization (DPO) to align\nthe model's style without relying on human intervention. We demonstrate that\nthis method effectively instils desirable behaviour, mitigates undesirable\nones, and encourages the model to disregard inappropriate instructions. Our\nfindings suggest that counterfactual prompting with DPO presents a low-resource\nway to fine-tune LLMs to meet the demands for responsible and ethically aligned\nAI systems.\n","authors":["Bradley Butcher"],"pdf_url":"https://arxiv.org/pdf/2401.09566v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10559v1","updated":"2024-01-19T08:50:54Z","published":"2024-01-19T08:50:54Z","title":"OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy","summary":" We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel\nmulti-adapter method, OrchMoE, which capitalizes on modular skill architecture\nfor enhanced forward transfer in neural networks. Unlike prior models that\ndepend on explicit task identification inputs, OrchMoE automatically discerns\ntask categories, streamlining the learning process. This is achieved through an\nintegrated mechanism comprising an Automatic Task Classification module and a\nTask-Skill Allocation module, which collectively deduce task-specific\nclassifications and tailor skill allocation matrices. Our extensive evaluations\non the 'Super Natural Instructions' dataset, featuring 1,600 diverse\ninstructional tasks, indicate that OrchMoE substantially outperforms comparable\nmulti-adapter baselines in terms of both performance and sample utilization\nefficiency, all while operating within the same parameter constraints. These\nfindings suggest that OrchMoE offers a significant leap forward in multi-task\nlearning efficiency.\n","authors":["Haowen Wang","Tao Sun","Kaixiang Ji","Jian Wang","Cong Fan","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2401.10559v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.08326v2","updated":"2024-01-19T08:48:37Z","published":"2024-01-16T12:45:15Z","title":"RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large\n Language Models in Tool Learning","summary":" Tool learning has generated widespread interest as a vital means of\ninteraction between Large Language Models (LLMs) and the physical world.\nCurrent research predominantly emphasizes LLMs' capacity to utilize tools in\nwell-structured environments while overlooking their stability when confronted\nwith the inevitable noise of the real world. To bridge this gap, we introduce\nRoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool\nlearning. Specifically, we establish five external environments, each featuring\nvarying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union),\nproviding an in-depth analysis of the model's resilience across three critical\nphases: tool selection, parameter identification, and content filling.\nExperiments involving six widely-used models underscore the urgent necessity\nfor enhancing the robustness of LLMs in tool learning. For instance, the\nperformance of GPT-4 even drops significantly from 80.00 to 58.10 when there is\nno substantial change in manual accuracy. More surprisingly, the noise\ncorrection capability inherent in the GPT family paradoxically impedes its\nadaptability in the face of mild noise. In light of these findings, we propose\nRoTTuning, a strategy that enriches the diversity of training environments to\nbolster the robustness of LLMs in tool learning. The code and data are\navailable at https://github.com/Junjie-Ye/RoTBench.\n","authors":["Junjie Ye","Yilong Wu","Songyang Gao","Caishuang Huang","Sixian Li","Guanyu Li","Xiaoran Fan","Qi Zhang","Tao Gui","Xuanjing Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10543v1","updated":"2024-01-19T08:02:37Z","published":"2024-01-19T08:02:37Z","title":"Multilingual acoustic word embeddings for zero-resource languages","summary":" This research addresses the challenge of developing speech applications for\nzero-resource languages that lack labelled data. It specifically uses acoustic\nword embedding (AWE) -- fixed-dimensional representations of variable-duration\nspeech segments -- employing multilingual transfer, where labelled data from\nseveral well-resourced languages are used for pertaining. The study introduces\na new neural network that outperforms existing AWE models on zero-resource\nlanguages. It explores the impact of the choice of well-resourced languages.\nAWEs are applied to a keyword-spotting system for hate speech detection in\nSwahili radio broadcasts, demonstrating robustness in real-world scenarios.\nAdditionally, novel semantic AWE models improve semantic query-by-example\nsearch.\n","authors":["Christiaan Jacobs","Herman Kamper"],"pdf_url":"https://arxiv.org/pdf/2401.10543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.14995v2","updated":"2024-01-19T07:47:01Z","published":"2023-07-27T16:45:33Z","title":"TransNormerLLM: A Faster and Better Large Language Model with Improved\n TransNormer","summary":" We present TransNormerLLM, the first linear attention-based Large Language\nModel (LLM) that outperforms conventional softmax attention-based models in\nterms of both accuracy and efficiency. TransNormerLLM evolves from the previous\nlinear attention architecture TransNormer by making advanced modifications that\ninclude positional embedding, linear attention acceleration, gating mechanisms,\ntensor normalization, and inference acceleration and stabilization.\nSpecifically, we use LRPE together with an exponential decay to avoid attention\ndilution issues while allowing the model to retain global interactions between\ntokens. Additionally, we propose Lightning Attention, a cutting-edge technique\nthat accelerates linear attention by more than twice in runtime and reduces\nmemory usage by a remarkable four times. To further enhance the performance of\nTransNormer, we leverage a gating mechanism for smooth training and a new\ntensor normalization scheme to accelerate the model, resulting in an impressive\nacceleration of over $20\\%$. Furthermore, we develop a robust inference\nalgorithm that ensures numerical stability and consistent inference speed,\nregardless of the sequence length, showcasing superior efficiency during both\ntraining and inference stages. We also implement an efficient model parallel\nschema for TransNormerLLM, enabling seamless deployment on large-scale clusters\nand facilitating expansion to even more extensive models, i.e., LLMs with 175B\nparameters. We validate our model design through a series of ablations and\ntrain models with sizes of 385M, 1B, and 7B on our self-collected corpus.\nBenchmark results demonstrate that our models not only match the performance of\nstate-of-the-art LLMs with Transformer but are also significantly faster. Code\nis released at: https://github.com/OpenNLPLab/TransnormerLLM.\n","authors":["Zhen Qin","Dong Li","Weigao Sun","Weixuan Sun","Xuyang Shen","Xiaodong Han","Yunshen Wei","Baohong Lv","Xiao Luo","Yu Qiao","Yiran Zhong"],"pdf_url":"https://arxiv.org/pdf/2307.14995v2.pdf","comment":"Technical Report. Yiran Zhong is the corresponding author. Zhen Qin,\n Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this\n paper. Code is released at: https://github.com/OpenNLPLab/TransnormerLLM"},{"id":"http://arxiv.org/abs/2401.10536v1","updated":"2024-01-19T07:30:57Z","published":"2024-01-19T07:30:57Z","title":"Speech Swin-Transformer: Exploring a Hierarchical Transformer with\n Shifted Windows for Speech Emotion Recognition","summary":" Swin-Transformer has demonstrated remarkable success in computer vision by\nleveraging its hierarchical feature representation based on Transformer. In\nspeech signals, emotional information is distributed across different scales of\nspeech features, e.\\,g., word, phrase, and utterance. Drawing above\ninspiration, this paper presents a hierarchical speech Transformer with shifted\nwindows to aggregate multi-scale emotion features for speech emotion\nrecognition (SER), called Speech Swin-Transformer. Specifically, we first\ndivide the speech spectrogram into segment-level patches in the time domain,\ncomposed of multiple frame patches. These segment-level patches are then\nencoded using a stack of Swin blocks, in which a local window Transformer is\nutilized to explore local inter-frame emotional information across frame\npatches of each segment patch. After that, we also design a shifted window\nTransformer to compensate for patch correlations near the boundaries of segment\npatches. Finally, we employ a patch merging operation to aggregate\nsegment-level emotional features for hierarchical speech representation by\nexpanding the receptive field of Transformer from frame-level to segment-level.\nExperimental results demonstrate that our proposed Speech Swin-Transformer\noutperforms the state-of-the-art methods.\n","authors":["Yong Wang","Cheng Lu","Hailun Lian","Yan Zhao","Björn Schuller","Yuan Zong","Wenming Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.10536v1.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.10535v1","updated":"2024-01-19T07:21:45Z","published":"2024-01-19T07:21:45Z","title":"The \"Colonial Impulse\" of Natural Language Processing: An Audit of\n Bengali Sentiment Analysis Tools and Their Identity-based Biases","summary":" While colonization has sociohistorically impacted people's identities across\nvarious dimensions, those colonial values and biases continue to be perpetuated\nby sociotechnical systems. One category of sociotechnical systems--sentiment\nanalysis tools--can also perpetuate colonial values and bias, yet less\nattention has been paid to how such tools may be complicit in perpetuating\ncoloniality, although they are often used to guide various practices (e.g.,\ncontent moderation). In this paper, we explore potential bias in sentiment\nanalysis tools in the context of Bengali communities that have experienced and\ncontinue to experience the impacts of colonialism. Drawing on identity\ncategories most impacted by colonialism amongst local Bengali communities, we\nfocused our analytic attention on gender, religion, and nationality. We\nconducted an algorithmic audit of all sentiment analysis tools for Bengali,\navailable on the Python package index (PyPI) and GitHub. Despite similar\nsemantic content and structure, our analyses showed that in addition to\ninconsistencies in output from different tools, Bengali sentiment analysis\ntools exhibit bias between different identity categories and respond\ndifferently to different ways of identity expression. Connecting our findings\nwith colonially shaped sociocultural structures of Bengali communities, we\ndiscuss the implications of downstream bias of sentiment analysis tools.\n","authors":["Dipto Das","Shion Guha","Jed Brubaker","Bryan Semaan"],"pdf_url":"https://arxiv.org/pdf/2401.10535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n Reasoning over Image Sequences","summary":" Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10521v1","updated":"2024-01-19T06:54:39Z","published":"2024-01-19T06:54:39Z","title":"Cross-lingual Editing in Multilingual Language Models","summary":" The training of large language models (LLMs) necessitates substantial data\nand computational resources, and updating outdated LLMs entails significant\nefforts and resources. While numerous model editing techniques (METs) have\nemerged to efficiently update model outputs without retraining, their\neffectiveness in multilingual LLMs, where knowledge is stored in diverse\nlanguages, remains an underexplored research area. This research paper\nintroduces the cross-lingual model editing (\\textbf{XME}) paradigm, wherein a\nfact is edited in one language, and the subsequent update propagation is\nobserved across other languages. To investigate the XME paradigm, we conducted\nexperiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts:\n\\textit{Latin} (English, French, and Spanish) and \\textit{Indic} (Hindi,\nGujarati, and Bengali). The results reveal notable performance limitations of\nstate-of-the-art METs under the XME setting, mainly when the languages involved\nbelong to two distinct script families. These findings highlight the need for\nfurther research and development of XME techniques to address these challenges.\nFor more comprehensive information, the dataset used in this research and the\nassociated code are publicly available at the following\nURL\\url{https://github.com/lingo-iitgn/XME}.\n","authors":["Himanshu Beniwal","Kowsik Nandagopan D","Mayank Singh"],"pdf_url":"https://arxiv.org/pdf/2401.10521v1.pdf","comment":"Accepted at EACL 2024"},{"id":"http://arxiv.org/abs/2312.15880v2","updated":"2024-01-19T06:42:16Z","published":"2023-12-26T04:22:56Z","title":"KnowledgeNavigator: Leveraging Large Language Models for Enhanced\n Reasoning over Knowledge Graph","summary":" Large language model (LLM) has achieved outstanding performance on various\ndownstream tasks with its powerful natural language understanding and zero-shot\ncapability, but LLM still suffers from knowledge limitation. Especially in\nscenarios that require long logical chains or complex reasoning, the\nhallucination and knowledge limitation of LLM limit its performance in question\nanswering (QA). In this paper, we propose a novel framework KnowledgeNavigator\nto address these challenges by efficiently and accurately retrieving external\nknowledge from knowledge graph and using it as a key factor to enhance LLM\nreasoning. Specifically, KnowledgeNavigator first mines and enhances the\npotential constraints of the given question to guide the reasoning. Then it\nretrieves and filters external knowledge that supports answering through\niterative reasoning on knowledge graph with the guidance of LLM and the\nquestion. Finally, KnowledgeNavigator constructs the structured knowledge into\neffective prompts that are friendly to LLM to help its reasoning. We evaluate\nKnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the\nframework has great effectiveness and generalization, outperforming previous\nknowledge graph enhanced LLM methods and is comparable to the fully supervised\nmodels.\n","authors":["Tiezheng Guo","Qingwen Yang","Chen Wang","Yanyi Liu","Pan Li","Jiawei Tang","Dapeng Li","Yingyou Wen"],"pdf_url":"https://arxiv.org/pdf/2312.15880v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05492v3","updated":"2024-01-19T06:06:46Z","published":"2023-10-09T07:56:16Z","title":"How Abilities in Large Language Models are Affected by Supervised\n Fine-tuning Data Composition","summary":" Large language models (LLMs) with enormous pre-training tokens and parameters\nemerge diverse abilities, including math reasoning, code generation, and\ninstruction following. These abilities are further enhanced by supervised\nfine-tuning (SFT). While the open-source community has explored ad-hoc SFT for\nenhancing individual capabilities, proprietary LLMs exhibit versatility across\nvarious skills. Therefore, understanding the facilitation of multiple abilities\nvia SFT is paramount. In this study, we specifically focuses on the interplay\nof data composition between mathematical reasoning, code generation, and\ngeneral human-aligning abilities during SFT. We propose four intriguing\nresearch questions to explore the association between model performance and\nvarious factors including data amount, composition ratio, model size and SFT\nstrategies. Our experiments reveal that distinct capabilities scale differently\nand larger models generally show superior performance with same amount of data.\nMathematical reasoning and code generation consistently improve with increasing\ndata amount, whereas general abilities plateau after roughly a thousand\nsamples. Moreover, we observe data composition appears to enhance various\nabilities under limited data conditions, yet can lead to performance conflicts\nwhen data is plentiful. Our findings also suggest the amount of composition\ndata influences performance more than the composition ratio. In analysis of SFT\nstrategies, we find that sequentially learning multiple skills risks\ncatastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)\nstrategy offers a promising solution to learn multiple abilities with different\nscaling patterns.\n","authors":["Guanting Dong","Hongyi Yuan","Keming Lu","Chengpeng Li","Mingfeng Xue","Dayiheng Liu","Wei Wang","Zheng Yuan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10510v1","updated":"2024-01-19T05:58:30Z","published":"2024-01-19T05:58:30Z","title":"A match made in consistency heaven: when large language models meet\n evolutionary algorithms","summary":" Pre-trained large language models (LLMs) have powerful capabilities for\ngenerating creative natural text. Evolutionary algorithms (EAs) can discover\ndiverse solutions to complex real-world problems. Motivated by the common\ncollective and directionality of text sequence generation and evolution, this\npaper illustrates the strong consistency of LLMs and EAs, which includes\nmultiple one-to-one key characteristics: token embedding and genotype-phenotype\nmapping, position encoding and fitness shaping, position embedding and\nselection, attention and crossover, feed-forward neural network and mutation,\nmodel training and parameter update, and multi-task learning and\nmulti-objective optimization. Based on this consistency perspective, existing\ncoupling studies are analyzed, including evolutionary fine-tuning and\nLLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap\nfor future research in coupling LLMs and EAs, while highlighting key challenges\nalong the way. The consistency not only reveals the evolution mechanism behind\nLLMs but also facilitates the development of evolved artificial agents that\napproach or surpass biological organisms.\n","authors":["Wang Chao","Jiaxuan Zhao","Licheng Jiao","Lingling Li","Fang Liu","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10510v1.pdf","comment":"A perspective article under review"},{"id":"http://arxiv.org/abs/2401.10506v1","updated":"2024-01-19T05:48:07Z","published":"2024-01-19T05:48:07Z","title":"FinSQL: Model-Agnostic LLMs-based Text-to-SQL Framework for Financial\n Analysis","summary":" Text-to-SQL, which provides zero-code interface for operating relational\ndatabases, has gained much attention in financial analysis; because, financial\nprofessionals may not well-skilled in SQL programming. However, until now,\nthere is no practical Text-to-SQL benchmark dataset for financial analysis, and\nexisting Text-to-SQL methods have not considered the unique characteristics of\ndatabases in financial applications, such as commonly existing wide tables. To\naddress these issues, we collect a practical Text-to-SQL benchmark dataset and\npropose a model-agnostic Large Language Model (LLMs)-based Text-to-SQL\nframework for financial analysis. The benchmark dataset, BULL, is collected\nfrom the practical financial analysis business of Hundsun Technologies Inc.,\nincluding databases for fund, stock, and macro economy. Besides, the proposed\nLLMs-based Text-to-SQL framework, FinSQL, provides a systematic treatment for\nfinancial Text-to-SQL from the perspectives of prompt construction,\nparameter-efficient fine-tuning and output calibration. Extensive experimental\nresults on BULL demonstrate that FinSQL achieves the state-of-the-art\nText-to-SQL performance at a small cost; furthermore, FinSQL can bring up to\n36.64% performance improvement in scenarios requiring few-shot cross-database\nmodel transfer.\n","authors":["Chao Zhang","Yuren Mao","Yijiang Fan","Yu Mi","Yunjun Gao","Lu Chen","Dongfang Lou","Jinshu Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10506v1.pdf","comment":"13 pages, 13 figures"},{"id":"http://arxiv.org/abs/2401.00368v2","updated":"2024-01-19T05:16:20Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":" In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across nearly 100 languages. We then\nfine-tune open-source decoder-only LLMs on the synthetic data using standard\ncontrastive loss. Experiments demonstrate that our method achieves strong\nperformance on highly competitive text embedding benchmarks without using any\nlabeled data. Furthermore, when fine-tuned with a mixture of synthetic and\nlabeled data, our model sets new state-of-the-art results on the BEIR and MTEB\nbenchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v2.pdf","comment":"20 pages, 15 tables"},{"id":"http://arxiv.org/abs/2401.10491v1","updated":"2024-01-19T05:02:46Z","published":"2024-01-19T05:02:46Z","title":"Knowledge Fusion of Large Language Models","summary":" While training large language models (LLMs) from scratch can generate models\nwith distinct functionalities and strengths, it comes at significant costs and\nmay result in redundant capabilities. Alternatively, a cost-effective and\ncompelling approach is to merge existing pre-trained LLMs into a more potent\nmodel. However, due to the varying architectures of these LLMs, directly\nblending their weights is impractical. In this paper, we introduce the notion\nof knowledge fusion for LLMs, aimed at combining the capabilities of existing\nLLMs and transferring them into a single LLM. By leveraging the generative\ndistributions of source LLMs, we externalize their collective knowledge and\nunique strengths, thereby potentially elevating the capabilities of the target\nmodel beyond those of any individual source LLM. We validate our approach using\nthree popular LLMs with different architectures--Llama-2, MPT, and\nOpenLLaMA--across various benchmarks and tasks. Our findings confirm that the\nfusion of LLMs can improve the performance of the target model across a range\nof capabilities such as reasoning, commonsense, and code generation. Our code,\nmodel weights, and data are public at\n\\url{https://github.com/fanqiwan/FuseLLM}.\n","authors":["Fanqi Wan","Xinting Huang","Deng Cai","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10491v1.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.09972v2","updated":"2024-01-19T04:29:42Z","published":"2024-01-18T13:41:08Z","title":"Better Explain Transformers by Illuminating Important Information","summary":" Transformer-based models excel in various natural language processing (NLP)\ntasks, attracting countless efforts to explain their inner workings. Prior\nmethods explain Transformers by focusing on the raw gradient and attention as\ntoken attribution scores, where non-relevant information is often considered\nduring explanation computation, resulting in confusing results. In this work,\nwe propose highlighting the important information and eliminating irrelevant\ninformation by a refined information flow on top of the layer-wise relevance\npropagation (LRP) method. Specifically, we consider identifying syntactic and\npositional heads as important attention heads and focus on the relevance\nobtained from these important heads. Experimental results demonstrate that\nirrelevant information does distort output attribution scores and then should\nbe masked during explanation computation. Compared to eight baselines on both\nclassification and question-answering datasets, our method consistently\noutperforms with over 3\\% to 33\\% improvement on explanation metrics, providing\nsuperior explanation performance. Our anonymous code repository is available\nat: https://github.com/LinxinS97/Mask-LRP\n","authors":["Linxin Song","Yan Cui","Ao Luo","Freddy Lecue","Irene Li"],"pdf_url":"https://arxiv.org/pdf/2401.09972v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10487v1","updated":"2024-01-19T04:24:07Z","published":"2024-01-19T04:24:07Z","title":"Generative Dense Retrieval: Memory Can Be a Burden","summary":" Generative Retrieval (GR), autoregressively decoding relevant document\nidentifiers given a query, has been shown to perform well under the setting of\nsmall-scale corpora. By memorizing the document corpus with model parameters,\nGR implicitly achieves deep interaction between query and document. However,\nsuch a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for\nfine-grained features of documents; (2) Memory confusion gets worse as the\ncorpus size increases; (3) Huge memory update costs for new documents. To\nalleviate these problems, we propose the Generative Dense Retrieval (GDR)\nparadigm. Specifically, GDR first uses the limited memory volume to achieve\ninter-cluster matching from query to relevant document clusters.\nMemorizing-free matching mechanism from Dense Retrieval (DR) is then introduced\nto conduct fine-grained intra-cluster matching from clusters to relevant\ndocuments. The coarse-to-fine process maximizes the advantages of GR's deep\ninteraction and DR's scalability. Besides, we design a cluster identifier\nconstructing strategy to facilitate corpus memory and a cluster-adaptive\nnegative sampling strategy to enhance the intra-cluster mapping ability.\nEmpirical results show that GDR obtains an average of 3.0 R@100 improvement on\nNQ dataset under multiple settings and has better scalability.\n","authors":["Peiwen Yuan","Xinglin Wang","Shaoxiong Feng","Boyuan Pan","Yiwei Li","Heda Wang","Xupeng Miao","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10487v1.pdf","comment":"EACL 2024 main"},{"id":"http://arxiv.org/abs/2401.10480v1","updated":"2024-01-19T04:03:59Z","published":"2024-01-19T04:03:59Z","title":"Escape Sky-high Cost: Early-stopping Self-Consistency for Multi-step\n Reasoning","summary":" Self-consistency (SC) has been a widely used decoding strategy for\nchain-of-thought reasoning. Despite bringing significant performance\nimprovements across a variety of multi-step reasoning tasks, it is a high-cost\nmethod that requires multiple sampling with the preset size. In this paper, we\npropose a simple and scalable sampling process, \\textbf{E}arly-Stopping\n\\textbf{S}elf-\\textbf{C}onsistency (ESC), to greatly reduce the cost of SC\nwithout sacrificing performance. On this basis, one control scheme for ESC is\nfurther derivated to dynamically choose the performance-cost balance for\ndifferent tasks and models. To demonstrate ESC's effectiveness, we conducted\nextensive experiments on three popular categories of reasoning tasks:\narithmetic, commonsense and symbolic reasoning over language models with\nvarying scales. The empirical results show that ESC reduces the average number\nof sampling of chain-of-thought reasoning by a significant margin on six\nbenchmarks, including MATH (-33.8%), GSM8K (-80.1%), StrategyQA (-76.8%),\nCommonsenseQA (-78.5%), Coin Flip (-84.2%) and Last Letters (-67.4%), while\nattaining comparable performances.\n","authors":["Yiwei Li","Peiwen Yuan","Shaoxiong Feng","Boyuan Pan","Xinglin Wang","Bin Sun","Heda Wang","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10480v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10472v1","updated":"2024-01-19T03:49:28Z","published":"2024-01-19T03:49:28Z","title":"Name Tagging Under Domain Shift via Metric Learning for Life Sciences","summary":" Name tagging is a key component of Information Extraction (IE), particularly\nin scientific domains such as biomedicine and chemistry, where large language\nmodels (LLMs), e.g., ChatGPT, fall short. We investigate the applicability of\ntransfer learning for enhancing a name tagging model trained in the biomedical\ndomain (the source domain) to be used in the chemical domain (the target\ndomain). A common practice for training such a model in a few-shot learning\nsetting is to pretrain the model on the labeled source data, and then, to\nfinetune it on a hand-full of labeled target examples. In our experiments we\nobserved that such a model is prone to mis-labeling the source entities, which\ncan often appear in the text, as the target entities. To alleviate this\nproblem, we propose a model to transfer the knowledge from the source domain to\nthe target domain, however, at the same time, to project the source entities\nand target entities into separate regions of the feature space. This diminishes\nthe risk of mis-labeling the source entities as the target entities. Our model\nconsists of two stages: 1) entity grouping in the source domain, which\nincorporates knowledge from annotated events to establish relations between\nentities, and 2) entity discrimination in the target domain, which relies on\npseudo labeling and contrastive learning to enhance discrimination between the\nentities in the two domains. We carry out our extensive experiments across\nthree source and three target datasets, and demonstrate that our method\noutperforms the baselines, in some scenarios by 5\\% absolute value.\n","authors":["Hongyi Liu","Qingyun Wang","Payam Karisani","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2401.10472v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2401.10471v1","updated":"2024-01-19T03:48:27Z","published":"2024-01-19T03:48:27Z","title":"DeepEdit: Knowledge Editing as Decoding with Constraints","summary":" We develop a new perspective of knowledge editing for large language models\n(LLMs) as decoding with constraints. We propose DeepEdit (Depth-first Search\nbased Progressive Decoding for Knowledge Editing), a neuro-symbolic method that\nimproves knowledge editing with better coherence of reasoning, relevance to the\nquestion, and awareness of updated knowledge. DeepEdit can be flexibly applied\nto all black-box LLMs: it does not require any access to the model parameters,\nrepresentations, or output vocabulary distributions. DeepEdit progressively\nproduces the high-quality reasoning steps towards effective knowledge editing.\nIt utilizes a depth-first search to revise the LLMs' output, which improves the\noutput's informativeness to the input question and awareness of the updated\nknowledge. Qualitatively, DeepEdit effectively controls LLMs to produce more\nsuccinct reasoning in accord with knowledge editing. Quantitatively, DeepEdit\nyields significant gains on MQuaKE, a challenging multi-hop question-answering\ndataset with knowledge editing. We release the source code at\nhttps://github.com/wangywUST/DeepEdit.\n","authors":["Yiwei Wang","Muhao Chen","Nanyun Peng","Kai-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2401.10471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10465v1","updated":"2024-01-19T03:37:27Z","published":"2024-01-19T03:37:27Z","title":"Data-driven grapheme-to-phoneme representations for a lexicon-free\n text-to-speech","summary":" Grapheme-to-Phoneme (G2P) is an essential first step in any modern,\nhigh-quality Text-to-Speech (TTS) system. Most of the current G2P systems rely\non carefully hand-crafted lexicons developed by experts. This poses a two-fold\nproblem. Firstly, the lexicons are generated using a fixed phoneme set,\nusually, ARPABET or IPA, which might not be the most optimal way to represent\nphonemes for all languages. Secondly, the man-hours required to produce such an\nexpert lexicon are very high. In this paper, we eliminate both of these issues\nby using recent advances in self-supervised learning to obtain data-driven\nphoneme representations instead of fixed representations. We compare our\nlexicon-free approach against strong baselines that utilize a well-crafted\nlexicon. Furthermore, we show that our data-driven lexicon-free method performs\nas good or even marginally better than the conventional rule-based or\nlexicon-based neural G2Ps in terms of Mean Opinion Score (MOS) while using no\nprior language lexicon or phoneme set, i.e. no linguistic expertise.\n","authors":["Abhinav Garg","Jiyeon Kim","Sushil Khyalia","Chanwoo Kim","Dhananjaya Gowda"],"pdf_url":"https://arxiv.org/pdf/2401.10465v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.10463v1","updated":"2024-01-19T03:24:36Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":" We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.03279v2","updated":"2024-01-19T02:26:38Z","published":"2023-08-07T03:39:52Z","title":"UniversalNER: Targeted Distillation from Large Language Models for Open\n Named Entity Recognition","summary":" Large language models (LLMs) have demonstrated remarkable generalizability,\nsuch as understanding arbitrary entities and relations. Instruction tuning has\nproven effective for distilling LLMs into more cost-efficient models such as\nAlpaca and Vicuna. Yet such student models still trail the original LLMs by\nlarge margins in downstream applications. In this paper, we explore targeted\ndistillation with mission-focused instruction tuning to train student models\nthat can excel in a broad application class such as open information\nextraction. Using named entity recognition (NER) for case study, we show how\nChatGPT can be distilled into much smaller UniversalNER models for open NER.\nFor evaluation, we assemble the largest NER benchmark to date, comprising 43\ndatasets across 9 diverse domains such as biomedicine, programming, social\nmedia, law, finance. Without using any direct supervision, UniversalNER attains\nremarkable NER accuracy across tens of thousands of entity types, outperforming\ngeneral instruction-tuned models such as Alpaca and Vicuna by over 30 absolute\nF1 points in average. With a tiny fraction of parameters, UniversalNER not only\nacquires ChatGPT's capability in recognizing arbitrary entity types, but also\noutperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably,\nUniversalNER even outperforms by a large margin state-of-the-art multi-task\ninstruction-tuned systems such as InstructUIE, which uses supervised NER\nexamples. We also conduct thorough ablation studies to assess the impact of\nvarious components in our distillation approach. We release the distillation\nrecipe, data, and UniversalNER models to facilitate future research on targeted\ndistillation.\n","authors":["Wenxuan Zhou","Sheng Zhang","Yu Gu","Muhao Chen","Hoifung Poon"],"pdf_url":"https://arxiv.org/pdf/2308.03279v2.pdf","comment":"Accepted at ICLR 2024. Project page: https://universal-ner.github.io/"},{"id":"http://arxiv.org/abs/2401.10449v1","updated":"2024-01-19T01:36:07Z","published":"2024-01-19T01:36:07Z","title":"Contextualized Automatic Speech Recognition with Attention-Based Bias\n Phrase Boosted Beam Search","summary":" End-to-end (E2E) automatic speech recognition (ASR) methods exhibit\nremarkable performance. However, since the performance of such methods is\nintrinsically linked to the context present in the training data, E2E-ASR\nmethods do not perform as desired for unseen user contexts (e.g., technical\nterms, personal names, and playlists). Thus, E2E-ASR methods must be easily\ncontextualized by the user or developer. This paper proposes an attention-based\ncontextual biasing method that can be customized using an editable phrase list\n(referred to as a bias list). The proposed method can be trained effectively by\ncombining a bias phrase index loss and special tokens to detect the bias\nphrases in the input speech data. In addition, to improve the contextualization\nperformance during inference further, we propose a bias phrase boosted (BPB)\nbeam search algorithm based on the bias phrase index probability. Experimental\nresults demonstrate that the proposed method consistently improves the word\nerror rate and the character error rate of the target phrases in the bias list\non both the Librispeech-960 (English) and our in-house (Japanese) dataset,\nrespectively.\n","authors":["Yui Sudo","Muhammad Shakeel","Yosuke Fukumoto","Yifan Peng","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2401.10449v1.pdf","comment":"accepted by ICASSP20224"},{"id":"http://arxiv.org/abs/2401.10447v1","updated":"2024-01-19T01:30:16Z","published":"2024-01-19T01:30:16Z","title":"Investigating Training Strategies and Model Robustness of Low-Rank\n Adaptation for Language Modeling in Speech Recognition","summary":" The use of low-rank adaptation (LoRA) with frozen pretrained language models\n(PLMs) has become increasing popular as a mainstream, resource-efficient\nmodeling approach for memory-constrained hardware. In this study, we first\nexplore how to enhance model performance by introducing various LoRA training\nstrategies, achieving relative word error rate reductions of 3.50\\% on the\npublic Librispeech dataset and of 3.67\\% on an internal dataset in the\nmessaging domain. To further characterize the stability of LoRA-based\nsecond-pass speech recognition models, we examine robustness against input\nperturbations. These perturbations are rooted in homophone replacements and a\nnovel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both\ndesigned to measure the relative degradation in the performance of rescoring\nmodels. Our experimental results indicate that while advanced variants of LoRA,\nsuch as dynamic rank-allocated LoRA, lead to performance degradation in\n$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.\nThis finding is in comparison to fully-tuned models and vanilla LoRA tuning\nbaselines, suggesting that a comprehensive selection is needed when using\nLoRA-based adaptation for compute-cost savings and robust language modeling.\n","authors":["Yu Yu","Chao-Han Huck Yang","Tuan Dinh","Sungho Ryu","Jari Kolehmainen","Roger Ren","Denis Filimonov","Prashanth G. Shivakumar","Ankur Gandhe","Ariya Rastow","Jia Xu","Ivan Bulyko","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2401.10447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10446v1","updated":"2024-01-19T01:29:27Z","published":"2024-01-19T01:29:27Z","title":"Large Language Models are Efficient Learners of Noise-Robust Speech\n Recognition","summary":" Recent advances in large language models (LLMs) have promoted generative\nerror correction (GER) for automatic speech recognition (ASR), which leverages\nthe rich linguistic knowledge and powerful reasoning ability of LLMs to improve\nrecognition results. The latest work proposes a GER benchmark with HyPoradise\ndataset to learn the mapping from ASR N-best hypotheses to ground-truth\ntranscription by efficient LLM finetuning, which shows great effectiveness but\nlacks specificity on noise-robust ASR. In this work, we extend the benchmark to\nnoisy conditions and investigate if we can teach LLMs to perform denoising for\nGER just like what robust ASR do}, where one solution is introducing noise\ninformation as a conditioner into LLM. However, directly incorporating noise\nembeddings from audio encoder could harm the LLM tuning due to cross-modality\ngap. To this end, we propose to extract a language-space noise embedding from\nthe N-best list to represent the noise conditions of source speech, which can\npromote the denoising process in GER. Furthermore, in order to enhance its\nrepresentation ability of audio noise, we design a knowledge distillation (KD)\napproach via mutual information estimation to distill the real noise\ninformation in audio embeddings to our language embedding. Experiments on\nvarious latest LLMs demonstrate our approach achieves a new breakthrough with\nup to 53.9% correction improvement in terms of word error rate while with\nlimited training data. Analysis shows that our language-space noise embedding\ncan well represent the noise conditions of source speech, under which\noff-the-shelf LLMs show strong ability of language-space denoising.\n","authors":["Yuchen Hu","Chen Chen","Chao-Han Huck Yang","Ruizhe Li","Chao Zhang","Pin-Yu Chen","EnSiong Chng"],"pdf_url":"https://arxiv.org/pdf/2401.10446v1.pdf","comment":"Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be\n open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license"},{"id":"http://arxiv.org/abs/2401.10440v1","updated":"2024-01-19T01:07:50Z","published":"2024-01-19T01:07:50Z","title":"Breaking the Curse of Multilinguality with Cross-lingual Expert Language\n Models","summary":" Despite their popularity in non-English NLP, multilingual language models\noften underperform monolingual ones due to inter-language competition for model\nparameters. We propose Cross-lingual Expert Language Models (X-ELM), which\nmitigate this competition by independently training language models on subsets\nof the multilingual corpus. This process specializes X-ELMs to different\nlanguages while remaining effective as a multilingual ensemble. Our experiments\nshow that when given the same compute budget, X-ELM outperforms jointly trained\nmultilingual models across all considered languages and that these gains\ntransfer to downstream tasks. X-ELM provides additional benefits over\nperformance improvements: new experts can be iteratively added, adapting X-ELM\nto new languages without catastrophic forgetting. Furthermore, training is\nasynchronous, reducing the hardware requirements for multilingual training and\ndemocratizing multilingual modeling.\n","authors":["Terra Blevins","Tomasz Limisiewicz","Suchin Gururangan","Margaret Li","Hila Gonen","Noah A. Smith","Luke Zettlemoyer"],"pdf_url":"https://arxiv.org/pdf/2401.10440v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04398v2","updated":"2024-01-19T01:05:05Z","published":"2024-01-09T07:46:26Z","title":"Chain-of-Table: Evolving Tables in the Reasoning Chain for Table\n Understanding","summary":" Table-based reasoning with large language models (LLMs) is a promising\ndirection to tackle many table understanding tasks, such as table-based\nquestion answering and fact verification. Compared with generic reasoning,\ntable-based reasoning requires the extraction of underlying semantics from both\nfree-form questions and semi-structured tabular data. Chain-of-Thought and its\nsimilar approaches incorporate the reasoning chain in the form of textual\ncontext, but it is still an open question how to effectively leverage tabular\ndata in the reasoning chain. We propose the Chain-of-Table framework, where\ntabular data is explicitly used in the reasoning chain as a proxy for\nintermediate thoughts. Specifically, we guide LLMs using in-context learning to\niteratively generate operations and update the table to represent a tabular\nreasoning chain. LLMs can therefore dynamically plan the next operation based\non the results of the previous ones. This continuous evolution of the table\nforms a chain, showing the reasoning process for a given tabular problem. The\nchain carries structured information of the intermediate results, enabling more\naccurate and reliable predictions. Chain-of-Table achieves new state-of-the-art\nperformance on WikiTQ, FeTaQA, and TabFact benchmarks across multiple LLM\nchoices.\n","authors":["Zilong Wang","Hao Zhang","Chun-Liang Li","Julian Martin Eisenschlos","Vincent Perot","Zifeng Wang","Lesly Miculicich","Yasuhisa Fujii","Jingbo Shang","Chen-Yu Lee","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2401.04398v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11052v1","updated":"2024-01-19T23:00:31Z","published":"2024-01-19T23:00:31Z","title":"Mining experimental data from Materials Science literature with Large\n Language Models","summary":" This study is dedicated to evaluating the capabilities of advanced large\nlanguage models (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in the\nextraction of structured information from scientific documents within the field\nof materials science. We introduce a novel methodology for the comparative\nanalysis of intricate material expressions, emphasising the standardisation of\nchemical formulas to tackle the complexities inherent in materials science\ninformation assessment. To this end, we primarily focus on two critical tasks\nof information extraction: (i) a named entity recognition (NER) of studied\nmaterials and physical properties and (ii) a relation extraction (RE) between\nthese entities. The performance of LLMs in executing these tasks is benchmarked\nagainst traditional models based on the BERT architecture and rule-based\napproaches. For NER, LLMs fail to outperform the baseline with zero-shot\nprompting and exhibit only limited improvement with few-shot prompting.\nHowever, for RE, a GPT-3.5-Turbo fine-tuned with the appropriate strategy\noutperforms all models, including the baseline. Without any fine-tuning, GPT-4\nand GPT-4-Turbo display remarkable reasoning and relationship extraction\ncapabilities after being provided with merely a couple of examples, surpassing\nthe baseline. Overall, the results suggest that although LLMs demonstrate\nrelevant reasoning skills in connecting concepts, for tasks requiring\nextracting complex domain-specific entities like materials, specialised models\nare currently a better choice.\n","authors":["Luca Foppiano","Guillaume Lambard","Toshiyuki Amagasa","Masashi Ishii"],"pdf_url":"https://arxiv.org/pdf/2401.11052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11048v1","updated":"2024-01-19T22:24:39Z","published":"2024-01-19T22:24:39Z","title":"PubTator 3.0: an AI-powered Literature Resource for Unlocking Biomedical\n Knowledge","summary":" PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a\nbiomedical literature resource using state-of-the-art AI techniques to offer\nsemantic and relation searches for key concepts like proteins, genetic\nvariants, diseases, and chemicals. It currently provides over one billion\nentity and relation annotations across approximately 36 million PubMed\nabstracts and 6 million full-text articles from the PMC open access subset,\nupdated weekly. PubTator 3.0's online interface and API utilize these\nprecomputed entity relations and synonyms to provide advanced search\ncapabilities and enable large-scale analyses, streamlining many complex\ninformation needs. We showcase the retrieval quality of PubTator 3.0 using a\nseries of entity pair queries, demonstrating that PubTator 3.0 retrieves a\ngreater number of articles than either PubMed or Google Scholar, with higher\nprecision in the top 20 results. We further show that integrating ChatGPT\n(GPT-4) with PubTator APIs dramatically improves the factuality and\nverifiability of its responses. In summary, PubTator 3.0 offers a comprehensive\nset of features and tools that allow researchers to navigate the ever-expanding\nwealth of biomedical literature, expediting research and unlocking valuable\ninsights for scientific discovery.\n","authors":["Chih-Hsuan Wei","Alexis Allot","Po-Ting Lai","Robert Leaman","Shubo Tian","Ling Luo","Qiao Jin","Zhizheng Wang","Qingyu Chen","Zhiyong Lu"],"pdf_url":"https://arxiv.org/pdf/2401.11048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11033v1","updated":"2024-01-19T21:21:02Z","published":"2024-01-19T21:21:02Z","title":"FAIR Enough: How Can We Develop and Assess a FAIR-Compliant Dataset for\n Large Language Models' Training?","summary":" Advancements in Large Language Models (LLMs) highlight the need for ethical\npractices and data integrity. We introduce a framework that embeds FAIR\n(Findable, Accessible, Interoperable, Reusable) data principles into LLM\ntraining. This approach marks a shift towards practices compliant with FAIR\nstandards. Our framework presents guidelines for integrating FAIR data\nprinciples into LLM training. This initiative includes a checklist for\nresearchers and developers. We also demonstrate its practical application\nthrough a case study focused on bias identification and mitigation in our\nFAIR-compliant dataset. This work is a significant contribution to AI ethics\nand data science, advocating for balanced and ethical training methods in LLMs.\n","authors":["Shaina Raza","Shardul Ghuge","Chen Ding","Deval Pandya"],"pdf_url":"https://arxiv.org/pdf/2401.11033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11021v1","updated":"2024-01-19T20:40:23Z","published":"2024-01-19T20:40:23Z","title":"Analysis and Detection of Multilingual Hate Speech Using Transformer\n Based Deep Learning","summary":" Hate speech is harmful content that directly attacks or promotes hatred\nagainst members of groups or individuals based on actual or perceived aspects\nof identity, such as racism, religion, or sexual orientation. This can affect\nsocial life on social media platforms as hateful content shared through social\nmedia can harm both individuals and communities. As the prevalence of hate\nspeech increases online, the demand for automated detection as an NLP task is\nincreasing. In this work, the proposed method is using transformer-based model\nto detect hate speech in social media, like twitter, Facebook, WhatsApp,\nInstagram, etc. The proposed model is independent of languages and has been\ntested on Italian, English, German, Bengali. The Gold standard datasets were\ncollected from renowned researcher Zeerak Talat, Sara Tonelli, Melanie Siegel,\nand Rezaul Karim. The success rate of the proposed model for hate speech\ndetection is higher than the existing baseline and state-of-the-art models with\naccuracy in Bengali dataset is 89%, in English: 91%, in German dataset 91% and\nin Italian dataset it is 77%. The proposed algorithm shows substantial\nimprovement to the benchmark method.\n","authors":["Arijit Das","Somashree Nandy","Rupam Saha","Srijan Das","Diganta Saha"],"pdf_url":"https://arxiv.org/pdf/2401.11021v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.10995v1","updated":"2024-01-19T19:23:37Z","published":"2024-01-19T19:23:37Z","title":"The Radiation Oncology NLP Database","summary":" We present the Radiation Oncology NLP Database (ROND), the first dedicated\nNatural Language Processing (NLP) dataset for radiation oncology, an important\nmedical specialty that has received limited attention from the NLP community in\nthe past. With the advent of Artificial General Intelligence (AGI), there is an\nincreasing need for specialized datasets and benchmarks to facilitate research\nand development. ROND is specifically designed to address this gap in the\ndomain of radiation oncology, a field that offers many opportunities for NLP\nexploration. It encompasses various NLP tasks including Logic Reasoning, Text\nClassification, Named Entity Recognition (NER), Question Answering (QA), Text\nSummarization, and Patient-Clinician Conversations, each with a distinct focus\non radiation oncology concepts and application cases. In addition, we have\ndeveloped an instruction-tuning dataset consisting of over 20k instruction\npairs (based on ROND) and trained a large language model, CancerChat. This\nserves to demonstrate the potential of instruction-tuning large language models\nwithin a highly-specialized medical domain. The evaluation results in this\nstudy could serve as baseline results for future research. ROND aims to\nstimulate advancements in radiation oncology and clinical NLP by offering a\nplatform for testing and improving algorithms and models in a domain-specific\ncontext. The ROND dataset is a joint effort of multiple U.S. health\ninstitutions. The data is available at\nhttps://github.com/zl-liu/Radiation-Oncology-NLP-Database.\n","authors":["Zhengliang Liu","Jason Holmes","Wenxiong Liao","Chenbin Liu","Lian Zhang","Hongying Feng","Peilong Wang","Muhammad Ali Elahi","Hongmin Cai","Lichao Sun","Quanzheng Li","Xiang Li","Tianming Liu","Jiajian Shen","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10995v1.pdf","comment":"10 pages, 7 figures, 6 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.10891v1","updated":"2024-01-19T18:59:52Z","published":"2024-01-19T18:59:52Z","title":"Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data","summary":" This work presents Depth Anything, a highly practical solution for robust\nmonocular depth estimation. Without pursuing novel technical modules, we aim to\nbuild a simple yet powerful foundation model dealing with any images under any\ncircumstances. To this end, we scale up the dataset by designing a data engine\nto collect and automatically annotate large-scale unlabeled data (~62M), which\nsignificantly enlarges the data coverage and thus is able to reduce the\ngeneralization error. We investigate two simple yet effective strategies that\nmake data scaling-up promising. First, a more challenging optimization target\nis created by leveraging data augmentation tools. It compels the model to\nactively seek extra visual knowledge and acquire robust representations.\nSecond, an auxiliary supervision is developed to enforce the model to inherit\nrich semantic priors from pre-trained encoders. We evaluate its zero-shot\ncapabilities extensively, including six public datasets and randomly captured\nphotos. It demonstrates impressive generalization ability. Further, through\nfine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs\nare set. Our better depth model also results in a better depth-conditioned\nControlNet. Our models are released at\nhttps://github.com/LiheYoung/Depth-Anything.\n","authors":["Lihe Yang","Bingyi Kang","Zilong Huang","Xiaogang Xu","Jiashi Feng","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10891v1.pdf","comment":"Project page: https://depth-anything.github.io"},{"id":"http://arxiv.org/abs/2401.10890v1","updated":"2024-01-19T18:59:37Z","published":"2024-01-19T18:59:37Z","title":"Event detection from novel data sources: Leveraging satellite imagery\n alongside GPS traces","summary":" Rapid identification and response to breaking events, particularly those that\npose a threat to human life such as natural disasters or conflicts, is of\nparamount importance. The prevalence of mobile devices and the ubiquity of\nnetwork connectivity has generated a massive amount of temporally- and\nspatially-stamped data. Numerous studies have used mobile data to derive\nindividual human mobility patterns for various applications. Similarly, the\nincreasing number of orbital satellites has made it easier to gather\nhigh-resolution images capturing a snapshot of a geographical area in sub-daily\ntemporal frequency. We propose a novel data fusion methodology integrating\nsatellite imagery with privacy-enhanced mobile data to augment the event\ninference task, whether in real-time or historical. In the absence of boots on\nthe ground, mobile data is able to give an approximation of human mobility,\nproximity to one another, and the built environment. On the other hand,\nsatellite imagery can provide visual information on physical changes to the\nbuilt and natural environment. The expected use cases for our methodology\ninclude small-scale disaster detection (i.e., tornadoes, wildfires, and floods)\nin rural regions, search and rescue operation augmentation for lost hikers in\nremote wilderness areas, and identification of active conflict areas and\npopulation displacement in war-torn states. Our implementation is open-source\non GitHub: https://github.com/ekinugurel/SatMobFusion.\n","authors":["Ekin Ugurel","Steffen Coenen","Minda Zhou Chen","Cynthia Chen"],"pdf_url":"https://arxiv.org/pdf/2401.10890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10889v1","updated":"2024-01-19T18:59:11Z","published":"2024-01-19T18:59:11Z","title":"Synthesizing Moving People with 3D Control","summary":" In this paper, we present a diffusion model-based framework for animating\npeople from a single image for a given target 3D motion sequence. Our approach\nhas two core components: a) learning priors about invisible parts of the human\nbody and clothing, and b) rendering novel body poses with proper clothing and\ntexture. For the first part, we learn an in-filling diffusion model to\nhallucinate unseen parts of a person given a single image. We train this model\non texture map space, which makes it more sample-efficient since it is\ninvariant to pose and viewpoint. Second, we develop a diffusion-based rendering\npipeline, which is controlled by 3D human poses. This produces realistic\nrenderings of novel poses of the person, including clothing, hair, and\nplausible in-filling of unseen regions. This disentangled approach allows our\nmethod to generate a sequence of images that are faithful to the target motion\nin the 3D pose and, to the input image in terms of visual similarity. In\naddition to that, the 3D control allows various synthetic camera trajectories\nto render a person. Our experiments show that our method is resilient in\ngenerating prolonged motions and varied challenging and complex poses compared\nto prior methods. Please check our website for more details:\nhttps://boyiliee.github.io/3DHM.github.io/.\n","authors":["Boyi Li","Jathushan Rajasegaran","Yossi Gandelsman","Alexei A. Efros","Jitendra Malik"],"pdf_url":"https://arxiv.org/pdf/2401.10889v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10886v1","updated":"2024-01-19T18:57:46Z","published":"2024-01-19T18:57:46Z","title":"SCENES: Subpixel Correspondence Estimation With Epipolar Supervision","summary":" Extracting point correspondences from two or more views of a scene is a\nfundamental computer vision problem with particular importance for relative\ncamera pose estimation and structure-from-motion. Existing local feature\nmatching approaches, trained with correspondence supervision on large-scale\ndatasets, obtain highly-accurate matches on the test sets. However, they do not\ngeneralise well to new datasets with different characteristics to those they\nwere trained on, unlike classic feature extractors. Instead, they require\nfinetuning, which assumes that ground-truth correspondences or ground-truth\ncamera poses and 3D structure are available. We relax this assumption by\nremoving the requirement of 3D structure, e.g., depth maps or point clouds, and\nonly require camera pose information, which can be obtained from odometry. We\ndo so by replacing correspondence losses with epipolar losses, which encourage\nputative matches to lie on the associated epipolar line. While weaker than\ncorrespondence supervision, we observe that this cue is sufficient for\nfinetuning existing models on new data. We then further relax the assumption of\nknown camera poses by using pose estimates in a novel bootstrapping approach.\nWe evaluate on highly challenging datasets, including an indoor drone dataset\nand an outdoor smartphone camera dataset, and obtain state-of-the-art results\nwithout strong supervision.\n","authors":["Dominik A. Kloepfer","João F. Henriques","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2401.10886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20685v2","updated":"2024-01-19T18:53:13Z","published":"2023-10-31T17:49:48Z","title":"NeRF Revisited: Fixing Quadrature Instability in Volume Rendering","summary":" Neural radiance fields (NeRF) rely on volume rendering to synthesize novel\nviews. Volume rendering requires evaluating an integral along each ray, which\nis numerically approximated with a finite sum that corresponds to the exact\nintegral along the ray under piecewise constant volume density. As a\nconsequence, the rendered result is unstable w.r.t. the choice of samples along\nthe ray, a phenomenon that we dub quadrature instability. We propose a\nmathematically principled solution by reformulating the sample-based rendering\nequation so that it corresponds to the exact integral under piecewise linear\nvolume density. This simultaneously resolves multiple issues: conflicts between\nsamples along different rays, imprecise hierarchical sampling, and\nnon-differentiability of quantiles of ray termination distances w.r.t. model\nparameters. We demonstrate several benefits over the classical sample-based\nrendering equation, such as sharper textures, better geometric reconstruction,\nand stronger depth supervision. Our proposed formulation can be also be used as\na drop-in replacement to the volume rendering equation of existing NeRF-based\nmethods. Our project page can be found at pl-nerf.github.io.\n","authors":["Mikaela Angelina Uy","Kiyohiro Nakayama","Guandao Yang","Rahul Krishna Thomas","Leonidas Guibas","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2310.20685v2.pdf","comment":"Neurips 2023"},{"id":"http://arxiv.org/abs/2401.10877v1","updated":"2024-01-19T18:41:53Z","published":"2024-01-19T18:41:53Z","title":"The Cadaver in the Machine: The Social Practices of Measurement and\n Validation in Motion Capture Technology","summary":" Motion capture systems, used across various domains, make body\nrepresentations concrete through technical processes. We argue that the\nmeasurement of bodies and the validation of measurements for motion capture\nsystems can be understood as social practices. By analyzing the findings of a\nsystematic literature review (N=278) through the lens of social practice\ntheory, we show how these practices, and their varying attention to errors,\nbecome ingrained in motion capture design and innovation over time. Moreover,\nwe show how contemporary motion capture systems perpetuate assumptions about\nhuman bodies and their movements. We suggest that social practices of\nmeasurement and validation are ubiquitous in the development of data- and\nsensor-driven systems more broadly, and provide this work as a basis for\ninvestigating hidden design assumptions and their potential negative\nconsequences in human-computer interaction.\n","authors":["Emma Harvey","Hauke Sandhaus","Abigail Z. Jacobs","Emanuel Moss","Mona Sloane"],"pdf_url":"https://arxiv.org/pdf/2401.10877v1.pdf","comment":"34 pages, 9 figures. To appear in the 2024 ACM CHI Conference on\n Human Factors in Computing Systems (CHI '24)"},{"id":"http://arxiv.org/abs/2306.08251v2","updated":"2024-01-19T18:35:54Z","published":"2023-06-14T05:34:02Z","title":"GBSD: Generative Bokeh with Stage Diffusion","summary":" The bokeh effect is an artistic technique that blurs out-of-focus areas in a\nphotograph and has gained interest due to recent developments in text-to-image\nsynthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior\nwork on rendering bokeh effects have focused on post hoc image manipulation to\nproduce similar blurring effects in existing photographs using classical\ncomputer graphics or neural rendering techniques, but have either depth\ndiscontinuity artifacts or are restricted to reproducing bokeh effects that are\npresent in the training data. More recent diffusion based models can synthesize\nimages with an artistic style, but either require the generation of\nhigh-dimensional masks, expensive fine-tuning, or affect global image\ncharacteristics. In this paper, we present GBSD, the first generative\ntext-to-image model that synthesizes photorealistic images with a bokeh style.\nMotivated by how image synthesis occurs progressively in diffusion models, our\napproach combines latent diffusion models with a 2-stage conditioning algorithm\nto render bokeh effects on semantically defined objects. Since we can focus the\neffect on objects, this semantic bokeh effect is more versatile than classical\nrendering techniques. We evaluate GBSD both quantitatively and qualitatively\nand demonstrate its ability to be applied in both text-to-image and\nimage-to-image settings.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2306.08251v2.pdf","comment":"Short Version is accepted by International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP) 2024"},{"id":"http://arxiv.org/abs/2303.05015v2","updated":"2024-01-19T18:23:19Z","published":"2023-03-09T03:33:56Z","title":"Smooth and Stepwise Self-Distillation for Object Detection","summary":" Distilling the structured information captured in feature maps has\ncontributed to improved results for object detection tasks, but requires\ncareful selection of baseline architectures and substantial pre-training.\nSelf-distillation addresses these limitations and has recently achieved\nstate-of-the-art performance for object detection despite making several\nsimplifying architectural assumptions. Building on this work, we propose Smooth\nand Stepwise Self-Distillation (SSSD) for object detection. Our SSSD\narchitecture forms an implicit teacher from object labels and a feature pyramid\nnetwork backbone to distill label-annotated feature maps using Jensen-Shannon\ndistance, which is smoother than distillation losses used in prior work. We\nadditionally add a distillation coefficient that is adaptively configured based\non the learning rate. We extensively benchmark SSSD against a baseline and two\nstate-of-the-art object detector architectures on the COCO dataset by varying\nthe coefficients and backbone and detector networks. We demonstrate that SSSD\nachieves higher average precision in most experimental settings, is robust to a\nwide range of coefficients, and benefits from our stepwise distillation\nprocedure.\n","authors":["Jieren Deng","Xin Zhou","Hao Tian","Zhihong Pan","Derek Aguiar"],"pdf_url":"https://arxiv.org/pdf/2303.05015v2.pdf","comment":"Accepted by International Conference on Image Processing (ICIP) 2023"},{"id":"http://arxiv.org/abs/2401.10857v1","updated":"2024-01-19T18:00:52Z","published":"2024-01-19T18:00:52Z","title":"Motion Consistency Loss for Monocular Visual Odometry with\n Attention-Based Deep Learning","summary":" Deep learning algorithms have driven expressive progress in many complex\ntasks. The loss function is a core component of deep learning techniques,\nguiding the learning process of neural networks. This paper contributes by\nintroducing a consistency loss for visual odometry with deep learning-based\napproaches. The motion consistency loss explores repeated motions that appear\nin consecutive overlapped video clips. Experimental results show that our\napproach increased the performance of a model on the KITTI odometry benchmark.\n","authors":["André O. Françani","Marcos R. O. A. Maximo"],"pdf_url":"https://arxiv.org/pdf/2401.10857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10848v1","updated":"2024-01-19T17:48:05Z","published":"2024-01-19T17:48:05Z","title":"Source-Free and Image-Only Unsupervised Domain Adaptation for Category\n Level Object Pose Estimation","summary":" We consider the problem of source-free unsupervised category-level pose\nestimation from only RGB images to a target domain without any access to source\ndomain data or 3D annotations during adaptation. Collecting and annotating\nreal-world 3D data and corresponding images is laborious, expensive, yet\nunavoidable process, since even 3D pose domain adaptation methods require 3D\ndata in the target domain. We introduce 3DUDA, a method capable of adapting to\na nuisance-ridden target domain without 3D or depth data. Our key insight stems\nfrom the observation that specific object subparts remain stable across\nout-of-domain (OOD) scenarios, enabling strategic utilization of these\ninvariant subcomponents for effective model updates. We represent object\ncategories as simple cuboid meshes, and harness a generative model of neural\nfeature activations modeled at each mesh vertex learnt using differential\nrendering. We focus on individual locally robust mesh vertex features and\niteratively update them based on their proximity to corresponding features in\nthe target domain even when the global pose is not correct. Our model is then\ntrained in an EM fashion, alternating between updating the vertex features and\nthe feature extractor. We show that our method simulates fine-tuning on a\nglobal pseudo-labeled dataset under mild assumptions, which converges to the\ntarget domain asymptotically. Through extensive empirical validation, including\na complex extreme UDA setup which combines real nuisances, synthetic noise, and\nocclusion, we demonstrate the potency of our simple approach in addressing the\ndomain shift challenge and significantly improving pose estimation accuracy.\n","authors":["Prakhar Kaushik","Aayush Mishra","Adam Kortylewski","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2401.10848v1.pdf","comment":"36 pages, 9 figures, 50 tables; ICLR 2024 (Poster)"},{"id":"http://arxiv.org/abs/2401.10831v1","updated":"2024-01-19T17:27:21Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we demonstrate\nthat VTCDcan be used to improve model performance for fine-grained tasks.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10822v1","updated":"2024-01-19T17:16:16Z","published":"2024-01-19T17:16:16Z","title":"ActAnywhere: Subject-Aware Video Background Generation","summary":" Generating video background that tailors to foreground subject motion is an\nimportant problem for the movie industry and visual effects community. This\ntask involves synthesizing background that aligns with the motion and\nappearance of the foreground subject, while also complies with the artist's\ncreative intention. We introduce ActAnywhere, a generative model that automates\nthis process which traditionally requires tedious manual efforts. Our model\nleverages the power of large-scale video diffusion models, and is specifically\ntailored for this task. ActAnywhere takes a sequence of foreground subject\nsegmentation as input and an image that describes the desired scene as\ncondition, to produce a coherent video with realistic foreground-background\ninteractions while adhering to the condition frame. We train our model on a\nlarge-scale dataset of human-scene interaction videos. Extensive evaluations\ndemonstrate the superior performance of our model, significantly outperforming\nbaselines. Moreover, we show that ActAnywhere generalizes to diverse\nout-of-distribution samples, including non-human subjects. Please visit our\nproject webpage at https://actanywhere.github.io.\n","authors":["Boxiao Pan","Zhan Xu","Chun-Hao Paul Huang","Krishna Kumar Singh","Yang Zhou","Leonidas J. Guibas","Jimei Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10815v1","updated":"2024-01-19T17:02:17Z","published":"2024-01-19T17:02:17Z","title":"RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text\n Supervision","summary":" Language-supervised pre-training has proven to be a valuable method for\nextracting semantically meaningful features from images, serving as a\nfoundational element in multimodal systems within the computer vision and\nmedical imaging domains. However, resulting features are limited by the\ninformation contained within the text. This is particularly problematic in\nmedical imaging, where radiologists' written findings focus on specific\nobservations; a challenge compounded by the scarcity of paired imaging-text\ndata due to concerns over leakage of personal health information. In this work,\nwe fundamentally challenge the prevailing reliance on language supervision for\nlearning general purpose biomedical imaging encoders. We introduce RAD-DINO, a\nbiomedical image encoder pre-trained solely on unimodal biomedical imaging data\nthat obtains similar or greater performance than state-of-the-art biomedical\nlanguage supervised models on a diverse range of benchmarks. Specifically, the\nquality of learned representations is evaluated on standard imaging tasks\n(classification and semantic segmentation), and a vision-language alignment\ntask (text report generation from images). To further demonstrate the drawback\nof language supervision, we show that features from RAD-DINO correlate with\nother medical records (e.g., sex or age) better than language-supervised\nmodels, which are generally not mentioned in radiology reports. Finally, we\nconduct a series of ablations determining the factors in RAD-DINO's\nperformance; notably, we observe that RAD-DINO's downstream performance scales\nwell with the quantity and diversity of training data, demonstrating that\nimage-only supervision is a scalable approach for training a foundational\nbiomedical image encoder.\n","authors":["Fernando Pérez-García","Harshita Sharma","Sam Bond-Taylor","Kenza Bouzid","Valentina Salvatelli","Maximilian Ilse","Shruthi Bannur","Daniel C. Castro","Anton Schwaighofer","Matthew P. Lungren","Maria Wetscherek","Noel Codella","Stephanie L. Hyland","Javier Alvarez-Valle","Ozan Oktay"],"pdf_url":"https://arxiv.org/pdf/2401.10815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10805v1","updated":"2024-01-19T16:48:49Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":" In this work, we introduce the novel concept of visually Connecting Actions\nand Their Effects (CATE) in video understanding. CATE can have applications in\nareas like task planning and learning from demonstration. We propose different\nCATE-based task formulations, such as action selection and action\nspecification, where video understanding models connect actions and effects at\nsemantic and fine-grained levels. We observe that different formulations\nproduce representations capturing intuitive action properties. We also design\nvarious baseline models for action selection and action specification. Despite\nthe intuitive nature of the task, we observe that models struggle, and humans\noutperform them by a large margin. The study aims to establish a foundation for\nfuture efforts, showcasing the flexibility and versatility of connecting\nactions and effects in video understanding, with the hope of inspiring advanced\nformulations and models.\n","authors":["Eric Peh","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10790v1","updated":"2024-01-19T16:21:55Z","published":"2024-01-19T16:21:55Z","title":"Measuring the Impact of Scene Level Objects on Object Detection: Towards\n Quantitative Explanations of Detection Decisions","summary":" Although accuracy and other common metrics can provide a useful window into\nthe performance of an object detection model, they lack a deeper view of the\nmodel's decision process. Regardless of the quality of the training data and\nprocess, the features that an object detection model learns cannot be\nguaranteed. A model may learn a relationship between certain background\ncontext, i.e., scene level objects, and the presence of the labeled classes.\nFurthermore, standard performance verification and metrics would not identify\nthis phenomenon. This paper presents a new black box explainability method for\nadditional verification of object detection models by finding the impact of\nscene level objects on the identification of the objects within the image. By\ncomparing the accuracies of a model on test data with and without certain scene\nlevel objects, the contributions of these objects to the model's performance\nbecomes clearer. The experiment presented here will assess the impact of\nbuildings and people in image context on the detection of emergency road\nvehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the\npresence of a scene level object will indicate the model's reliance on that\nobject to make its detections. The results of this research lead to providing a\nquantitative explanation of the object detection model's decision process,\nenabling a deeper understanding of the model's performance.\n","authors":["Lynn Vonder Haar","Timothy Elvira","Luke Newcomb","Omar Ochoa"],"pdf_url":"https://arxiv.org/pdf/2401.10790v1.pdf","comment":"9 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.10786v1","updated":"2024-01-19T16:15:37Z","published":"2024-01-19T16:15:37Z","title":"Sat2Scene: 3D Urban Scene Generation from Satellite Images with\n Diffusion","summary":" Directly generating scenes from satellite imagery offers exciting\npossibilities for integration into applications like games and map services.\nHowever, challenges arise from significant view changes and scene scale.\nPrevious efforts mainly focused on image or video generation, lacking\nexploration into the adaptability of scene generation for arbitrary views.\nExisting 3D generation works either operate at the object level or are\ndifficult to utilize the geometry obtained from satellite imagery. To overcome\nthese limitations, we propose a novel architecture for direct 3D scene\ngeneration by introducing diffusion models into 3D sparse representations and\ncombining them with neural rendering techniques. Specifically, our approach\ngenerates texture colors at the point level for a given geometry using a 3D\ndiffusion model first, which is then transformed into a scene representation in\na feed-forward manner. The representation can be utilized to render arbitrary\nviews which would excel in both single-frame quality and inter-frame\nconsistency. Experiments in two city-scale datasets show that our model\ndemonstrates proficiency in generating photo-realistic street-view image\nsequences and cross-view urban scenes from satellite imagery.\n","authors":["Zuoyue Li","Zhenqiang Li","Zhaopeng Cui","Marc Pollefeys","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2401.10786v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.09495v2","updated":"2024-01-19T16:11:28Z","published":"2024-01-17T01:33:40Z","title":"IPR-NeRF: Ownership Verification meets Neural Radiance Field","summary":" Neural Radiance Field (NeRF) models have gained significant attention in the\ncomputer vision community in the recent past with state-of-the-art visual\nquality and produced impressive demonstrations. Since then, technopreneurs have\nsought to leverage NeRF models into a profitable business. Therefore, NeRF\nmodels make it worth the risk of plagiarizers illegally copying,\nre-distributing, or misusing those models. This paper proposes a comprehensive\nintellectual property (IP) protection framework for the NeRF model in both\nblack-box and white-box settings, namely IPR-NeRF. In the black-box setting, a\ndiffusion-based solution is introduced to embed and extract the watermark via a\ntwo-stage optimization process. In the white-box setting, a designated digital\nsignature is embedded into the weights of the NeRF model by adopting the sign\nloss objective. Our extensive experiments demonstrate that not only does our\napproach maintain the fidelity (\\ie, the rendering quality) of IPR-NeRF models,\nbut it is also robust against both ambiguity and removal attacks compared to\nprior arts.\n","authors":["Win Kent Ong","Kam Woh Ng","Chee Seng Chan","Yi Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2401.09495v2.pdf","comment":"Error on the paper"},{"id":"http://arxiv.org/abs/2401.10777v1","updated":"2024-01-19T15:51:34Z","published":"2024-01-19T15:51:34Z","title":"Determination of efficiency indicators of the stand for intelligent\n control of manual operations in industrial production","summary":" Systems of intelligent control of manual operations in industrial production\nare being implemented in many industries nowadays. Such systems use\nhigh-resolution cameras and computer vision algorithms to automatically track\nthe operator's manipulations and prevent technological errors in the assembly\nprocess. At the same time compliance with safety regulations in the workspace\nis monitored. As a result, the defect rate of manufactured products and the\nnumber of accidents during the manual assembly of any device are decreased.\nBefore implementing an intelligent control system into a real production it is\nnecessary to calculate its efficiency. In order to do it experiments on the\nstand for manual operations control systems were carried out. This paper\nproposes the methodology for calculating the efficiency indicators. This\nmathematical approach is based on the IoU calculation of real- and\npredicted-time intervals between assembly stages. The results show high\nprecision in tracking the validity of manual assembly and do not depend on the\nduration of the assembly process.\n","authors":["Anton Sergeev","Victor Minchenkov","Aleksei Soldatov"],"pdf_url":"https://arxiv.org/pdf/2401.10777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01984v2","updated":"2024-01-19T15:51:32Z","published":"2024-01-03T21:24:44Z","title":"AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed\n and Low Tolerance","summary":" Recent advances in visual anomaly detection research have seen AUROC and\nAUPRO scores on public benchmark datasets such as MVTec and VisA converge\ntowards perfect recall, giving the impression that these benchmarks are\nnear-solved. However, high AUROC and AUPRO scores do not always reflect\nqualitative performance, which limits the validity of these metrics in\nreal-world applications. We argue that the artificial ceiling imposed by the\nlack of an adequate evaluation metric restrains progression of the field, and\nit is crucial that we revisit the evaluation metrics used to rate our\nalgorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric\nthat addresses the shortcomings of AUROC and AUPRO. PIMO retains the\nrecall-based nature of the existing metrics but introduces two distinctions:\nthe assignment of curves (and respective area under the curve) is per-image,\nand its X-axis relies solely on normal images. Measuring recall per image\nsimplifies instance score indexing and is more robust to noisy annotations. As\nwe show, it also accelerates computation and enables the usage of statistical\ntests to compare models. By imposing low tolerance for false positives on\nnormal images, PIMO provides an enhanced model validation procedure and\nhighlights performance variations across datasets. Our experiments demonstrate\nthat PIMO offers practical advantages and nuanced performance insights that\nredefine anomaly detection benchmarks -- notably challenging the perception\nthat MVTec AD and VisA datasets have been solved by contemporary models.\nAvailable on GitHub: https://github.com/jpcbertoldo/aupimo.\n","authors":["Joao P. C. Bertoldo","Dick Ameln","Ashwin Vaidya","Samet Akçay"],"pdf_url":"https://arxiv.org/pdf/2401.01984v2.pdf","comment":"This research has been conducted during Google Summer of Code 2023\n (GSoC 2023) at OpenVINO (Intel). GSoC 2023 page:\n https://summerofcode.withgoogle.com/archive/2023/projects/SPMopugd"},{"id":"http://arxiv.org/abs/2401.10761v1","updated":"2024-01-19T15:33:46Z","published":"2024-01-19T15:33:46Z","title":"NN-VVC: Versatile Video Coding boosted by self-supervisedly learned\n image coding for machines","summary":" The recent progress in artificial intelligence has led to an ever-increasing\nusage of images and videos by machine analysis algorithms, mainly neural\nnetworks. Nonetheless, compression, storage and transmission of media have\ntraditionally been designed considering human beings as the viewers of the\ncontent. Recent research on image and video coding for machine analysis has\nprogressed mainly in two almost orthogonal directions. The first is represented\nby end-to-end (E2E) learned codecs which, while offering high performance on\nimage coding, are not yet on par with state-of-the-art conventional video\ncodecs and lack interoperability. The second direction considers using the\nVersatile Video Coding (VVC) standard or any other conventional video codec\n(CVC) together with pre- and post-processing operations targeting machine\nanalysis. While the CVC-based methods benefit from interoperability and broad\nhardware and software support, the machine task performance is often lower than\nthe desired level, particularly in low bitrates. This paper proposes a hybrid\ncodec for machines called NN-VVC, which combines the advantages of an\nE2E-learned image codec and a CVC to achieve high performance in both image and\nvideo coding for machines. Our experiments show that the proposed system\nachieved up to -43.20% and -26.8% Bj{\\o}ntegaard Delta rate reduction over VVC\nfor image and video data, respectively, when evaluated on multiple different\ndatasets and machine vision tasks. To the best of our knowledge, this is the\nfirst research paper showing a hybrid video codec that outperforms VVC on\nmultiple datasets and multiple machine vision tasks.\n","authors":["Jukka I. Ahonen","Nam Le","Honglei Zhang","Antti Hallapuro","Francesco Cricri","Hamed Rezazadegan Tavakoli","Miska M. Hannuksela","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2401.10761v1.pdf","comment":"ISM 2023 Best paper award winner version"},{"id":"http://arxiv.org/abs/2212.08044v3","updated":"2024-01-19T15:29:34Z","published":"2022-12-15T18:52:03Z","title":"Benchmarking Robustness of Multimodal Image-Text Models under\n Distribution Shift","summary":" Multimodal image-text models have shown remarkable performance in the past\nfew years. However, evaluating robustness against distribution shifts is\ncrucial before adopting them in real-world applications. In this work, we\ninvestigate the robustness of 12 popular open-sourced image-text models under\ncommon perturbations on five tasks (image-text retrieval, visual reasoning,\nvisual entailment, image captioning, and text-to-image generation). In\nparticular, we propose several new multimodal robustness benchmarks by applying\n17 image perturbation and 16 text perturbation techniques on top of existing\ndatasets. We observe that multimodal models are not robust to image and text\nperturbations, especially to image perturbations. Among the tested perturbation\nmethods, character-level perturbations constitute the most severe distribution\nshift for text, and zoom blur is the most severe shift for image data. We also\nintroduce two new robustness metrics (\\textbf{MMI} for MultiModal Impact score\nand \\textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal\nmodels. We hope our extensive study sheds light on new directions for the\ndevelopment of robust multimodal models. More details can be found on the\nproject webpage: \\url{https://MMRobustness.github.io}.\n","authors":["Jielin Qiu","Yi Zhu","Xingjian Shi","Florian Wenzel","Zhiqiang Tang","Ding Zhao","Bo Li","Mu Li"],"pdf_url":"https://arxiv.org/pdf/2212.08044v3.pdf","comment":"Accepted by Journal of Data-centric Machine Learning Research (DMLR)\n 2024"},{"id":"http://arxiv.org/abs/2401.10752v1","updated":"2024-01-19T15:21:51Z","published":"2024-01-19T15:21:51Z","title":"HiCD: Change Detection in Quality-Varied Images via Hierarchical\n Correlation Distillation","summary":" Advanced change detection techniques primarily target image pairs of equal\nand high quality. However, variations in imaging conditions and platforms\nfrequently lead to image pairs with distinct qualities: one image being\nhigh-quality, while the other being low-quality. These disparities in image\nquality present significant challenges for understanding image pairs\nsemantically and extracting change features, ultimately resulting in a notable\ndecline in performance. To tackle this challenge, we introduce an innovative\ntraining strategy grounded in knowledge distillation. The core idea revolves\naround leveraging task knowledge acquired from high-quality image pairs to\nguide the model's learning process when dealing with image pairs that exhibit\ndifferences in quality. Additionally, we develop a hierarchical correlation\ndistillation approach (involving self-correlation, cross-correlation, and\nglobal correlation). This approach compels the student model to replicate the\ncorrelations inherent in the teacher model, rather than focusing solely on\nindividual features. This ensures effective knowledge transfer while\nmaintaining the student model's training flexibility.\n","authors":["Chao Pang","Xingxing Weng","Jiang Wu","Qiang Wang","Gui-Song Xia"],"pdf_url":"https://arxiv.org/pdf/2401.10752v1.pdf","comment":"accepted by TGRS"},{"id":"http://arxiv.org/abs/2401.10741v1","updated":"2024-01-19T14:59:26Z","published":"2024-01-19T14:59:26Z","title":"Character Recognition in Byzantine Seals with Deep Neural Networks","summary":" Seals are small coin-shaped artifacts, mostly made of lead, held with strings\nto seal letters. This work presents the first attempt towards automatic reading\nof text on Byzantine seal images.Byzantine seals are generally decorated with\niconography on the obverse side and Greek text on the reverse side. Text may\ninclude the sender's name, position in the Byzantine aristocracy, and elements\nof prayers. Both text and iconography are precious literary sources that wait\nto be exploited electronically, so the development of computerized systems for\ninterpreting seals images is of paramount importance. This work's contribution\nis hence a deep, two-stages, character reading pipeline for transcribing\nByzantine seal images. A first deep convolutional neural network (CNN) detects\ncharacters in the seal (character localization). A second convolutional network\nreads the localized characters (character classification). Finally, a\ndiplomatic transcription of the seal is provided by post-processing the two\nnetwork outputs. We provide an experimental evaluation of each CNN in isolation\nand both CNNs in combination. All performances are evaluated by\ncross-validation. Character localization achieves a mean average precision\n(mAP@0.5) greater than 0.9. Classification of characters cropped from ground\ntruth bounding boxes achieves Top-1 accuracy greater than 0.92. End-to-end\nevaluation shows the efficiency of the proposed approach when compared to the\nSoTA for similar tasks.\n","authors":["Théophile Rageau","Laurence Likforman-Sulem","Attilio Fiandrotti","Victoria Eyharabide","Béatrice Caseau","Jean-Claude Cheynet"],"pdf_url":"https://arxiv.org/pdf/2401.10741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10732v1","updated":"2024-01-19T14:49:56Z","published":"2024-01-19T14:49:56Z","title":"Bridging the gap between image coding for machines and humans","summary":" Image coding for machines (ICM) aims at reducing the bitrate required to\nrepresent an image while minimizing the drop in machine vision analysis\naccuracy. In many use cases, such as surveillance, it is also important that\nthe visual quality is not drastically deteriorated by the compression process.\nRecent works on using neural network (NN) based ICM codecs have shown\nsignificant coding gains against traditional methods; however, the decompressed\nimages, especially at low bitrates, often contain checkerboard artifacts. We\npropose an effective decoder finetuning scheme based on adversarial training to\nsignificantly enhance the visual quality of ICM codecs, while preserving the\nmachine analysis accuracy, without adding extra bitcost or parameters at the\ninference phase. The results show complete removal of the checkerboard\nartifacts at the negligible cost of -1.6% relative change in task performance\nscore. In the cases where some amount of artifacts is tolerable, such as when\nmachine consumption is the primary target, this technique can enhance both\npixel-fidelity and feature-fidelity scores without losing task performance.\n","authors":["Nam Le","Honglei Zhang","Francesco Cricri","Ramin G. Youvalari","Hamed Rezazadegan Tavakoli","Emre Aksu","Miska M. Hannuksela","Esa Rahtu"],"pdf_url":"https://arxiv.org/pdf/2401.10732v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10731v1","updated":"2024-01-19T14:49:42Z","published":"2024-01-19T14:49:42Z","title":"Removal and Selection: Improving RGB-Infrared Object Detection via\n Coarse-to-Fine Fusion","summary":" Object detection in visible (RGB) and infrared (IR) images has been widely\napplied in recent years. Leveraging the complementary characteristics of RGB\nand IR images, the object detector provides reliable and robust object\nlocalization from day to night. Existing fusion strategies directly inject RGB\nand IR images into convolution neural networks, leading to inferior detection\nperformance. Since the RGB and IR features have modality-specific noise, these\nstrategies will worsen the fused features along with the propagation. Inspired\nby the mechanism of human brain processing multimodal information, this work\nintroduces a new coarse-to-fine perspective to purify and fuse two modality\nfeatures. Specifically, following this perspective, we design a Redundant\nSpectrum Removal module to coarsely remove interfering information within each\nmodality and a Dynamic Feature Selection module to finely select the desired\nfeatures for feature fusion. To verify the effectiveness of the coarse-to-fine\nfusion strategy, we construct a new object detector called Removal and\nSelection Detector (RSDet). Extensive experiments on three RGB-IR object\ndetection datasets verify the superior performance of our method.\n","authors":["Tianyi Zhao","Maoxun Yuan","Xingxing Wei"],"pdf_url":"https://arxiv.org/pdf/2401.10731v1.pdf","comment":"9pages, 7figures"},{"id":"http://arxiv.org/abs/2401.10727v1","updated":"2024-01-19T14:44:37Z","published":"2024-01-19T14:44:37Z","title":"Tool-LMM: A Large Multi-Modal Model for Tool Agent Learning","summary":" Recently, the astonishing performance of large language models (LLMs) in\nnatural language comprehension and generation tasks triggered lots of\nexploration of using them as central controllers to build agent systems.\nMultiple studies focus on bridging the LLMs to external tools to extend the\napplication scenarios. However, the current LLMs' perceiving tool-use ability\nis limited to a single text query, which may result in ambiguity in\nunderstanding the users' real intentions. LLMs are expected to eliminate that\nby perceiving the visual- or auditory-grounded instructions' information.\nTherefore, in this paper, we propose Tool-LMM, a system incorporating\nopen-source LLMs and multi-modal encoders so that the learnt LLMs can be\nconscious of multi-modal input instruction and then select the function-matched\ntool correctly. To facilitate the evaluation of the model's capability, we\ncollect a dataset featured by consisting of multi-modal input tools from\nHuggingFace. Another important feature of our dataset is that our dataset also\ncontains multiple potential choices for the same instruction due to the\nexistence of identical functions and synonymous functions, which provides more\npotential solutions for the same query. The experiments reveal that our LMM is\ncapable of recommending appropriate tools for multi-modal instructions. Codes\nand data are available at https://github.com/Tool-LMM/Tool-LMM.\n","authors":["Chenyu Wang","Weixin Luo","Qianyu Chen","Haonan Mai","Jindi Guo","Sixun Dong"," Xiaohua"," Xuan","Zhengxin Li","Lin Ma","Shenghua Gao"],"pdf_url":"https://arxiv.org/pdf/2401.10727v1.pdf","comment":"21 pages, 9 figures, 10 tables"},{"id":"http://arxiv.org/abs/2103.10702v4","updated":"2024-01-19T14:43:57Z","published":"2021-03-19T09:31:08Z","title":"ClawCraneNet: Leveraging Object-level Relation for Text-based Video\n Segmentation","summary":" Text-based video segmentation is a challenging task that segments out the\nnatural language referred objects in videos. It essentially requires semantic\ncomprehension and fine-grained video understanding. Existing methods introduce\nlanguage representation into segmentation models in a bottom-up manner, which\nmerely conducts vision-language interaction within local receptive fields of\nConvNets. We argue that such interaction is not fulfilled since the model can\nbarely construct region-level relationships given partial observations, which\nis contrary to the description logic of natural language/referring expressions.\nIn fact, people usually describe a target object using relations with other\nobjects, which may not be easily understood without seeing the whole video. To\naddress the issue, we introduce a novel top-down approach by imitating how we\nhuman segment an object with the language guidance. We first figure out all\ncandidate objects in videos and then choose the refereed one by parsing\nrelations among those high-level objects. Three kinds of object-level relations\nare investigated for precise relationship understanding, i.e., positional\nrelation, text-guided semantic relation, and temporal relation. Extensive\nexperiments on A2D Sentences and J-HMDB Sentences show our method outperforms\nstate-of-the-art methods by a large margin. Qualitative results also show our\nresults are more explainable.\n","authors":["Chen Liang","Yu Wu","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2103.10702v4.pdf","comment":"Extended version published in\n https://ieeexplore.ieee.org/abstract/document/10083244"},{"id":"http://arxiv.org/abs/2401.10712v1","updated":"2024-01-19T14:22:29Z","published":"2024-01-19T14:22:29Z","title":"Q&A Prompts: Discovering Rich Visual Clues through Mining\n Question-Answer Prompts for VQA requiring Diverse World Knowledge","summary":" With the breakthrough of multi-modal large language models, answering complex\nvisual questions that demand advanced reasoning abilities and world knowledge\nhas become a much more important testbed for developing AI models than ever.\nHowever, equipping AI models with robust cross-modality reasoning ability\nremains challenging since the cognition scheme of humans has not been\nunderstood systematically. In this paper, we believe that if we can collect\nvisual clues in the given image as much as possible, we will recognize the\nimage more accurately, understand the question better, recall relevant\nknowledge more easily, and finally reason out the answer. We discover these\nrich visual clues by mining question-answer pairs in images and sending them\ninto multi-modal large language models as prompts. We call the proposed method\nQ&A Prompts. Specifically, we first use the image-answer pairs and the\ncorresponding questions in the training set as inputs and outputs to train a\nvisual question generation model. Then, we use an image tagging model to\nidentify various instances and send packaged image-tag pairs into the visual\nquestion generation model to generate relevant questions with the extracted\nimage tags as answers. Finally, we encode these generated question-answer pairs\nas prompts with a visual-aware prompting module and send them into pre-trained\nmulti-modal large language models to reason out the final answers. Experimental\nresults show that, compared with state-of-the-art methods, our Q&A Prompts\nachieves substantial improvements on the challenging visual question answering\ndatasets requiring reasoning over diverse world knowledge, such as OK-VQA and\nA-OKVQA.\n","authors":["Haibi Wang","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10712v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10711v1","updated":"2024-01-19T14:21:46Z","published":"2024-01-19T14:21:46Z","title":"Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal\n Models for Video Question Answering","summary":" Video Question Answering (VideoQA) aims to answer natural language questions\nbased on the information observed in videos. Despite the recent success of\nLarge Multimodal Models (LMMs) in image-language understanding and reasoning,\nthey deal with VideoQA insufficiently by simply taking uniformly sampled frames\nas visual inputs, which ignores question-relevant visual clues. Moreover, there\nare no human annotations for question-critical timestamps in existing VideoQA\ndatasets. In light of this, we propose a novel weakly supervised framework to\nenforce the LMMs to reason out the answers with question-critical moments as\nvisual inputs. Specifically, we fuse the question and answer pairs as event\ndescriptions to find multiple keyframes as target moments, which will be\npseudo-labels. With these pseudo-labels as additionally weak supervision, we\ndevise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG\nlearns multiple Gaussian functions to characterize the temporal structure of\nthe video, and sample question-critical frames as positive moments to be the\nvisual inputs of LMMs. Extensive experiments on several VideoQA benchmarks\nverify the effectiveness of our framework, and we achieve substantial\nimprovements compared to previous state-of-the-art methods.\n","authors":["Haibo Wang","Chenghang Lai","Yixuan Sun","Weifeng Ge"],"pdf_url":"https://arxiv.org/pdf/2401.10711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10709v1","updated":"2024-01-19T14:14:26Z","published":"2024-01-19T14:14:26Z","title":"Dense 3D Reconstruction Through Lidar: A Comparative Study on Ex-vivo\n Porcine Tissue","summary":" New sensing technologies and more advanced processing algorithms are\ntransforming computer-integrated surgery. While researchers are actively\ninvestigating depth sensing and 3D reconstruction for vision-based surgical\nassistance, it remains difficult to achieve real-time, accurate, and robust 3D\nrepresentations of the abdominal cavity for minimally invasive surgery. Thus,\nthis work uses quantitative testing on fresh ex-vivo porcine tissue to\nthoroughly characterize the quality with which a 3D laser-based time-of-flight\nsensor (lidar) can perform anatomical surface reconstruction. Ground-truth\nsurface shapes are captured with a commercial laser scanner, and the resulting\nsigned error fields are analyzed using rigorous statistical tools. When\ncompared to modern learning-based stereo matching from endoscopic images,\ntime-of-flight sensing demonstrates higher precision, lower processing delay,\nhigher frame rate, and superior robustness against sensor distance and poor\nillumination. Furthermore, we report on the potential negative effect of\nnear-infrared light penetration on the accuracy of lidar measurements across\ndifferent tissue samples, identifying a significant measured depth offset for\nmuscle in contrast to fat and liver. Our findings highlight the potential of\nlidar for intraoperative 3D perception and point toward new methods that\ncombine complementary time-of-flight and spectral imaging.\n","authors":["Guido Caccianiga","Julian Nubert","Marco Hutter","Katherine J. Kuchenbecker"],"pdf_url":"https://arxiv.org/pdf/2401.10709v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.11795v2","updated":"2024-01-19T14:08:38Z","published":"2022-10-21T08:18:49Z","title":"PoseScript: Linking 3D Human Poses and Natural Language","summary":" Natural language plays a critical role in many computer vision applications,\nsuch as image captioning, visual question answering, and cross-modal retrieval,\nto provide fine-grained semantic information. Unfortunately, while human pose\nis key to human understanding, current 3D human pose datasets lack detailed\nlanguage descriptions. To address this issue, we have introduced the PoseScript\ndataset. This dataset pairs more than six thousand 3D human poses from AMASS\nwith rich human-annotated descriptions of the body parts and their spatial\nrelationships. Additionally, to increase the size of the dataset to a scale\nthat is compatible with data-hungry learning algorithms, we have proposed an\nelaborate captioning process that generates automatic synthetic descriptions in\nnatural language from given 3D keypoints. This process extracts low-level pose\ninformation, known as \"posecodes\", using a set of simple but generic rules on\nthe 3D keypoints. These posecodes are then combined into higher level textual\ndescriptions using syntactic rules. With automatic annotations, the amount of\navailable data significantly scales up (100k), making it possible to\neffectively pretrain deep models for finetuning on human captions. To showcase\nthe potential of annotated poses, we present three multi-modal learning tasks\nthat utilize the PoseScript dataset. Firstly, we develop a pipeline that maps\n3D poses and textual descriptions into a joint embedding space, allowing for\ncross-modal retrieval of relevant poses from large-scale datasets. Secondly, we\nestablish a baseline for a text-conditioned model generating 3D poses. Thirdly,\nwe present a learned process for generating pose descriptions. These\napplications demonstrate the versatility and usefulness of annotated poses in\nvarious tasks and pave the way for future research in the field.\n","authors":["Ginger Delmas","Philippe Weinzaepfel","Thomas Lucas","Francesc Moreno-Noguer","Grégory Rogez"],"pdf_url":"https://arxiv.org/pdf/2210.11795v2.pdf","comment":"Extended version of the ECCV 2022 paper"},{"id":"http://arxiv.org/abs/2106.01061v2","updated":"2024-01-19T13:44:46Z","published":"2021-06-02T10:26:13Z","title":"Rethinking Cross-modal Interaction from a Top-down Perspective for\n Referring Video Object Segmentation","summary":" Referring video object segmentation (RVOS) aims to segment video objects with\nthe guidance of natural language reference. Previous methods typically tackle\nRVOS through directly grounding linguistic reference over the image lattice.\nSuch bottom-up strategy fails to explore object-level cues, easily leading to\ninferior results. In this work, we instead put forward a two-stage, top-down\nRVOS solution. First, an exhaustive set of object tracklets is constructed by\npropagating object masks detected from several sampled frames to the entire\nvideo. Second, a Transformer-based tracklet-language grounding module is\nproposed, which models instance-level visual relations and cross-modal\ninteractions simultaneously and efficiently. Our model ranks first place on\nCVPR2021 Referring Youtube-VOS challenge.\n","authors":["Chen Liang","Yu Wu","Tianfei Zhou","Wenguan Wang","Zongxin Yang","Yunchao Wei","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2106.01061v2.pdf","comment":"Champion solution in YouTube-VOS 2021 Track 3. Extended version\n published in https://ieeexplore.ieee.org/abstract/document/10083244"},{"id":"http://arxiv.org/abs/2301.13359v4","updated":"2024-01-19T13:25:03Z","published":"2023-01-31T01:24:45Z","title":"IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing","summary":" Image anomaly detection (IAD) is an emerging and vital computer vision task\nin industrial manufacturing (IM). Recently, many advanced algorithms have been\nreported, but their performance deviates considerably with various IM settings.\nWe realize that the lack of a uniform IM benchmark is hindering the development\nand usage of IAD methods in real-world applications. In addition, it is\ndifficult for researchers to analyze IAD algorithms without a uniform\nbenchmark. To solve this problem, we propose a uniform IM benchmark, for the\nfirst time, to assess how well these algorithms perform, which includes various\nlevels of supervision (unsupervised versus fully supervised), learning\nparadigms (few-shot, continual and noisy label), and efficiency (memory usage\nand inference speed). Then, we construct a comprehensive image anomaly\ndetection benchmark (IM-IAD), which includes 19 algorithms on seven major\ndatasets with a uniform setting. Extensive experiments (17,017 total) on IM-IAD\nprovide in-depth insights into IAD algorithm redesign or selection. Moreover,\nthe proposed IM-IAD benchmark challenges existing algorithms and suggests\nfuture research directions. To foster reproducibility and accessibility, the\nsource code of IM-IAD is uploaded on the website,\nhttps://github.com/M-3LAB/IM-IAD.\n","authors":["Guoyang Xie","Jinbao Wang","Jiaqi Liu","Jiayi Lyu","Yong Liu","Chengjie Wang","Feng Zheng","Yaochu Jin"],"pdf_url":"https://arxiv.org/pdf/2301.13359v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13310v2","updated":"2024-01-19T13:03:04Z","published":"2023-05-22T17:59:43Z","title":"Matcher: Segment Anything with One Shot Using All-Purpose Feature\n Matching","summary":" Powered by large-scale pre-training, vision foundation models exhibit\nsignificant potential in open-world image understanding. However, unlike large\nlanguage models that excel at directly tackling various language tasks, vision\nfoundation models require a task-specific model structure followed by\nfine-tuning on specific tasks. In this work, we present Matcher, a novel\nperception paradigm that utilizes off-the-shelf vision foundation models to\naddress various perception tasks. Matcher can segment anything by using an\nin-context example without training. Additionally, we design three effective\ncomponents within the Matcher framework to collaborate with these foundation\nmodels and unleash their full potential in diverse perception tasks. Matcher\ndemonstrates impressive generalization performance across various segmentation\ntasks, all without training. For example, it achieves 52.7% mIoU on COCO-20$^i$\nwith one example, surpassing the state-of-the-art specialist model by 1.6%. In\naddition, Matcher achieves 33.0% mIoU on the proposed LVIS-92$^i$ for one-shot\nsemantic segmentation, outperforming the state-of-the-art generalist model by\n14.4%. Our visualization results further showcase the open-world generality and\nflexibility of Matcher when applied to images in the wild. Our code can be\nfound at https://github.com/aim-uofa/Matcher.\n","authors":["Yang Liu","Muzhi Zhu","Hengtao Li","Hao Chen","Xinlong Wang","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2305.13310v2.pdf","comment":"Accepted to ICLR2024"},{"id":"http://arxiv.org/abs/2203.09773v2","updated":"2024-01-19T13:01:44Z","published":"2022-03-18T07:35:26Z","title":"Local-Global Context Aware Transformer for Language-Guided Video\n Segmentation","summary":" We explore the task of language-guided video segmentation (LVS). Previous\nalgorithms mostly adopt 3D CNNs to learn video representation, struggling to\ncapture long-term context and easily suffering from visual-linguistic\nmisalignment. In light of this, we present Locater (local-global context aware\nTransformer), which augments the Transformer architecture with a finite memory\nso as to query the entire video with the language expression in an efficient\nmanner. The memory is designed to involve two components -- one for\npersistently preserving global video content, and one for dynamically gathering\nlocal temporal context and segmentation history. Based on the memorized\nlocal-global context and the particular content of each frame, Locater\nholistically and flexibly comprehends the expression as an adaptive query\nvector for each frame. The vector is used to query the corresponding frame for\nmask generation. The memory also allows Locater to process videos with linear\ntime complexity and constant size memory, while Transformer-style\nself-attention computation scales quadratically with sequence length. To\nthoroughly examine the visual grounding capability of LVS models, we contribute\na new LVS dataset, A2D-S+, which is built upon A2D-S dataset but poses\nincreased challenges in disambiguating among similar objects. Experiments on\nthree LVS datasets and our A2D-S+ show that Locater outperforms previous\nstate-of-the-arts. Further, we won the 1st place in the Referring Video Object\nSegmentation Track of the 3rd Large-scale Video Object Segmentation Challenge,\nwhere Locater served as the foundation for the winning solution. Our code and\ndataset are available at: https://github.com/leonnnop/Locater\n","authors":["Chen Liang","Wenguan Wang","Tianfei Zhou","Jiaxu Miao","Yawei Luo","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2203.09773v2.pdf","comment":"Accepted by TPAMI. Code, data: https://github.com/leonnnop/Locater"},{"id":"http://arxiv.org/abs/2401.10666v1","updated":"2024-01-19T12:40:54Z","published":"2024-01-19T12:40:54Z","title":"MixNet: Towards Effective and Efficient UHD Low-Light Image Enhancement","summary":" With the continuous advancement of imaging devices, the prevalence of\nUltra-High-Definition (UHD) images is rising. Although many image restoration\nmethods have achieved promising results, they are not directly applicable to\nUHD images on devices with limited computational resources due to the\ninherently high computational complexity of UHD images. In this paper, we focus\non the task of low-light image enhancement (LLIE) and propose a novel LLIE\nmethod called MixNet, which is designed explicitly for UHD images. To capture\nthe long-range dependency of features without introducing excessive\ncomputational complexity, we present the Global Feature Modulation Layer\n(GFML). GFML associates features from different views by permuting the feature\nmaps, enabling efficient modeling of long-range dependency. In addition, we\nalso design the Local Feature Modulation Layer (LFML) and Feed-forward Layer\n(FFL) to capture local features and transform features into a compact\nrepresentation. This way, our MixNet achieves effective LLIE with few model\nparameters and low computational complexity. We conducted extensive experiments\non both synthetic and real-world datasets, and the comprehensive results\ndemonstrate that our proposed method surpasses the performance of current\nstate-of-the-art methods. The code will be available at\n\\url{https://github.com/zzr-idam/MixNet}.\n","authors":["Chen Wu","Zhuoran Zheng","Xiuyi Jia","Wenqi Ren"],"pdf_url":"https://arxiv.org/pdf/2401.10666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15420v2","updated":"2024-01-19T12:34:42Z","published":"2023-11-26T21:04:28Z","title":"Data-Driven Modelling for Harmonic Current Emission in Low-Voltage Grid\n Using MCReSANet with Interpretability Analysis","summary":" Even though the use of power electronics PE loads offers enhanced electrical\nenergy conversion efficiency and control, they remain the primary sources of\nharmonics in grids. When diverse loads are connected in the distribution\nsystem, their interactions complicate establishing analytical models for the\nrelationship between harmonic voltages and currents. To solve this, our paper\npresents a data-driven model using MCReSANet to construct the highly nonlinear\nbetween harmonic voltage and current. Two datasets from PCCs in Finland and\nGermany are utilized, which demonstrates that MCReSANet is capable of\nestablishing accurate nonlinear mappings, even in the presence of various\nnetwork characteristics for selected Finland and Germany datasets. The model\nbuilt by MCReSANet can improve the MAE by 10% and 14% compared to the CNN, and\nby 8% and 17% compared to the MLP for both Finnish and German datasets, also\nshowing much lower model uncertainty than others. This is a crucial\nprerequisite for more precise SHAP value-based feature importance analysis,\nwhich is a method for the model interpretability analysis in this paper. The\nresults by feature importance analysis show the detailed relationships between\neach order of harmonic voltage and current in the distribution system. There is\nan interactive impact on each order of harmonic current, but some orders of\nharmonic voltages have a dominant influence on harmonic current emissions:\npositive sequence and zero sequence harmonics have the dominant importance in\nthe Finnish and German networks, respectively, which conforms to the pattern of\nconnected load types in two selected Finnish and German datasets. This paper\nenhances the potential for understanding and predicting harmonic current\nemissions by diverse PE loads in distribution systems, which is beneficial to\nmore effective management for optimizing power quality in diverse grid\nenvironments.\n","authors":["Jieyu Yao","Hao Yu","Paul Judge","Jiabin Jia","Sasa Djokic","Verner Püvi","Matti Lehtonen","Jan Meyer"],"pdf_url":"https://arxiv.org/pdf/2311.15420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16516v2","updated":"2024-01-19T12:29:47Z","published":"2023-12-27T10:49:19Z","title":"ConstScene: Dataset and Model for Advancing Robust Semantic Segmentation\n in Construction Environments","summary":" The increasing demand for autonomous machines in construction environments\nnecessitates the development of robust object detection algorithms that can\nperform effectively across various weather and environmental conditions. This\npaper introduces a new semantic segmentation dataset specifically tailored for\nconstruction sites, taking into account the diverse challenges posed by adverse\nweather and environmental conditions. The dataset is designed to enhance the\ntraining and evaluation of object detection models, fostering their\nadaptability and reliability in real-world construction applications. Our\ndataset comprises annotated images captured under a wide range of different\nweather conditions, including but not limited to sunny days, rainy periods,\nfoggy atmospheres, and low-light situations. Additionally, environmental\nfactors such as the existence of dirt/mud on the camera lens are integrated\ninto the dataset through actual captures and synthetic generation to simulate\nthe complex conditions prevalent in construction sites. We also generate\nsynthetic images of the annotations including precise semantic segmentation\nmasks for various objects commonly found in construction environments, such as\nwheel loader machines, personnel, cars, and structural elements. To demonstrate\nthe dataset's utility, we evaluate state-of-the-art object detection algorithms\non our proposed benchmark. The results highlight the dataset's success in\nadversarial training models across diverse conditions, showcasing its efficacy\ncompared to existing datasets that lack such environmental variability.\n","authors":["Maghsood Salimi","Mohammad Loni","Sara Afshar","Antonio Cicchetti","Marjan Sirjani"],"pdf_url":"https://arxiv.org/pdf/2312.16516v2.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2401.10659v1","updated":"2024-01-19T12:26:51Z","published":"2024-01-19T12:26:51Z","title":"BadODD: Bangladeshi Autonomous Driving Object Detection Dataset","summary":" We propose a comprehensive dataset for object detection in diverse driving\nenvironments across 9 districts in Bangladesh. The dataset, collected\nexclusively from smartphone cameras, provided a realistic representation of\nreal-world scenarios, including day and night conditions. Most existing\ndatasets lack suitable classes for autonomous navigation on Bangladeshi roads,\nmaking it challenging for researchers to develop models that can handle the\nintricacies of road scenarios. To address this issue, the authors proposed a\nnew set of classes based on characteristics rather than local vehicle names.\nThe dataset aims to encourage the development of models that can handle the\nunique challenges of Bangladeshi road scenarios for the effective deployment of\nautonomous vehicles. The dataset did not consist of any online images to\nsimulate real-world conditions faced by autonomous vehicles. The classification\nof vehicles is challenging because of the diverse range of vehicles on\nBangladeshi roads, including those not found elsewhere in the world. The\nproposed classification system is scalable and can accommodate future vehicles,\nmaking it a valuable resource for researchers in the autonomous vehicle sector.\n","authors":["Mirza Nihal Baig","Rony Hajong","Mahdi Murshed Patwary","Mohammad Shahidur Rahman","Husne Ara Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2401.10659v1.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2312.08010v2","updated":"2024-01-19T12:19:48Z","published":"2023-12-13T09:33:08Z","title":"EZ-CLIP: Efficient Zeroshot Video Action Recognition","summary":" Recent advancements in large-scale pre-training of visual-language models on\npaired image-text data have demonstrated impressive generalization capabilities\nfor zero-shot tasks. Building on this success, efforts have been made to adapt\nthese image-based visual-language models, such as CLIP, for videos extending\ntheir zero-shot capabilities to the video domain. While these adaptations have\nshown promising results, they come at a significant computational cost and\nstruggle with effectively modeling the crucial temporal aspects inherent to the\nvideo domain. In this study, we present EZ-CLIP, a simple and efficient\nadaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal\nvisual prompting for seamless temporal adaptation, requiring no fundamental\nalterations to the core CLIP architecture while preserving its remarkable\ngeneralization abilities. Moreover, we introduce a novel learning objective\nthat guides the temporal visual prompts to focus on capturing motion, thereby\nenhancing its learning capabilities from video data. We conducted extensive\nexperiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP\nfor zero-shot learning and base-to-novel video action recognition, and also\ndemonstrating its potential for few-shot generalization.Impressively, with a\nmere 5.2 million learnable parameters (as opposed to the 71.1 million in the\nprior best model), EZ-CLIP can be efficiently trained on a single GPU,\noutperforming existing approaches in several evaluations.\n","authors":["Shahzad Ahmad","Sukalpa Chanda","Yogesh S Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07823v4","updated":"2024-01-19T12:18:28Z","published":"2023-12-13T01:16:50Z","title":"Semantic Lens: Instance-Centric Semantic Alignment for Video\n Super-Resolution","summary":" As a critical clue of video super-resolution (VSR), inter-frame alignment\nsignificantly impacts overall performance. However, accurate pixel-level\nalignment is a challenging task due to the intricate motion interweaving in the\nvideo. In response to this issue, we introduce a novel paradigm for VSR named\nSemantic Lens, predicated on semantic priors drawn from degraded videos.\nSpecifically, video is modeled as instances, events, and scenes via a Semantic\nExtractor. Those semantics assist the Pixel Enhancer in understanding the\nrecovered contents and generating more realistic visual results. The distilled\nglobal semantics embody the scene information of each frame, while the\ninstance-specific semantics assemble the spatial-temporal contexts related to\neach instance. Furthermore, we devise a Semantics-Powered Attention\nCross-Embedding (SPACE) block to bridge the pixel-level features with semantic\nknowledge, composed of a Global Perspective Shifter (GPS) and an\nInstance-Specific Semantic Embedding Encoder (ISEE). Concretely, the GPS module\ngenerates pairs of affine transformation parameters for pixel-level feature\nmodulation conditioned on global semantics. After that, the ISEE module\nharnesses the attention mechanism to align the adjacent frames in the\ninstance-centric semantic space. In addition, we incorporate a simple yet\neffective pre-alignment module to alleviate the difficulty of model training.\nExtensive experiments demonstrate the superiority of our model over existing\nstate-of-the-art VSR methods.\n","authors":["Qi Tang","Yao Zhao","Meiqin Liu","Jian Jin","Chao Yao"],"pdf_url":"https://arxiv.org/pdf/2312.07823v4.pdf","comment":"Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.10643v1","updated":"2024-01-19T11:45:10Z","published":"2024-01-19T11:45:10Z","title":"A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification:\n Models, Data Sets and Challenges","summary":" Vehicle re-identification (ReID) endeavors to associate vehicle images\ncollected from a distributed network of cameras spanning diverse traffic\nenvironments. This task assumes paramount importance within the spectrum of\nvehicle-centric technologies, playing a pivotal role in deploying Intelligent\nTransportation Systems (ITS) and advancing smart city initiatives. Rapid\nadvancements in deep learning have significantly propelled the evolution of\nvehicle ReID technologies in recent years. Consequently, undertaking a\ncomprehensive survey of methodologies centered on deep learning for vehicle\nre-identification has become imperative and inescapable. This paper extensively\nexplores deep learning techniques applied to vehicle ReID. It outlines the\ncategorization of these methods, encompassing supervised and unsupervised\napproaches, delves into existing research within these categories, introduces\ndatasets and evaluation criteria, and delineates forthcoming challenges and\npotential research directions. This comprehensive assessment examines the\nlandscape of deep learning in vehicle ReID and establishes a foundation and\nstarting point for future works. It aims to serve as a complete reference by\nhighlighting challenges and emerging trends, fostering advancements and\napplications in vehicle ReID utilizing deep learning models.\n","authors":["Ali Amiri","Aydin Kaya","Ali Seydi Keceli"],"pdf_url":"https://arxiv.org/pdf/2401.10643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10640v1","updated":"2024-01-19T11:35:52Z","published":"2024-01-19T11:35:52Z","title":"A comprehensive study on fidelity metrics for XAI","summary":" The use of eXplainable Artificial Intelligence (XAI) systems has introduced a\nset of challenges that need resolution. Herein, we focus on how to correctly\nselect an XAI method, an open questions within the field. The inherent\ndifficulty of this task is due to the lack of a ground truth. Several authors\nhave proposed metrics to approximate the fidelity of different XAI methods.\nThese metrics lack verification and have concerning disagreements. In this\nstudy, we proposed a novel methodology to verify fidelity metrics, using a\nwell-known transparent model, namely a decision tree. This model allowed us to\nobtain explanations with perfect fidelity. Our proposal constitutes the first\nobjective benchmark for these metrics, facilitating a comparison of existing\nproposals, and surpassing existing methods. We applied our benchmark to assess\nthe existing fidelity metrics in two different experiments, each using public\ndatasets comprising 52,000 images. The images from these datasets had a size a\n128 by 128 pixels and were synthetic data that simplified the training process.\nAll metric values, indicated a lack of fidelity, with the best one showing a 30\n\\% deviation from the expected values for perfect explanation. Our\nexperimentation led us to conclude that the current fidelity metrics are not\nreliable enough to be used in real scenarios. From this finding, we deemed it\nnecessary to development new metrics, to avoid the detected problems, and we\nrecommend the usage of our proposal as a benchmark within the scientific\ncommunity to address these limitations.\n","authors":["Miquel Miró-Nicolau","Antoni Jaume-i-Capó","Gabriel Moyà-Alcover"],"pdf_url":"https://arxiv.org/pdf/2401.10640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10637v1","updated":"2024-01-19T11:35:07Z","published":"2024-01-19T11:35:07Z","title":"Towards Universal Unsupervised Anomaly Detection in Medical Imaging","summary":" The increasing complexity of medical imaging data underscores the need for\nadvanced anomaly detection methods to automatically identify diverse\npathologies. Current methods face challenges in capturing the broad spectrum of\nanomalies, often limiting their use to specific lesion types in brain scans. To\naddress this challenge, we introduce a novel unsupervised approach, termed\n\\textit{Reversed Auto-Encoders (RA)}, designed to create realistic\npseudo-healthy reconstructions that enable the detection of a wider range of\npathologies. We evaluate the proposed method across various imaging modalities,\nincluding magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,\nand chest X-ray, and demonstrate superior performance in detecting anomalies\ncompared to existing state-of-the-art methods. Our unsupervised anomaly\ndetection approach may enhance diagnostic accuracy in medical imaging by\nidentifying a broader range of unknown pathologies. Our code is publicly\navailable at: \\url{https://github.com/ci-ber/RA}.\n","authors":["Cosmin I. Bercea","Benedikt Wiestler","Daniel Rueckert","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2401.10637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10620v1","updated":"2024-01-19T10:52:57Z","published":"2024-01-19T10:52:57Z","title":"Polytopic Autoencoders with Smooth Clustering for Reduced-order\n Modelling of Flows","summary":" With the advancement of neural networks, there has been a notable increase,\nboth in terms of quantity and variety, in research publications concerning the\napplication of autoencoders to reduced-order models. We propose a polytopic\nautoencoder architecture that includes a lightweight nonlinear encoder, a\nconvex combination decoder, and a smooth clustering network. Supported by\nseveral proofs, the model architecture ensures that all reconstructed states\nlie within a polytope, accompanied by a metric indicating the quality of the\nconstructed polytopes, referred to as polytope error. Additionally, it offers a\nminimal number of convex coordinates for polytopic linear-parameter varying\nsystems while achieving acceptable reconstruction errors compared to proper\northogonal decomposition (POD). To validate our proposed model, we conduct\nsimulations involving two flow scenarios with the incompressible Navier-Stokes\nequation. Numerical results demonstrate the guaranteed properties of the model,\nlow reconstruction errors compared to POD, and the improvement in error using a\nclustering network.\n","authors":["Jan Heiland","Yongho Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10620v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2401.10608v1","updated":"2024-01-19T10:37:27Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n Prediction from Histopathology Images","summary":" The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10191v2","updated":"2024-01-19T10:01:36Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n Continual Learning","summary":" Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v2.pdf","comment":"Accepted for ICLR 2024 (main track), code is available at:\n https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2401.10588v1","updated":"2024-01-19T09:58:06Z","published":"2024-01-19T09:58:06Z","title":"DGL: Dynamic Global-Local Prompt Tuning for Text-Video Retrieval","summary":" Text-video retrieval is a critical multi-modal task to find the most relevant\nvideo for a text query. Although pretrained models like CLIP have demonstrated\nimpressive potential in this area, the rising cost of fully finetuning these\nmodels due to increasing model size continues to pose a problem. To address\nthis challenge, prompt tuning has emerged as an alternative. However, existing\nworks still face two problems when adapting pretrained image-text models to\ndownstream video-text tasks: (1) The visual encoder could only encode\nframe-level features and failed to extract global-level general video\ninformation. (2) Equipping the visual and text encoder with separated prompts\nfailed to mitigate the visual-text modality gap. To this end, we propose DGL, a\ncross-modal Dynamic prompt tuning method with Global-Local video attention. In\ncontrast to previous prompt tuning methods, we employ the shared latent space\nto generate local-level text and frame prompts that encourage inter-modal\ninteraction. Furthermore, we propose modeling video in a global-local attention\nmechanism to capture global video information from the perspective of prompt\ntuning. Extensive experiments reveal that when only 0.67% parameters are tuned,\nour cross-modal prompt tuning strategy DGL outperforms or is comparable to\nfully finetuning methods on MSR-VTT, VATEX, LSMDC, and ActivityNet datasets.\nCode will be available at https://github.com/knightyxp/DGL\n","authors":["Xiangpeng Yang","Linchao Zhu","Xiaohan Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10588v1.pdf","comment":"AAAI2024, Code will be available at https://github.com/knightyxp/DGL"},{"id":"http://arxiv.org/abs/2303.06088v6","updated":"2024-01-19T09:45:02Z","published":"2023-03-10T17:09:04Z","title":"Towards domain-invariant Self-Supervised Learning with Batch Styles\n Standardization","summary":" In Self-Supervised Learning (SSL), models are typically pretrained,\nfine-tuned, and evaluated on the same domains. However, they tend to perform\npoorly when evaluated on unseen domains, a challenge that Unsupervised Domain\nGeneralization (UDG) seeks to address. Current UDG methods rely on domain\nlabels, which are often challenging to collect, and domain-specific\narchitectures that lack scalability when confronted with numerous domains,\nmaking the current methodology impractical and rigid. Inspired by\ncontrastive-based UDG methods that mitigate spurious correlations by\nrestricting comparisons to examples from the same domain, we hypothesize that\neliminating style variability within a batch could provide a more convenient\nand flexible way to reduce spurious correlations without requiring domain\nlabels. To verify this hypothesis, we introduce Batch Styles Standardization\n(BSS), a relatively simple yet powerful Fourier-based method to standardize the\nstyle of images in a batch specifically designed for integration with SSL\nmethods to tackle UDG. Combining BSS with existing SSL methods offers serious\nadvantages over prior UDG methods: (1) It eliminates the need for domain labels\nor domain-specific network components to enhance domain-invariance in SSL\nrepresentations, and (2) offers flexibility as BSS can be seamlessly integrated\nwith diverse contrastive-based but also non-contrastive-based SSL methods.\nExperiments on several UDG datasets demonstrate that it significantly improves\ndownstream task performances on unseen domains, often outperforming or rivaling\nwith UDG methods. Finally, this work clarifies the underlying mechanisms\ncontributing to BSS's effectiveness in improving domain-invariance in SSL\nrepresentations and performances on unseen domain.\n","authors":["Marin Scalbert","Maria Vakalopoulou","Florent Couzinié-Devy"],"pdf_url":"https://arxiv.org/pdf/2303.06088v6.pdf","comment":"Accepted at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10578v1","updated":"2024-01-19T09:41:09Z","published":"2024-01-19T09:41:09Z","title":"3D Shape Completion on Unseen Categories:A Weakly-supervised Approach","summary":" 3D shapes captured by scanning devices are often incomplete due to occlusion.\n3D shape completion methods have been explored to tackle this limitation.\nHowever, most of these methods are only trained and tested on a subset of\ncategories, resulting in poor generalization to unseen categories. In this\npaper, we introduce a novel weakly-supervised framework to reconstruct the\ncomplete shapes from unseen categories. We first propose an end-to-end\nprior-assisted shape learning network that leverages data from the seen\ncategories to infer a coarse shape. Specifically, we construct a prior bank\nconsisting of representative shapes from the seen categories. Then, we design a\nmulti-scale pattern correlation module for learning the complete shape of the\ninput by analyzing the correlation between local patterns within the input and\nthe priors at various scales. In addition, we propose a self-supervised shape\nrefinement model to further refine the coarse shape. Considering the shape\nvariability of 3D objects across categories, we construct a category-specific\nprior bank to facilitate shape refinement. Then, we devise a voxel-based\npartial matching loss and leverage the partial scans to drive the refinement\nprocess. Extensive experimental results show that our approach is superior to\nstate-of-the-art methods by a large margin.\n","authors":["Lintai Wu","Junhui Hou","Linqi Song","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2401.10578v1.pdf","comment":"13 pages,8 figures"},{"id":"http://arxiv.org/abs/2401.10564v1","updated":"2024-01-19T09:01:20Z","published":"2024-01-19T09:01:20Z","title":"Dream360: Diverse and Immersive Outdoor Virtual Scene Creation via\n Transformer-Based 360 Image Outpainting","summary":" 360 images, with a field-of-view (FoV) of 180x360, provide immersive and\nrealistic environments for emerging virtual reality (VR) applications, such as\nvirtual tourism, where users desire to create diverse panoramic scenes from a\nnarrow FoV photo they take from a viewpoint via portable devices. It thus\nbrings us to a technical challenge: `How to allow the users to freely create\ndiverse and immersive virtual scenes from a narrow FoV image with a specified\nviewport?' To this end, we propose a transformer-based 360 image outpainting\nframework called Dream360, which can generate diverse, high-fidelity, and\nhigh-resolution panoramas from user-selected viewports, considering the\nspherical properties of 360 images. Compared with existing methods, e.g., [3],\nwhich primarily focus on inputs with rectangular masks and central locations\nwhile overlooking the spherical property of 360 images, our Dream360 offers\nhigher outpainting flexibility and fidelity based on the spherical\nrepresentation. Dream360 comprises two key learning stages: (I) codebook-based\npanorama outpainting via Spherical-VQGAN (S-VQGAN), and (II) frequency-aware\nrefinement with a novel frequency-aware consistency loss. Specifically, S-VQGAN\nlearns a sphere-specific codebook from spherical harmonic (SH) values,\nproviding a better representation of spherical data distribution for scene\nmodeling. The frequency-aware refinement matches the resolution and further\nimproves the semantic consistency and visual fidelity of the generated results.\nOur Dream360 achieves significantly lower Frechet Inception Distance (FID)\nscores and better visual fidelity than existing methods. We also conducted a\nuser study involving 15 participants to interactively evaluate the quality of\nthe generated results in VR, demonstrating the flexibility and superiority of\nour Dream360 framework.\n","authors":["Hao Ai","Zidong Cao","Haonan Lu","Chen Chen","Jian Ma","Pengyuan Zhou","Tae-Kyun Kim","Pan Hui","Lin Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10564v1.pdf","comment":"11 pages, accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.10561v1","updated":"2024-01-19T08:54:54Z","published":"2024-01-19T08:54:54Z","title":"MAEDiff: Masked Autoencoder-enhanced Diffusion Models for Unsupervised\n Anomaly Detection in Brain Images","summary":" Unsupervised anomaly detection has gained significant attention in the field\nof medical imaging due to its capability of relieving the costly pixel-level\nannotation. To achieve this, modern approaches usually utilize generative\nmodels to produce healthy references of the diseased images and then identify\nthe abnormalities by comparing the healthy references and the original diseased\nimages. Recently, diffusion models have exhibited promising potential for\nunsupervised anomaly detection in medical images for their good mode coverage\nand high sample quality. However, the intrinsic characteristics of the medical\nimages, e.g. the low contrast, and the intricate anatomical structure of the\nhuman body make the reconstruction challenging. Besides, the global information\nof medical images often remain underutilized. To address these two issues, we\npropose a novel Masked Autoencoder-enhanced Diffusion Model (MAEDiff) for\nunsupervised anomaly detection in brain images. The MAEDiff involves a\nhierarchical patch partition. It generates healthy images by overlapping\nupper-level patches and implements a mechanism based on the masked autoencoders\noperating on the sub-level patches to enhance the condition on the unnoised\nregions. Extensive experiments on data of tumors and multiple sclerosis lesions\ndemonstrate the effectiveness of our method.\n","authors":["Rui Xu","Yunke Wang","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2401.10561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10560v1","updated":"2024-01-19T08:52:24Z","published":"2024-01-19T08:52:24Z","title":"360ORB-SLAM: A Visual SLAM System for Panoramic Images with Depth\n Completion Network","summary":" To enhance the performance and effect of AR/VR applications and visual\nassistance and inspection systems, visual simultaneous localization and mapping\n(vSLAM) is a fundamental task in computer vision and robotics. However,\ntraditional vSLAM systems are limited by the camera's narrow field-of-view,\nresulting in challenges such as sparse feature distribution and lack of dense\ndepth information. To overcome these limitations, this paper proposes a\n360ORB-SLAM system for panoramic images that combines with a depth completion\nnetwork. The system extracts feature points from the panoramic image, utilizes\na panoramic triangulation module to generate sparse depth information, and\nemploys a depth completion network to obtain a dense panoramic depth map.\nExperimental results on our novel panoramic dataset constructed based on Carla\ndemonstrate that the proposed method achieves superior scale accuracy compared\nto existing monocular SLAM methods and effectively addresses the challenges of\nfeature association and scale ambiguity. The integration of the depth\ncompletion network enhances system stability and mitigates the impact of\ndynamic elements on SLAM performance.\n","authors":["Yichen Chen","Yiqi Pan","Ruyu Liu","Haoyu Zhang","Guodao Zhang","Bo Sun","Jianhua Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10560v1.pdf","comment":"6 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.02119v3","updated":"2024-01-19T08:50:28Z","published":"2023-09-05T10:52:21Z","title":"Hierarchical Masked 3D Diffusion Model for Video Outpainting","summary":" Video outpainting aims to adequately complete missing areas at the edges of\nvideo frames. Compared to image outpainting, it presents an additional\nchallenge as the model should maintain the temporal consistency of the filled\narea. In this paper, we introduce a masked 3D diffusion model for video\noutpainting. We use the technique of mask modeling to train the 3D diffusion\nmodel. This allows us to use multiple guide frames to connect the results of\nmultiple video clip inferences, thus ensuring temporal consistency and reducing\njitter between adjacent frames. Meanwhile, we extract the global frames of the\nvideo as prompts and guide the model to obtain information other than the\ncurrent video clip using cross-attention. We also introduce a hybrid\ncoarse-to-fine inference pipeline to alleviate the artifact accumulation\nproblem. The existing coarse-to-fine pipeline only uses the infilling strategy,\nwhich brings degradation because the time interval of the sparse frames is too\nlarge. Our pipeline benefits from bidirectional learning of the mask modeling\nand thus can employ a hybrid strategy of infilling and interpolation when\ngenerating sparse frames. Experiments show that our method achieves\nstate-of-the-art results in video outpainting tasks. More results and codes are\nprovided at our https://fanfanda.github.io/M3DDM/.\n","authors":["Fanda Fan","Chaoxu Guo","Litong Gong","Biao Wang","Tiezheng Ge","Yuning Jiang","Chunjie Luo","Jianfeng Zhan"],"pdf_url":"https://arxiv.org/pdf/2309.02119v3.pdf","comment":"Accepted to ACM MM 2023"},{"id":"http://arxiv.org/abs/2401.10556v1","updated":"2024-01-19T08:44:52Z","published":"2024-01-19T08:44:52Z","title":"Symbol as Points: Panoptic Symbol Spotting via Point-based\n Representation","summary":" This work studies the problem of panoptic symbol spotting, which is to spot\nand parse both countable object instances (windows, doors, tables, etc.) and\nuncountable stuff (wall, railing, etc.) from computer-aided design (CAD)\ndrawings. Existing methods typically involve either rasterizing the vector\ngraphics into images and using image-based methods for symbol spotting, or\ndirectly building graphs and using graph neural networks for symbol\nrecognition. In this paper, we take a different approach, which treats graphic\nprimitives as a set of 2D points that are locally connected and use point cloud\nsegmentation methods to tackle it. Specifically, we utilize a point transformer\nto extract the primitive features and append a mask2former-like spotting head\nto predict the final output. To better use the local connection information of\nprimitives and enhance their discriminability, we further propose the attention\nwith connection module (ACM) and contrastive connection learning scheme (CCL).\nFinally, we propose a KNN interpolation mechanism for the mask attention module\nof the spotting head to better handle primitive mask downsampling, which is\nprimitive-level in contrast to pixel-level for the image. Our approach, named\nSymPoint, is simple yet effective, outperforming recent state-of-the-art method\nGAT-CADNet by an absolute increase of 9.6% PQ and 10.4% RQ on the FloorPlanCAD\ndataset. The source code and models will be available at\nhttps://github.com/nicehuster/SymPoint.\n","authors":["Wenlong Liu","Tianyu Yang","Yuhan Wang","Qizhi Yu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10556v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2309.02773v2","updated":"2024-01-19T08:01:15Z","published":"2023-09-06T06:31:08Z","title":"Diffusion Model is Secretly a Training-free Open Vocabulary Semantic\n Segmenter","summary":" The pre-trained text-image discriminative models, such as CLIP, has been\nexplored for open-vocabulary semantic segmentation with unsatisfactory results\ndue to the loss of crucial localization information and awareness of object\nshapes. Recently, there has been a growing interest in expanding the\napplication of generative models from generation tasks to semantic\nsegmentation. These approaches utilize generative models either for generating\nannotated data or extracting features to facilitate semantic segmentation. This\ntypically involves generating a considerable amount of synthetic data or\nrequiring additional mask annotations. To this end, we uncover the potential of\ngenerative text-to-image diffusion models (e.g., Stable Diffusion) as highly\nefficient open-vocabulary semantic segmenters, and introduce a novel\ntraining-free approach named DiffSegmenter. The insight is that to generate\nrealistic objects that are semantically faithful to the input text, both the\ncomplete object shapes and the corresponding semantics are implicitly learned\nby diffusion models. We discover that the object shapes are characterized by\nthe self-attention maps while the semantics are indicated through the\ncross-attention maps produced by the denoising U-Net, forming the basis of our\nsegmentation results.Additionally, we carefully design effective textual\nprompts and a category filtering mechanism to further enhance the segmentation\nresults. Extensive experiments on three benchmark datasets show that the\nproposed DiffSegmenter achieves impressive results for open-vocabulary semantic\nsegmentation.\n","authors":["Jinglong Wang","Xiawei Li","Jing Zhang","Qingyuan Xu","Qin Zhou","Qian Yu","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2309.02773v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10541v1","updated":"2024-01-19T07:44:32Z","published":"2024-01-19T07:44:32Z","title":"I-SplitEE: Image classification in Split Computing DNNs with Early Exits","summary":" The recent advances in Deep Neural Networks (DNNs) stem from their\nexceptional performance across various domains. However, their inherent large\nsize hinders deploying these networks on resource-constrained devices like\nedge, mobile, and IoT platforms. Strategies have emerged, from partial cloud\ncomputation offloading (split computing) to integrating early exits within DNN\nlayers. Our work presents an innovative unified approach merging early exits\nand split computing. We determine the 'splitting layer', the optimal depth in\nthe DNN for edge device computations, and whether to infer on edge device or be\noffloaded to the cloud for inference considering accuracy, computational\nefficiency, and communication costs. Also, Image classification faces diverse\nenvironmental distortions, influenced by factors like time of day, lighting,\nand weather. To adapt to these distortions, we introduce I-SplitEE, an online\nunsupervised algorithm ideal for scenarios lacking ground truths and with\nsequential data. Experimental validation using Caltech-256 and Cifar-10\ndatasets subjected to varied distortions showcases I-SplitEE's ability to\nreduce costs by a minimum of 55% with marginal performance degradation of at\nmost 5%.\n","authors":["Divya Jyoti Bajpai","Aastha Jaiswal","Manjesh Kumar Hanawal"],"pdf_url":"https://arxiv.org/pdf/2401.10541v1.pdf","comment":"To appear in proceedings of IEEE International Conference on\n Communications 2024"},{"id":"http://arxiv.org/abs/2401.10537v1","updated":"2024-01-19T07:31:44Z","published":"2024-01-19T07:31:44Z","title":"Learning Position-Aware Implicit Neural Network for Real-World Face\n Inpainting","summary":" Face inpainting requires the model to have a precise global understanding of\nthe facial position structure. Benefiting from the powerful capabilities of\ndeep learning backbones, recent works in face inpainting have achieved decent\nperformance in ideal setting (square shape with $512px$). However, existing\nmethods often produce a visually unpleasant result, especially in the\nposition-sensitive details (e.g., eyes and nose), when directly applied to\narbitrary-shaped images in real-world scenarios. The visually unpleasant\nposition-sensitive details indicate the shortcomings of existing methods in\nterms of position information processing capability. In this paper, we propose\nan \\textbf{I}mplicit \\textbf{N}eural \\textbf{I}npainting \\textbf{N}etwork\n(IN$^2$) to handle arbitrary-shape face images in real-world scenarios by\nexplicit modeling for position information. Specifically, a downsample\nprocessing encoder is proposed to reduce information loss while obtaining the\nglobal semantic feature. A neighbor hybrid attention block is proposed with a\nhybrid attention mechanism to improve the facial understanding ability of the\nmodel without restricting the shape of the input. Finally, an implicit neural\npyramid decoder is introduced to explicitly model position information and\nbridge the gap between low-resolution features and high-resolution output.\nExtensive experiments demonstrate the superiority of the proposed method in\nreal-world face inpainting task.\n","authors":["Bo Zhao","Huan Yang","Jianlong Fu"],"pdf_url":"https://arxiv.org/pdf/2401.10537v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.16451v3","updated":"2024-01-19T07:27:18Z","published":"2023-12-27T07:35:17Z","title":"Domain Generalization with Vital Phase Augmentation","summary":" Deep neural networks have shown remarkable performance in image\nclassification. However, their performance significantly deteriorates with\ncorrupted input data. Domain generalization methods have been proposed to train\nrobust models against out-of-distribution data. Data augmentation in the\nfrequency domain is one of such approaches that enable a model to learn phase\nfeatures to establish domain-invariant representations. This approach changes\nthe amplitudes of the input data while preserving the phases. However, using\nfixed phases leads to susceptibility to phase fluctuations because amplitudes\nand phase fluctuations commonly occur in out-of-distribution. In this study, to\naddress this problem, we introduce an approach using finite variation of the\nphases of input data rather than maintaining fixed phases. Based on the\nassumption that the degree of domain-invariant features varies for each phase,\nwe propose a method to distinguish phases based on this degree. In addition, we\npropose a method called vital phase augmentation (VIPAug) that applies the\nvariation to the phases differently according to the degree of domain-invariant\nfeatures of given phases. The model depends more on the vital phases that\ncontain more domain-invariant features for attaining robustness to amplitude\nand phase fluctuations. We present experimental evaluations of our proposed\napproach, which exhibited improved performance for both clean and corrupted\ndata. VIPAug achieved SOTA performance on the benchmark CIFAR-10 and CIFAR-100\ndatasets, as well as near-SOTA performance on the ImageNet-100 and ImageNet\ndatasets. Our code is available at https://github.com/excitedkid/vipaug.\n","authors":["Ingyun Lee","Wooju Lee","Hyun Myung"],"pdf_url":"https://arxiv.org/pdf/2312.16451v3.pdf","comment":"Accepted by AAAI-24"},{"id":"http://arxiv.org/abs/2309.06023v4","updated":"2024-01-19T07:22:30Z","published":"2023-09-12T07:50:54Z","title":"Learning from History: Task-agnostic Model Contrastive Learning for\n Image Restoration","summary":" Contrastive learning has emerged as a prevailing paradigm for high-level\nvision tasks, which, by introducing properly negative samples, has also been\nexploited for low-level vision tasks to achieve a compact optimization space to\naccount for their ill-posed nature. However, existing methods rely on manually\npredefined and task-oriented negatives, which often exhibit pronounced\ntask-specific biases. To address this challenge, our paper introduces an\ninnovative method termed 'learning from history', which dynamically generates\nnegative samples from the target model itself. Our approach, named Model\nContrastive paradigm for Image Restoration (MCIR), rejuvenates latency models\nas negative models, making it compatible with diverse image restoration tasks.\nWe propose the Self-Prior guided Negative loss (SPN) to enable it. This\napproach significantly enhances existing models when retrained with the\nproposed model contrastive paradigm. The results show significant improvements\nin image restoration across various tasks and architectures. For example,\nmodels retrained with SPN outperform the original FFANet and DehazeFormer by\n3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly,\nthey achieve notable improvements of 0.47 dB on SPA-Data over IDT for image\nderaining and 0.12 dB on Manga109 for a 4x scale super-resolution over\nlightweight SwinIR, respectively. Code and retrained models are available at\nhttps://github.com/Aitical/MCIR.\n","authors":["Gang Wu","Junjun Jiang","Kui Jiang","Xianming Liu"],"pdf_url":"https://arxiv.org/pdf/2309.06023v4.pdf","comment":"Camera Ready Version. Accepted to The 38th Annual AAAI Conference on\n Artificial Intelligence (AAAI 2024)"},{"id":"http://arxiv.org/abs/2401.10530v1","updated":"2024-01-19T07:12:36Z","published":"2024-01-19T07:12:36Z","title":"NWPU-MOC: A Benchmark for Fine-grained Multi-category Object Counting in\n Aerial Images","summary":" Object counting is a hot topic in computer vision, which aims to estimate the\nnumber of objects in a given image. However, most methods only count objects of\na single category for an image, which cannot be applied to scenes that need to\ncount objects with multiple categories simultaneously, especially in aerial\nscenes. To this end, this paper introduces a Multi-category Object Counting\n(MOC) task to estimate the numbers of different objects (cars, buildings,\nships, etc.) in an aerial image. Considering the absence of a dataset for this\ntask, a large-scale Dataset (NWPU-MOC) is collected, consisting of 3,416 scenes\nwith a resolution of 1024 $\\times$ 1024 pixels, and well-annotated using 14\nfine-grained object categories. Besides, each scene contains RGB and Near\nInfrared (NIR) images, of which the NIR spectrum can provide richer\ncharacterization information compared with only the RGB spectrum. Based on\nNWPU-MOC, the paper presents a multi-spectrum, multi-category object counting\nframework, which employs a dual-attention module to fuse the features of RGB\nand NIR and subsequently regress multi-channel density maps corresponding to\neach object category. In addition, to modeling the dependency between different\nchannels in the density map with each object category, a spatial contrast loss\nis designed as a penalty for overlapping predictions at the same spatial\nposition. Experimental results demonstrate that the proposed method achieves\nstate-of-the-art performance compared with some mainstream counting algorithms.\nThe dataset, code and models are publicly available at\nhttps://github.com/lyongo/NWPU-MOC.\n","authors":["Junyu Gao","Liangliang Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2401.10530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n Reasoning over Image Sequences","summary":" Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10526v1","updated":"2024-01-19T07:06:58Z","published":"2024-01-19T07:06:58Z","title":"On mitigating stability-plasticity dilemma in CLIP-guided image morphing\n via geodesic distillation loss","summary":" Large-scale language-vision pre-training models, such as CLIP, have achieved\nremarkable text-guided image morphing results by leveraging several\nunconditional generative models. However, existing CLIP-guided image morphing\nmethods encounter difficulties when morphing photorealistic images.\nSpecifically, existing guidance fails to provide detailed explanations of the\nmorphing regions within the image, leading to misguidance. In this paper, we\nobserved that such misguidance could be effectively mitigated by simply using a\nproper regularization loss. Our approach comprises two key components: 1) a\ngeodesic cosine similarity loss that minimizes inter-modality features (i.e.,\nimage and text) on a projected subspace of CLIP space, and 2) a latent\nregularization loss that minimizes intra-modality features (i.e., image and\nimage) on the image manifold. By replacing the na\\\"ive directional CLIP loss in\na drop-in replacement manner, our method achieves superior morphing results on\nboth images and videos for various benchmarks, including CLIP-inversion.\n","authors":["Yeongtak Oh","Saehyung Lee","Uiwon Hwang","Sungroh Yoon"],"pdf_url":"https://arxiv.org/pdf/2401.10526v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07567v2","updated":"2024-01-19T07:04:56Z","published":"2024-01-15T09:59:43Z","title":"Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy\n for Temporal Sentence Grounding in Video","summary":" Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias\nissue, which is caused by the uneven temporal distribution of the target\nmoments for samples with similar semantic components in input videos or query\ntexts. Existing methods resort to utilizing prior knowledge about bias to\nartificially break this uneven distribution, which only removes a limited\namount of significant language biases. In this work, we propose the\nbias-conflict sample synthesis and adversarial removal debias strategy\n(BSSARD), which dynamically generates bias-conflict samples by explicitly\nleveraging potentially spurious correlations between single-modality features\nand the temporal position of the target moments. Through adversarial training,\nits bias generators continuously introduce biases and generate bias-conflict\nsamples to deceive its grounding model. Meanwhile, the grounding model\ncontinuously eliminates the introduced biases, which requires it to model\nmulti-modality alignment information. BSSARD will cover most kinds of coupling\nrelationships and disrupt language and visual biases simultaneously. Extensive\nexperiments on Charades-CD and ActivityNet-CD demonstrate the promising\ndebiasing capability of BSSARD. Source codes are available at\nhttps://github.com/qzhb/BSSARD.\n","authors":["Zhaobo Qi","Yibo Yuan","Xiaowen Ruan","Shuhui Wang","Weigang Zhang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2401.07567v2.pdf","comment":"accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.10525v1","updated":"2024-01-19T07:01:07Z","published":"2024-01-19T07:01:07Z","title":"Focaler-IoU: More Focused Intersection over Union Loss","summary":" Bounding box regression plays a crucial role in the field of object\ndetection, and the positioning accuracy of object detection largely depends on\nthe loss function of bounding box regression. Existing researchs improve\nregression performance by utilizing the geometric relationship between bounding\nboxes, while ignoring the impact of difficult and easy sample distribution on\nbounding box regression. In this article, we analyzed the impact of difficult\nand easy sample distribution on regression results, and then proposed\nFocaler-IoU, which can improve detector performance in different detection\ntasks by focusing on different regression samples. Finally, comparative\nexperiments were conducted using existing advanced detectors and regression\nmethods for different detection tasks, and the detection performance was\nfurther improved by using the method proposed in this paper.Code is available\nat \\url{https://github.com/malagoutou/Focaler-IoU}.\n","authors":["Hao Zhang","Shuaijie Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10525v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2312.17663"},{"id":"http://arxiv.org/abs/2401.10512v1","updated":"2024-01-19T06:04:48Z","published":"2024-01-19T06:04:48Z","title":"Exploring Color Invariance through Image-Level Ensemble Learning","summary":" In the field of computer vision, the persistent presence of color bias,\nresulting from fluctuations in real-world lighting and camera conditions,\npresents a substantial challenge to the robustness of models. This issue is\nparticularly pronounced in complex wide-area surveillance scenarios, such as\nperson re-identification and industrial dust segmentation, where models often\nexperience a decline in performance due to overfitting on color information\nduring training, given the presence of environmental variations. Consequently,\nthere is a need to effectively adapt models to cope with the complexities of\ncamera conditions. To address this challenge, this study introduces a learning\nstrategy named Random Color Erasing, which draws inspiration from ensemble\nlearning. This strategy selectively erases partial or complete color\ninformation in the training data without disrupting the original image\nstructure, thereby achieving a balanced weighting of color features and other\nfeatures within the neural network. This approach mitigates the risk of\noverfitting and enhances the model's ability to handle color variation, thereby\nimproving its overall robustness. The approach we propose serves as an ensemble\nlearning strategy, characterized by robust interpretability. A comprehensive\nanalysis of this methodology is presented in this paper. Across various tasks\nsuch as person re-identification and semantic segmentation, our approach\nconsistently improves strong baseline methods. Notably, in comparison to\nexisting methods that prioritize color robustness, our strategy significantly\nenhances performance in cross-domain scenarios. The code available at\n\\url{https://github.com/layumi/Person\\_reID\\_baseline\\_pytorch/blob/master/random\\_erasing.py}\nor \\url{https://github.com/finger-monkey/Data-Augmentation}.\n","authors":["Yunpeng Gong","Jiaquan Li","Lifei Chen","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.10512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10511v1","updated":"2024-01-19T06:03:01Z","published":"2024-01-19T06:03:01Z","title":"GMC-IQA: Exploiting Global-correlation and Mean-opinion Consistency for\n No-reference Image Quality Assessment","summary":" Due to the subjective nature of image quality assessment (IQA), assessing\nwhich image has better quality among a sequence of images is more reliable than\nassigning an absolute mean opinion score for an image. Thus, IQA models are\nevaluated by global correlation consistency (GCC) metrics like PLCC and SROCC,\nrather than mean opinion consistency (MOC) metrics like MAE and MSE. However,\nmost existing methods adopt MOC metrics to define their loss functions, due to\nthe infeasible computation of GCC metrics during training. In this work, we\nconstruct a novel loss function and network to exploit Global-correlation and\nMean-opinion Consistency, forming a GMC-IQA framework. Specifically, we propose\na novel GCC loss by defining a pairwise preference-based rank estimation to\nsolve the non-differentiable problem of SROCC and introducing a queue mechanism\nto reserve previous data to approximate the global results of the whole data.\nMoreover, we propose a mean-opinion network, which integrates diverse opinion\nfeatures to alleviate the randomness of weight learning and enhance the model\nrobustness. Experiments indicate that our method outperforms SOTA methods on\nmultiple authentic datasets with higher accuracy and generalization. We also\nadapt the proposed loss to various networks, which brings better performance\nand more stable training.\n","authors":["Zewen Chen","Juan Wang","Bing Li","Chunfeng Yuan","Weiming Hu","Junxian Liu","Peng Li","Yan Wang","Youqun Zhang","Congxuan Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05594v3","updated":"2024-01-19T05:50:58Z","published":"2024-01-10T23:55:16Z","title":"Wasserstein Distance-based Expansion of Low-Density Latent Regions for\n Unknown Class Detection","summary":" This paper addresses the significant challenge in open-set object detection\n(OSOD): the tendency of state-of-the-art detectors to erroneously classify\nunknown objects as known categories with high confidence. We present a novel\napproach that effectively identifies unknown objects by distinguishing between\nhigh and low-density regions in latent space. Our method builds upon the\nOpen-Det (OD) framework, introducing two new elements to the loss function.\nThese elements enhance the known embedding space's clustering and expand the\nunknown space's low-density regions. The first addition is the Class\nWasserstein Anchor (CWA), a new function that refines the classification\nboundaries. The second is a spectral normalisation step, improving the\nrobustness of the model. Together, these augmentations to the existing\nContrastive Feature Learner (CFL) and Unknown Probability Learner (UPL) loss\nfunctions significantly improve OSOD performance. Our proposed OpenDet-CWA\n(OD-CWA) method demonstrates: a) a reduction in open-set errors by\napproximately 17%-22%, b) an enhancement in novelty detection capability by\n1.5%-16%, and c) a decrease in the wilderness index by 2%-20% across various\nopen-set scenarios. These results represent a substantial advancement in the\nfield, showcasing the potential of our approach in managing the complexities of\nopen-set object detection.\n","authors":["Prakash Mallick","Feras Dayoub","Jamie Sherrah"],"pdf_url":"https://arxiv.org/pdf/2401.05594v3.pdf","comment":"8 Full length pages, followed by 2 supplementary pages, total of 9\n Figures"},{"id":"http://arxiv.org/abs/2208.09424v3","updated":"2024-01-19T05:32:54Z","published":"2022-08-19T16:16:59Z","title":"Hierarchical Compositional Representations for Few-shot Action\n Recognition","summary":" Recently action recognition has received more and more attention for its\ncomprehensive and practical applications in intelligent surveillance and\nhuman-computer interaction. However, few-shot action recognition has not been\nwell explored and remains challenging because of data scarcity. In this paper,\nwe propose a novel hierarchical compositional representations (HCR) learning\napproach for few-shot action recognition. Specifically, we divide a complicated\naction into several sub-actions by carefully designed hierarchical clustering\nand further decompose the sub-actions into more fine-grained spatially\nattentional sub-actions (SAS-actions). Although there exist large differences\nbetween base classes and novel classes, they can share similar patterns in\nsub-actions or SAS-actions. Furthermore, we adopt the Earth Mover's Distance in\nthe transportation problem to measure the similarity between video samples in\nterms of sub-action representations. It computes the optimal matching flows\nbetween sub-actions as distance metric, which is favorable for comparing\nfine-grained patterns. Extensive experiments show our method achieves the\nstate-of-the-art results on HMDB51, UCF101 and Kinetics datasets.\n","authors":["Changzhen Li","Jie Zhang","Shuzhe Wu","Xin Jin","Shiguang Shan"],"pdf_url":"https://arxiv.org/pdf/2208.09424v3.pdf","comment":"Accepted by Computer Vision and Image Understanding"},{"id":"http://arxiv.org/abs/2401.10501v1","updated":"2024-01-19T05:28:51Z","published":"2024-01-19T05:28:51Z","title":"Enhancing medical vision-language contrastive learning via\n inter-matching relation modelling","summary":" Medical image representations can be learned through medical vision-language\ncontrastive learning (mVLCL) where medical imaging reports are used as weak\nsupervision through image-text alignment. These learned image representations\ncan be transferred to and benefit various downstream medical vision tasks such\nas disease classification and segmentation. Recent mVLCL methods attempt to\nalign image sub-regions and the report keywords as local-matchings. However,\nthese methods aggregate all local-matchings via simple pooling operations while\nignoring the inherent relations between them. These methods therefore fail to\nreason between local-matchings that are semantically related, e.g.,\nlocal-matchings that correspond to the disease word and the location word\n(semantic-relations), and also fail to differentiate such clinically important\nlocal-matchings from others that correspond to less meaningful words, e.g.,\nconjunction words (importance-relations). Hence, we propose a mVLCL method that\nmodels the inter-matching relations between local-matchings via a\nrelation-enhanced contrastive learning framework (RECLF). In RECLF, we\nintroduce a semantic-relation reasoning module (SRM) and an importance-relation\nreasoning module (IRM) to enable more fine-grained report supervision for image\nrepresentation learning. We evaluated our method using four public benchmark\ndatasets on four downstream tasks, including segmentation, zero-shot\nclassification, supervised classification, and cross-modal retrieval. Our\nresults demonstrated the superiority of our RECLF over the state-of-the-art\nmVLCL methods with consistent improvements across single-modal and cross-modal\ntasks. These results suggest that our RECLF, by modelling the inter-matching\nrelations, can learn improved medical image representations with better\ngeneralization capabilities.\n","authors":["Mingjian Li","Mingyuan Meng","Michael Fulham","David Dagan Feng","Lei Bi","Jinman Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10501v1.pdf","comment":"11 pages, 5 figures. Under review"},{"id":"http://arxiv.org/abs/2401.09895v2","updated":"2024-01-19T05:27:15Z","published":"2024-01-18T11:14:32Z","title":"Skeleton-Guided Instance Separation for Fine-Grained Segmentation in\n Microscopy","summary":" One of the fundamental challenges in microscopy (MS) image analysis is\ninstance segmentation (IS), particularly when segmenting cluster regions where\nmultiple objects of varying sizes and shapes may be connected or even\noverlapped in arbitrary orientations. Existing IS methods usually fail in\nhandling such scenarios, as they rely on coarse instance representations such\nas keypoints and horizontal bounding boxes (h-bboxes). In this paper, we\npropose a novel one-stage framework named A2B-IS to address this challenge and\nenhance the accuracy of IS in MS images. Our approach represents each instance\nwith a pixel-level mask map and a rotated bounding box (r-bbox). Unlike\ntwo-stage methods that use box proposals for segmentations, our method\ndecouples mask and box predictions, enabling simultaneous processing to\nstreamline the model pipeline. Additionally, we introduce a Gaussian skeleton\nmap to aid the IS task in two key ways: (1) It guides anchor placement,\nreducing computational costs while improving the model's capacity to learn\nRoI-aware features by filtering out noise from background regions. (2) It\nensures accurate isolation of densely packed instances by rectifying erroneous\nbox predictions near instance boundaries. To further enhance the performance,\nwe integrate two modules into the framework: (1) An Atrous Attention Block\n(A2B) designed to extract high-resolution feature maps with fine-grained\nmultiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that\nleverages both labeled and unlabeled images for model training. Our method has\nbeen thoroughly validated on two large-scale MS datasets, demonstrating its\nsuperiority over most state-of-the-art approaches.\n","authors":["Jun Wang","Chengfeng Zhou","Zhaoyan Ming","Lina Wei","Xudong Jiang","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2401.09895v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.12653v2","updated":"2024-01-19T04:37:18Z","published":"2023-12-19T22:53:32Z","title":"Diagnosis Of Takotsubo Syndrome By Robust Feature Selection From The\n Complex Latent Space Of DL-based Segmentation Network","summary":" Researchers have shown significant correlations among segmented objects in\nvarious medical imaging modalities and disease related pathologies. Several\nstudies showed that using hand crafted features for disease prediction neglects\nthe immense possibility to use latent features from deep learning (DL) models\nwhich may reduce the overall accuracy of differential diagnosis. However,\ndirectly using classification or segmentation models on medical to learn latent\nfeatures opt out robust feature selection and may lead to overfitting. To fill\nthis gap, we propose a novel feature selection technique using the latent space\nof a segmentation model that can aid diagnosis. We evaluated our method in\ndifferentiating a rare cardiac disease: Takotsubo Syndrome (TTS) from the ST\nelevation myocardial infarction (STEMI) using echocardiogram videos (echo). TTS\ncan mimic clinical features of STEMI in echo and extremely hard to distinguish.\nOur approach shows promising results in differential diagnosis of TTS with 82%\ndiagnosis accuracy beating the previous state-of-the-art (SOTA) approach.\nMoreover, the robust feature selection technique using LASSO algorithm shows\ngreat potential in reducing the redundant features and creates a robust\npipeline for short- and long-term disease prognoses in the downstream analysis.\n","authors":["Fahim Ahmed Zaman","Wahidul Alam","Tarun Kanti Roy","Amanda Chang","Kan Liu","Xiaodong Wu"],"pdf_url":"https://arxiv.org/pdf/2312.12653v2.pdf","comment":"5 pages, 3 figures, conference"},{"id":"http://arxiv.org/abs/2401.10150v2","updated":"2024-01-19T04:27:05Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n Diffusion-Based Video Generation","summary":" Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model.To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos.\n","authors":["Changgu Chen","Junwei Shu","Lianggangxu Chen","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.09721v2","updated":"2024-01-19T04:07:33Z","published":"2024-01-18T04:51:41Z","title":"Fast graph-based denoising for point cloud color information","summary":" Point clouds are utilized in various 3D applications such as cross-reality\n(XR) and realistic 3D displays. In some applications, e.g., for live streaming\nusing a 3D point cloud, real-time point cloud denoising methods are required to\nenhance the visual quality. However, conventional high-precision denoising\nmethods cannot be executed in real time for large-scale point clouds owing to\nthe complexity of graph constructions with K nearest neighbors and noise level\nestimation. This paper proposes a fast graph-based denoising (FGBD) for a\nlarge-scale point cloud. First, high-speed graph construction is achieved by\nscanning a point cloud in various directions and searching adjacent\nneighborhoods on the scanning lines. Second, we propose a fast noise level\nestimation method using eigenvalues of the covariance matrix on a graph.\nFinally, we also propose a new low-cost filter selection method to enhance\ndenoising accuracy to compensate for the degradation caused by the acceleration\nalgorithms. In our experiments, we succeeded in reducing the processing time\ndramatically while maintaining accuracy relative to conventional denoising\nmethods. Denoising was performed at 30fps, with frames containing approximately\n1 million points.\n","authors":["Ryosuke Watanabe","Keisuke Nonaka","Eduardo Pavez","Tatsuya Kobayashi","Antonio Ortega"],"pdf_url":"https://arxiv.org/pdf/2401.09721v2.pdf","comment":"Published in the proceeding of 2024 IEEE International Conference on\n Acoustics, Speech and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.10475v1","updated":"2024-01-19T03:54:58Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n Video Search Scenarios","summary":" Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The complete dataset, code\nand checkpoints will be available upon release.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10474v1","updated":"2024-01-19T03:50:19Z","published":"2024-01-19T03:50:19Z","title":"LDReg: Local Dimensionality Regularized Self-Supervised Learning","summary":" Representations learned via self-supervised learning (SSL) can be susceptible\nto dimensional collapse, where the learned representation subspace is of\nextremely low dimensionality and thus fails to represent the full data\ndistribution and modalities. Dimensional collapse also known as the\n\"underfilling\" phenomenon is one of the major causes of degraded performance on\ndownstream tasks. Previous work has investigated the dimensional collapse\nproblem of SSL at a global level. In this paper, we demonstrate that\nrepresentations can span over high dimensional space globally, but collapse\nlocally. To address this, we propose a method called $\\textit{local\ndimensionality regularization (LDReg)}$. Our formulation is based on the\nderivation of the Fisher-Rao metric to compare and optimize local distance\ndistributions at an asymptotically small radius for each data point. By\nincreasing the local intrinsic dimensionality, we demonstrate through a range\nof experiments that LDReg improves the representation quality of SSL. The\nresults also show that LDReg can regularize dimensionality at both local and\nglobal levels.\n","authors":["Hanxun Huang","Ricardo J. G. B. Campello","Sarah Monazam Erfani","Xingjun Ma","Michael E. Houle","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2401.10474v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2309.09466v2","updated":"2024-01-19T03:37:57Z","published":"2023-09-18T04:01:25Z","title":"Progressive Text-to-Image Diffusion with Soft Latent Direction","summary":" In spite of the rapidly evolving landscape of text-to-image generation, the\nsynthesis and manipulation of multiple entities while adhering to specific\nrelational constraints pose enduring challenges. This paper introduces an\ninnovative progressive synthesis and editing operation that systematically\nincorporates entities into the target image, ensuring their adherence to\nspatial and relational constraints at each sequential step. Our key insight\nstems from the observation that while a pre-trained text-to-image diffusion\nmodel adeptly handles one or two entities, it often falters when dealing with a\ngreater number. To address this limitation, we propose harnessing the\ncapabilities of a Large Language Model (LLM) to decompose intricate and\nprotracted text descriptions into coherent directives adhering to stringent\nformats. To facilitate the execution of directives involving distinct semantic\noperations-namely insertion, editing, and erasing-we formulate the Stimulus,\nResponse, and Fusion (SRF) framework. Within this framework, latent regions are\ngently stimulated in alignment with each operation, followed by the fusion of\nthe responsive latent components to achieve cohesive entity manipulation. Our\nproposed framework yields notable advancements in object synthesis,\nparticularly when confronted with intricate and lengthy textual inputs.\nConsequently, it establishes a new benchmark for text-to-image generation\ntasks, further elevating the field's performance standards.\n","authors":["YuTeng Ye","Jiale Cai","Hang Zhou","Guanwen Li","Youjia Zhang","Zikai Song","Chenxing Gao","Junqing Yu","Wei Yang"],"pdf_url":"https://arxiv.org/pdf/2309.09466v2.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2401.10090v2","updated":"2024-01-19T03:31:49Z","published":"2024-01-18T15:56:23Z","title":"Cross-Modality Perturbation Synergy Attack for Person Re-identification","summary":" In recent years, there has been significant research focusing on addressing\nsecurity concerns in single-modal person re-identification (ReID) systems that\nare based on RGB images. However, the safety of cross-modality scenarios, which\nare more commonly encountered in practical applications involving images\ncaptured by infrared cameras, has not received adequate attention. The main\nchallenge in cross-modality ReID lies in effectively dealing with visual\ndifferences between different modalities. For instance, infrared images are\ntypically grayscale, unlike visible images that contain color information.\nExisting attack methods have primarily focused on the characteristics of the\nvisible image modality, overlooking the features of other modalities and the\nvariations in data distribution among different modalities. This oversight can\npotentially undermine the effectiveness of these methods in image retrieval\nacross diverse modalities. This study represents the first exploration into the\nsecurity of cross-modality ReID models and proposes a universal perturbation\nattack specifically designed for cross-modality ReID. This attack optimizes\nperturbations by leveraging gradients from diverse modality data, thereby\ndisrupting the discriminator and reinforcing the differences between\nmodalities. We conducted experiments on two widely used cross-modality\ndatasets, namely RegDB and SYSU, which not only demonstrated the effectiveness\nof our method but also provided insights for future enhancements in the\nrobustness of cross-modality ReID systems.\n","authors":["Yunpeng Gong","Zhun Zhong","Zhiming Luo","Yansong Qu","Rongrong Ji","Min Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.10090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10461v1","updated":"2024-01-19T03:01:07Z","published":"2024-01-19T03:01:07Z","title":"Learning to Robustly Reconstruct Low-light Dynamic Scenes from Spike\n Streams","summary":" As a neuromorphic sensor with high temporal resolution, spike camera can\ngenerate continuous binary spike streams to capture per-pixel light intensity.\nWe can use reconstruction methods to restore scene details in high-speed\nscenarios. However, due to limited information in spike streams, low-light\nscenes are difficult to effectively reconstruct. In this paper, we propose a\nbidirectional recurrent-based reconstruction framework, including a\nLight-Robust Representation (LR-Rep) and a fusion module, to better handle such\nextreme conditions. LR-Rep is designed to aggregate temporal information in\nspike streams, and a fusion module is utilized to extract temporal features.\nAdditionally, we have developed a reconstruction benchmark for high-speed\nlow-light scenes. Light sources in the scenes are carefully aligned to\nreal-world conditions. Experimental results demonstrate the superiority of our\nmethod, which also generalizes well to real spike streams. Related codes and\nproposed datasets will be released after publication.\n","authors":["Liwen Hu","Ziluo Ding","Mianzhi Liu","Lei Ma","Tiejun Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14197v2","updated":"2024-01-19T02:46:00Z","published":"2023-10-22T06:16:16Z","title":"Diffusion-based Data Augmentation for Nuclei Image Segmentation","summary":" Nuclei segmentation is a fundamental but challenging task in the quantitative\nanalysis of histopathology images. Although fully-supervised deep\nlearning-based methods have made significant progress, a large number of\nlabeled images are required to achieve great segmentation performance.\nConsidering that manually labeling all nuclei instances for a dataset is\ninefficient, obtaining a large-scale human-annotated dataset is time-consuming\nand labor-intensive. Therefore, augmenting a dataset with only a few labeled\nimages to improve the segmentation performance is of significant research and\napplication value. In this paper, we introduce the first diffusion-based\naugmentation method for nuclei segmentation. The idea is to synthesize a large\nnumber of labeled images to facilitate training the segmentation model. To\nachieve this, we propose a two-step strategy. In the first step, we train an\nunconditional diffusion model to synthesize the Nuclei Structure that is\ndefined as the representation of pixel-level semantic and distance transform.\nEach synthetic nuclei structure will serve as a constraint on histopathology\nimage synthesis and is further post-processed to be an instance map. In the\nsecond step, we train a conditioned diffusion model to synthesize\nhistopathology images based on nuclei structures. The synthetic histopathology\nimages paired with synthetic instance maps will be added to the real dataset\nfor training the segmentation model. The experimental results show that by\naugmenting 10% labeled real dataset with synthetic samples, one can achieve\ncomparable segmentation results with the fully-supervised baseline. The code is\nreleased in: https://github.com/lhaof/Nudiff\n","authors":["Xinyi Yu","Guanbin Li","Wei Lou","Siqi Liu","Xiang Wan","Yan Chen","Haofeng Li"],"pdf_url":"https://arxiv.org/pdf/2310.14197v2.pdf","comment":"MICCAI 2023, released code: https://github.com/lhaof/Nudiff"},{"id":"http://arxiv.org/abs/2311.15497v3","updated":"2024-01-19T02:45:44Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed improvements of up to 1.6% in test data, while maintaining the same\ninference time, and a substantial 1.0% points performance gain in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.06551v4","updated":"2024-01-19T02:42:20Z","published":"2022-08-13T02:50:35Z","title":"Exploiting Multiple Sequence Lengths in Fast End to End Training for\n Image Captioning","summary":" We introduce a method called the Expansion mechanism that processes the input\nunconstrained by the number of elements in the sequence. By doing so, the model\ncan learn more effectively compared to traditional attention-based approaches.\nTo support this claim, we design a novel architecture ExpansionNet v2 that\nachieved strong results on the MS COCO 2014 Image Captioning challenge and the\nState of the Art in its respective category, with a score of 143.7 CIDErD in\nthe offline test split, 140.8 CIDErD in the online evaluation server and 72.9\nAllCIDEr on the nocaps validation set. Additionally, we introduce an End to End\ntraining algorithm up to 2.8 times faster than established alternatives. Source\ncode available at: https://github.com/jchenghu/ExpansionNet_v2\n","authors":["Jia Cheng Hu","Roberto Cavicchioli","Alessandro Capotondi"],"pdf_url":"https://arxiv.org/pdf/2208.06551v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10110v2","updated":"2024-01-19T02:31:02Z","published":"2024-01-18T16:27:09Z","title":"VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text\n Recognition","summary":" Scene Text Recognition (STR) is a challenging task that involves recognizing\ntext within images of natural scenes. Although current state-of-the-art models\nfor STR exhibit high performance, they typically suffer from low inference\nefficiency due to their reliance on hybrid architectures comprised of visual\nencoders and sequence decoders. In this work, we propose the VIsion Permutable\nextractor for fast and efficient scene Text Recognition (VIPTR), which achieves\nan impressive balance between high performance and rapid inference speeds in\nthe domain of STR. Specifically, VIPTR leverages a visual-semantic extractor\nwith a pyramid structure, characterized by multiple self-attention layers,\nwhile eschewing the traditional sequence decoder. This design choice results in\na lightweight and efficient model capable of handling inputs of varying sizes.\nExtensive experimental results on various standard datasets for both Chinese\nand English scene text recognition validate the superiority of VIPTR. Notably,\nthe VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with\nother lightweight models and achieves SOTA inference speeds. Meanwhile, the\nVIPTR-L (Large) variant attains greater recognition accuracy, while maintaining\na low parameter count and favorable inference speed. Our proposed method\nprovides a compelling solution for the STR challenge, which blends high\naccuracy with efficiency and greatly benefits real-world applications requiring\nfast and reliable text recognition. The code is publicly available at\nhttps://github.com/cxfyxl/VIPTR.\n","authors":["Xianfu Cheng","Weixiao Zhou","Xiang Li","Xiaoming Chen","Jian Yang","Tongliang Li","Zhoujun Li"],"pdf_url":"https://arxiv.org/pdf/2401.10110v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2205.00159 by other authors"},{"id":"http://arxiv.org/abs/2312.06946v2","updated":"2024-01-19T02:08:07Z","published":"2023-12-12T02:55:14Z","title":"WaterHE-NeRF: Water-ray Tracing Neural Radiance Fields for Underwater\n Scene Reconstruction","summary":" Neural Radiance Field (NeRF) technology demonstrates immense potential in\nnovel viewpoint synthesis tasks, due to its physics-based volumetric rendering\nprocess, which is particularly promising in underwater scenes. Addressing the\nlimitations of existing underwater NeRF methods in handling light attenuation\ncaused by the water medium and the lack of real Ground Truth (GT) supervision,\nthis study proposes WaterHE-NeRF. We develop a new water-ray tracing field by\nRetinex theory that precisely encodes color, density, and illuminance\nattenuation in three-dimensional space. WaterHE-NeRF, through its illuminance\nattenuation mechanism, generates both degraded and clear multi-view images and\noptimizes image restoration by combining reconstruction loss with Wasserstein\ndistance. Additionally, the use of histogram equalization (HE) as pseudo-GT\nenhances the network's accuracy in preserving original details and color\ndistribution. Extensive experiments on real underwater datasets and synthetic\ndatasets validate the effectiveness of WaterHE-NeRF. Our code will be made\npublicly available.\n","authors":["Jingchun Zhou","Tianyu Liang","Dehuan Zhang","Zongxin He"],"pdf_url":"https://arxiv.org/pdf/2312.06946v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18999v3","updated":"2024-01-19T01:57:15Z","published":"2023-10-29T12:55:53Z","title":"DynPoint: Dynamic Neural Point For View Synthesis","summary":" The introduction of neural radiance fields has greatly improved the\neffectiveness of view synthesis for monocular videos. However, existing\nalgorithms face difficulties when dealing with uncontrolled or lengthy\nscenarios, and require extensive training time specific to each new scenario.\nTo tackle these limitations, we propose DynPoint, an algorithm designed to\nfacilitate the rapid synthesis of novel views for unconstrained monocular\nvideos. Rather than encoding the entirety of the scenario information into a\nlatent representation, DynPoint concentrates on predicting the explicit 3D\ncorrespondence between neighboring frames to realize information aggregation.\nSpecifically, this correspondence prediction is achieved through the estimation\nof consistent depth and scene flow information across frames. Subsequently, the\nacquired correspondence is utilized to aggregate information from multiple\nreference frames to a target frame, by constructing hierarchical neural point\nclouds. The resulting framework enables swift and accurate view synthesis for\ndesired views of target frames. The experimental results obtained demonstrate\nthe considerable acceleration of training time achieved - typically an order of\nmagnitude - by our proposed method while yielding comparable outcomes compared\nto prior approaches. Furthermore, our method exhibits strong robustness in\nhandling long-duration videos without learning a canonical representation of\nvideo content.\n","authors":["Kaichen Zhou","Jia-Xing Zhong","Sangyun Shin","Kai Lu","Yiyuan Yang","Andrew Markham","Niki Trigoni"],"pdf_url":"https://arxiv.org/pdf/2310.18999v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.10766v2","updated":"2024-01-19T01:51:45Z","published":"2023-01-25T18:59:15Z","title":"On the Adversarial Robustness of Camera-based 3D Object Detection","summary":" In recent years, camera-based 3D object detection has gained widespread\nattention for its ability to achieve high performance with low computational\ncost. However, the robustness of these methods to adversarial attacks has not\nbeen thoroughly examined, especially when considering their deployment in\nsafety-critical domains like autonomous driving. In this study, we conduct the\nfirst comprehensive investigation of the robustness of leading camera-based 3D\nobject detection approaches under various adversarial conditions. We\nsystematically analyze the resilience of these models under two attack\nsettings: white-box and black-box; focusing on two primary objectives:\nclassification and localization. Additionally, we delve into two types of\nadversarial attack techniques: pixel-based and patch-based. Our experiments\nyield four interesting findings: (a) bird's-eye-view-based representations\nexhibit stronger robustness against localization attacks; (b)\ndepth-estimation-free approaches have the potential to show stronger\nrobustness; (c) accurate depth estimation effectively improves robustness for\ndepth-estimation-based methods; (d) incorporating multi-frame benign inputs can\neffectively mitigate adversarial attacks. We hope our findings can steer the\ndevelopment of future camera-based object detection models with enhanced\nadversarial robustness.\n","authors":["Shaoyuan Xie","Zichao Li","Zeyu Wang","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2301.10766v2.pdf","comment":"Transactions on Machine Learning Research, 2024. ISSN 2835-8856"},{"id":"http://arxiv.org/abs/2312.06955v2","updated":"2024-01-19T01:47:22Z","published":"2023-12-12T03:26:04Z","title":"IA2U: A Transfer Plugin with Multi-Prior for In-Air Model to Underwater","summary":" In underwater environments, variations in suspended particle concentration\nand turbidity cause severe image degradation, posing significant challenges to\nimage enhancement (IE) and object detection (OD) tasks. Currently, in-air image\nenhancement and detection methods have made notable progress, but their\napplication in underwater conditions is limited due to the complexity and\nvariability of these environments. Fine-tuning in-air models saves high\noverhead and has more optional reference work than building an underwater model\nfrom scratch. To address these issues, we design a transfer plugin with\nmultiple priors for converting in-air models to underwater applications, named\nIA2U. IA2U enables efficient application in underwater scenarios, thereby\nimproving performance in Underwater IE and OD. IA2U integrates three types of\nunderwater priors: the water type prior that characterizes the degree of image\ndegradation, such as color and visibility; the degradation prior, focusing on\ndifferences in details and textures; and the sample prior, considering the\nenvironmental conditions at the time of capture and the characteristics of the\nphotographed object. Utilizing a Transformer-like structure, IA2U employs these\npriors as query conditions and a joint task loss function to achieve\nhierarchical enhancement of task-level underwater image features, therefore\nconsidering the requirements of two different tasks, IE and OD. Experimental\nresults show that IA2U combined with an in-air model can achieve superior\nperformance in underwater image enhancement and object detection tasks. The\ncode will be made publicly available.\n","authors":["Jingchun Zhou","Qilin Gai","Kin-man Lam","Xianping Fu"],"pdf_url":"https://arxiv.org/pdf/2312.06955v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.06999v2","updated":"2024-01-19T01:46:49Z","published":"2023-12-12T06:07:21Z","title":"DGNet: Dynamic Gradient-guided Network with Noise Suppression for\n Underwater Image Enhancement","summary":" Underwater image enhancement (UIE) is a challenging task due to the complex\ndegradation caused by underwater environments. To solve this issue, previous\nmethods often idealize the degradation process, and neglect the impact of\nmedium noise and object motion on the distribution of image features, limiting\nthe generalization and adaptability of the model. Previous methods use the\nreference gradient that is constructed from original images and synthetic\nground-truth images. This may cause the network performance to be influenced by\nsome low-quality training data. Our approach utilizes predicted images to\ndynamically update pseudo-labels, adding a dynamic gradient to optimize the\nnetwork's gradient space. This process improves image quality and avoids local\noptima. Moreover, we propose a Feature Restoration and Reconstruction module\n(FRR) based on a Channel Combination Inference (CCI) strategy and a Frequency\nDomain Smoothing module (FRS). These modules decouple other degradation\nfeatures while reducing the impact of various types of noise on network\nperformance. Experiments on multiple public datasets demonstrate the\nsuperiority of our method over existing state-of-the-art approaches, especially\nin achieving performance milestones: PSNR of 25.6dB and SSIM of 0.93 on the\nUIEB dataset. Its efficiency in terms of parameter size and inference time\nfurther attests to its broad practicality. The code will be made publicly\navailable.\n","authors":["Jingchun Zhou","Zongxin He","Dehuan Zhang","Kin-man Lam","Xianping Fu","Yi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.06999v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10442v1","updated":"2024-01-19T01:11:44Z","published":"2024-01-19T01:11:44Z","title":"Path Choice Matters for Clear Attribution in Path Methods","summary":" Rigorousness and clarity are both essential for interpretations of DNNs to\nengender human trust. Path methods are commonly employed to generate rigorous\nattributions that satisfy three axioms. However, the meaning of attributions\nremains ambiguous due to distinct path choices. To address the ambiguity, we\nintroduce \\textbf{Concentration Principle}, which centrally allocates high\nattributions to indispensable features, thereby endowing aesthetic and\nsparsity. We then present \\textbf{SAMP}, a model-agnostic interpreter, which\nefficiently searches the near-optimal path from a pre-defined set of\nmanipulation paths. Moreover, we propose the infinitesimal constraint (IC) and\nmomentum strategy (MS) to improve the rigorousness and optimality.\nVisualizations show that SAMP can precisely reveal DNNs by pinpointing salient\nimage pixels. We also perform quantitative experiments and observe that our\nmethod significantly outperforms the counterparts. Code:\nhttps://github.com/zbr17/SAMP.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.10442v1.pdf","comment":"ICLR 2024 accepted"},{"id":"http://arxiv.org/abs/2304.00746v3","updated":"2024-01-19T00:42:13Z","published":"2023-04-03T06:40:52Z","title":"OTS: A One-shot Learning Approach for Text Spotting in Historical\n Manuscripts","summary":" In the field of historical manuscript research, scholars frequently encounter\nnovel symbols in ancient texts, investing considerable effort in their\nidentification and documentation. Although some object detection methods have\nachieved impressive performance, they primarily excel at detecting categories\nincluded in training datasets, often failing to recognize novel symbols without\nretraining. To overcome this limitation, we propose a novel One-shot\nlearning-based Text Spotting (OTS) approach that accurately and reliably spots\nnovel characters with just one annotated support sample. Drawing inspiration\nfrom cognitive research, we introduce a spatial alignment module that finds,\nfocuses on, and learns the most discriminative spatial regions in the query\nimage based on one support image. Especially, since the low-resource spotting\ntask often faces the problem of example imbalance, we propose a novel loss\nfunction called torus loss which can make the embedding space of distance\nmetric more discriminative. Our approach is highly efficient and requires only\na few training samples while exhibiting the remarkable ability to handle novel\ncharacters and symbols. To enhance dataset diversity, a new manuscript dataset\nthat contains the ancient Dongba hieroglyphics (DBH) is created, a script\nassociated with China and developed by the ancestors of the Naxi minority. We\nconduct experiments on publicly available DBH, EGY, VML-HD, TKH, and NC\ndatasets. The experimental results demonstrate that OTS outperforms the\nstate-of-the-art methods in one-shot text spotting. Overall, our proposed\nmethod offers promising applications in text spotting in historical\nmanuscripts.\n","authors":["Wenbo Hu","Hongjian Zhan","Cong Liu","Bing Yin","Yue Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00746v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00110v3","updated":"2024-01-19T00:35:35Z","published":"2023-12-30T01:24:25Z","title":"Diffusion Model with Perceptual Loss","summary":" Diffusion models trained with mean squared error loss tend to generate\nunrealistic samples. Current state-of-the-art models rely on classifier-free\nguidance to improve sample quality, yet its surprising effectiveness is not\nfully understood. In this paper, we show that the effectiveness of\nclassifier-free guidance partly originates from it being a form of implicit\nperceptual guidance. As a result, we can directly incorporate perceptual loss\nin diffusion training to improve sample quality. Since the score matching\nobjective used in diffusion training strongly resembles the denoising\nautoencoder objective used in unsupervised training of perceptual networks, the\ndiffusion model itself is a perceptual network and can be used to generate\nmeaningful perceptual loss. We propose a novel self-perceptual objective that\nresults in diffusion models capable of generating more realistic samples. For\nconditional generation, our method only improves sample quality without\nentanglement with the conditional input and therefore does not sacrifice sample\ndiversity. Our method can also improve sample quality for unconditional\ngeneration, which was not possible with classifier-free guidance before.\n","authors":["Shanchuan Lin","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00110v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n in extremist social media","summary":" Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2312.09631v2","updated":"2024-01-19T17:07:40Z","published":"2023-12-15T09:21:11Z","title":"Context-Driven Interactive Query Simulations Based on Generative Large\n Language Models","summary":" Simulating user interactions enables a more user-oriented evaluation of\ninformation retrieval (IR) systems. While user simulations are cost-efficient\nand reproducible, many approaches often lack fidelity regarding real user\nbehavior. Most notably, current user models neglect the user's context, which\nis the primary driver of perceived relevance and the interactions with the\nsearch results. To this end, this work introduces the simulation of\ncontext-driven query reformulations. The proposed query generation methods\nbuild upon recent Large Language Model (LLM) approaches and consider the user's\ncontext throughout the simulation of a search session. Compared to simple\ncontext-free query generation approaches, these methods show better\neffectiveness and allow the simulation of more efficient IR sessions.\nSimilarly, our evaluations consider more interaction context than current\nsession-based measures and reveal interesting complementary insights in\naddition to the established evaluation protocols. We conclude with directions\nfor future work and provide an entirely open experimental setup.\n","authors":["Björn Engelmann","Timo Breuer","Jana Isabelle Friese","Philipp Schaer","Norbert Fuhr"],"pdf_url":"https://arxiv.org/pdf/2312.09631v2.pdf","comment":"Accepted at ECIR 2024 (Full Paper)"},{"id":"http://arxiv.org/abs/2308.07107v3","updated":"2024-01-19T16:01:28Z","published":"2023-08-14T12:47:22Z","title":"Large Language Models for Information Retrieval: A Survey","summary":" As a primary means of information acquisition, information retrieval (IR)\nsystems, such as search engines, have integrated themselves into our daily\nlives. These systems also serve as components of dialogue, question-answering,\nand recommender systems. The trajectory of IR has evolved dynamically from its\norigins in term-based methods to its integration with advanced neural models.\nWhile the neural models excel at capturing complex contextual signals and\nsemantic nuances, thereby reshaping the IR landscape, they still face\nchallenges such as data scarcity, interpretability, and the generation of\ncontextually plausible yet potentially inaccurate responses. This evolution\nrequires a combination of both traditional methods (such as term-based sparse\nretrieval methods with rapid response) and modern neural architectures (such as\nlanguage models with powerful language understanding capacity). Meanwhile, the\nemergence of large language models (LLMs), typified by ChatGPT and GPT-4, has\nrevolutionized natural language processing due to their remarkable language\nunderstanding, generation, generalization, and reasoning abilities.\nConsequently, recent research has sought to leverage LLMs to improve IR\nsystems. Given the rapid evolution of this research trajectory, it is necessary\nto consolidate existing methodologies and provide nuanced insights through a\ncomprehensive overview. In this survey, we delve into the confluence of LLMs\nand IR systems, including crucial aspects such as query rewriters, retrievers,\nrerankers, and readers. Additionally, we explore promising directions, such as\nsearch agents, within this expanding field.\n","authors":["Yutao Zhu","Huaying Yuan","Shuting Wang","Jiongnan Liu","Wenhan Liu","Chenlong Deng","Haonan Chen","Zhicheng Dou","Ji-Rong Wen"],"pdf_url":"https://arxiv.org/pdf/2308.07107v3.pdf","comment":"updated to version 2"},{"id":"http://arxiv.org/abs/2401.10733v1","updated":"2024-01-19T14:50:22Z","published":"2024-01-19T14:50:22Z","title":"Dynamic Q&A of Clinical Documents with Large Language Models","summary":" Electronic health records (EHRs) house crucial patient data in clinical\nnotes. As these notes grow in volume and complexity, manual extraction becomes\nchallenging. This work introduces a natural language interface using large\nlanguage models (LLMs) for dynamic question-answering on clinical notes. Our\nchatbot, powered by Langchain and transformer-based LLMs, allows users to query\nin natural language, receiving relevant answers from clinical notes.\nExperiments, utilizing various embedding models and advanced LLMs, show Wizard\nVicuna's superior accuracy, albeit with high compute demands. Model\noptimization, including weight quantization, improves latency by approximately\n48 times. Promising results indicate potential, yet challenges such as model\nhallucinations and limited diverse medical case evaluations remain. Addressing\nthese gaps is crucial for unlocking the value in clinical notes and advancing\nAI-driven clinical decision-making.\n","authors":["Ran Elgedawy","Sudarshan Srinivasan","Ioana Danciu"],"pdf_url":"https://arxiv.org/pdf/2401.10733v1.pdf","comment":"8 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.10690v1","updated":"2024-01-19T13:41:08Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n unfairness in dyadic regression models","summary":" Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10634v1","updated":"2024-01-19T11:22:04Z","published":"2024-01-19T11:22:04Z","title":"Automatic Construction of Multi-faceted User Profiles using Text\n Clustering and its Application to Expert Recommendation and Filtering\n Problems","summary":" In the information age we are living in today, not only are we interested in\naccessing multimedia objects such as documents, videos, etc. but also in\nsearching for professional experts, people or celebrities, possibly for\nprofessional needs or just for fun. Information access systems need to be able\nto extract and exploit various sources of information (usually in text format)\nabout such individuals, and to represent them in a suitable way usually in the\nform of a profile. In this article, we tackle the problems of profile-based\nexpert recommendation and document filtering from a machine learning\nperspective by clustering expert textual sources to build profiles and capture\nthe different hidden topics in which the experts are interested. The experts\nwill then be represented by means of multi-faceted profiles. Our experiments\nshow that this is a valid technique to improve the performance of expert\nfinding and document filtering.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10617v1","updated":"2024-01-19T10:49:31Z","published":"2024-01-19T10:49:31Z","title":"LDA-based Term Profiles for Expert Finding in a Political Setting","summary":" A common task in many political institutions (i.e. Parliament) is to find\npoliticians who are experts in a particular field. In order to tackle this\nproblem, the first step is to obtain politician profiles which include their\ninterests, and these can be automatically learned from their speeches. As a\npolitician may have various areas of expertise, one alternative is to use a set\nof subprofiles, each of which covers a different subject. In this study, we\npropose a novel approach for this task by using latent Dirichlet allocation\n(LDA) to determine the main underlying topics of each political speech, and to\ndistribute the related terms among the different topic-based subprofiles. With\nthis objective, we propose the use of fifteen distance and similarity measures\nto automatically determine the optimal number of topics discussed in a\ndocument, and to demonstrate that every measure converges into five strategies:\nEuclidean, Dice, Sorensen, Cosine and Overlap. Our experimental results showed\nthat the scores of the different accuracy metrics of the proposed strategies\ntended to be higher than those of the baselines for expert recommendation\ntasks, and that the use of an appropriate number of topics has proved relevant.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10611v1","updated":"2024-01-19T10:42:29Z","published":"2024-01-19T10:42:29Z","title":"Publication venue recommendation using profiles based on clustering","summary":" In this paper we study the venue recommendation problem in order to help\nresearchers to identify a journal or conference to submit a given paper. A\ncommon approach to tackle this problem is to build profiles defining the scope\nof each venue. Then, these profiles are compared against the target paper. In\nour approach we will study how clustering techniques can be used to construct\ntopic-based profiles and use an Information Retrieval based approach to obtain\nthe final recommendations. Additionally, we will explore how the use of\nauthorship, representing a complementary piece of information, helps to improve\nthe recommendations.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10611v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10607v1","updated":"2024-01-19T10:32:28Z","published":"2024-01-19T10:32:28Z","title":"Use of topical and temporal profiles and their hybridisation for\n content-based recommendation","summary":" In the context of content-based recommender systems, the aim of this paper is\nto determine how better profiles can be built and how these affect the\nrecommendation process based on the incorporation of temporality, i.e. the\ninclusion of time in the recommendation process, and topicality, i.e. the\nrepresentation of texts associated with users and items using topics and their\ncombination. The main contribution of the paper is to present two different\nways of hybridising these two dimensions and to evaluate and compare them with\nother alternatives.\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10607v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10545v1","updated":"2024-01-19T08:09:20Z","published":"2024-01-19T08:09:20Z","title":"Understanding Biases in ChatGPT-based Recommender Systems: Provider\n Fairness, Temporal Stability, and Recency","summary":" This study explores the nuanced capabilities and inherent biases of\nRecommender Systems using Large Language Models (RecLLMs), with a focus on\nChatGPT-based systems. It studies into the contrasting behaviors of generative\nmodels and traditional collaborative filtering models in movie recommendations.\nThe research primarily investigates prompt design strategies and their impact\non various aspects of recommendation quality, including accuracy, provider\nfairness, diversity, stability, genre dominance, and temporal freshness\n(recency).\n Our experimental analysis reveals that the introduction of specific 'system\nroles' and 'prompt strategies' in RecLLMs significantly influences their\nperformance. For instance, role-based prompts enhance fairness and diversity in\nrecommendations, mitigating popularity bias. We find that while GPT-based\nmodels do not always match the performance of CF baselines, they exhibit a\nunique tendency to recommend newer and more diverse movie genres. Notably,\nGPT-based models tend to recommend more recent films, particularly those\nreleased post-2000, and show a preference for genres like \\sq{Drama} and\nComedy, and Romance (compared to CF Action, Adventure) presumably due to the\nRecLLMs' training on varied data sets, which allows them to capture recent\ntrends and discussions more effectively than CF models. Interestingly, our\nresults demonstrate that the 'Simple' and 'Chain of Thought (COT)' paradigms\nyield the highest accuracy. These findings imply the potential of combining\nthese strategies with scenarios that favor more recent content, thereby\noffering a more balanced and up-to-date recommendation experience. This study\ncontributes significantly to the understanding of emerging RecLLMs,\nparticularly in the context of harms and biases within these systems.\n","authors":["Yashar Deldjoo"],"pdf_url":"https://arxiv.org/pdf/2401.10545v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04971v2","updated":"2024-01-19T07:52:57Z","published":"2024-01-10T07:31:26Z","title":"A Survey on Cross-Domain Sequential Recommendation","summary":" Cross-domain sequential recommendation (CDSR) shifts the modeling of user\npreferences from flat to stereoscopic by integrating and learning interaction\ninformation from multiple domains at different granularities (ranging from\ninter-sequence to intra-sequence and from single-domain to cross-domain). In\nthis survey, we first define the CDSR problem using a four-dimensional tensor\nand then analyze its multi-type input representations under multidirectional\ndimensionality reductions. Following that, we provide a systematic overview\nfrom both macro and micro views. From a macro view, we abstract the multi-level\nfusion structures of various models across domains and discuss their bridges\nfor fusion. From a micro view, focusing on the existing models, we specifically\ndiscuss the basic technologies and then explain the auxiliary learning\ntechnologies. Finally, we exhibit the available public datasets and the\nrepresentative experimental results as well as provide some insights into\nfuture directions for research in CDSR.\n","authors":["Shu Chen","Zitao Xu","Weike Pan","Qiang Yang","Zhong Ming"],"pdf_url":"https://arxiv.org/pdf/2401.04971v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09885v2","updated":"2024-01-19T07:23:04Z","published":"2024-01-18T10:56:27Z","title":"Source Code Clone Detection Using Unsupervised Similarity Measures","summary":" Assessing similarity in source code has gained significant attention in\nrecent years due to its importance in software engineering tasks such as clone\ndetection and code search and recommendation. This work presents a comparative\nanalysis of unsupervised similarity measures for identifying source code clone\ndetection. The goal is to overview the current state-of-the-art techniques,\ntheir strengths, and weaknesses. To do that, we compile the existing\nunsupervised strategies and evaluate their performance on a benchmark dataset\nto guide software engineers in selecting appropriate methods for their specific\nuse cases. The source code of this study is available at\nhttps://github.com/jorge-martinez-gil/codesim\n","authors":["Jorge Martinez-Gil"],"pdf_url":"https://arxiv.org/pdf/2401.09885v2.pdf","comment":"Accepted for publication as Full Paper in the Software Quality Days\n 2024, Vienna, Austria"},{"id":"http://arxiv.org/abs/2401.00368v2","updated":"2024-01-19T05:16:20Z","published":"2023-12-31T02:13:18Z","title":"Improving Text Embeddings with Large Language Models","summary":" In this paper, we introduce a novel and simple method for obtaining\nhigh-quality text embeddings using only synthetic data and less than 1k\ntraining steps. Unlike existing methods that often depend on multi-stage\nintermediate pre-training with billions of weakly-supervised text pairs,\nfollowed by fine-tuning with a few labeled datasets, our method does not\nrequire building complex training pipelines or relying on manually collected\ndatasets that are often constrained by task diversity and language coverage. We\nleverage proprietary LLMs to generate diverse synthetic data for hundreds of\nthousands of text embedding tasks across nearly 100 languages. We then\nfine-tune open-source decoder-only LLMs on the synthetic data using standard\ncontrastive loss. Experiments demonstrate that our method achieves strong\nperformance on highly competitive text embedding benchmarks without using any\nlabeled data. Furthermore, when fine-tuned with a mixture of synthetic and\nlabeled data, our model sets new state-of-the-art results on the BEIR and MTEB\nbenchmarks.\n","authors":["Liang Wang","Nan Yang","Xiaolong Huang","Linjun Yang","Rangan Majumder","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2401.00368v2.pdf","comment":"20 pages, 15 tables"},{"id":"http://arxiv.org/abs/2401.10487v1","updated":"2024-01-19T04:24:07Z","published":"2024-01-19T04:24:07Z","title":"Generative Dense Retrieval: Memory Can Be a Burden","summary":" Generative Retrieval (GR), autoregressively decoding relevant document\nidentifiers given a query, has been shown to perform well under the setting of\nsmall-scale corpora. By memorizing the document corpus with model parameters,\nGR implicitly achieves deep interaction between query and document. However,\nsuch a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for\nfine-grained features of documents; (2) Memory confusion gets worse as the\ncorpus size increases; (3) Huge memory update costs for new documents. To\nalleviate these problems, we propose the Generative Dense Retrieval (GDR)\nparadigm. Specifically, GDR first uses the limited memory volume to achieve\ninter-cluster matching from query to relevant document clusters.\nMemorizing-free matching mechanism from Dense Retrieval (DR) is then introduced\nto conduct fine-grained intra-cluster matching from clusters to relevant\ndocuments. The coarse-to-fine process maximizes the advantages of GR's deep\ninteraction and DR's scalability. Besides, we design a cluster identifier\nconstructing strategy to facilitate corpus memory and a cluster-adaptive\nnegative sampling strategy to enhance the intra-cluster mapping ability.\nEmpirical results show that GDR obtains an average of 3.0 R@100 improvement on\nNQ dataset under multiple settings and has better scalability.\n","authors":["Peiwen Yuan","Xinglin Wang","Shaoxiong Feng","Boyuan Pan","Yiwei Li","Heda Wang","Xupeng Miao","Kan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10487v1.pdf","comment":"EACL 2024 main"},{"id":"http://arxiv.org/abs/2401.10484v1","updated":"2024-01-19T04:17:50Z","published":"2024-01-19T04:17:50Z","title":"Enhancing Scalability in Recommender Systems through Lottery Ticket\n Hypothesis and Knowledge Distillation-based Neural Network Pruning","summary":" This study introduces an innovative approach aimed at the efficient pruning\nof neural networks, with a particular focus on their deployment on edge\ndevices. Our method involves the integration of the Lottery Ticket Hypothesis\n(LTH) with the Knowledge Distillation (KD) framework, resulting in the\nformulation of three distinct pruning models. These models have been developed\nto address scalability issue in recommender systems, whereby the complexities\nof deep learning models have hindered their practical deployment. With\njudicious application of the pruning techniques, we effectively curtail the\npower consumption and model dimensions without compromising on accuracy.\nEmpirical evaluation has been performed using two real world datasets from\ndiverse domains against two baselines. Gratifyingly, our approaches yielded a\nGPU computation-power reduction of up to 66.67%. Notably, our study contributes\nto the field of recommendation system by pioneering the application of LTH and\nKD.\n","authors":["Rajaram R","Manoj Bharadhwaj","Vasan VS","Nargis Pervin"],"pdf_url":"https://arxiv.org/pdf/2401.10484v1.pdf","comment":"Accepted in WITS 2023 as a workshop paper"},{"id":"http://arxiv.org/abs/2401.10963v1","updated":"2024-01-19T11:50:26Z","published":"2024-01-19T11:50:26Z","title":"On the selection of the correct number of terms for profile\n construction: theoretical and empirical analysis","summary":" In this paper, we examine the problem of building a user profile from a set\nof documents. This profile will consist of a subset of the most representative\nterms in the documents that best represent user preferences or interests.\nInspired by the discrete concentration theory we have conducted an axiomatic\nstudy of seven properties that a selection function should fulfill: the minimum\nand maximum uncertainty principle, invariant to adding zeros, invariant to\nscale transformations, principle of nominal increase, transfer principle and\nthe richest get richer inequality. We also present a novel selection function\nbased on the use of similarity metrics, and more specifically the cosine\nmeasure which is commonly used in information retrieval, and demonstrate that\nthis verifies six of the properties in addition to a weaker variant of the\ntransfer principle, thereby representing a good selection approach. The\ntheoretical study was complemented with an empirical study to compare the\nperformance of different selection criteria (weight- and unweight-based) using\nreal data in a parliamentary setting. In this study, we analyze the performance\nof the different functions focusing on the two main factors affecting the\nselection process: profile size (number of terms) and weight distribution.\nThese profiles are then used in a document filtering task to show that our\nsimilarity-based approach performs well in terms not only of recommendation\naccuracy but also efficiency (we obtain smaller profiles and consequently\nfaster recommendations).\n","authors":["Luis M. de Campos","Juan M. Fernández-Luna","Juan F. Huete"],"pdf_url":"https://arxiv.org/pdf/2401.10963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10961v1","updated":"2024-01-19T11:14:37Z","published":"2024-01-19T11:14:37Z","title":"Positive unlabeled learning for building recommender systems in a\n parliamentary setting","summary":" Our goal is to learn about the political interests and preferences of the\nMembers of Parliament by mining their parliamentary activity, in order to\ndevelop a recommendation/filtering system that, given a stream of documents to\nbe distributed among them, is able to decide which documents should receive\neach Member of Parliament. We propose to use positive unlabeled learning to\ntackle this problem, because we only have information about relevant documents\n(the own interventions of each Member of Parliament in the debates) but not\nabout irrelevant documents, so that we cannot use standard binary classifiers\ntrained with positive and negative examples. We have also developed a new\nalgorithm of this type, which compares favourably with: a) the baseline\napproach assuming that all the interventions of other Members of Parliament are\nirrelevant, b) another well-known positive unlabeled learning method and c) an\napproach based on information retrieval methods that matches documents and\nlegislators' representations. The experiments have been carried out with data\nfrom the regional Andalusian Parliament at Spain.\n","authors":["Luis M. de Camposa","Juan M. Fernández-Luna","Juan F. Huete","Luis Redondo-Expósito"],"pdf_url":"https://arxiv.org/pdf/2401.10961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10956v1","updated":"2024-01-19T05:54:35Z","published":"2024-01-19T05:54:35Z","title":"AI Revolution on Chat Bot: Evidence from a Randomized Controlled\n Experiment","summary":" In recent years, generative AI has undergone major advancements,\ndemonstrating significant promise in augmenting human productivity. Notably,\nlarge language models (LLM), with ChatGPT-4 as an example, have drawn\nconsiderable attention. Numerous articles have examined the impact of LLM-based\ntools on human productivity in lab settings and designed tasks or in\nobservational studies. Despite recent advances, field experiments applying\nLLM-based tools in realistic settings are limited. This paper presents the\nfindings of a field randomized controlled trial assessing the effectiveness of\nLLM-based tools in providing unmonitored support services for information\nretrieval.\n","authors":["Sida Peng","Wojciech Swiatek","Allen Gao","Paul Cullivan","Haoge Chang"],"pdf_url":"https://arxiv.org/pdf/2401.10956v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.10886v1","updated":"2024-01-19T18:57:46Z","published":"2024-01-19T18:57:46Z","title":"SCENES: Subpixel Correspondence Estimation With Epipolar Supervision","summary":" Extracting point correspondences from two or more views of a scene is a\nfundamental computer vision problem with particular importance for relative\ncamera pose estimation and structure-from-motion. Existing local feature\nmatching approaches, trained with correspondence supervision on large-scale\ndatasets, obtain highly-accurate matches on the test sets. However, they do not\ngeneralise well to new datasets with different characteristics to those they\nwere trained on, unlike classic feature extractors. Instead, they require\nfinetuning, which assumes that ground-truth correspondences or ground-truth\ncamera poses and 3D structure are available. We relax this assumption by\nremoving the requirement of 3D structure, e.g., depth maps or point clouds, and\nonly require camera pose information, which can be obtained from odometry. We\ndo so by replacing correspondence losses with epipolar losses, which encourage\nputative matches to lie on the associated epipolar line. While weaker than\ncorrespondence supervision, we observe that this cue is sufficient for\nfinetuning existing models on new data. We then further relax the assumption of\nknown camera poses by using pose estimates in a novel bootstrapping approach.\nWe evaluate on highly challenging datasets, including an indoor drone dataset\nand an outdoor smartphone camera dataset, and obtain state-of-the-art results\nwithout strong supervision.\n","authors":["Dominik A. Kloepfer","João F. Henriques","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2401.10886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10874v1","updated":"2024-01-19T18:33:52Z","published":"2024-01-19T18:33:52Z","title":"Applications of flow models to the generation of correlated lattice QCD\n ensembles","summary":" Machine-learned normalizing flows can be used in the context of lattice\nquantum field theory to generate statistically correlated ensembles of lattice\ngauge fields at different action parameters. This work demonstrates how these\ncorrelations can be exploited for variance reduction in the computation of\nobservables. Three different proof-of-concept applications are demonstrated\nusing a novel residual flow architecture: continuum limits of gauge theories,\nthe mass dependence of QCD observables, and hadronic matrix elements based on\nthe Feynman-Hellmann approach. In all three cases, it is shown that statistical\nuncertainties are significantly reduced when machine-learned flows are\nincorporated as compared with the same calculations performed with uncorrelated\nensembles or direct reweighting.\n","authors":["Ryan Abbott","Aleksandar Botev","Denis Boyda","Daniel C. Hackett","Gurtej Kanwar","Sébastien Racanière","Danilo J. Rezende","Fernando Romero-López","Phiala E. Shanahan","Julian M. Urban"],"pdf_url":"https://arxiv.org/pdf/2401.10874v1.pdf","comment":"11 pages, 2 tables, 5 figures"},{"id":"http://arxiv.org/abs/2306.00119v2","updated":"2024-01-19T18:30:27Z","published":"2023-05-31T18:48:16Z","title":"Optimal Sets and Solution Paths of ReLU Networks","summary":" We develop an analytical framework to characterize the set of optimal ReLU\nneural networks by reformulating the non-convex training problem as a convex\nprogram. We show that the global optima of the convex parameterization are\ngiven by a polyhedral set and then extend this characterization to the optimal\nset of the non-convex training objective. Since all stationary points of the\nReLU training problem can be represented as optima of sub-sampled convex\nprograms, our work provides a general expression for all critical points of the\nnon-convex objective. We then leverage our results to provide an optimal\npruning algorithm for computing minimal networks, establish conditions for the\nregularization path of ReLU networks to be continuous, and develop sensitivity\nresults for minimal ReLU networks.\n","authors":["Aaron Mishkin","Mert Pilanci"],"pdf_url":"https://arxiv.org/pdf/2306.00119v2.pdf","comment":"Minor updates and corrections to clarify the role of merge/split\n symmetries in formation of ReLU optimal set and add missing sufficient\n conditions for all minimal models to have the same cardinality"},{"id":"http://arxiv.org/abs/2401.10862v1","updated":"2024-01-19T18:05:34Z","published":"2024-01-19T18:05:34Z","title":"Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs\n Without Fine-Tuning","summary":" Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type\nof attack that can coax these models into generating harmful and illegal\ncontent. In this paper, we show that pruning up to 20% of LLM parameters\nmarkedly increases their resistance to such attacks without additional training\nand without sacrificing their performance in standard benchmarks. Intriguingly,\nwe discovered that the enhanced safety observed post-pruning correlates to the\ninitial safety training level of the model, hinting that the effect of pruning\ncould be more general and may hold for other LLM behaviors beyond safety.\nAdditionally, we introduce a curated dataset of 225 harmful tasks across five\ncategories, inserted into ten different Jailbreaking prompts, showing that\npruning aids LLMs in concentrating attention on task-relevant tokens in\njailbreaking prompts. Lastly, our experiments reveal that the prominent chat\nmodels, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high\nsusceptibility to jailbreaking attacks, with some categories achieving nearly\n70-100% success rate. These insights underline the potential of pruning as a\ngeneralizable approach for improving LLM safety, reliability, and potentially\nother desired behaviors.\n","authors":["Adib Hasan","Ileana Rugina","Alex Wang"],"pdf_url":"https://arxiv.org/pdf/2401.10862v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10859v1","updated":"2024-01-19T18:03:21Z","published":"2024-01-19T18:03:21Z","title":"Ensembler: Combating model inversion attacks using model ensemble during\n collaborative inference","summary":" Deep learning models have exhibited remarkable performance across various\ndomains. Nevertheless, the burgeoning model sizes compel edge devices to\noffload a significant portion of the inference process to the cloud. While this\npractice offers numerous advantages, it also raises critical concerns regarding\nuser data privacy. In scenarios where the cloud server's trustworthiness is in\nquestion, the need for a practical and adaptable method to safeguard data\nprivacy becomes imperative. In this paper, we introduce Ensembler, an\nextensible framework designed to substantially increase the difficulty of\nconducting model inversion attacks for adversarial parties. Ensembler leverages\nmodel ensembling on the adversarial server, running in parallel with existing\napproaches that introduce perturbations to sensitive data during colloborative\ninference. Our experiments demonstrate that when combined with even basic\nGaussian noise, Ensembler can effectively shield images from reconstruction\nattacks, achieving recognition levels that fall below human performance in some\nstrict settings, significantly outperforming baseline methods lacking the\nEnsembler framework.\n","authors":["Dancheng Liu","Jinjun Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.10859v1.pdf","comment":"in submission"},{"id":"http://arxiv.org/abs/2401.10841v1","updated":"2024-01-19T17:40:50Z","published":"2024-01-19T17:40:50Z","title":"Using LLMs to discover emerging coded antisemitic hate-speech emergence\n in extremist social media","summary":" Online hate speech proliferation has created a difficult problem for social\nmedia platforms. A particular challenge relates to the use of coded language by\ngroups interested in both creating a sense of belonging for its users and\nevading detection. Coded language evolves quickly and its use varies over time.\nThis paper proposes a methodology for detecting emerging coded hate-laden\nterminology. The methodology is tested in the context of online antisemitic\ndiscourse. The approach considers posts scraped from social media platforms,\noften used by extremist users. The posts are scraped using seed expressions\nrelated to previously known discourse of hatred towards Jews. The method begins\nby identifying the expressions most representative of each post and calculating\ntheir frequency in the whole corpus. It filters out grammatically incoherent\nexpressions as well as previously encountered ones so as to focus on emergent\nwell-formed terminology. This is followed by an assessment of semantic\nsimilarity to known antisemitic terminology using a fine-tuned large language\nmodel, and subsequent filtering out of the expressions that are too distant\nfrom known expressions of hatred. Emergent antisemitic expressions containing\nterms clearly relating to Jewish topics are then removed to return only coded\nexpressions of hatred.\n","authors":["Dhanush Kikkisetti","Raza Ul Mustafa","Wendy Melillo","Roberto Corizzo","Zois Boukouvalas","Jeff Gill","Nathalie Japkowicz"],"pdf_url":"https://arxiv.org/pdf/2401.10841v1.pdf","comment":"9 pages, 4 figures, 2 algorithms, 3 tables"},{"id":"http://arxiv.org/abs/2309.14393v2","updated":"2024-01-19T17:33:44Z","published":"2023-09-25T14:50:04Z","title":"LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language\n Models","summary":" The carbon footprint associated with large language models (LLMs) is a\nsignificant concern, encompassing emissions from their training, inference,\nexperimentation, and storage processes, including operational and embodied\ncarbon emissions. An essential aspect is accurately estimating the carbon\nimpact of emerging LLMs even before their training, which heavily relies on GPU\nusage. Existing studies have reported the carbon footprint of LLM training, but\nonly one tool, mlco2, can predict the carbon footprint of new neural networks\nprior to physical training. However, mlco2 has several serious limitations. It\ncannot extend its estimation to dense or mixture-of-experts (MoE) LLMs,\ndisregards critical architectural parameters, focuses solely on GPUs, and\ncannot model embodied carbon footprints. Addressing these gaps, we introduce\n\\textit{\\carb}, an end-to-end carbon footprint projection model designed for\nboth dense and MoE LLMs. Compared to mlco2, \\carb~significantly enhances the\naccuracy of carbon footprint estimations for various LLMs. The source code is\nreleased at \\url{https://github.com/SotaroKaneda/MLCarbon}.\n","authors":["Ahmad Faiz","Sotaro Kaneda","Ruhan Wang","Rita Osi","Prateek Sharma","Fan Chen","Lei Jiang"],"pdf_url":"https://arxiv.org/pdf/2309.14393v2.pdf","comment":"15 pages, 8 figures"},{"id":"http://arxiv.org/abs/2211.13350v2","updated":"2024-01-19T17:33:36Z","published":"2022-11-23T23:31:14Z","title":"Choreographer: Learning and Adapting Skills in Imagination","summary":" Unsupervised skill learning aims to learn a rich repertoire of behaviors\nwithout external supervision, providing artificial agents with the ability to\ncontrol and influence the environment. However, without appropriate knowledge\nand exploration, skills may provide control only over a restricted area of the\nenvironment, limiting their applicability. Furthermore, it is unclear how to\nleverage the learned skill behaviors for adapting to downstream tasks in a\ndata-efficient manner. We present Choreographer, a model-based agent that\nexploits its world model to learn and adapt skills in imagination. Our method\ndecouples the exploration and skill learning processes, being able to discover\nskills in the latent state space of the model. During adaptation, the agent\nuses a meta-controller to evaluate and adapt the learned skills efficiently by\ndeploying them in parallel in imagination. Choreographer is able to learn\nskills both from offline data, and by collecting data simultaneously with an\nexploration policy. The skills can be used to effectively adapt to downstream\ntasks, as we show in the URL benchmark, where we outperform previous approaches\nfrom both pixels and states inputs. The learned skills also explore the\nenvironment thoroughly, finding sparse rewards more frequently, as shown in\ngoal-reaching tasks from the DMC Suite and Meta-World. Website and code:\nhttps://skillchoreographer.github.io/\n","authors":["Pietro Mazzaglia","Tim Verbelen","Bart Dhoedt","Alexandre Lacoste","Sai Rajeswar"],"pdf_url":"https://arxiv.org/pdf/2211.13350v2.pdf","comment":"Accepted at ICLR 2023 (notable top 25%)"},{"id":"http://arxiv.org/abs/2401.10831v1","updated":"2024-01-19T17:27:21Z","published":"2024-01-19T17:27:21Z","title":"Understanding Video Transformers via Universal Concept Discovery","summary":" This paper studies the problem of concept-based interpretability of\ntransformer representations for videos. Concretely, we seek to explain the\ndecision-making process of video transformers based on high-level,\nspatiotemporal concepts that are automatically discovered. Prior research on\nconcept-based interpretability has concentrated solely on image-level tasks.\nComparatively, video models deal with the added temporal dimension, increasing\ncomplexity and posing challenges in identifying dynamic concepts over time. In\nthis work, we systematically address these challenges by introducing the first\nVideo Transformer Concept Discovery (VTCD) algorithm. To this end, we propose\nan efficient approach for unsupervised identification of units of video\ntransformer representations - concepts, and ranking their importance to the\noutput of a model. The resulting concepts are highly interpretable, revealing\nspatio-temporal reasoning mechanisms and object-centric representations in\nunstructured video models. Performing this analysis jointly over a diverse set\nof supervised and self-supervised representations, we discover that some of\nthese mechanism are universal in video transformers. Finally, we demonstrate\nthat VTCDcan be used to improve model performance for fine-grained tasks.\n","authors":["Matthew Kowal","Achal Dave","Rares Ambrus","Adrien Gaidon","Konstantinos G. Derpanis","Pavel Tokmakov"],"pdf_url":"https://arxiv.org/pdf/2401.10831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10825v1","updated":"2024-01-19T17:21:05Z","published":"2024-01-19T17:21:05Z","title":"A survey on recent advances in named entity recognition","summary":" Named Entity Recognition seeks to extract substrings within a text that name\nreal-world objects and to determine their type (for example, whether they refer\nto persons or organizations). In this survey, we first present an overview of\nrecent popular approaches, but we also look at graph- and transformer- based\nmethods including Large Language Models (LLMs) that have not had much coverage\nin other surveys. Second, we focus on methods designed for datasets with scarce\nannotations. Third, we evaluate the performance of the main NER implementations\non a variety of datasets with differing characteristics (as regards their\ndomain, their size, and their number of classes). We thus provide a deep\ncomparison of algorithms that are never considered together. Our experiments\nshed some light on how the characteristics of datasets affect the behavior of\nthe methods that we compare.\n","authors":["Imed Keraghel","Stanislas Morbieu","Mohamed Nadif"],"pdf_url":"https://arxiv.org/pdf/2401.10825v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2310.12955v2","updated":"2024-01-19T17:12:23Z","published":"2023-10-19T17:54:39Z","title":"Towards Robust Offline Reinforcement Learning under Diverse Data\n Corruption","summary":" Offline reinforcement learning (RL) presents a promising approach for\nlearning reinforced policies from offline datasets without the need for costly\nor unsafe interactions with the environment. However, datasets collected by\nhumans in real-world environments are often noisy and may even be maliciously\ncorrupted, which can significantly degrade the performance of offline RL. In\nthis work, we first investigate the performance of current offline RL\nalgorithms under comprehensive data corruption, including states, actions,\nrewards, and dynamics. Our extensive experiments reveal that implicit\nQ-learning (IQL) demonstrates remarkable resilience to data corruption among\nvarious offline RL algorithms. Furthermore, we conduct both empirical and\ntheoretical analyses to understand IQL's robust performance, identifying its\nsupervised policy learning scheme as the key factor. Despite its relative\nrobustness, IQL still suffers from heavy-tail targets of Q functions under\ndynamics corruption. To tackle this challenge, we draw inspiration from robust\nstatistics to employ the Huber loss to handle the heavy-tailedness and utilize\nquantile estimators to balance penalization for corrupted data and learning\nstability. By incorporating these simple yet effective modifications into IQL,\nwe propose a more robust offline RL approach named Robust IQL (RIQL). Extensive\nexperiments demonstrate that RIQL exhibits highly robust performance when\nsubjected to diverse data corruption scenarios.\n","authors":["Rui Yang","Han Zhong","Jiawei Xu","Amy Zhang","Chongjie Zhang","Lei Han","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.12955v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10819v1","updated":"2024-01-19T17:09:32Z","published":"2024-01-19T17:09:32Z","title":"Optimisation in Neurosymbolic Learning Systems","summary":" Neurosymbolic AI aims to integrate deep learning with symbolic AI. This\nintegration has many promises, such as decreasing the amount of data required\nto train a neural network, improving the explainability and interpretability of\nanswers given by models and verifying the correctness of trained systems. We\nstudy neurosymbolic learning, where we have both data and background knowledge\nexpressed using symbolic languages. How do we connect the symbolic and neural\ncomponents to communicate this knowledge? One option is fuzzy reasoning, which\nstudies degrees of truth. For example, being tall is not a binary concept.\nInstead, probabilistic reasoning studies the probability that something is true\nor will happen. Our first research question studies how different forms of\nfuzzy reasoning combine with learning. We find surprising results like a\nconnection to the Raven paradox stating we confirm \"ravens are black\" when we\nobserve a green apple. In this study, we did not use the background knowledge\nwhen we deployed our models after training. In our second research question, we\nstudied how to use background knowledge in deployed models. We developed a new\nneural network layer based on fuzzy reasoning. Probabilistic reasoning is a\nnatural fit for neural networks, which we usually train to be probabilistic.\nHowever, they are expensive to compute and do not scale well to large tasks. In\nour third research question, we study how to connect probabilistic reasoning\nwith neural networks by sampling to estimate averages, while in the final\nresearch question, we study scaling probabilistic neurosymbolic learning to\nmuch larger problems than before. Our insight is to train a neural network with\nsynthetic data to predict the result of probabilistic reasoning.\n","authors":["Emile van Krieken"],"pdf_url":"https://arxiv.org/pdf/2401.10819v1.pdf","comment":"PhD dissertation"},{"id":"http://arxiv.org/abs/2401.10816v1","updated":"2024-01-19T17:03:37Z","published":"2024-01-19T17:03:37Z","title":"Co-Pilot for Health: Personalized Algorithmic AI Nudging to Improve\n Health Outcomes","summary":" The ability to shape health behaviors of large populations automatically,\nacross wearable types and disease conditions at scale has tremendous potential\nto improve global health outcomes. We designed and implemented an AI driven\nplatform for digital algorithmic nudging, enabled by a Graph-Neural Network\n(GNN) based Recommendation System, and granular health behavior data from\nwearable fitness devices. Here we describe the efficacy results of this\nplatform with its capabilities of personalized and contextual nudging to\n$n=84,764$ individuals over a 12-week period in Singapore. We statistically\nvalidated that participants in the target group who received such AI optimized\ndaily nudges increased daily physical activity like step count by 6.17% ($p =\n3.09\\times10^{-4}$) and weekly minutes of Moderate to Vigorous Physical\nActivity (MVPA) by 7.61% ($p = 1.16\\times10^{-2}$), compared to matched\nparticipants in control group who did not receive any nudges. Further, such\nnudges were very well received, with a 13.1% of nudges sent being opened (open\nrate), and 11.7% of the opened nudges rated useful compared to 1.9% rated as\nnot useful thereby demonstrating significant improvement in population level\nengagement metrics.\n","authors":["Jodi Chiam","Aloysius Lim","Cheryl Nott","Nicholas Mark","Ankur Teredesai","Sunil Shinde"],"pdf_url":"https://arxiv.org/pdf/2401.10816v1.pdf","comment":"19 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.10811v1","updated":"2024-01-19T16:56:11Z","published":"2024-01-19T16:56:11Z","title":"Simulation Based Bayesian Optimization","summary":" Bayesian Optimization (BO) is a powerful method for optimizing black-box\nfunctions by combining prior knowledge with ongoing function evaluations. BO\nconstructs a probabilistic surrogate model of the objective function given the\ncovariates, which is in turn used to inform the selection of future evaluation\npoints through an acquisition function. For smooth continuous search spaces,\nGaussian Processes (GPs) are commonly used as the surrogate model as they offer\nanalytical access to posterior predictive distributions, thus facilitating the\ncomputation and optimization of acquisition functions. However, in complex\nscenarios involving optimizations over categorical or mixed covariate spaces,\nGPs may not be ideal.\n This paper introduces Simulation Based Bayesian Optimization (SBBO) as a\nnovel approach to optimizing acquisition functions that only requires\n\\emph{sampling-based} access to posterior predictive distributions. SBBO allows\nthe use of surrogate probabilistic models tailored for combinatorial spaces\nwith discrete variables. Any Bayesian model in which posterior inference is\ncarried out through Markov chain Monte Carlo can be selected as the surrogate\nmodel in SBBO. In applications involving combinatorial optimization, we\ndemonstrate empirically the effectiveness of SBBO method using various choices\nof surrogate models.\n","authors":["Roi Naveiro","Becky Tang"],"pdf_url":"https://arxiv.org/pdf/2401.10811v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10809v1","updated":"2024-01-19T16:52:53Z","published":"2024-01-19T16:52:53Z","title":"Neglected Hessian component explains mysteries in Sharpness\n regularization","summary":" Recent work has shown that methods like SAM which either explicitly or\nimplicitly penalize second order information can improve generalization in deep\nlearning. Seemingly similar methods like weight noise and gradient penalties\noften fail to provide such benefits. We show that these differences can be\nexplained by the structure of the Hessian of the loss. First, we show that a\ncommon decomposition of the Hessian can be quantitatively interpreted as\nseparating the feature exploitation from feature exploration. The feature\nexploration, which can be described by the Nonlinear Modeling Error matrix\n(NME), is commonly neglected in the literature since it vanishes at\ninterpolation. Our work shows that the NME is in fact important as it can\nexplain why gradient penalties are sensitive to the choice of activation\nfunction. Using this insight we design interventions to improve performance. We\nalso provide evidence that challenges the long held equivalence of weight noise\nand gradient penalties. This equivalence relies on the assumption that the NME\ncan be ignored, which we find does not hold for modern networks since they\ninvolve significant feature learning. We find that regularizing feature\nexploitation but not feature exploration yields performance similar to gradient\npenalties.\n","authors":["Yann N. Dauphin","Atish Agarwala","Hossein Mobahi"],"pdf_url":"https://arxiv.org/pdf/2401.10809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.07626v3","updated":"2024-01-19T16:52:27Z","published":"2022-08-16T09:24:47Z","title":"Algorithmic Assistance with Recommendation-Dependent Preferences","summary":" When an algorithm provides risk assessments, we typically think of them as\nhelpful inputs to human decisions, such as when risk scores are presented to\njudges or doctors. However, a decision-maker may not only react to the\ninformation provided by the algorithm. The decision-maker may also view the\nalgorithmic recommendation as a default action, making it costly for them to\ndeviate, such as when a judge is reluctant to overrule a high-risk assessment\nfor a defendant or a doctor fears the consequences of deviating from\nrecommended procedures. To address such unintended consequences of algorithmic\nassistance, we propose a principal-agent model of joint human-machine\ndecision-making. Within this model, we consider the effect and design of\nalgorithmic recommendations when they affect choices not just by shifting\nbeliefs, but also by altering preferences. We motivate this assumption from\ninstitutional factors, such as a desire to avoid audits, as well as from\nwell-established models in behavioral science that predict loss aversion\nrelative to a reference point, which here is set by the algorithm. We show that\nrecommendation-dependent preferences create inefficiencies where the\ndecision-maker is overly responsive to the recommendation. As a potential\nremedy, we discuss algorithms that strategically withhold recommendations, and\nshow how they can improve the quality of final decisions.\n","authors":["Bryce McLaughlin","Jann Spiess"],"pdf_url":"https://arxiv.org/pdf/2208.07626v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10805v1","updated":"2024-01-19T16:48:49Z","published":"2024-01-19T16:48:49Z","title":"Learning to Visually Connect Actions and their Effects","summary":" In this work, we introduce the novel concept of visually Connecting Actions\nand Their Effects (CATE) in video understanding. CATE can have applications in\nareas like task planning and learning from demonstration. We propose different\nCATE-based task formulations, such as action selection and action\nspecification, where video understanding models connect actions and effects at\nsemantic and fine-grained levels. We observe that different formulations\nproduce representations capturing intuitive action properties. We also design\nvarious baseline models for action selection and action specification. Despite\nthe intuitive nature of the task, we observe that models struggle, and humans\noutperform them by a large margin. The study aims to establish a foundation for\nfuture efforts, showcasing the flexibility and versatility of connecting\nactions and effects in video understanding, with the hope of inspiring advanced\nformulations and models.\n","authors":["Eric Peh","Paritosh Parmar","Basura Fernando"],"pdf_url":"https://arxiv.org/pdf/2401.10805v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10800v1","updated":"2024-01-19T16:36:27Z","published":"2024-01-19T16:36:27Z","title":"Estimation of AMOC transition probabilities using a machine learning\n based rare-event algorithm","summary":" The Atlantic Meridional Overturning Circulation (AMOC) is an important\ncomponent of the global climate, known to be a tipping element, as it could\ncollapse under global warming. The main objective of this study is to compute\nthe probability that the AMOC collapses within a specified time window, using a\nrare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS).\nHowever, the efficiency and accuracy of TAMS depend on the choice of the score\nfunction. Although the definition of the optimal score function, called\n``committor function\" is known, it is impossible in general to compute it a\npriori. Here, we combine TAMS with a Next-Generation Reservoir Computing\ntechnique that estimates the committor function from the data generated by the\nrare-event algorithm. We test this technique in a stochastic box model of the\nAMOC for which two types of transition exist, the so-called F(ast)-transitions\nand S(low)-transitions. Results for the F-transtions compare favorably with\nthose in the literature where a physically-informed score function was used. We\nshow that coupling a rare-event algorithm with machine learning allows for a\ncorrect estimation of transition probabilities, transition times, and even\ntransition paths for a wide range of model parameters. We then extend these\nresults to the more difficult problem of S-transitions in the same model. In\nboth cases of F- and S-transitions, we also show how the Next-Generation\nReservoir Computing technique can be interpreted to retrieve an analytical\nestimate of the committor function.\n","authors":["Valérian Jacques-Dumas","René M. van Westen","Henk A. Dijkstra"],"pdf_url":"https://arxiv.org/pdf/2401.10800v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10799v1","updated":"2024-01-19T16:34:37Z","published":"2024-01-19T16:34:37Z","title":"Novel Representation Learning Technique using Graphs for Performance\n Analytics","summary":" The performance analytics domain in High Performance Computing (HPC) uses\ntabular data to solve regression problems, such as predicting the execution\ntime. Existing Machine Learning (ML) techniques leverage the correlations among\nfeatures given tabular datasets, not leveraging the relationships between\nsamples directly. Moreover, since high-quality embeddings from raw features\nimprove the fidelity of the downstream predictive models, existing methods rely\non extensive feature engineering and pre-processing steps, costing time and\nmanual effort. To fill these two gaps, we propose a novel idea of transforming\ntabular performance data into graphs to leverage the advancement of Graph\nNeural Network-based (GNN) techniques in capturing complex relationships\nbetween features and samples. In contrast to other ML application domains, such\nas social networks, the graph is not given; instead, we need to build it. To\naddress this gap, we propose graph-building methods where nodes represent\nsamples, and the edges are automatically inferred iteratively based on the\nsimilarity between the features in the samples. We evaluate the effectiveness\nof the generated embeddings from GNNs based on how well they make even a simple\nfeed-forward neural network perform for regression tasks compared to other\nstate-of-the-art representation learning techniques. Our evaluation\ndemonstrates that even with up to 25% random missing values for each dataset,\nour method outperforms commonly used graph and Deep Neural Network (DNN)-based\napproaches and achieves up to 61.67% & 78.56% improvement in MSE loss over the\nDNN baseline respectively for HPC dataset and Machine Learning Datasets.\n","authors":["Tarek Ramadan","Ankur Lahiry","Tanzima Z. Islam"],"pdf_url":"https://arxiv.org/pdf/2401.10799v1.pdf","comment":"This paper has been accepted at 22nd International Conference on\n Machine Learning and Applications (ICMLA2023)"},{"id":"http://arxiv.org/abs/2201.05158v3","updated":"2024-01-19T16:26:46Z","published":"2022-01-13T16:35:45Z","title":"Towards Quantum Graph Neural Networks: An Ego-Graph Learning Approach","summary":" Quantum machine learning is a fast-emerging field that aims to tackle machine\nlearning using quantum algorithms and quantum computing. Due to the lack of\nphysical qubits and an effective means to map real-world data from Euclidean\nspace to Hilbert space, most of these methods focus on quantum analogies or\nprocess simulations rather than devising concrete architectures based on\nqubits. In this paper, we propose a novel hybrid quantum-classical algorithm\nfor graph-structured data, which we refer to as the Ego-graph based Quantum\nGraph Neural Network (egoQGNN). egoQGNN implements the GNN theoretical\nframework using the tensor product and unity matrix representation, which\ngreatly reduces the number of model parameters required. When controlled by a\nclassical computer, egoQGNN can accommodate arbitrarily sized graphs by\nprocessing ego-graphs from the input graph using a modestly-sized quantum\ndevice. The architecture is based on a novel mapping from real-world data to\nHilbert space. This mapping maintains the distance relations present in the\ndata and reduces information loss. Experimental results show that the proposed\nmethod outperforms competitive state-of-the-art models with only 1.68\\%\nparameters compared to those models.\n","authors":["Xing Ai","Zhihong Zhang","Luzhe Sun","Junchi Yan","Edwin Hancock"],"pdf_url":"https://arxiv.org/pdf/2201.05158v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10794v1","updated":"2024-01-19T16:26:35Z","published":"2024-01-19T16:26:35Z","title":"Deep Reinforcement Learning Empowered Activity-Aware Dynamic Health\n Monitoring Systems","summary":" In smart healthcare, health monitoring utilizes diverse tools and\ntechnologies to analyze patients' real-time biosignal data, enabling immediate\nactions and interventions. Existing monitoring approaches were designed on the\npremise that medical devices track several health metrics concurrently,\ntailored to their designated functional scope. This means that they report all\nrelevant health values within that scope, which can result in excess resource\nuse and the gathering of extraneous data due to monitoring irrelevant health\nmetrics. In this context, we propose Dynamic Activity-Aware Health Monitoring\nstrategy (DActAHM) for striking a balance between optimal monitoring\nperformance and cost efficiency, a novel framework based on Deep Reinforcement\nLearning (DRL) and SlowFast Model to ensure precise monitoring based on users'\nactivities. Specifically, with the SlowFast Model, DActAHM efficiently\nidentifies individual activities and captures these results for enhanced\nprocessing. Subsequently, DActAHM refines health metric monitoring in response\nto the identified activity by incorporating a DRL framework. Extensive\nexperiments comparing DActAHM against three state-of-the-art approaches\ndemonstrate it achieves 27.3% higher gain than the best-performing baseline\nthat fixes monitoring actions over timeline.\n","authors":["Ziqiaing Ye","Yulan Gao","Yue Xiao","Zehui Xiong","Dusit Niyato"],"pdf_url":"https://arxiv.org/pdf/2401.10794v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10791v1","updated":"2024-01-19T16:23:53Z","published":"2024-01-19T16:23:53Z","title":"Early alignment in two-layer networks training is a two-edged sword","summary":" Training neural networks with first order optimisation methods is at the core\nof the empirical success of deep learning. The scale of initialisation is a\ncrucial factor, as small initialisations are generally associated to a feature\nlearning regime, for which gradient descent is implicitly biased towards simple\nsolutions. This work provides a general and quantitative description of the\nearly alignment phase, originally introduced by Maennel et al. (2018) . For\nsmall initialisation and one hidden ReLU layer networks, the early stage of the\ntraining dynamics leads to an alignment of the neurons towards key directions.\nThis alignment induces a sparse representation of the network, which is\ndirectly related to the implicit bias of gradient flow at convergence. This\nsparsity inducing alignment however comes at the expense of difficulties in\nminimising the training objective: we also provide a simple data example for\nwhich overparameterised networks fail to converge towards global minima and\nonly converge to a spurious stationary point instead.\n","authors":["Etienne Boursier","Nicolas Flammarion"],"pdf_url":"https://arxiv.org/pdf/2401.10791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10790v1","updated":"2024-01-19T16:21:55Z","published":"2024-01-19T16:21:55Z","title":"Measuring the Impact of Scene Level Objects on Object Detection: Towards\n Quantitative Explanations of Detection Decisions","summary":" Although accuracy and other common metrics can provide a useful window into\nthe performance of an object detection model, they lack a deeper view of the\nmodel's decision process. Regardless of the quality of the training data and\nprocess, the features that an object detection model learns cannot be\nguaranteed. A model may learn a relationship between certain background\ncontext, i.e., scene level objects, and the presence of the labeled classes.\nFurthermore, standard performance verification and metrics would not identify\nthis phenomenon. This paper presents a new black box explainability method for\nadditional verification of object detection models by finding the impact of\nscene level objects on the identification of the objects within the image. By\ncomparing the accuracies of a model on test data with and without certain scene\nlevel objects, the contributions of these objects to the model's performance\nbecomes clearer. The experiment presented here will assess the impact of\nbuildings and people in image context on the detection of emergency road\nvehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the\npresence of a scene level object will indicate the model's reliance on that\nobject to make its detections. The results of this research lead to providing a\nquantitative explanation of the object detection model's decision process,\nenabling a deeper understanding of the model's performance.\n","authors":["Lynn Vonder Haar","Timothy Elvira","Luke Newcomb","Omar Ochoa"],"pdf_url":"https://arxiv.org/pdf/2401.10790v1.pdf","comment":"9 pages, 4 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.07961v2","updated":"2024-01-19T15:55:16Z","published":"2024-01-15T20:57:50Z","title":"Solution of the Probabilistic Lambert Problem: Connections with Optimal\n Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs","summary":" Lambert's problem concerns with transferring a spacecraft from a given\ninitial to a given terminal position within prescribed flight time via velocity\ncontrol subject to a gravitational force field. We consider a probabilistic\nvariant of the Lambert problem where the knowledge of the endpoint constraints\nin position vectors are replaced by the knowledge of their respective joint\nprobability density functions. We show that the Lambert problem with endpoint\njoint probability density constraints is a generalized optimal mass transport\n(OMT) problem, thereby connecting this classical astrodynamics problem with a\nburgeoning area of research in modern stochastic control and stochastic machine\nlearning. This newfound connection allows us to rigorously establish the\nexistence and uniqueness of solution for the probabilistic Lambert problem. The\nsame connection also helps to numerically solve the probabilistic Lambert\nproblem via diffusion regularization, i.e., by leveraging further connection of\nthe OMT with the Schr\\\"odinger bridge problem (SBP). This also shows that the\nprobabilistic Lambert problem with additive dynamic process noise is in fact a\ngeneralized SBP, and can be solved numerically using the so-called\nSchr\\\"odinger factors, as we do in this work. We explain how the resulting\nanalysis leads to solving a boundary-coupled system of reaction-diffusion PDEs\nwhere the nonlinear gravitational potential appears as the reaction rate. We\npropose novel algorithms for the same, and present illustrative numerical\nresults. Our analysis and the algorithmic framework are nonparametric, i.e., we\nmake neither statistical (e.g., Gaussian, first few moments, mixture or\nexponential family, finite dimensionality of the sufficient statistic) nor\ndynamical (e.g., Taylor series) approximations.\n","authors":["Alexis M. H. Teter","Iman Nodozi","Abhishek Halder"],"pdf_url":"https://arxiv.org/pdf/2401.07961v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10774v1","updated":"2024-01-19T15:48:40Z","published":"2024-01-19T15:48:40Z","title":"Medusa: Simple LLM Inference Acceleration Framework with Multiple\n Decoding Heads","summary":" The inference process in Large Language Models (LLMs) is often limited due to\nthe absence of parallelism in the auto-regressive decoding process, resulting\nin most operations being restricted by the memory bandwidth of accelerators.\nWhile methods such as speculative decoding have been suggested to address this\nissue, their implementation is impeded by the challenges associated with\nacquiring and maintaining a separate draft model. In this paper, we present\nMedusa, an efficient method that augments LLM inference by adding extra\ndecoding heads to predict multiple subsequent tokens in parallel. Using a\ntree-based attention mechanism, Medusa constructs multiple candidate\ncontinuations and verifies them simultaneously in each decoding step. By\nleveraging parallel processing, Medusa introduces only minimal overhead in\nterms of single-step latency while substantially reducing the number of\ndecoding steps required.\n We present two levels of fine-tuning procedures for Medusa to meet the needs\nof different use cases: Medusa-1: Medusa is directly fine-tuned on top of a\nfrozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa\nis fine-tuned together with the backbone LLM, enabling better prediction\naccuracy of Medusa heads and higher speedup but needing a special training\nrecipe that preserves the backbone model's capabilities.\n Moreover, we propose several extensions that improve or expand the utility of\nMedusa, including a self-distillation to handle situations where no training\ndata is available and a typical acceptance scheme to boost the acceptance rate\nwhile maintaining generation quality. We evaluate Medusa on models of various\nsizes and training procedures. Our experiments demonstrate that Medusa-1 can\nachieve over 2.2x speedup without compromising generation quality, while\nMedusa-2 further improves the speedup to 2.3-3.6x.\n","authors":["Tianle Cai","Yuhong Li","Zhengyang Geng","Hongwu Peng","Jason D. Lee","Deming Chen","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2401.10774v1.pdf","comment":"The code for this implementation is available at\n https://github.com/FasterDecoding/Medusa"},{"id":"http://arxiv.org/abs/2401.10765v1","updated":"2024-01-19T15:37:11Z","published":"2024-01-19T15:37:11Z","title":"Starlit: Privacy-Preserving Federated Learning to Enhance Financial\n Fraud Detection","summary":" Federated Learning (FL) is a data-minimization approach enabling\ncollaborative model training across diverse clients with local data, avoiding\ndirect data exchange. However, state-of-the-art FL solutions to identify\nfraudulent financial transactions exhibit a subset of the following\nlimitations. They (1) lack a formal security definition and proof, (2) assume\nprior freezing of suspicious customers' accounts by financial institutions\n(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$\ncomputationally expensive modular exponentiation (where $n$ is the total number\nof financial institutions) or highly inefficient fully homomorphic encryption,\n(4) assume the parties have already completed the identity alignment phase,\nhence excluding it from the implementation, performance evaluation, and\nsecurity analysis, and (5) struggle to resist clients' dropouts. This work\nintroduces Starlit, a novel scalable privacy-preserving FL mechanism that\novercomes these limitations. It has various applications, such as enhancing\nfinancial fraud detection, mitigating terrorism, and enhancing digital health.\nWe implemented Starlit and conducted a thorough performance analysis using\nsynthetic data from a key player in global financial transactions. The\nevaluation indicates Starlit's scalability, efficiency, and accuracy.\n","authors":["Aydin Abadi","Bradley Doyle","Francesco Gini","Kieron Guinamard","Sasi Kumar Murakonda","Jack Liddell","Paul Mellor","Steven J. Murdoch","Mohammad Naseri","Hector Page","George Theodorakopoulos","Suzanne Weller"],"pdf_url":"https://arxiv.org/pdf/2401.10765v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17046v2","updated":"2024-01-19T15:33:12Z","published":"2023-03-29T22:18:47Z","title":"Have it your way: Individualized Privacy Assignment for DP-SGD","summary":" When training a machine learning model with differential privacy, one sets a\nprivacy budget. This budget represents a maximal privacy violation that any\nuser is willing to face by contributing their data to the training set. We\nargue that this approach is limited because different users may have different\nprivacy expectations. Thus, setting a uniform privacy budget across all points\nmay be overly conservative for some users or, conversely, not sufficiently\nprotective for others. In this paper, we capture these preferences through\nindividualized privacy budgets. To demonstrate their practicality, we introduce\na variant of Differentially Private Stochastic Gradient Descent (DP-SGD) which\nsupports such individualized budgets. DP-SGD is the canonical approach to\ntraining models with differential privacy. We modify its data sampling and\ngradient noising mechanisms to arrive at our approach, which we call\nIndividualized DP-SGD (IDP-SGD). Because IDP-SGD provides privacy guarantees\ntailored to the preferences of individual users and their data points, we find\nit empirically improves privacy-utility trade-offs.\n","authors":["Franziska Boenisch","Christopher Mühl","Adam Dziedzic","Roy Rinberg","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2303.17046v2.pdf","comment":"Published at NeurIPS'2024"},{"id":"http://arxiv.org/abs/2205.14102v3","updated":"2024-01-19T15:30:04Z","published":"2022-05-27T17:12:26Z","title":"Group-level Brain Decoding with Deep Learning","summary":" Decoding brain imaging data are gaining popularity, with applications in\nbrain-computer interfaces and the study of neural representations. Decoding is\ntypicallysubject-specific and does not generalise well over subjects, due to\nhigh amounts ofbetween subject variability. Techniques that overcome this will\nnot only providericher neuroscientific insights but also make it possible for\ngroup-level models to out-perform subject-specific models. Here, we propose a\nmethod that uses subjectembedding, analogous to word embedding in natural\nlanguage processing, to learnand exploit the structure in between-subject\nvariability as part of a decoding model,our adaptation of the WaveNet\narchitecture for classification. We apply this to mag-netoencephalography data,\nwhere 15 subjects viewed 118 different images, with30 examples per image; to\nclassify images using the entire 1 s window followingimage presentation. We\nshow that the combination of deep learning and subjectembedding is crucial to\nclosing the performance gap between subject- and group-level decoding models.\nImportantly, group models outperform subject models onlow-accuracy subjects\n(although slightly impair high-accuracy subjects) and can behelpful for\ninitialising subject models. While we have not generally found\ngroup-levelmodels to perform better than subject-level models, the performance\nof groupmodelling is expected to be even higher with bigger datasets. In order\nto providephysiological interpretation at the group level, we make use of\npermutation featureimportance. This provides insights into the spatiotemporal\nand spectral informationencoded in the models. All code is available on GitHub\n(https://github.com/ricsinaruto/MEG-group-decode).\n","authors":["Richard Csaky","Mats Van Es","Oiwi Parker Jones","Mark Woolrich"],"pdf_url":"https://arxiv.org/pdf/2205.14102v3.pdf","comment":"Published in Human Brain Mapping"},{"id":"http://arxiv.org/abs/2401.10754v1","updated":"2024-01-19T15:25:09Z","published":"2024-01-19T15:25:09Z","title":"Data Augmentation for Traffic Classification","summary":" Data Augmentation (DA) -- enriching training data by adding synthetic samples\n-- is a technique widely adopted in Computer Vision (CV) and Natural Language\nProcessing (NLP) tasks to improve models performance. Yet, DA has struggled to\ngain traction in networking contexts, particularly in Traffic Classification\n(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation\nfunctions applied to 3 TC datasets using packet time series as input\nrepresentation and considering a variety of training conditions. Our results\nshow that (i) DA can reap benefits previously unexplored with (ii)\naugmentations acting on time series sequence order and masking being a better\nsuit for TC and (iii) simple latent space analysis can provide hints about why\naugmentations have positive or negative effects.\n","authors":["Chao Wang","Alessandro Finamore","Pietro Michiardi","Massimo Gallo","Dario Rossi"],"pdf_url":"https://arxiv.org/pdf/2401.10754v1.pdf","comment":"to appear at Passive and Active Measurements (PAM), 2024"},{"id":"http://arxiv.org/abs/2401.10753v1","updated":"2024-01-19T15:22:28Z","published":"2024-01-19T15:22:28Z","title":"BoolGebra: Attributed Graph-learning for Boolean Algebraic Manipulation","summary":" Boolean algebraic manipulation is at the core of logic synthesis in\nElectronic Design Automation (EDA) design flow. Existing methods struggle to\nfully exploit optimization opportunities, and often suffer from an explosive\nsearch space and limited scalability efficiency. This work presents BoolGebra,\na novel attributed graph-learning approach for Boolean algebraic manipulation\nthat aims to improve fundamental logic synthesis. BoolGebra incorporates Graph\nNeural Networks (GNNs) and takes initial feature embeddings from both\nstructural and functional information as inputs. A fully connected neural\nnetwork is employed as the predictor for direct optimization result\npredictions, significantly reducing the search space and efficiently locating\nthe optimization space. The experiments involve training the BoolGebra model\nw.r.t design-specific and cross-design inferences using the trained model,\nwhere BoolGebra demonstrates generalizability for cross-design inference and\nits potential to scale from small, simple training datasets to large, complex\ninference datasets. Finally, BoolGebra is integrated with existing synthesis\ntool ABC to perform end-to-end logic minimization evaluation w.r.t SOTA\nbaselines.\n","authors":["Yingjie Li","Anthony Agnesina","Yanqing Zhang","Haoxing Ren","Cunxi Yu"],"pdf_url":"https://arxiv.org/pdf/2401.10753v1.pdf","comment":"DATE 2024 extended version. arXiv admin note: text overlap with\n arXiv:2310.07846"},{"id":"http://arxiv.org/abs/2310.13384v2","updated":"2024-01-19T15:19:54Z","published":"2023-10-20T09:53:55Z","title":"Salted Inference: Enhancing Privacy while Maintaining Efficiency of\n Split Inference in Mobile Computing","summary":" In split inference, a deep neural network (DNN) is partitioned to run the\nearly part of the DNN at the edge and the later part of the DNN in the cloud.\nThis meets two key requirements for on-device machine learning: input privacy\nand computation efficiency. Still, an open question in split inference is\noutput privacy, given that the outputs of the DNN are observable in the cloud.\nWhile encrypted computing can protect output privacy too, homomorphic\nencryption requires substantial computation and communication resources from\nboth edge and cloud devices. In this paper, we introduce Salted DNNs: a novel\napproach that enables clients at the edge, who run the early part of the DNN,\nto control the semantic interpretation of the DNN's outputs at inference time.\nOur proposed Salted DNNs maintain classification accuracy and computation\nefficiency very close to the standard DNN counterparts. Experimental\nevaluations conducted on both images and wearable sensor data demonstrate that\nSalted DNNs attain classification accuracy very close to standard DNNs,\nparticularly when the Salted Layer is positioned within the early part to meet\nthe requirements of split inference. Our approach is general and can be applied\nto various types of DNNs. As a benchmark for future studies, we open-source our\ncode.\n","authors":["Mohammad Malekzadeh","Fahim Kawsar"],"pdf_url":"https://arxiv.org/pdf/2310.13384v2.pdf","comment":"To be appeared in the 25th International Workshop on Mobile Computing\n Systems and Applications (HotMobile 2024)"},{"id":"http://arxiv.org/abs/2305.03077v2","updated":"2024-01-19T15:16:37Z","published":"2023-05-04T18:00:01Z","title":"Explaining dark matter halo density profiles with neural networks","summary":" We use explainable neural networks to connect the evolutionary history of\ndark matter halos with their density profiles. The network captures independent\nfactors of variation in the density profiles within a low-dimensional\nrepresentation, which we physically interpret using mutual information. Without\nany prior knowledge of the halos' evolution, the network recovers the known\nrelation between the early time assembly and the inner profile, and discovers\nthat the profile beyond the virial radius is described by a single parameter\ncapturing the most recent mass accretion rate. The results illustrate the\npotential for machine-assisted scientific discovery in complicated\nastrophysical datasets.\n","authors":["Luisa Lucie-Smith","Hiranya V. Peiris","Andrew Pontzen"],"pdf_url":"https://arxiv.org/pdf/2305.03077v2.pdf","comment":"7 pages, 5 figures. Minor changes to match version accepted for\n publication in PRL"},{"id":"http://arxiv.org/abs/2401.10746v1","updated":"2024-01-19T15:13:30Z","published":"2024-01-19T15:13:30Z","title":"A Systematic Evaluation of Euclidean Alignment with Deep Learning for\n EEG Decoding","summary":" Electroencephalography (EEG) signals are frequently used for various\nBrain-Computer Interface (BCI) tasks. While Deep Learning (DL) techniques have\nshown promising results, they are hindered by the substantial data\nrequirements. By leveraging data from multiple subjects, transfer learning\nenables more effective training of DL models. A technique that is gaining\npopularity is Euclidean Alignment (EA) due to its ease of use, low\ncomputational complexity, and compatibility with Deep Learning models. However,\nfew studies evaluate its impact on the training performance of shared and\nindividual DL models. In this work, we systematically evaluate the effect of EA\ncombined with DL for decoding BCI signals. We used EA to train shared models\nwith data from multiple subjects and evaluated its transferability to new\nsubjects. Our experimental results show that it improves decoding in the target\nsubject by 4.33% and decreases convergence time by more than 70%. We also\ntrained individual models for each subject to use as a majority-voting ensemble\nclassifier. In this scenario, using EA improved the 3-model ensemble accuracy\nby 3.7%. However, when compared to the shared model with EA, the ensemble\naccuracy was 3.62% lower.\n","authors":["Bruna Junqueira","Bruno Aristimunha","Sylvain Chevallier","Raphael Y. de Camargo"],"pdf_url":"https://arxiv.org/pdf/2401.10746v1.pdf","comment":"14 pages and 10 figures"},{"id":"http://arxiv.org/abs/2401.09796v2","updated":"2024-01-19T15:09:45Z","published":"2024-01-18T08:33:09Z","title":"A Fast, Performant, Secure Distributed Training Framework For Large\n Language Model","summary":" The distributed (federated) LLM is an important method for co-training the\ndomain-specific LLM using siloed data. However, maliciously stealing model\nparameters and data from the server or client side has become an urgent problem\nto be solved. In this paper, we propose a secure distributed LLM based on model\nslicing. In this case, we deploy the Trusted Execution Environment (TEE) on\nboth the client and server side, and put the fine-tuned structure (LoRA or\nembedding of P-tuning v2) into the TEE. Then, secure communication is executed\nin the TEE and general environments through lightweight encryption. In order to\nfurther reduce the equipment cost as well as increase the model performance and\naccuracy, we propose a split fine-tuning scheme. In particular, we split the\nLLM by layers and place the latter layers in a server-side TEE (the client does\nnot need a TEE). We then combine the proposed Sparsification Parameter\nFine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream\ntask. Numerous experiments have shown that our method guarantees accuracy while\nmaintaining security.\n","authors":["Wei Huang","Yinggui Wang","Anda Cheng","Aihui Zhou","Chaofan Yu","Lei Wang"],"pdf_url":"https://arxiv.org/pdf/2401.09796v2.pdf","comment":"Accepted by ICASSP 2024 (Federated LLM)"},{"id":"http://arxiv.org/abs/2306.17248v2","updated":"2024-01-19T15:01:52Z","published":"2023-06-29T18:34:37Z","title":"TemperatureGAN: Generative Modeling of Regional Atmospheric Temperatures","summary":" Stochastic generators are useful for estimating climate impacts on various\nsectors. Projecting climate risk in various sectors, e.g. energy systems,\nrequires generators that are accurate (statistical resemblance to\nground-truth), reliable (do not produce erroneous examples), and efficient.\nLeveraging data from the North American Land Data Assimilation System, we\nintroduce TemperatureGAN, a Generative Adversarial Network conditioned on\nmonths, locations, and time periods, to generate 2m above ground atmospheric\ntemperatures at an hourly resolution. We propose evaluation methods and metrics\nto measure the quality of generated samples. We show that TemperatureGAN\nproduces high-fidelity examples with good spatial representation and temporal\ndynamics consistent with known diurnal cycles.\n","authors":["Emmanuel Balogun","Ram Rajagopal","Arun Majumdar"],"pdf_url":"https://arxiv.org/pdf/2306.17248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.09234v2","updated":"2024-01-19T14:57:06Z","published":"2023-12-14T18:57:16Z","title":"Let's do the time-warp-attend: Learning topological invariants of\n dynamical systems","summary":" Dynamical systems across the sciences, from electrical circuits to ecological\nnetworks, undergo qualitative and often catastrophic changes in behavior,\ncalled bifurcations, when their underlying parameters cross a threshold.\nExisting methods predict oncoming catastrophes in individual systems but are\nprimarily time-series-based and struggle both to categorize qualitative\ndynamical regimes across diverse systems and to generalize to real data. To\naddress this challenge, we propose a data-driven, physically-informed\ndeep-learning framework for classifying dynamical regimes and characterizing\nbifurcation boundaries based on the extraction of topologically invariant\nfeatures. We focus on the paradigmatic case of the supercritical Hopf\nbifurcation, which is used to model periodic dynamics across a wide range of\napplications. Our convolutional attention method is trained with data\naugmentations that encourage the learning of topological invariants which can\nbe used to detect bifurcation boundaries in unseen systems and to design models\nof biological systems like oscillatory gene regulatory networks. We further\ndemonstrate our method's use in analyzing real data by recovering distinct\nproliferation and differentiation dynamics along pancreatic endocrinogenesis\ntrajectory in gene expression space based on single-cell data. Our method\nprovides valuable insights into the qualitative, long-term behavior of a wide\nrange of dynamical systems, and can detect bifurcations or catastrophic\ntransitions in large-scale physical and biological systems.\n","authors":["Noa Moriel","Matthew Ricci","Mor Nitzan"],"pdf_url":"https://arxiv.org/pdf/2312.09234v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.02901v2","updated":"2024-01-19T14:53:51Z","published":"2023-03-06T05:35:32Z","title":"$α$-divergence Improves the Entropy Production Estimation via\n Machine Learning","summary":" Recent years have seen a surge of interest in the algorithmic estimation of\nstochastic entropy production (EP) from trajectory data via machine learning. A\ncrucial element of such algorithms is the identification of a loss function\nwhose minimization guarantees the accurate EP estimation. In this study, we\nshow that there exists a host of loss functions, namely those implementing a\nvariational representation of the $\\alpha$-divergence, which can be used for\nthe EP estimation. By fixing $\\alpha$ to a value between $-1$ and $0$, the\n$\\alpha$-NEEP (Neural Estimator for Entropy Production) exhibits a much more\nrobust performance against strong nonequilibrium driving or slow dynamics,\nwhich adversely affects the existing method based on the Kullback-Leibler\ndivergence ($\\alpha = 0$). In particular, the choice of $\\alpha = -0.5$ tends\nto yield the optimal results. To corroborate our findings, we present an\nexactly solvable simplification of the EP estimation problem, whose loss\nfunction landscape and stochastic properties give deeper intuition into the\nrobustness of the $\\alpha$-NEEP.\n","authors":["Euijoon Kwon","Yongjoo Baek"],"pdf_url":"https://arxiv.org/pdf/2303.02901v2.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10726v1","updated":"2024-01-19T14:43:04Z","published":"2024-01-19T14:43:04Z","title":"Empowering Aggregators with Practical Data-Driven Tools: Harnessing\n Aggregated and Disaggregated Flexibility for Demand Response","summary":" This study explores the crucial interplay between aggregators and building\noccupants in activating flexibility through Demand Response (DR) programs, with\na keen focus on achieving robust decarbonization and fortifying the resilience\nof the energy system amidst the uncertainties presented by Renewable Energy\nSources (RES). Firstly, it introduces a methodology of optimizing aggregated\nflexibility provision strategies in environments with limited data, utilizing\nDiscrete Fourier Transformation (DFT) and clustering techniques to identify\nbuilding occupant's activity patterns. Secondly, the study assesses the\ndisaggregated flexibility provision of Heating Ventilation and Air Conditioning\n(HVAC) systems during DR events, employing machine learning and optimization\ntechniques for precise, device-level analysis. The first approach offers a\nnon-intrusive pathway for aggregators to provide flexibility services in\nenvironments of a single smart meter for the whole building's consumption,\nwhile the second approach carefully considers building occupants' thermal\ncomfort profiles, while maximizing flexibility in case of existence of\ndedicated smart meters to the HVAC systems. Through the application of\ndata-driven techniques and encompassing case studies from both industrial and\nresidential buildings, this paper not only unveils pivotal opportunities for\naggregators in the balancing and emerging flexibility markets but also\nsuccessfully develops end-to-end practical tools for aggregators. Furthermore,\nthe efficacy of this tool is validated through detailed case studies,\nsubstantiating its operational capability and contributing to the evolution of\na resilient and efficient energy system.\n","authors":["Costas Mylonas","Donata Boric","Leila Luttenberger Maric","Alexandros Tsitsanis","Eleftheria Petrianou","Magda Foti"],"pdf_url":"https://arxiv.org/pdf/2401.10726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10724v1","updated":"2024-01-19T14:36:01Z","published":"2024-01-19T14:36:01Z","title":"Real-Time Zero-Day Intrusion Detection System for Automotive Controller\n Area Network on FPGAs","summary":" Increasing automation in vehicles enabled by increased connectivity to the\noutside world has exposed vulnerabilities in previously siloed automotive\nnetworks like controller area networks (CAN). Attributes of CAN such as\nbroadcast-based communication among electronic control units (ECUs) that\nlowered deployment costs are now being exploited to carry out active injection\nattacks like denial of service (DoS), fuzzing, and spoofing attacks. Research\nliterature has proposed multiple supervised machine learning models deployed as\nIntrusion detection systems (IDSs) to detect such malicious activity; however,\nthese are largely limited to identifying previously known attack vectors. With\nthe ever-increasing complexity of active injection attacks, detecting zero-day\n(novel) attacks in these networks in real-time (to prevent propagation) becomes\na problem of particular interest. This paper presents an\nunsupervised-learning-based convolutional autoencoder architecture for\ndetecting zero-day attacks, which is trained only on benign (attack-free) CAN\nmessages. We quantise the model using Vitis-AI tools from AMD/Xilinx targeting\na resource-constrained Zynq Ultrascale platform as our IDS-ECU system for\nintegration. The proposed model successfully achieves equal or higher\nclassification accuracy (> 99.5%) on unseen DoS, fuzzing, and spoofing attacks\nfrom a publicly available attack dataset when compared to the state-of-the-art\nunsupervised learning-based IDSs. Additionally, by cleverly overlapping IDS\noperation on a window of CAN messages with the reception, the model is able to\nmeet line-rate detection (0.43 ms per window) of high-speed CAN, which when\ncoupled with the low energy consumption per inference, makes this architecture\nideally suited for detecting zero-day attacks on critical CAN networks.\n","authors":["Shashwat Khandelwal","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10724v1.pdf","comment":"8 pages, 6 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.03976v2","updated":"2024-01-19T14:34:47Z","published":"2023-11-07T13:24:01Z","title":"A Foundation Graph Model","summary":" The principal benefit of unsupervised graph representation learning is that a\npre-trained model can be fine-tuned where data or labels are scarce. Existing\napproaches are domain specific, maintaining consistent node and edge attributes\nacross the pre-training and target datasets. This precludes transfer to other\ndomains. A model capable of positive transfer on arbitrary tasks and domains\nwould represent the first foundation graph model.\n In this work we use adversarial contrastive learning to present FoToM, a\ngraph pre-training method based on node and edge feature exclusion. We use\nFoToM to pre-train models over multiple graph domains, producing the first\nfoundation graph models. We demonstrate positive transfer on evaluation\ndatasets from multiple domains, including domains not present in pre-training\ndata. On all datasets performance is at worst on-par and on 76% significantly\nbetter than a supervised baseline ($P \\leq 0.01$), with an 8 to 40% reduction\nin error at 95% confidence. Contrary to other research, pre-training on a\ndataset with the target domain excluded leads us to better performance than\npre-training on a dataset from only the target domain. The multi-domain model\nat worst, matches, and on 56% of tasks, significantly outperforms single-domain\n($P \\leq 0.01$). These results include when node labels are used in evaluation,\nwhere performance is consistently superior to single-domain or non-pre-trained\nmodels. Notably, FoToM benefits scenarios in both large or scarce data regimes\nfor the target domains.\n","authors":["Alex O. Davies","Riku W. Green","Nirav S. Ajmeri","Telmo M. Silva Filho"],"pdf_url":"https://arxiv.org/pdf/2311.03976v2.pdf","comment":"Presented at the NeurIPS 2023 New Frontiers in Graph Learning\n workshop"},{"id":"http://arxiv.org/abs/2401.10721v1","updated":"2024-01-19T14:32:50Z","published":"2024-01-19T14:32:50Z","title":"Generative Model for Constructing Reaction Path from Initial to Final\n States","summary":" Mapping out reaction pathways and their corresponding activation barriers is\na significant aspect of molecular simulation. Given their inherent complexity\nand nonlinearity, even generating a initial guess of these paths remains a\nchallenging problem. Presented in this paper is an innovative approach that\nutilizes neural networks to generate initial guess for these reaction pathways.\nThe proposed method is initiated by inputting the coordinates of the initial\nstate, followed by progressive alterations to its structure. This iterative\nprocess culminates in the generation of the approximate representation of the\nreaction path and the coordinates of the final state. The application of this\nmethod extends to complex reaction pathways illustrated by organic reactions.\nTraining was executed on the Transition1x dataset, an organic reaction pathway\ndataset. The results revealed generation of reactions that bore substantial\nsimilarities with the corresponding test data. The method's flexibility allows\nfor reactions to be generated either to conform to predetermined conditions or\nin a randomized manner.\n","authors":["Akihide Hayashi","So Takamoto","Ju Li","Daisuke Okanohara"],"pdf_url":"https://arxiv.org/pdf/2401.10721v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10710v1","updated":"2024-01-19T14:18:32Z","published":"2024-01-19T14:18:32Z","title":"Classification with neural networks with quadratic decision functions","summary":" Neural network with quadratic decision functions have been introduced as\nalternatives to standard neural networks with affine linear one. They are\nadvantageous when the objects to be identified are of compact basic geometries\nlike circles, ellipsis etc. In this paper we investigate the use of such ansatz\nfunctions for classification. In particular we test and compare the algorithm\non the MNIST dataset for classification of handwritten digits and for\nclassification of subspecies. We also show, that the implementation can be\nbased on the neural network structure in the software Tensorflow and Keras,\nrespectively.\n","authors":["Leon Frischauf","Otmar Scherzer","Cong Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15591v2","updated":"2024-01-19T14:08:23Z","published":"2023-12-25T02:32:05Z","title":"Privacy-Preserving Neural Graph Databases","summary":" In the era of big data and rapidly evolving information systems, efficient\nand accurate data retrieval has become increasingly crucial. Neural graph\ndatabases (NGDBs) have emerged as a powerful paradigm that combines the\nstrengths of graph databases (graph DBs) and neural networks to enable\nefficient storage, retrieval, and analysis of graph-structured data. The usage\nof neural embedding storage and complex neural logical query answering provides\nNGDBs with generalization ability. When the graph is incomplete, by extracting\nlatent patterns and representations, neural graph databases can fill gaps in\nthe graph structure, revealing hidden relationships and enabling accurate query\nanswering. Nevertheless, this capability comes with inherent trade-offs, as it\nintroduces additional privacy risks to the database. Malicious attackers can\ninfer more sensitive information in the database using well-designed\ncombinatorial queries, such as by comparing the answer sets of where Turing\nAward winners born before 1950 and after 1940 lived, the living places of\nTuring Award winner Hinton are probably exposed, although the living places may\nhave been deleted in the training due to the privacy concerns. In this work,\ninspired by the privacy protection in graph embeddings, we propose a\nprivacy-preserving neural graph database (P-NGDB) to alleviate the risks of\nprivacy leakage in NGDBs. We introduce adversarial training techniques in the\ntraining stage to force the NGDBs to generate indistinguishable answers when\nqueried with private information, enhancing the difficulty of inferring\nsensitive information through combinations of multiple innocuous queries.\nExtensive experiment results on three datasets show that P-NGDB can effectively\nprotect private information in the graph database while delivering high-quality\npublic answers responses to queries.\n","authors":["Qi Hu","Haoran Li","Jiaxin Bai","Yangqiu Song"],"pdf_url":"https://arxiv.org/pdf/2312.15591v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10700v1","updated":"2024-01-19T14:05:09Z","published":"2024-01-19T14:05:09Z","title":"Safe Offline Reinforcement Learning with Feasibility-Guided Diffusion\n Model","summary":" Safe offline RL is a promising way to bypass risky online interactions\ntowards safe policy learning. Most existing methods only enforce soft\nconstraints, i.e., constraining safety violations in expectation below\nthresholds predetermined. This can lead to potentially unsafe outcomes, thus\nunacceptable in safety-critical scenarios. An alternative is to enforce the\nhard constraint of zero violation. However, this can be challenging in offline\nsetting, as it needs to strike the right balance among three highly intricate\nand correlated aspects: safety constraint satisfaction, reward maximization,\nand behavior regularization imposed by offline datasets. Interestingly, we\ndiscover that via reachability analysis of safe-control theory, the hard safety\nconstraint can be equivalently translated to identifying the largest feasible\nregion given the offline dataset. This seamlessly converts the original trilogy\nproblem to a feasibility-dependent objective, i.e., maximizing reward value\nwithin the feasible region while minimizing safety risks in the infeasible\nregion. Inspired by these, we propose FISOR (FeasIbility-guided Safe Offline\nRL), which allows safety constraint adherence, reward maximization, and offline\npolicy learning to be realized via three decoupled processes, while offering\nstrong safety performance and stability. In FISOR, the optimal policy for the\ntranslated optimization problem can be derived in a special form of weighted\nbehavior cloning. Thus, we propose a novel energy-guided diffusion model that\ndoes not require training a complicated time-dependent classifier to extract\nthe policy, greatly simplifying the training. We compare FISOR against\nbaselines on DSRL benchmark for safe offline RL. Evaluation results show that\nFISOR is the only method that can guarantee safety satisfaction in all tasks,\nwhile achieving top returns in most tasks.\n","authors":["Yinan Zheng","Jianxiong Li","Dongjie Yu","Yujie Yang","Shengbo Eben Li","Xianyuan Zhan","Jingjing Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10700v1.pdf","comment":"ICLR 2024, 30pages, 11 figures"},{"id":"http://arxiv.org/abs/2401.09902v2","updated":"2024-01-19T14:04:22Z","published":"2024-01-18T11:32:50Z","title":"Interplay between depth and width for interpolation in neural ODEs","summary":" Neural ordinary differential equations (neural ODEs) have emerged as a\nnatural tool for supervised learning from a control perspective, yet a complete\nunderstanding of their optimal architecture remains elusive. In this work, we\nexamine the interplay between their width $p$ and number of layer transitions\n$L$ (effectively the depth $L+1$). Specifically, we assess the model\nexpressivity in terms of its capacity to interpolate either a finite dataset\n$D$ comprising $N$ pairs of points or two probability measures in\n$\\mathbb{R}^d$ within a Wasserstein error margin $\\varepsilon>0$. Our findings\nreveal a balancing trade-off between $p$ and $L$, with $L$ scaling as\n$O(1+N/p)$ for dataset interpolation, and\n$L=O\\left(1+(p\\varepsilon^d)^{-1}\\right)$ for measure interpolation.\n In the autonomous case, where $L=0$, a separate study is required, which we\nundertake focusing on dataset interpolation. We address the relaxed problem of\n$\\varepsilon$-approximate controllability and establish an error decay of\n$\\varepsilon\\sim O(\\log(p)p^{-1/d})$. This decay rate is a consequence of\napplying a universal approximation theorem to a custom-built Lipschitz vector\nfield that interpolates $D$. In the high-dimensional setting, we further\ndemonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact\ncontrol.\n","authors":["Antonio Álvarez-López","Arselane Hadj Slimane","Enrique Zuazua"],"pdf_url":"https://arxiv.org/pdf/2401.09902v2.pdf","comment":"16 pages, 10 figures, double column"},{"id":"http://arxiv.org/abs/2401.10690v1","updated":"2024-01-19T13:41:08Z","published":"2024-01-19T13:41:08Z","title":"Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and\n unfairness in dyadic regression models","summary":" Dyadic regression models, which predict real-valued outcomes for pairs of\nentities, are fundamental in many domains (e.g. predicting the rating of a user\nto a product in Recommender Systems) and promising and under exploration in\nmany others (e.g. approximating the adequate dosage of a drug for a patient in\npersonalized pharmacology). In this work, we demonstrate that non-uniformity in\nthe observed value distributions of individual entities leads to severely\nbiased predictions in state-of-the-art models, skewing predictions towards the\naverage of observed past values for the entity and providing worse-than-random\npredictive power in eccentric yet equally important cases. We show that the\nusage of global error metrics like Root Mean Squared Error (RMSE) and Mean\nAbsolute Error (MAE) is insufficient to capture this phenomenon, which we name\neccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as\na new complementary metric that can quantify it in all studied models and\ndatasets. We also prove the adequateness of EAUC by using naive de-biasing\ncorrections to demonstrate that a lower model bias correlates with a lower EAUC\nand vice-versa. This work contributes a bias-aware evaluation of dyadic\nregression models to avoid potential unfairness and risks in critical\nreal-world applications of such systems.\n","authors":["Jorge Paz-Ruza","Amparo Alonso-Betanzos","Bertha Guijarro-Berdiñas","Brais Cancela","Carlos Eiras-Franco"],"pdf_url":"https://arxiv.org/pdf/2401.10690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10689v1","updated":"2024-01-19T13:39:05Z","published":"2024-01-19T13:39:05Z","title":"A Lightweight Multi-Attack CAN Intrusion Detection System on Hybrid\n FPGAs","summary":" Rising connectivity in vehicles is enabling new capabilities like connected\nautonomous driving and advanced driver assistance systems (ADAS) for improving\nthe safety and reliability of next-generation vehicles. This increased access\nto in-vehicle functions compromises critical capabilities that use legacy\ninvehicle networks like Controller Area Network (CAN), which has no inherent\nsecurity or authentication mechanism. Intrusion detection and mitigation\napproaches, particularly using machine learning models, have shown promising\nresults in detecting multiple attack vectors in CAN through their ability to\ngeneralise to new vectors. However, most deployments require dedicated\ncomputing units like GPUs to perform line-rate detection, consuming much higher\npower. In this paper, we present a lightweight multi-attack quantised machine\nlearning model that is deployed using Xilinx's Deep Learning Processing Unit IP\non a Zynq Ultrascale+ (XCZU3EG) FPGA, which is trained and validated using the\npublic CAN Intrusion Detection dataset. The quantised model detects denial of\nservice and fuzzing attacks with an accuracy of above 99 % and a false positive\nrate of 0.07%, which are comparable to the state-of-the-art techniques in the\nliterature. The Intrusion Detection System (IDS) execution consumes just 2.0 W\nwith software tasks running on the ECU and achieves a 25 % reduction in\nper-message processing latency over the state-of-the-art implementations. This\ndeployment allows the ECU function to coexist with the IDS with minimal changes\nto the tasks, making it ideal for real-time IDS in in-vehicle systems.\n","authors":["Shashwat Khandelwal","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10689v1.pdf","comment":"5 pages, 2 figures, 6 tables"},{"id":"http://arxiv.org/abs/2401.10686v1","updated":"2024-01-19T13:33:23Z","published":"2024-01-19T13:33:23Z","title":"Manipulating Sparse Double Descent","summary":" This paper investigates the double descent phenomenon in two-layer neural\nnetworks, focusing on the role of L1 regularization and representation\ndimensions. It explores an alternative double descent phenomenon, named sparse\ndouble descent. The study emphasizes the complex relationship between model\ncomplexity, sparsity, and generalization, and suggests further research into\nmore diverse models and datasets. The findings contribute to a deeper\nunderstanding of neural network training and optimization.\n","authors":["Ya Shi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10685v1","updated":"2024-01-19T13:32:55Z","published":"2024-01-19T13:32:55Z","title":"Towards End-to-End GPS Localization with Neural Pseudorange Correction","summary":" Pseudorange errors are the root cause of localization inaccuracy in GPS.\nPrevious data-driven methods regress and eliminate pseudorange errors using\nhandcrafted intermediate labels. Unlike them, we propose an end-to-end GPS\nlocalization framework, E2E-PrNet, to train a neural network for pseudorange\ncorrection (PrNet) directly using the final task loss calculated with the\nground truth of GPS receiver states. The gradients of the loss with respect to\nlearnable parameters are backpropagated through a differentiable nonlinear\nleast squares optimizer to PrNet. The feasibility is verified with GPS data\ncollected by Android phones, showing that E2E-PrNet outperforms the\nstate-of-the-art end-to-end GPS localization methods.\n","authors":["Xu Weng","KV Ling","Haochen Liu","Kun Cao"],"pdf_url":"https://arxiv.org/pdf/2401.10685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10674v1","updated":"2024-01-19T13:13:38Z","published":"2024-01-19T13:13:38Z","title":"Deep Learning-based Embedded Intrusion Detection System for Automotive\n CAN","summary":" Rising complexity of in-vehicle electronics is enabling new capabilities like\nautonomous driving and active safety. However, rising automation also increases\nrisk of security threats which is compounded by lack of in-built security\nmeasures in legacy networks like CAN, allowing attackers to observe, tamper and\nmodify information shared over such broadcast networks. Various intrusion\ndetection approaches have been proposed to detect and tackle such threats, with\nmachine learning models proving highly effective. However, deploying machine\nlearning models will require high processing power through high-end processors\nor GPUs to perform them close to line rate. In this paper, we propose a hybrid\nFPGA-based ECU approach that can transparently integrate IDS functionality\nthrough a dedicated off-the-shelf hardware accelerator that implements a\ndeep-CNN intrusion detection model. Our results show that the proposed approach\nprovides an average accuracy of over 99% across multiple attack datasets with\n0.64% false detection rates while consuming 94% less energy and achieving 51.8%\nreduction in per-message processing latency when compared to IDS\nimplementations on GPUs.\n","authors":["Shashwat Khandelwal","Eashan Wadhwa","Shreejith Shanker"],"pdf_url":"https://arxiv.org/pdf/2401.10674v1.pdf","comment":"5 pages, 1 figure, 8 tables"},{"id":"http://arxiv.org/abs/2401.09691v2","updated":"2024-01-19T12:43:36Z","published":"2024-01-18T02:44:18Z","title":"Imitation Learning Inputting Image Feature to Each Layer of Neural\n Network","summary":" Imitation learning enables robots to learn and replicate human behavior from\ntraining data. Recent advances in machine learning enable end-to-end learning\napproaches that directly process high-dimensional observation data, such as\nimages. However, these approaches face a critical challenge when processing\ndata from multiple modalities, inadvertently ignoring data with a lower\ncorrelation to the desired output, especially when using short sampling\nperiods. This paper presents a useful method to address this challenge, which\namplifies the influence of data with a relatively low correlation to the output\nby inputting the data into each neural network layer. The proposed approach\neffectively incorporates diverse data sources into the learning process.\nThrough experiments using a simple pick-and-place operation with raw images and\njoint information as input, significant improvements in success rates are\ndemonstrated even when dealing with data from short sampling periods.\n","authors":["Koki Yamane","Sho Sakaino","Toshiaki Tsuji"],"pdf_url":"https://arxiv.org/pdf/2401.09691v2.pdf","comment":"6 pages, 4 figures, Accepted at AMC2024"},{"id":"http://arxiv.org/abs/2312.01185v2","updated":"2024-01-19T12:34:07Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":" In this note we use the State of the Union Address (SOTU) dataset from Kaggle\nto make some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2.\n While it is widely believed that BERT (and its variations) is most suitable\nfor NLP classification tasks, we find out that GPT-2 in conjunction with\nnonlinear dimension reduction methods such as UMAP provide better separation\nand stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In\nour case, no model fine-tuning is required, and the pre-trained out-of-the-box\nGPT-2 model is enough.\n We also used a fine-tuned DistilBERT model for classification detecting which\nPresident delivered which address, with very good results (accuracy 93\\% - 95\\%\ndepending on the run). An analogous task was performed to determine the year of\nwriting, and we were able to pin it down to about 4 years (which is a single\npresidential term).\n It is worth noting that SOTU addresses provide relatively small writing\nsamples (with about 8000 words on average, and varying widely from under 2000\nwords to more than 20000), and that the amount of authors is relatively large\n(we used SOTU addresses of 42 US presidents). This shows that the techniques\nemployed turn out to be rather efficient, while all the computations described\nin this note can be performed using a single GPU instance of Google Colab.\n The accompanying code is available on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v2.pdf","comment":"7 pages, 8 figures; GitHub repository\n https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2312.08010v2","updated":"2024-01-19T12:19:48Z","published":"2023-12-13T09:33:08Z","title":"EZ-CLIP: Efficient Zeroshot Video Action Recognition","summary":" Recent advancements in large-scale pre-training of visual-language models on\npaired image-text data have demonstrated impressive generalization capabilities\nfor zero-shot tasks. Building on this success, efforts have been made to adapt\nthese image-based visual-language models, such as CLIP, for videos extending\ntheir zero-shot capabilities to the video domain. While these adaptations have\nshown promising results, they come at a significant computational cost and\nstruggle with effectively modeling the crucial temporal aspects inherent to the\nvideo domain. In this study, we present EZ-CLIP, a simple and efficient\nadaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal\nvisual prompting for seamless temporal adaptation, requiring no fundamental\nalterations to the core CLIP architecture while preserving its remarkable\ngeneralization abilities. Moreover, we introduce a novel learning objective\nthat guides the temporal visual prompts to focus on capturing motion, thereby\nenhancing its learning capabilities from video data. We conducted extensive\nexperiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP\nfor zero-shot learning and base-to-novel video action recognition, and also\ndemonstrating its potential for few-shot generalization.Impressively, with a\nmere 5.2 million learnable parameters (as opposed to the 71.1 million in the\nprior best model), EZ-CLIP can be efficiently trained on a single GPU,\noutperforming existing approaches in several evaluations.\n","authors":["Shahzad Ahmad","Sukalpa Chanda","Yogesh S Rawat"],"pdf_url":"https://arxiv.org/pdf/2312.08010v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10657v1","updated":"2024-01-19T12:04:31Z","published":"2024-01-19T12:04:31Z","title":"FIMBA: Evaluating the Robustness of AI in Genomics via Feature\n Importance Adversarial Attacks","summary":" With the steady rise of the use of AI in bio-technical applications and the\nwidespread adoption of genomics sequencing, an increasing amount of AI-based\nalgorithms and tools is entering the research and production stage affecting\ncritical decision-making streams like drug discovery and clinical outcomes.\nThis paper demonstrates the vulnerability of AI models often utilized\ndownstream tasks on recognized public genomics datasets. We undermine model\nrobustness by deploying an attack that focuses on input transformation while\nmimicking the real data and confusing the model decision-making, ultimately\nyielding a pronounced deterioration in model performance. Further, we enhance\nour approach by generating poisoned data using a variational autoencoder-based\nmodel. Our empirical findings unequivocally demonstrate a decline in model\nperformance, underscored by diminished accuracy and an upswing in false\npositives and false negatives. Furthermore, we analyze the resulting\nadversarial samples via spectral analysis yielding conclusions for\ncountermeasures against such attacks.\n","authors":["Heorhii Skovorodnikov","Hoda Alkhzaimi"],"pdf_url":"https://arxiv.org/pdf/2401.10657v1.pdf","comment":"15 pages, core code available at:\n https://github.com/HeorhiiS/fimba-attack"},{"id":"http://arxiv.org/abs/2401.10653v1","updated":"2024-01-19T11:59:13Z","published":"2024-01-19T11:59:13Z","title":"Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech\n Detection","summary":" With the recent surge and exponential growth of social media usage,\nscrutinizing social media content for the presence of any hateful content is of\nutmost importance. Researchers have been diligently working since the past\ndecade on distinguishing between content that promotes hatred and content that\ndoes not. Traditionally, the main focus has been on analyzing textual content.\nHowever, recent research attempts have also commenced into the identification\nof audio-based content. Nevertheless, studies have shown that relying solely on\naudio or text-based content may be ineffective, as recent upsurge indicates\nthat individuals often employ sarcasm in their speech and writing. To overcome\nthese challenges, we present an approach to identify whether a speech promotes\nhate or not utilizing both audio and textual representations. Our methodology\nis based on the Transformer framework that incorporates both audio and text\nsampling, accompanied by our very own layer called \"Attentive Fusion\". The\nresults of our study surpassed previous state-of-the-art techniques, achieving\nan impressive macro F1 score of 0.927 on the Test Set.\n","authors":["Atanu Mandal","Gargi Roy","Amit Barman","Indranil Dutta","Sudip Kumar Naskar"],"pdf_url":"https://arxiv.org/pdf/2401.10653v1.pdf","comment":"Accepted in 20th International Conference on Natural Language\n Processing (ICON)"},{"id":"http://arxiv.org/abs/2401.10652v1","updated":"2024-01-19T11:58:13Z","published":"2024-01-19T11:58:13Z","title":"AutoChunk: Automated Activation Chunk for Memory-Efficient Long Sequence\n Inference","summary":" Large deep learning models have achieved impressive performance across a\nrange of applications. However, their large memory requirements, including\nparameter memory and activation memory, have become a significant challenge for\ntheir practical serving. While existing methods mainly address parameter\nmemory, the importance of activation memory has been overlooked. Especially for\nlong input sequences, activation memory is expected to experience a significant\nexponential growth as the length of sequences increases. In this approach, we\npropose AutoChunk, an automatic and adaptive compiler system that efficiently\nreduces activation memory for long sequence inference by chunk strategies. The\nproposed system generates chunk plans by optimizing through multiple stages. In\neach stage, the chunk search pass explores all possible chunk candidates and\nthe chunk selection pass identifies the optimal one. At runtime, AutoChunk\nemploys code generation to automatically apply chunk strategies. The\nexperiments demonstrate that AutoChunk can reduce over 80\\% of activation\nmemory while maintaining speed loss within 10%, extend max sequence length by\n3.2x to 11.7x, and outperform state-of-the-art methods by a large margin.\n","authors":["Xuanlei Zhao","Shenggan Cheng","Guangyang Lu","Jiarui Fang","Haotian Zhou","Bin Jia","Ziming Liu","Yang You"],"pdf_url":"https://arxiv.org/pdf/2401.10652v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10648v1","updated":"2024-01-19T11:48:52Z","published":"2024-01-19T11:48:52Z","title":"Area Modeling using Stay Information for Large-Scale Users and Analysis\n for Influence of COVID-19","summary":" Understanding how people use area in a city can be a valuable information in\na wide range of fields, from marketing to urban planning. Area usage is subject\nto change over time due to various events including seasonal shifts and\npandemics. Before the spread of smartphones, this data had been collected\nthrough questionnaire survey. However, this is not a sustainable approach in\nterms of time to results and cost. There are many existing studies on area\nmodeling, which characterize an area with some kind of information, using Point\nof Interest (POI) or inter-area movement data. However, since POI is data that\nis statically tied to space, and inter-area movement data ignores the behavior\nof people within an area, existing methods are not sufficient in terms of\ncapturing area usage changes. In this paper, we propose a novel area modeling\nmethod named Area2Vec, inspired by Word2Vec, which models areas based on\npeople's location data. This method is based on the discovery that it is\npossible to characterize an area based on its usage by using people's stay\ninformation in the area. And it is a novel method that can reflect the\ndynamically changing people's behavior in an area in the modeling results. We\nvalidated Area2vec by performing a functional classification of areas in a\ndistrict of Japan. The results show that Area2Vec can be usable in general area\nanalysis. We also investigated area usage changes due to COVID-19 in two\ndistricts in Japan. We could find that COVID-19 made people refrain from\nunnecessary going out, such as visiting entertainment areas.\n","authors":["Kazuyuki Shoji","Shunsuke Aoki","Takuro Yonezawa","Nobuo Kawaguchi"],"pdf_url":"https://arxiv.org/pdf/2401.10648v1.pdf","comment":"This paper is an English translation of the paper published in the\n Transactions of the Information Processing Society of Japan\n (http://doi.org/10.20729/00213190)"},{"id":"http://arxiv.org/abs/2401.10646v1","updated":"2024-01-19T11:47:49Z","published":"2024-01-19T11:47:49Z","title":"Empowering HWNs with Efficient Data Labeling: A Clustered Federated\n Semi-Supervised Learning Approach","summary":" Clustered Federated Multitask Learning (CFL) has gained considerable\nattention as an effective strategy for overcoming statistical challenges,\nparticularly when dealing with non independent and identically distributed (non\nIID) data across multiple users. However, much of the existing research on CFL\noperates under the unrealistic premise that devices have access to accurate\nground truth labels. This assumption becomes especially problematic in\nhierarchical wireless networks (HWNs), where edge networks contain a large\namount of unlabeled data, resulting in slower convergence rates and increased\nprocessing times, particularly when dealing with two layers of model\naggregation. To address these issues, we introduce a novel framework, Clustered\nFederated Semi-Supervised Learning (CFSL), designed for more realistic HWN\nscenarios. Our approach leverages a best-performing specialized model\nalgorithm, wherein each device is assigned a specialized model that is highly\nadept at generating accurate pseudo-labels for unlabeled data, even when the\ndata stems from diverse environments. We validate the efficacy of CFSL through\nextensive experiments, comparing it with existing methods highlighted in recent\nliterature. Our numerical results demonstrate that CFSL significantly improves\nupon key metrics such as testing accuracy, labeling accuracy, and labeling\nlatency under varying proportions of labeled and unlabeled data while also\naccommodating the non-IID nature of the data and the unique characteristics of\nwireless edge networks.\n","authors":["Moqbel Hamood","Abdullatif Albaseer","Mohamed Abdallah","Ala Al-Fuqaha"],"pdf_url":"https://arxiv.org/pdf/2401.10646v1.pdf","comment":"Accepted for IEEE Wireless Communications and Networking Conference\n (WCNC) 2024"},{"id":"http://arxiv.org/abs/2401.10643v1","updated":"2024-01-19T11:45:10Z","published":"2024-01-19T11:45:10Z","title":"A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification:\n Models, Data Sets and Challenges","summary":" Vehicle re-identification (ReID) endeavors to associate vehicle images\ncollected from a distributed network of cameras spanning diverse traffic\nenvironments. This task assumes paramount importance within the spectrum of\nvehicle-centric technologies, playing a pivotal role in deploying Intelligent\nTransportation Systems (ITS) and advancing smart city initiatives. Rapid\nadvancements in deep learning have significantly propelled the evolution of\nvehicle ReID technologies in recent years. Consequently, undertaking a\ncomprehensive survey of methodologies centered on deep learning for vehicle\nre-identification has become imperative and inescapable. This paper extensively\nexplores deep learning techniques applied to vehicle ReID. It outlines the\ncategorization of these methods, encompassing supervised and unsupervised\napproaches, delves into existing research within these categories, introduces\ndatasets and evaluation criteria, and delineates forthcoming challenges and\npotential research directions. This comprehensive assessment examines the\nlandscape of deep learning in vehicle ReID and establishes a foundation and\nstarting point for future works. It aims to serve as a complete reference by\nhighlighting challenges and emerging trends, fostering advancements and\napplications in vehicle ReID utilizing deep learning models.\n","authors":["Ali Amiri","Aydin Kaya","Ali Seydi Keceli"],"pdf_url":"https://arxiv.org/pdf/2401.10643v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10637v1","updated":"2024-01-19T11:35:07Z","published":"2024-01-19T11:35:07Z","title":"Towards Universal Unsupervised Anomaly Detection in Medical Imaging","summary":" The increasing complexity of medical imaging data underscores the need for\nadvanced anomaly detection methods to automatically identify diverse\npathologies. Current methods face challenges in capturing the broad spectrum of\nanomalies, often limiting their use to specific lesion types in brain scans. To\naddress this challenge, we introduce a novel unsupervised approach, termed\n\\textit{Reversed Auto-Encoders (RA)}, designed to create realistic\npseudo-healthy reconstructions that enable the detection of a wider range of\npathologies. We evaluate the proposed method across various imaging modalities,\nincluding magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray,\nand chest X-ray, and demonstrate superior performance in detecting anomalies\ncompared to existing state-of-the-art methods. Our unsupervised anomaly\ndetection approach may enhance diagnostic accuracy in medical imaging by\nidentifying a broader range of unknown pathologies. Our code is publicly\navailable at: \\url{https://github.com/ci-ber/RA}.\n","authors":["Cosmin I. Bercea","Benedikt Wiestler","Daniel Rueckert","Julia A. Schnabel"],"pdf_url":"https://arxiv.org/pdf/2401.10637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10632v1","updated":"2024-01-19T11:20:31Z","published":"2024-01-19T11:20:31Z","title":"Interventional Fairness on Partially Known Causal Graphs: A Constrained\n Optimization Approach","summary":" Fair machine learning aims to prevent discrimination against individuals or\nsub-populations based on sensitive attributes such as gender and race. In\nrecent years, causal inference methods have been increasingly used in fair\nmachine learning to measure unfairness by causal effects. However, current\nmethods assume that the true causal graph is given, which is often not true in\nreal-world applications. To address this limitation, this paper proposes a\nframework for achieving causal fairness based on the notion of interventions\nwhen the true causal graph is partially known. The proposed approach involves\nmodeling fair prediction using a Partially Directed Acyclic Graph (PDAG),\nspecifically, a class of causal DAGs that can be learned from observational\ndata combined with domain knowledge. The PDAG is used to measure causal\nfairness, and a constrained optimization problem is formulated to balance\nbetween fairness and accuracy. Results on both simulated and real-world\ndatasets demonstrate the effectiveness of this method.\n","authors":["Aoqi Zuo","Yiqing Li","Susan Wei","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2401.10632v1.pdf","comment":"Accepted to ICLR24"},{"id":"http://arxiv.org/abs/2401.10620v1","updated":"2024-01-19T10:52:57Z","published":"2024-01-19T10:52:57Z","title":"Polytopic Autoencoders with Smooth Clustering for Reduced-order\n Modelling of Flows","summary":" With the advancement of neural networks, there has been a notable increase,\nboth in terms of quantity and variety, in research publications concerning the\napplication of autoencoders to reduced-order models. We propose a polytopic\nautoencoder architecture that includes a lightweight nonlinear encoder, a\nconvex combination decoder, and a smooth clustering network. Supported by\nseveral proofs, the model architecture ensures that all reconstructed states\nlie within a polytope, accompanied by a metric indicating the quality of the\nconstructed polytopes, referred to as polytope error. Additionally, it offers a\nminimal number of convex coordinates for polytopic linear-parameter varying\nsystems while achieving acceptable reconstruction errors compared to proper\northogonal decomposition (POD). To validate our proposed model, we conduct\nsimulations involving two flow scenarios with the incompressible Navier-Stokes\nequation. Numerical results demonstrate the guaranteed properties of the model,\nlow reconstruction errors compared to POD, and the improvement in error using a\nclustering network.\n","authors":["Jan Heiland","Yongho Kim"],"pdf_url":"https://arxiv.org/pdf/2401.10620v1.pdf","comment":"28 pages, 18 figures"},{"id":"http://arxiv.org/abs/2401.10603v1","updated":"2024-01-19T10:21:27Z","published":"2024-01-19T10:21:27Z","title":"ZnTrack -- Data as Code","summary":" The past decade has seen tremendous breakthroughs in computation and there is\nno indication that this will slow any time soon. Machine learning, large-scale\ncomputing resources, and increased industry focus have resulted in rising\ninvestments in computer-driven solutions for data management, simulations, and\nmodel generation. However, with this growth in computation has come an even\nlarger expansion of data and with it, complexity in data storage, sharing, and\ntracking. In this work, we introduce ZnTrack, a Python-driven data versioning\ntool. ZnTrack builds upon established version control systems to provide a\nuser-friendly and easy-to-use interface for tracking parameters in experiments,\ndesigning workflows, and storing and sharing data. From this ability to reduce\nlarge datasets to a simple Python script emerges the concept of Data as Code, a\ncore component of the work presented here and an undoubtedly important concept\nas the age of computation continues to evolve. ZnTrack offers an open-source,\nFAIR data compatible Python package to enable users to harness these concepts\nof the future.\n","authors":["Fabian Zills","Moritz Schäfer","Samuel Tovey","Johannes Kästner","Christian Holm"],"pdf_url":"https://arxiv.org/pdf/2401.10603v1.pdf","comment":"22 pages, 10 figures, 2MB PDF"},{"id":"http://arxiv.org/abs/2311.11809v2","updated":"2024-01-19T10:10:27Z","published":"2023-11-20T14:42:13Z","title":"LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly\n Detector","summary":" This paper introduces LogLead, a tool designed for efficient log analysis\nbenchmarking. LogLead combines three essential steps in log processing:\nloading, enhancing, and anomaly detection. The tool leverages Polars, a\nhigh-speed DataFrame library. We currently have Loaders for eight systems that\nare publicly available (HDFS, Hadoop, BGL, Thunderbird, Spirit, Liberty,\nTrainTicket, and GC Webshop). We have multiple enhancers with three parsers\n(Drain, Spell, LenMa), Bert embedding creation and other log representation\ntechniques like bag-of-words. LogLead integrates to five supervised and four\nunsupervised machine learning algorithms for anomaly detection from SKLearn. By\nintegrating diverse datasets, log representation methods and anomaly detectors,\nLogLead facilitates comprehensive benchmarking in log analysis research. We\nshow that log loading from raw file to dataframe is over 10x faster with\nLogLead compared to past solutions. We demonstrate roughly 2x improvement in\nDrain parsing speed by off-loading log message normalization to LogLead. Our\nbrief benchmarking on HDFS indicates that log representations extending beyond\nthe bag-of-words approach offer limited additional benefits. Tool URL:\nhttps://github.com/EvoTestOps/LogLead\n","authors":["Mika Mäntylä","Yuqing Wang","Jesse Nyyssölä"],"pdf_url":"https://arxiv.org/pdf/2311.11809v2.pdf","comment":"2024 IEEE International Conference on Software Analysis, Evolution\n and Reengineering (SANER)"},{"id":"http://arxiv.org/abs/2401.10590v1","updated":"2024-01-19T10:02:20Z","published":"2024-01-19T10:02:20Z","title":"Adversarially Robust Signed Graph Contrastive Learning from Balance\n Augmentation","summary":" Signed graphs consist of edges and signs, which can be separated into\nstructural information and balance-related information, respectively. Existing\nsigned graph neural networks (SGNNs) typically rely on balance-related\ninformation to generate embeddings. Nevertheless, the emergence of recent\nadversarial attacks has had a detrimental impact on the balance-related\ninformation. Similar to how structure learning can restore unsigned graphs,\nbalance learning can be applied to signed graphs by improving the balance\ndegree of the poisoned graph. However, this approach encounters the challenge\n\"Irreversibility of Balance-related Information\" - while the balance degree\nimproves, the restored edges may not be the ones originally affected by\nattacks, resulting in poor defense effectiveness. To address this challenge, we\npropose a robust SGNN framework called Balance Augmented-Signed Graph\nContrastive Learning (BA-SGCL), which combines Graph Contrastive Learning\nprinciples with balance augmentation techniques. Experimental results\ndemonstrate that BA-SGCL not only enhances robustness against existing\nadversarial attacks but also achieves superior performance on link sign\nprediction task across various datasets.\n","authors":["Jialong Zhou","Xing Ai","Yuni Lai","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.10590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10191v2","updated":"2024-01-19T10:01:36Z","published":"2024-01-18T18:25:29Z","title":"Divide and not forget: Ensemble of selectively trained experts in\n Continual Learning","summary":" Class-incremental learning is becoming more popular as it helps models widen\ntheir applicability while not forgetting what they already know. A trend in\nthis area is to use a mixture-of-expert technique, where different models work\ntogether to solve the task. However, the experts are usually trained all at\nonce using whole task data, which makes them all prone to forgetting and\nincreasing computational burden. To address this limitation, we introduce a\nnovel approach named SEED. SEED selects only one, the most optimal expert for a\nconsidered task, and uses data from this task to fine-tune only this expert.\nFor this purpose, each expert represents each class with a Gaussian\ndistribution, and the optimal expert is selected based on the similarity of\nthose distributions. Consequently, SEED increases diversity and heterogeneity\nwithin the experts while maintaining the high stability of this ensemble\nmethod. The extensive experiments demonstrate that SEED achieves\nstate-of-the-art performance in exemplar-free settings across various\nscenarios, showing the potential of expert diversification through data in\ncontinual learning.\n","authors":["Grzegorz Rypeść","Sebastian Cygert","Valeriya Khan","Tomasz Trzciński","Bartosz Zieliński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2401.10191v2.pdf","comment":"Accepted for ICLR 2024 (main track), code is available at:\n https://github.com/grypesc/SEED"},{"id":"http://arxiv.org/abs/2401.10586v1","updated":"2024-01-19T09:54:23Z","published":"2024-01-19T09:54:23Z","title":"PuriDefense: Randomized Local Implicit Adversarial Purification for\n Defending Black-box Query-based Attacks","summary":" Black-box query-based attacks constitute significant threats to Machine\nLearning as a Service (MLaaS) systems since they can generate adversarial\nexamples without accessing the target model's architecture and parameters.\nTraditional defense mechanisms, such as adversarial training, gradient masking,\nand input transformations, either impose substantial computational costs or\ncompromise the test accuracy of non-adversarial inputs. To address these\nchallenges, we propose an efficient defense mechanism, PuriDefense, that\nemploys random patch-wise purifications with an ensemble of lightweight\npurification models at a low level of inference cost. These models leverage the\nlocal implicit function and rebuild the natural image manifold. Our theoretical\nanalysis suggests that this approach slows down the convergence of query-based\nattacks by incorporating randomness into purifications. Extensive experiments\non CIFAR-10 and ImageNet validate the effectiveness of our proposed\npurifier-based defense mechanism, demonstrating significant improvements in\nrobustness against query-based attacks.\n","authors":["Ping Guo","Zhiyuan Yang","Xi Lin","Qingchuan Zhao","Qingfu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.10586v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12399v3","updated":"2024-01-19T09:49:46Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v3.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.10566v1","updated":"2024-01-19T09:10:58Z","published":"2024-01-19T09:10:58Z","title":"Robust Multi-Modal Density Estimation","summary":" Development of multi-modal, probabilistic prediction models has lead to a\nneed for comprehensive evaluation metrics. While several metrics can\ncharacterize the accuracy of machine-learned models (e.g., negative\nlog-likelihood, Jensen-Shannon divergence), these metrics typically operate on\nprobability densities. Applying them to purely sample-based prediction models\nthus requires that the underlying density function is estimated. However,\ncommon methods such as kernel density estimation (KDE) have been demonstrated\nto lack robustness, while more complex methods have not been evaluated in\nmulti-modal estimation problems. In this paper, we present ROME (RObust\nMulti-modal density Estimator), a non-parametric approach for density\nestimation which addresses the challenge of estimating multi-modal, non-normal,\nand highly correlated distributions. ROME utilizes clustering to segment a\nmulti-modal set of samples into multiple uni-modal ones and then combines\nsimple KDE estimates obtained for individual clusters in a single multi-modal\nestimate. We compared our approach to state-of-the-art methods for density\nestimation as well as ablations of ROME, showing that it not only outperforms\nestablished methods but is also more robust to a variety of distributions. Our\nresults demonstrate that ROME can overcome the issues of over-fitting and\nover-smoothing exhibited by other estimators, promising a more robust\nevaluation of probabilistic machine learning models.\n","authors":["Anna Mészáros","Julian F. Schumann","Javier Alonso-Mora","Arkady Zgonnikov","Jens Kober"],"pdf_url":"https://arxiv.org/pdf/2401.10566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10559v1","updated":"2024-01-19T08:50:54Z","published":"2024-01-19T08:50:54Z","title":"OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy","summary":" We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel\nmulti-adapter method, OrchMoE, which capitalizes on modular skill architecture\nfor enhanced forward transfer in neural networks. Unlike prior models that\ndepend on explicit task identification inputs, OrchMoE automatically discerns\ntask categories, streamlining the learning process. This is achieved through an\nintegrated mechanism comprising an Automatic Task Classification module and a\nTask-Skill Allocation module, which collectively deduce task-specific\nclassifications and tailor skill allocation matrices. Our extensive evaluations\non the 'Super Natural Instructions' dataset, featuring 1,600 diverse\ninstructional tasks, indicate that OrchMoE substantially outperforms comparable\nmulti-adapter baselines in terms of both performance and sample utilization\nefficiency, all while operating within the same parameter constraints. These\nfindings suggest that OrchMoE offers a significant leap forward in multi-task\nlearning efficiency.\n","authors":["Haowen Wang","Tao Sun","Kaixiang Ji","Jian Wang","Cong Fan","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2401.10559v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.10549v1","updated":"2024-01-19T08:26:44Z","published":"2024-01-19T08:26:44Z","title":"Unified View Imputation and Feature Selection Learning for Incomplete\n Multi-view Data","summary":" Although multi-view unsupervised feature selection (MUFS) is an effective\ntechnology for reducing dimensionality in machine learning, existing methods\ncannot directly deal with incomplete multi-view data where some samples are\nmissing in certain views. These methods should first apply predetermined values\nto impute missing data, then perform feature selection on the complete dataset.\nSeparating imputation and feature selection processes fails to capitalize on\nthe potential synergy where local structural information gleaned from feature\nselection could guide the imputation, thereby improving the feature selection\nperformance in turn. Additionally, previous methods only focus on leveraging\nsamples' local structure information, while ignoring the intrinsic locality of\nthe feature space. To tackle these problems, a novel MUFS method, called\nUNified view Imputation and Feature selectIon lEaRning (UNIFIER), is proposed.\nUNIFIER explores the local structure of multi-view data by adaptively learning\nsimilarity-induced graphs from both the sample and feature spaces. Then,\nUNIFIER dynamically recovers the missing views, guided by the sample and\nfeature similarity graphs during the feature selection procedure. Furthermore,\nthe half-quadratic minimization technique is used to automatically weight\ndifferent instances, alleviating the impact of outliers and unreliable restored\ndata. Comprehensive experimental results demonstrate that UNIFIER outperforms\nother state-of-the-art methods.\n","authors":["Yanyong Huang","Zongxin Shen","Tianrui Li","Fengmao Lv"],"pdf_url":"https://arxiv.org/pdf/2401.10549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10547v1","updated":"2024-01-19T08:13:10Z","published":"2024-01-19T08:13:10Z","title":"PhoGAD: Graph-based Anomaly Behavior Detection with Persistent Homology\n Optimization","summary":" A multitude of toxic online behaviors, ranging from network attacks to\nanonymous traffic and spam, have severely disrupted the smooth operation of\nnetworks. Due to the inherent sender-receiver nature of network behaviors,\ngraph-based frameworks are commonly used for detecting anomalous behaviors.\nHowever, in real-world scenarios, the boundary between normal and anomalous\nbehaviors tends to be ambiguous. The local heterophily of graphs interferes\nwith the detection, and existing methods based on nodes or edges introduce\nunwanted noise into representation results, thereby impacting the effectiveness\nof detection. To address these issues, we propose PhoGAD, a graph-based anomaly\ndetection framework. PhoGAD leverages persistent homology optimization to\nclarify behavioral boundaries. Building upon this, the weights of adjacent\nedges are designed to mitigate the effects of local heterophily. Subsequently,\nto tackle the noise problem, we conduct a formal analysis and propose a\ndisentangled representation-based explicit embedding method, ultimately\nachieving anomaly behavior detection. Experiments on intrusion, traffic, and\nspam datasets verify that PhoGAD has surpassed the performance of\nstate-of-the-art (SOTA) frameworks in detection efficacy. Notably, PhoGAD\ndemonstrates robust detection even with diminished anomaly proportions,\nhighlighting its applicability to real-world scenarios. The analysis of\npersistent homology demonstrates its effectiveness in capturing the topological\nstructure formed by normal edge features. Additionally, ablation experiments\nvalidate the effectiveness of the innovative mechanisms integrated within\nPhoGAD.\n","authors":["Ziqi Yuan","Haoyi Zhou","Tianyu Chen","Jianxin Li"],"pdf_url":"https://arxiv.org/pdf/2401.10547v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2401.08169v2","updated":"2024-01-19T07:48:24Z","published":"2024-01-16T07:18:47Z","title":"Statistical Test for Attention Map in Vision Transformer","summary":" The Vision Transformer (ViT) demonstrates exceptional performance in various\ncomputer vision tasks. Attention is crucial for ViT to capture complex\nwide-ranging relationships among image patches, allowing the model to weigh the\nimportance of image patches and aiding our understanding of the decision-making\nprocess. However, when utilizing the attention of ViT as evidence in\nhigh-stakes decision-making tasks such as medical diagnostics, a challenge\narises due to the potential of attention mechanisms erroneously focusing on\nirrelevant regions. In this study, we propose a statistical test for ViT's\nattentions, enabling us to use the attentions as reliable quantitative evidence\nindicators for ViT's decision-making with a rigorously controlled error rate.\nUsing the framework called selective inference, we quantify the statistical\nsignificance of attentions in the form of p-values, which enables the\ntheoretically grounded quantification of the false positive detection\nprobability of attentions. We demonstrate the validity and the effectiveness of\nthe proposed method through numerical experiments and applications to brain\nimage diagnoses.\n","authors":["Tomohiro Shiraishi","Daiki Miwa","Teruyuki Katsuoka","Vo Nguyen Le Duy","Kouichi Taji","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2401.08169v2.pdf","comment":"42pages, 17figures"},{"id":"http://arxiv.org/abs/2401.10541v1","updated":"2024-01-19T07:44:32Z","published":"2024-01-19T07:44:32Z","title":"I-SplitEE: Image classification in Split Computing DNNs with Early Exits","summary":" The recent advances in Deep Neural Networks (DNNs) stem from their\nexceptional performance across various domains. However, their inherent large\nsize hinders deploying these networks on resource-constrained devices like\nedge, mobile, and IoT platforms. Strategies have emerged, from partial cloud\ncomputation offloading (split computing) to integrating early exits within DNN\nlayers. Our work presents an innovative unified approach merging early exits\nand split computing. We determine the 'splitting layer', the optimal depth in\nthe DNN for edge device computations, and whether to infer on edge device or be\noffloaded to the cloud for inference considering accuracy, computational\nefficiency, and communication costs. Also, Image classification faces diverse\nenvironmental distortions, influenced by factors like time of day, lighting,\nand weather. To adapt to these distortions, we introduce I-SplitEE, an online\nunsupervised algorithm ideal for scenarios lacking ground truths and with\nsequential data. Experimental validation using Caltech-256 and Cifar-10\ndatasets subjected to varied distortions showcases I-SplitEE's ability to\nreduce costs by a minimum of 55% with marginal performance degradation of at\nmost 5%.\n","authors":["Divya Jyoti Bajpai","Aastha Jaiswal","Manjesh Kumar Hanawal"],"pdf_url":"https://arxiv.org/pdf/2401.10541v1.pdf","comment":"To appear in proceedings of IEEE International Conference on\n Communications 2024"},{"id":"http://arxiv.org/abs/2401.10535v1","updated":"2024-01-19T07:21:45Z","published":"2024-01-19T07:21:45Z","title":"The \"Colonial Impulse\" of Natural Language Processing: An Audit of\n Bengali Sentiment Analysis Tools and Their Identity-based Biases","summary":" While colonization has sociohistorically impacted people's identities across\nvarious dimensions, those colonial values and biases continue to be perpetuated\nby sociotechnical systems. One category of sociotechnical systems--sentiment\nanalysis tools--can also perpetuate colonial values and bias, yet less\nattention has been paid to how such tools may be complicit in perpetuating\ncoloniality, although they are often used to guide various practices (e.g.,\ncontent moderation). In this paper, we explore potential bias in sentiment\nanalysis tools in the context of Bengali communities that have experienced and\ncontinue to experience the impacts of colonialism. Drawing on identity\ncategories most impacted by colonialism amongst local Bengali communities, we\nfocused our analytic attention on gender, religion, and nationality. We\nconducted an algorithmic audit of all sentiment analysis tools for Bengali,\navailable on the Python package index (PyPI) and GitHub. Despite similar\nsemantic content and structure, our analyses showed that in addition to\ninconsistencies in output from different tools, Bengali sentiment analysis\ntools exhibit bias between different identity categories and respond\ndifferently to different ways of identity expression. Connecting our findings\nwith colonially shaped sociocultural structures of Bengali communities, we\ndiscuss the implications of downstream bias of sentiment analysis tools.\n","authors":["Dipto Das","Shion Guha","Jed Brubaker","Bryan Semaan"],"pdf_url":"https://arxiv.org/pdf/2401.10535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10529v1","updated":"2024-01-19T07:10:13Z","published":"2024-01-19T07:10:13Z","title":"Mementos: A Comprehensive Benchmark for Multimodal Large Language Model\n Reasoning over Image Sequences","summary":" Multimodal Large Language Models (MLLMs) have demonstrated proficiency in\nhandling a variety of visual-language tasks. However, current MLLM benchmarks\nare predominantly designed to evaluate reasoning based on static information\nabout a single image, and the ability of modern MLLMs to extrapolate from image\nsequences, which is essential for understanding our ever-changing world, has\nbeen less investigated. To address this challenge, this paper introduces\nMementos, a new benchmark designed to assess MLLMs' sequential image reasoning\nabilities. Mementos features 4,761 diverse image sequences with varying\nlengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning\nperformance. Through a careful evaluation of nine recent MLLMs on Mementos,\nincluding GPT-4V and Gemini, we find that they struggle to accurately describe\ndynamic information about given image sequences, often leading to\nhallucinations/misrepresentations of objects and their corresponding behaviors.\nOur quantitative analysis and case studies identify three key factors impacting\nMLLMs' sequential image reasoning: the correlation between object and\nbehavioral hallucinations, the influence of cooccurring behaviors, and the\ncompounding impact of behavioral hallucinations. Our dataset is available at\nhttps://github.com/umd-huang-lab/Mementos.\n","authors":["Xiyao Wang","Yuhang Zhou","Xiaoyu Liu","Hongjin Lu","Yuancheng Xu","Feihong He","Jaehong Yoon","Taixi Lu","Gedas Bertasius","Mohit Bansal","Huaxiu Yao","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.10529v1.pdf","comment":"27 pages, 23 figures"},{"id":"http://arxiv.org/abs/2401.10522v1","updated":"2024-01-19T06:56:09Z","published":"2024-01-19T06:56:09Z","title":"FARe: Fault-Aware GNN Training on ReRAM-based PIM Accelerators","summary":" Resistive random-access memory (ReRAM)-based processing-in-memory (PIM)\narchitecture is an attractive solution for training Graph Neural Networks\n(GNNs) on edge platforms. However, the immature fabrication process and limited\nwrite endurance of ReRAMs make them prone to hardware faults, thereby limiting\ntheir widespread adoption for GNN training. Further, the existing\nfault-tolerant solutions prove inadequate for effectively training GNNs in the\npresence of faults. In this paper, we propose a fault-aware framework referred\nto as FARe that mitigates the effect of faults during GNN training. FARe\noutperforms existing approaches in terms of both accuracy and timing overhead.\nExperimental results demonstrate that FARe framework can restore GNN test\naccuracy by 47.6% on faulty ReRAM hardware with a ~1% timing overhead compared\nto the fault-free counterpart.\n","authors":["Pratyush Dhingra","Chukwufumnanya Ogbogu","Biresh Kumar Joardar","Janardhan Rao Doppa","Ananth Kalyanaraman","Partha Pratim Pande"],"pdf_url":"https://arxiv.org/pdf/2401.10522v1.pdf","comment":"This paper has been accepted to the conference DATE (Design,\n Automation and Test in Europe) - 2024"},{"id":"http://arxiv.org/abs/2401.10518v1","updated":"2024-01-19T06:26:05Z","published":"2024-01-19T06:26:05Z","title":"Spatial-temporal Forecasting for Regions without Observations","summary":" Spatial-temporal forecasting plays an important role in many real-world\napplications, such as traffic forecasting, air pollutant forecasting,\ncrowd-flow forecasting, and so on. State-of-the-art spatial-temporal\nforecasting models take data-driven approaches and rely heavily on data\navailability. Such models suffer from accuracy issues when data is incomplete,\nwhich is common in reality due to the heavy costs of deploying and maintaining\nsensors for data collection. A few recent studies attempted to address the\nissue of incomplete data. They typically assume some data availability in a\nregion of interest either for a short period or at a few locations. In this\npaper, we further study spatial-temporal forecasting for a region of interest\nwithout any historical observations, to address scenarios such as unbalanced\nregion development, progressive deployment of sensors or lack of open data. We\npropose a model named STSM for the task. The model takes a contrastive\nlearning-based approach to learn spatial-temporal patterns from adjacent\nregions that have recorded data. Our key insight is to learn from the locations\nthat resemble those in the region of interest, and we propose a selective\nmasking strategy to enable the learning. As a result, our model outperforms\nadapted state-of-the-art models, reducing errors consistently over both traffic\nand air pollutant forecasting tasks. The source code is available at\nhttps://github.com/suzy0223/STSM.\n","authors":["Xinyu Su","Jianzhong Qi","Egemen Tanin","Yanchuan Chang","Majid Sarvi"],"pdf_url":"https://arxiv.org/pdf/2401.10518v1.pdf","comment":"Accepted by EDBT2024"},{"id":"http://arxiv.org/abs/2401.07494v2","updated":"2024-01-19T06:16:59Z","published":"2024-01-15T06:26:53Z","title":"Input Convex Lipschitz RNN: A Fast and Robust Approach for Engineering\n Tasks","summary":" Computational efficiency and adversarial robustness are critical factors in\nreal-world engineering applications. Yet, conventional neural networks often\nfall short in addressing both simultaneously, or even separately. Drawing\ninsights from natural physical systems and existing literature, it is known\nthat an input convex architecture enhances computational efficiency, while a\nLipschitz-constrained architecture bolsters adversarial robustness. By\nleveraging the strengths of convexity and Lipschitz continuity, we develop a\nnovel network architecture, termed Input Convex Lipschitz Recurrent Neural\nNetworks. This model outperforms existing recurrent units across a spectrum of\nengineering tasks in terms of computational efficiency and adversarial\nrobustness. These tasks encompass a benchmark MNIST image classification,\nreal-world solar irradiance prediction for Solar PV system planning at LHT\nHoldings in Singapore, and real-time Model Predictive Control optimization for\na chemical reactor.\n","authors":["Zihao Wang","P S Pravin","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2401.07494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10516v1","updated":"2024-01-19T06:14:36Z","published":"2024-01-19T06:14:36Z","title":"Episodic Reinforcement Learning with Expanded State-reward Space","summary":" Empowered by deep neural networks, deep reinforcement learning (DRL) has\ndemonstrated tremendous empirical successes in various domains, including\ngames, health care, and autonomous driving. Despite these advancements, DRL is\nstill identified as data-inefficient as effective policies demand vast numbers\nof environmental samples. Recently, episodic control (EC)-based model-free DRL\nmethods enable sample efficiency by recalling past experiences from episodic\nmemory. However, existing EC-based methods suffer from the limitation of\npotential misalignment between the state and reward spaces for neglecting the\nutilization of (past) retrieval states with extensive information, which\nprobably causes inaccurate value estimation and degraded policy performance. To\ntackle this issue, we introduce an efficient EC-based DRL framework with\nexpanded state-reward space, where the expanded states used as the input and\nthe expanded rewards used in the training both contain historical and current\ninformation. To be specific, we reuse the historical states retrieved by EC as\npart of the input states and integrate the retrieved MC-returns into the\nimmediate reward in each interactive transition. As a result, our method is\nable to simultaneously achieve the full utilization of retrieval information\nand the better evaluation of state values by a Temporal Difference (TD) loss.\nEmpirical results on challenging Box2d and Mujoco tasks demonstrate the\nsuperiority of our method over a recent sibling method and common baselines.\nFurther, we also verify our method's effectiveness in alleviating Q-value\noverestimation by additional experiments of Q-value comparison.\n","authors":["Dayang Liang","Yaru Zhang","Yunlong Liu"],"pdf_url":"https://arxiv.org/pdf/2401.10516v1.pdf","comment":"Accepted at AAMAS'24"},{"id":"http://arxiv.org/abs/2310.05492v3","updated":"2024-01-19T06:06:46Z","published":"2023-10-09T07:56:16Z","title":"How Abilities in Large Language Models are Affected by Supervised\n Fine-tuning Data Composition","summary":" Large language models (LLMs) with enormous pre-training tokens and parameters\nemerge diverse abilities, including math reasoning, code generation, and\ninstruction following. These abilities are further enhanced by supervised\nfine-tuning (SFT). While the open-source community has explored ad-hoc SFT for\nenhancing individual capabilities, proprietary LLMs exhibit versatility across\nvarious skills. Therefore, understanding the facilitation of multiple abilities\nvia SFT is paramount. In this study, we specifically focuses on the interplay\nof data composition between mathematical reasoning, code generation, and\ngeneral human-aligning abilities during SFT. We propose four intriguing\nresearch questions to explore the association between model performance and\nvarious factors including data amount, composition ratio, model size and SFT\nstrategies. Our experiments reveal that distinct capabilities scale differently\nand larger models generally show superior performance with same amount of data.\nMathematical reasoning and code generation consistently improve with increasing\ndata amount, whereas general abilities plateau after roughly a thousand\nsamples. Moreover, we observe data composition appears to enhance various\nabilities under limited data conditions, yet can lead to performance conflicts\nwhen data is plentiful. Our findings also suggest the amount of composition\ndata influences performance more than the composition ratio. In analysis of SFT\nstrategies, we find that sequentially learning multiple skills risks\ncatastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT)\nstrategy offers a promising solution to learn multiple abilities with different\nscaling patterns.\n","authors":["Guanting Dong","Hongyi Yuan","Keming Lu","Chengpeng Li","Mingfeng Xue","Dayiheng Liu","Wei Wang","Zheng Yuan","Chang Zhou","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.05492v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10510v1","updated":"2024-01-19T05:58:30Z","published":"2024-01-19T05:58:30Z","title":"A match made in consistency heaven: when large language models meet\n evolutionary algorithms","summary":" Pre-trained large language models (LLMs) have powerful capabilities for\ngenerating creative natural text. Evolutionary algorithms (EAs) can discover\ndiverse solutions to complex real-world problems. Motivated by the common\ncollective and directionality of text sequence generation and evolution, this\npaper illustrates the strong consistency of LLMs and EAs, which includes\nmultiple one-to-one key characteristics: token embedding and genotype-phenotype\nmapping, position encoding and fitness shaping, position embedding and\nselection, attention and crossover, feed-forward neural network and mutation,\nmodel training and parameter update, and multi-task learning and\nmulti-objective optimization. Based on this consistency perspective, existing\ncoupling studies are analyzed, including evolutionary fine-tuning and\nLLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap\nfor future research in coupling LLMs and EAs, while highlighting key challenges\nalong the way. The consistency not only reveals the evolution mechanism behind\nLLMs but also facilitates the development of evolved artificial agents that\napproach or surpass biological organisms.\n","authors":["Wang Chao","Jiaxuan Zhao","Licheng Jiao","Lingling Li","Fang Liu","Shuyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2401.10510v1.pdf","comment":"A perspective article under review"},{"id":"http://arxiv.org/abs/2311.07202v3","updated":"2024-01-19T05:54:53Z","published":"2023-11-13T09:41:32Z","title":"Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model\n Predictive Control","summary":" Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive\nControl (MPC) successfully attains globally optimal solutions by upholding\nconvexity within the MPC framework. However, current ICNN architectures\nencounter the issue of vanishing/exploding gradients, which limits their\nability to serve as deep neural networks for complex tasks. Additionally, the\ncurrent neural network-based MPC, including conventional neural network-based\nMPC and ICNN-based MPC, faces slower convergence speed when compared to MPC\nbased on first-principles models. In this study, we leverage the principles of\nICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the\nspecific goal of reducing convergence time and mitigating the\nvanishing/exploding gradient problem while ensuring closed-loop stability. From\na simulation study of a nonlinear chemical reactor, we observed a mitigation of\nvanishing/exploding gradient problem and a reduction in convergence time, with\na percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain\nRNN, plain LSTM, and Input Convex Recurrent Neural Networks, respectively.\n","authors":["Zihao Wang","Zhe Wu"],"pdf_url":"https://arxiv.org/pdf/2311.07202v3.pdf","comment":"Submitted to 6th Annual Learning for Dynamics & Control Conference\n (L4DC 2024)"},{"id":"http://arxiv.org/abs/2401.08216v2","updated":"2024-01-19T05:31:07Z","published":"2024-01-16T09:02:34Z","title":"Towards Efficient and Certified Recovery from Poisoning Attacks in\n Federated Learning","summary":" Federated learning (FL) is vulnerable to poisoning attacks, where malicious\nclients manipulate their updates to affect the global model. Although various\nmethods exist for detecting those clients in FL, identifying malicious clients\nrequires sufficient model updates, and hence by the time malicious clients are\ndetected, FL models have been already poisoned. Thus, a method is needed to\nrecover an accurate global model after malicious clients are identified.\nCurrent recovery methods rely on (i) all historical information from\nparticipating FL clients and (ii) the initial model unaffected by the malicious\nclients, leading to a high demand for storage and computational resources. In\nthis paper, we show that highly effective recovery can still be achieved based\non (i) selective historical information rather than all historical information\nand (ii) a historical model that has not been significantly affected by\nmalicious clients rather than the initial model. In this scenario, while\nmaintaining comparable recovery performance, we can accelerate the recovery\nspeed and decrease memory consumption. Following this concept, we introduce\nCrab, an efficient and certified recovery method, which relies on selective\ninformation storage and adaptive model rollback. Theoretically, we demonstrate\nthat the difference between the global model recovered by Crab and the one\nrecovered by train-from-scratch can be bounded under certain assumptions. Our\nempirical evaluation, conducted across three datasets over multiple machine\nlearning models, and a variety of untargeted and targeted poisoning attacks\nreveals that Crab is both accurate and efficient, and consistently outperforms\nprevious approaches in terms of both recovery speed and memory consumption.\n","authors":["Yu Jiang","Jiyuan Shen","Ziyao Liu","Chee Wei Tan","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2401.08216v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10495v1","updated":"2024-01-19T05:18:28Z","published":"2024-01-19T05:18:28Z","title":"Causal Layering via Conditional Entropy","summary":" Causal discovery aims to recover information about an unobserved causal graph\nfrom the observable data it generates. Layerings are orderings of the variables\nwhich place causes before effects. In this paper, we provide ways to recover\nlayerings of a graph by accessing the data via a conditional entropy oracle,\nwhen distributions are discrete. Our algorithms work by repeatedly removing\nsources or sinks from the graph. Under appropriate assumptions and\nconditioning, we can separate the sources or sinks from the remainder of the\nnodes by comparing their conditional entropy to the unconditional entropy of\ntheir noise. Our algorithms are provably correct and run in worst-case\nquadratic time. The main assumptions are faithfulness and injective noise, and\neither known noise entropies or weakly monotonically increasing noise entropies\nalong directed paths. In addition, we require one of either a very mild\nextension of faithfulness, or strictly monotonically increasing noise\nentropies, or expanding noise injectivity to include an additional single\nargument in the structural functions.\n","authors":["Itai Feigenbaum","Devansh Arpit","Huan Wang","Shelby Heinecke","Juan Carlos Niebles","Weiran Yao","Caiming Xiong","Silvio Savarese"],"pdf_url":"https://arxiv.org/pdf/2401.10495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10490v1","updated":"2024-01-19T05:01:43Z","published":"2024-01-19T05:01:43Z","title":"Generalization Error Guaranteed Auto-Encoder-Based Nonlinear Model\n Reduction for Operator Learning","summary":" Many physical processes in science and engineering are naturally represented\nby operators between infinite-dimensional function spaces. The problem of\noperator learning, in this context, seeks to extract these physical processes\nfrom empirical data, which is challenging due to the infinite or high\ndimensionality of data. An integral component in addressing this challenge is\nmodel reduction, which reduces both the data dimensionality and problem size.\nIn this paper, we utilize low-dimensional nonlinear structures in model\nreduction by investigating Auto-Encoder-based Neural Network (AENet). AENet\nfirst learns the latent variables of the input data and then learns the\ntransformation from these latent variables to corresponding output data. Our\nnumerical experiments validate the ability of AENet to accurately learn the\nsolution operator of nonlinear partial differential equations. Furthermore, we\nestablish a mathematical and statistical estimation theory that analyzes the\ngeneralization error of AENet. Our theoretical framework shows that the sample\ncomplexity of training AENet is intricately tied to the intrinsic dimension of\nthe modeled process, while also demonstrating the remarkable resilience of\nAENet to noise.\n","authors":["Hao Liu","Biraj Dahal","Rongjie Lai","Wenjing Liao"],"pdf_url":"https://arxiv.org/pdf/2401.10490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06120v3","updated":"2024-01-19T04:13:33Z","published":"2023-02-13T06:00:56Z","title":"Knowledge from Large-Scale Protein Contact Prediction Models Can Be\n Transferred to the Data-Scarce RNA Contact Prediction Task","summary":" RNA, whose functionality is largely determined by its structure, plays an\nimportant role in many biological activities. The prediction of pairwise\nstructural proximity between each nucleotide of an RNA sequence can\ncharacterize the structural information of the RNA. Historically, this problem\nhas been tackled by machine learning models using expert-engineered features\nand trained on scarce labeled datasets. Here, we find that the knowledge\nlearned by a protein-coevolution Transformer-based deep neural network can be\ntransferred to the RNA contact prediction task. As protein datasets are orders\nof magnitude larger than those for RNA contact prediction, our findings and the\nsubsequent framework greatly reduce the data scarcity bottleneck. Experiments\nconfirm that RNA contact prediction through transfer learning using a publicly\navailable protein model is greatly improved. Our findings indicate that the\nlearned structural patterns of proteins can be transferred to RNAs, opening up\npotential new avenues for research.\n","authors":["Yiren Jian","Chongyang Gao","Chen Zeng","Yunjie Zhao","Soroush Vosoughi"],"pdf_url":"https://arxiv.org/pdf/2302.06120v3.pdf","comment":"The code is available at\n https://github.com/yiren-jian/CoT-RNA-Transfer"},{"id":"http://arxiv.org/abs/2401.10478v1","updated":"2024-01-19T04:02:49Z","published":"2024-01-19T04:02:49Z","title":"Budgeted Online Model Selection and Fine-Tuning via Federated Learning","summary":" Online model selection involves selecting a model from a set of candidate\nmodels 'on the fly' to perform prediction on a stream of data. The choice of\ncandidate models henceforth has a crucial impact on the performance. Although\nemploying a larger set of candidate models naturally leads to more flexibility\nin model selection, this may be infeasible in cases where prediction tasks are\nperformed on edge devices with limited memory. Faced with this challenge, the\npresent paper proposes an online federated model selection framework where a\ngroup of learners (clients) interacts with a server with sufficient memory such\nthat the server stores all candidate models. However, each client only chooses\nto store a subset of models that can be fit into its memory and performs its\nown prediction task using one of the stored models. Furthermore, employing the\nproposed algorithm, clients and the server collaborate to fine-tune models to\nadapt them to a non-stationary environment. Theoretical analysis proves that\nthe proposed algorithm enjoys sub-linear regret with respect to the best model\nin hindsight. Experiments on real datasets demonstrate the effectiveness of the\nproposed algorithm.\n","authors":["Pouya M. Ghari","Yanning Shen"],"pdf_url":"https://arxiv.org/pdf/2401.10478v1.pdf","comment":"Accepted by Transactions on Machine Learning Research (TMLR)"},{"id":"http://arxiv.org/abs/2401.10474v1","updated":"2024-01-19T03:50:19Z","published":"2024-01-19T03:50:19Z","title":"LDReg: Local Dimensionality Regularized Self-Supervised Learning","summary":" Representations learned via self-supervised learning (SSL) can be susceptible\nto dimensional collapse, where the learned representation subspace is of\nextremely low dimensionality and thus fails to represent the full data\ndistribution and modalities. Dimensional collapse also known as the\n\"underfilling\" phenomenon is one of the major causes of degraded performance on\ndownstream tasks. Previous work has investigated the dimensional collapse\nproblem of SSL at a global level. In this paper, we demonstrate that\nrepresentations can span over high dimensional space globally, but collapse\nlocally. To address this, we propose a method called $\\textit{local\ndimensionality regularization (LDReg)}$. Our formulation is based on the\nderivation of the Fisher-Rao metric to compare and optimize local distance\ndistributions at an asymptotically small radius for each data point. By\nincreasing the local intrinsic dimensionality, we demonstrate through a range\nof experiments that LDReg improves the representation quality of SSL. The\nresults also show that LDReg can regularize dimensionality at both local and\nglobal levels.\n","authors":["Hanxun Huang","Ricardo J. G. B. Campello","Sarah Monazam Erfani","Xingjun Ma","Michael E. Houle","James Bailey"],"pdf_url":"https://arxiv.org/pdf/2401.10474v1.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2401.10467v1","updated":"2024-01-19T03:39:43Z","published":"2024-01-19T03:39:43Z","title":"Learning Backdoors for Mixed Integer Programs with Contrastive Learning","summary":" Many real-world problems can be efficiently modeled as Mixed Integer Programs\n(MIPs) and solved with the Branch-and-Bound method. Prior work has shown the\nexistence of MIP backdoors, small sets of variables such that prioritizing\nbranching on them when possible leads to faster running times. However, finding\nhigh-quality backdoors that improve running times remains an open question.\nPrevious work learns to estimate the relative solver speed of randomly sampled\nbackdoors through ranking and then decide whether to use it. In this paper, we\nutilize the Monte-Carlo tree search method to collect backdoors for training,\nrather than relying on random sampling, and adapt a contrastive learning\nframework to train a Graph Attention Network model to predict backdoors. Our\nmethod, evaluated on four common MIP problem domains, demonstrates performance\nimprovements over both Gurobi and previous models.\n","authors":["Junyang Cai","Taoan Huang","Bistra Dilkina"],"pdf_url":"https://arxiv.org/pdf/2401.10467v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05225v2","updated":"2024-01-19T03:34:11Z","published":"2023-12-08T18:20:43Z","title":"Neural Spectral Methods: Self-supervised learning in the spectral domain","summary":" We present Neural Spectral Methods, a technique to solve parametric Partial\nDifferential Equations (PDEs), grounded in classical spectral methods. Our\nmethod uses orthogonal bases to learn PDE solutions as mappings between\nspectral coefficients. In contrast to current machine learning approaches which\nenforce PDE constraints by minimizing the numerical quadrature of the residuals\nin the spatiotemporal domain, we leverage Parseval's identity and introduce a\nnew training strategy through a \\textit{spectral loss}. Our spectral loss\nenables more efficient differentiation through the neural network, and\nsubstantially reduces training complexity. At inference time, the computational\ncost of our method remains constant, regardless of the spatiotemporal\nresolution of the domain. Our experimental results demonstrate that our method\nsignificantly outperforms previous machine learning approaches in terms of\nspeed and accuracy by one to two orders of magnitude on multiple different\nproblems. When compared to numerical solvers of the same accuracy, our method\ndemonstrates a $10\\times$ increase in performance speed.\n","authors":["Yiheng Du","Nithin Chalapathi","Aditi Krishnapriyan"],"pdf_url":"https://arxiv.org/pdf/2312.05225v2.pdf","comment":"Accepted to International Conference on Learning Representations\n (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.10463v1","updated":"2024-01-19T03:24:36Z","published":"2024-01-19T03:24:36Z","title":"Critical Data Size of Language Models from a Grokking Perspective","summary":" We explore the critical data size in language models, a threshold that marks\na fundamental shift from quick memorization to slow generalization. We\nformalize the phase transition under the grokking configuration into the Data\nEfficiency Hypothesis and identify data insufficiency, sufficiency, and surplus\nregimes in language models training dynamics. We develop a grokking\nconfiguration to reproduce grokking on simplistic language models stably by\nrescaling initialization and weight decay. We show that generalization occurs\nonly when language models reach a critical size. We analyze grokking across\nsample-wise and model-wise, verifying the proposed data efficiency hypothesis.\nOur experiments reveal smoother phase transitions occurring at the critical\ndataset size for language datasets. As the model size increases, this critical\npoint also becomes larger, indicating that larger models require more data. Our\nresults deepen the understanding of language model training, offering a novel\nperspective on the role of data in the learning mechanism of language models.\n","authors":["Xuekai Zhu","Yao Fu","Bowen Zhou","Zhouhan Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10463v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.11171v4","updated":"2024-01-19T03:23:21Z","published":"2023-04-21T03:26:29Z","title":"Granular-ball computing: an efficient, robust, and interpretable\n adaptive multi-granularity representation and computation method","summary":" Human cognition operates on a \"Global-first\" cognitive mechanism,\nprioritizing information processing based on coarse-grained details. This\nmechanism inherently possesses an adaptive multi-granularity description\ncapacity, resulting in computational traits such as efficiency, robustness, and\ninterpretability. The analysis pattern reliance on the finest granularity and\nsingle-granularity makes most existing computational methods less efficient,\nrobust, and interpretable, which is an important reason for the current lack of\ninterpretability in neural networks. Multi-granularity granular-ball computing\nemploys granular-balls of varying sizes to daptively represent and envelop the\nsample space, facilitating learning based on these granular-balls. Given that\nthe number of coarse-grained \"granular-balls\" is fewer than sample points,\ngranular-ball computing proves more efficient. Moreover, the inherent\ncoarse-grained nature of granular-balls reduces susceptibility to fine-grained\nsample disturbances, enhancing robustness. The multi-granularity construct of\ngranular-balls generates topological structures and coarse-grained\ndescriptions, naturally augmenting interpretability. Granular-ball computing\nhas successfully ventured into diverse AI domains, fostering the development of\ninnovative theoretical methods, including granular-ball classifiers, clustering\ntechniques, neural networks, rough sets, and evolutionary computing. This has\nnotably ameliorated the efficiency, noise robustness, and interpretability of\ntraditional methods. Overall, granular-ball computing is a rare and innovative\ntheoretical approach in AI that can adaptively and simultaneously enhance\nefficiency, robustness, and interpretability. This article delves into the main\napplication landscapes for granular-ball computing, aiming to equip future\nresearchers with references and insights to refine and expand this promising\ntheory.\n","authors":["Shuyin Xia","Guoyin Wang","Xinbo Gao","Xiaoyu Lian"],"pdf_url":"https://arxiv.org/pdf/2304.11171v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01521v2","updated":"2024-01-19T03:21:28Z","published":"2022-12-03T03:39:44Z","title":"Distribution Fitting for Combating Mode Collapse in Generative\n Adversarial Networks","summary":" Mode collapse is a significant unsolved issue of generative adversarial\nnetworks. In this work, we examine the causes of mode collapse from a novel\nperspective. Due to the nonuniform sampling in the training process, some\nsub-distributions may be missed when sampling data. As a result, even when the\ngenerated distribution differs from the real one, the GAN objective can still\nachieve the minimum. To address the issue, we propose a global distribution\nfitting (GDF) method with a penalty term to confine the generated data\ndistribution. When the generated distribution differs from the real one, GDF\nwill make the objective harder to reach the minimal value, while the original\nglobal minimum is not changed. To deal with the circumstance when the overall\nreal data is unreachable, we also propose a local distribution fitting (LDF)\nmethod. Experiments on several benchmarks demonstrate the effectiveness and\ncompetitive performance of GDF and LDF.\n","authors":["Yanxiang Gong","Zhiwei Xie","Guozhen Duan","Zheng Ma","Mei Xie"],"pdf_url":"https://arxiv.org/pdf/2212.01521v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18426v3","updated":"2024-01-19T02:56:41Z","published":"2023-11-30T10:24:07Z","title":"Convergence Analysis of Fractional Gradient Descent","summary":" Fractional derivatives are a well-studied generalization of integer order\nderivatives. Naturally, for optimization, it is of interest to understand the\nconvergence properties of gradient descent using fractional derivatives.\nConvergence analysis of fractional gradient descent is currently limited both\nin the methods analyzed and the settings analyzed. This paper aims to fill in\nthese gaps by analyzing variations of fractional gradient descent in smooth and\nconvex, smooth and strongly convex, and smooth and non-convex settings. First,\nnovel bounds will be established bridging fractional and integer derivatives.\nThen, these bounds will be applied to the aforementioned settings to prove\nlinear convergence for smooth and strongly convex functions and $O(1/T)$\nconvergence for smooth and convex functions. Additionally, we prove $O(1/T)$\nconvergence for smooth and non-convex functions using an extended notion of\nsmoothness - H\\\"older smoothness - that is more natural for fractional\nderivatives. Finally, empirical results will be presented on the potential\nspeed up of fractional gradient descent over standard gradient descent as well\nas the challenges of predicting which will be faster in general.\n","authors":["Ashwani Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2311.18426v3.pdf","comment":"24 pages, 4 figures. Added additional results for smooth and convex\n functions"},{"id":"http://arxiv.org/abs/2401.10460v1","updated":"2024-01-19T02:51:00Z","published":"2024-01-19T02:51:00Z","title":"Ultra-lightweight Neural Differential DSP Vocoder For High Quality\n Speech Synthesis","summary":" Neural vocoders model the raw audio waveform and synthesize high-quality\naudio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to\nrun real-time on a low-end device like a smartglass. A pure digital signal\nprocessing (DSP) based vocoder can be implemented via lightweight fast Fourier\ntransforms (FFT), and therefore, is a magnitude faster than any neural vocoder.\nA DSP vocoder often gets a lower audio quality due to consuming over-smoothed\nacoustic model predictions of approximate representations for the vocal tract.\nIn this paper, we propose an ultra-lightweight differential DSP (DDSP) vocoder\nthat uses a jointly optimized acoustic model with a DSP vocoder, and learns\nwithout an extracted spectral feature for the vocal tract. The model achieves\naudio quality comparable to neural vocoders with a high average MOS of 4.36\nwhile being efficient as a DSP vocoder. Our C++ implementation, without any\nhardware-specific optimization, is at 15 MFLOPS, surpasses MB-MelGAN by 340\ntimes in terms of FLOPS, and achieves a vocoder-only RTF of 0.003 and overall\nRTF of 0.044 while running single-threaded on a 2GHz Intel Xeon CPU.\n","authors":["Prabhav Agrawal","Thilo Koehler","Zhiping Xiu","Prashant Serai","Qing He"],"pdf_url":"https://arxiv.org/pdf/2401.10460v1.pdf","comment":"Accepted for ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.03320v4","updated":"2024-01-19T02:47:51Z","published":"2023-10-05T05:30:42Z","title":"BioBridge: Bridging Biomedical Foundation Models via Knowledge Graphs","summary":" Foundation models (FMs) are able to leverage large volumes of unlabeled data\nto demonstrate superior performance across a wide range of tasks. However, FMs\ndeveloped for biomedical domains have largely remained unimodal, i.e.,\nindependently trained and used for tasks on protein sequences alone, small\nmolecule structures alone, or clinical data alone. To overcome this limitation\nof biomedical FMs, we present BioBridge, a novel parameter-efficient learning\nframework, to bridge independently trained unimodal FMs to establish multimodal\nbehavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn\ntransformations between one unimodal FM and another without fine-tuning any\nunderlying unimodal FMs. Our empirical results demonstrate that BioBridge can\nbeat the best baseline KG embedding methods (on average by around 76.3%) in\ncross-modal retrieval tasks. We also identify BioBridge demonstrates\nout-of-domain generalization ability by extrapolating to unseen modalities or\nrelations. Additionally, we also show that BioBridge presents itself as a\ngeneral purpose retriever that can aid biomedical multimodal question answering\nas well as enhance the guided generation of novel drugs.\n","authors":["Zifeng Wang","Zichen Wang","Balasubramaniam Srinivasan","Vassilis N. Ioannidis","Huzefa Rangwala","Rishita Anubhai"],"pdf_url":"https://arxiv.org/pdf/2310.03320v4.pdf","comment":"ICLR 2024"},{"id":"http://arxiv.org/abs/2311.15497v3","updated":"2024-01-19T02:45:44Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed improvements of up to 1.6% in test data, while maintaining the same\ninference time, and a substantial 1.0% points performance gain in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08897v2","updated":"2024-01-19T02:39:59Z","published":"2024-01-17T00:46:24Z","title":"CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in\n Variational AutoEncoder","summary":" Symmetries of input and latent vectors have provided valuable insights for\ndisentanglement learning in VAEs.However, only a few works were proposed as an\nunsupervised method, and even these works require known factor information in\ntraining data. We propose a novel method, Composite Factor-Aligned Symmetry\nLearning (CFASL), which is integrated into VAEs for learning symmetry-based\ndisentanglement in unsupervised learning without any knowledge of the dataset\nfactor information.CFASL incorporates three novel features for learning\nsymmetry-based disentanglement: 1) Injecting inductive bias to align latent\nvector dimensions to factor-aligned symmetries within an explicit learnable\nsymmetry codebook 2) Learning a composite symmetry to express unknown factors\nchange between two random samples by learning factor-aligned symmetries within\nthe codebook 3) Inducing group equivariant encoder and decoder in training VAEs\nwith the two conditions. In addition, we propose an extended evaluation metric\nfor multi-factor changes in comparison to disentanglement evaluation in VAEs.\nIn quantitative and in-depth qualitative analysis, CFASL demonstrates a\nsignificant improvement of disentanglement in single-factor change, and\nmulti-factor change conditions compared to state-of-the-art methods.\n","authors":["Hee-Jun Jung","Jaehyoung Jeong","Kangil Kim"],"pdf_url":"https://arxiv.org/pdf/2401.08897v2.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2303.03183v2","updated":"2024-01-19T02:31:58Z","published":"2023-03-03T03:17:45Z","title":"Utilizing synthetic training data for the supervised classification of\n rat ultrasonic vocalizations","summary":" Murine rodents generate ultrasonic vocalizations (USVs) with frequencies that\nextend to around 120kHz. These calls are important in social behaviour, and so\ntheir analysis can provide insights into the function of vocal communication,\nand its dysfunction. The manual identification of USVs, and subsequent\nclassification into different subcategories is time consuming. Although machine\nlearning approaches for identification and classification can lead to enormous\nefficiency gains, the time and effort required to generate training data can be\nhigh, and the accuracy of current approaches can be problematic. Here we\ncompare the detection and classification performance of a trained human against\ntwo convolutional neural networks (CNNs), DeepSqueak and VocalMat, on audio\ncontaining rat USVs. Furthermore, we test the effect of inserting synthetic\nUSVs into the training data of the VocalMat CNN as a means of reducing the\nworkload associated with generating a training set. Our results indicate that\nVocalMat outperformed the DeepSqueak CNN on measures of call identification,\nand classification. Additionally, we found that the augmentation of training\ndata with synthetic images resulted in a further improvement in accuracy, such\nthat it was sufficiently close to human performance to allow for the use of\nthis software in laboratory conditions.\n","authors":["K. Jack Scott","Lucinda J. Speers","David K. Bilkey"],"pdf_url":"https://arxiv.org/pdf/2303.03183v2.pdf","comment":"25 pages, 5 main figures, 2 tables"},{"id":"http://arxiv.org/abs/2302.13854v2","updated":"2024-01-19T02:19:29Z","published":"2023-02-24T04:28:46Z","title":"A Deep Neural Network Based Reverse Radio Spectrogram Search Algorithm","summary":" Modern radio astronomy instruments generate vast amounts of data, and the\nincreasingly challenging radio frequency interference (RFI) environment\nnecessitates ever-more sophisticated RFI rejection algorithms. The \"needle in a\nhaystack\" nature of searches for transients and technosignatures requires us to\ndevelop methods that can determine whether a signal of interest has unique\nproperties, or is a part of some larger set of pernicious RFI. In the past,\nthis vetting has required onerous manual inspection of very large numbers of\nsignals. In this paper we present a fast and modular deep learning algorithm to\nsearch for lookalike signals of interest in radio spectrogram data. First, we\ntrained a B-Variational Autoencoder on signals returned by an energy detection\nalgorithm. We then adapted a positional embedding layer from classical\nTransformer architecture to a embed additional metadata, which we demonstrate\nusing a frequency-based embedding. Next we used the encoder component of the\nB-Variational Autoencoder to extract features from small (~ 715,Hz, with a\nresolution of 2.79Hz per frequency bin) windows in the radio spectrogram. We\nused our algorithm to conduct a search for a given query (encoded signal of\ninterest) on a set of signals (encoded features of searched items) to produce\nthe top candidates with similar features. We successfully demonstrate that the\nalgorithm retrieves signals with similar appearance, given only the original\nradio spectrogram data. This algorithm can be used to improve the efficiency of\nvetting signals of interest in technosignature searches, but could also be\napplied to a wider variety of searches for \"lookalike\" signals in large\nastronomical datasets.\n","authors":["Peter Xiangyuan Ma","Steve Croft","Chris Lintott","Andrew P. V. Siemion"],"pdf_url":"https://arxiv.org/pdf/2302.13854v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.10458v1","updated":"2024-01-19T02:16:30Z","published":"2024-01-19T02:16:30Z","title":"Contrastive Unlearning: A Contrastive Approach to Machine Unlearning","summary":" Machine unlearning aims to eliminate the influence of a subset of training\nsamples (i.e., unlearning samples) from a trained model. Effectively and\nefficiently removing the unlearning samples without negatively impacting the\noverall model performance is still challenging. In this paper, we propose a\ncontrastive unlearning framework, leveraging the concept of representation\nlearning for more effective unlearning. It removes the influence of unlearning\nsamples by contrasting their embeddings against the remaining samples so that\nthey are pushed away from their original classes and pulled toward other\nclasses. By directly optimizing the representation space, it effectively\nremoves the influence of unlearning samples while maintaining the\nrepresentations learned from the remaining samples. Experiments on a variety of\ndatasets and models on both class unlearning and sample unlearning showed that\ncontrastive unlearning achieves the best unlearning effects and efficiency with\nthe lowest performance loss compared with the state-of-the-art algorithms.\n","authors":["Hong kyu Lee","Qiuchen Zhang","Carl Yang","Jian Lou","Li Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.10458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10451v1","updated":"2024-01-19T01:40:58Z","published":"2024-01-19T01:40:58Z","title":"Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian\n Optimization Approach","summary":" Solving large-scale capacity expansion problems (CEPs) is central to\ncost-effective decarbonization of regional-scale energy systems. To ensure the\nintended outcomes of CEPs, modeling uncertainty due to weather-dependent\nvariable renewable energy (VRE) supply and energy demand becomes crucially\nimportant. However, the resulting stochastic optimization models are often less\ncomputationally tractable than their deterministic counterparts. Here, we\npropose a learning-assisted approximate solution method to tractably solve\ntwo-stage stochastic CEPs. Our method identifies low-cost planning decisions by\nconstructing and solving a sequence of tractable temporally aggregated\nsurrogate problems. We adopt a Bayesian optimization approach to searching the\nspace of time series aggregation hyperparameters and compute approximate\nsolutions that minimize costs on a validation set of supply-demand projections.\nImportantly, we evaluate solved planning outcomes on a held-out set of test\nprojections. We apply our approach to generation and transmission expansion\nplanning for a joint power-gas system spanning New England. We show that our\napproach yields an estimated cost savings of up to 3.8% in comparison to\nbenchmark time series aggregation approaches.\n","authors":["Aron Brenner","Rahman Khorramfar","Dharik Mallapragada","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2401.10451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.05359v3","updated":"2024-01-19T01:30:56Z","published":"2022-05-11T09:11:02Z","title":"Exploring Local Explanations of Nonlinear Models Using Animated Linear\n Projections","summary":" The increased predictive power of machine learning models comes at the cost\nof increased complexity and loss of interpretability, particularly in\ncomparison to parametric statistical models. This trade-off has led to the\nemergence of eXplainable AI (XAI) which provides methods, such as local\nexplanations (LEs) and local variable attributions (LVAs), to shed light on how\na model use predictors to arrive at a prediction. These provide a point\nestimate of the linear variable importance in the vicinity of a single\nobservation. However, LVAs tend not to effectively handle association between\npredictors. To understand how the interaction between predictors affects the\nvariable importance estimate, we can convert LVAs into linear projections and\nuse the radial tour. This is also useful for learning how a model has made a\nmistake, or the effect of outliers, or the clustering of observations. The\napproach is illustrated with examples from categorical (penguin species,\nchocolate types) and quantitative (soccer/football salaries, house prices)\nresponse models. The methods are implemented in the R package cheem, available\non CRAN.\n","authors":["Nicholas Spyrison","Dianne Cook","Przemyslaw Biecek"],"pdf_url":"https://arxiv.org/pdf/2205.05359v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10447v1","updated":"2024-01-19T01:30:16Z","published":"2024-01-19T01:30:16Z","title":"Investigating Training Strategies and Model Robustness of Low-Rank\n Adaptation for Language Modeling in Speech Recognition","summary":" The use of low-rank adaptation (LoRA) with frozen pretrained language models\n(PLMs) has become increasing popular as a mainstream, resource-efficient\nmodeling approach for memory-constrained hardware. In this study, we first\nexplore how to enhance model performance by introducing various LoRA training\nstrategies, achieving relative word error rate reductions of 3.50\\% on the\npublic Librispeech dataset and of 3.67\\% on an internal dataset in the\nmessaging domain. To further characterize the stability of LoRA-based\nsecond-pass speech recognition models, we examine robustness against input\nperturbations. These perturbations are rooted in homophone replacements and a\nnovel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both\ndesigned to measure the relative degradation in the performance of rescoring\nmodels. Our experimental results indicate that while advanced variants of LoRA,\nsuch as dynamic rank-allocated LoRA, lead to performance degradation in\n$1$-best perturbation, they alleviate the degradation in $N$-best perturbation.\nThis finding is in comparison to fully-tuned models and vanilla LoRA tuning\nbaselines, suggesting that a comprehensive selection is needed when using\nLoRA-based adaptation for compute-cost savings and robust language modeling.\n","authors":["Yu Yu","Chao-Han Huck Yang","Tuan Dinh","Sungho Ryu","Jari Kolehmainen","Roger Ren","Denis Filimonov","Prashanth G. Shivakumar","Ankur Gandhe","Ariya Rastow","Jia Xu","Ivan Bulyko","Andreas Stolcke"],"pdf_url":"https://arxiv.org/pdf/2401.10447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04336v3","updated":"2024-01-19T01:30:04Z","published":"2024-01-09T03:29:40Z","title":"Deep Efficient Private Neighbor Generation for Subgraph Federated\n Learning","summary":" Behemoth graphs are often fragmented and separately stored by multiple data\nowners as distributed subgraphs in many realistic applications. Without harming\ndata privacy, it is natural to consider the subgraph federated learning\n(subgraph FL) scenario, where each local client holds a subgraph of the entire\nglobal graph, to obtain globally generalized graph mining models. To overcome\nthe unique challenge of incomplete information propagation on local subgraphs\ndue to missing cross-subgraph neighbors, previous works resort to the\naugmentation of local neighborhoods through the joint FL of missing neighbor\ngenerators and GNNs. Yet their technical designs have profound limitations\nregarding the utility, efficiency, and privacy goals of FL. In this work, we\npropose FedDEP to comprehensively tackle these challenges in subgraph FL.\nFedDEP consists of a series of novel technical designs: (1) Deep neighbor\ngeneration through leveraging the GNN embeddings of potential missing\nneighbors; (2) Efficient pseudo-FL for neighbor generation through embedding\nprototyping; and (3) Privacy protection through noise-less\nedge-local-differential-privacy. We analyze the correctness and efficiency of\nFedDEP, and provide theoretical guarantees on its privacy. Empirical results on\nfour real-world datasets justify the clear benefits of proposed techniques.\n","authors":["Ke Zhang","Lichao Sun","Bolin Ding","Siu Ming Yiu","Carl Yang"],"pdf_url":"https://arxiv.org/pdf/2401.04336v3.pdf","comment":"Accepted to SDM 2024"},{"id":"http://arxiv.org/abs/2401.10446v1","updated":"2024-01-19T01:29:27Z","published":"2024-01-19T01:29:27Z","title":"Large Language Models are Efficient Learners of Noise-Robust Speech\n Recognition","summary":" Recent advances in large language models (LLMs) have promoted generative\nerror correction (GER) for automatic speech recognition (ASR), which leverages\nthe rich linguistic knowledge and powerful reasoning ability of LLMs to improve\nrecognition results. The latest work proposes a GER benchmark with HyPoradise\ndataset to learn the mapping from ASR N-best hypotheses to ground-truth\ntranscription by efficient LLM finetuning, which shows great effectiveness but\nlacks specificity on noise-robust ASR. In this work, we extend the benchmark to\nnoisy conditions and investigate if we can teach LLMs to perform denoising for\nGER just like what robust ASR do}, where one solution is introducing noise\ninformation as a conditioner into LLM. However, directly incorporating noise\nembeddings from audio encoder could harm the LLM tuning due to cross-modality\ngap. To this end, we propose to extract a language-space noise embedding from\nthe N-best list to represent the noise conditions of source speech, which can\npromote the denoising process in GER. Furthermore, in order to enhance its\nrepresentation ability of audio noise, we design a knowledge distillation (KD)\napproach via mutual information estimation to distill the real noise\ninformation in audio embeddings to our language embedding. Experiments on\nvarious latest LLMs demonstrate our approach achieves a new breakthrough with\nup to 53.9% correction improvement in terms of word error rate while with\nlimited training data. Analysis shows that our language-space noise embedding\ncan well represent the noise conditions of source speech, under which\noff-the-shelf LLMs show strong ability of language-space denoising.\n","authors":["Yuchen Hu","Chen Chen","Chao-Han Huck Yang","Ruizhe Li","Chao Zhang","Pin-Yu Chen","EnSiong Chng"],"pdf_url":"https://arxiv.org/pdf/2401.10446v1.pdf","comment":"Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be\n open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license"},{"id":"http://arxiv.org/abs/2312.10401v2","updated":"2024-01-19T01:25:39Z","published":"2023-12-16T10:05:18Z","title":"Rethinking Dimensional Rationale in Graph Contrastive Learning from\n Causal Perspective","summary":" Graph contrastive learning is a general learning paradigm excelling at\ncapturing invariant information from diverse perturbations in graphs. Recent\nworks focus on exploring the structural rationale from graphs, thereby\nincreasing the discriminability of the invariant information. However, such\nmethods may incur in the mis-learning of graph models towards the\ninterpretability of graphs, and thus the learned noisy and task-agnostic\ninformation interferes with the prediction of graphs. To this end, with the\npurpose of exploring the intrinsic rationale of graphs, we accordingly propose\nto capture the dimensional rationale from graphs, which has not received\nsufficient attention in the literature. The conducted exploratory experiments\nattest to the feasibility of the aforementioned roadmap. To elucidate the\ninnate mechanism behind the performance improvement arising from the\ndimensional rationale, we rethink the dimensional rationale in graph\ncontrastive learning from a causal perspective and further formalize the\ncausality among the variables in the pre-training stage to build the\ncorresponding structural causal model. On the basis of the understanding of the\nstructural causal model, we propose the dimensional rationale-aware graph\ncontrastive learning approach, which introduces a learnable dimensional\nrationale acquiring network and a redundancy reduction constraint. The\nlearnable dimensional rationale acquiring network is updated by leveraging a\nbi-level meta-learning technique, and the redundancy reduction constraint\ndisentangles the redundant features through a decorrelation process during\nlearning. Empirically, compared with state-of-the-art methods, our method can\nyield significant performance boosts on various benchmarks with respect to\ndiscriminability and transferability. The code implementation of our method is\navailable at https://github.com/ByronJi/DRGCL.\n","authors":["Qirui Ji","Jiangmeng Li","Jie Hu","Rui Wang","Changwen Zheng","Fanjiang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.10401v2.pdf","comment":"Accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.10442v1","updated":"2024-01-19T01:11:44Z","published":"2024-01-19T01:11:44Z","title":"Path Choice Matters for Clear Attribution in Path Methods","summary":" Rigorousness and clarity are both essential for interpretations of DNNs to\nengender human trust. Path methods are commonly employed to generate rigorous\nattributions that satisfy three axioms. However, the meaning of attributions\nremains ambiguous due to distinct path choices. To address the ambiguity, we\nintroduce \\textbf{Concentration Principle}, which centrally allocates high\nattributions to indispensable features, thereby endowing aesthetic and\nsparsity. We then present \\textbf{SAMP}, a model-agnostic interpreter, which\nefficiently searches the near-optimal path from a pre-defined set of\nmanipulation paths. Moreover, we propose the infinitesimal constraint (IC) and\nmomentum strategy (MS) to improve the rigorousness and optimality.\nVisualizations show that SAMP can precisely reveal DNNs by pinpointing salient\nimage pixels. We also perform quantitative experiments and observe that our\nmethod significantly outperforms the counterparts. Code:\nhttps://github.com/zbr17/SAMP.\n","authors":["Borui Zhang","Wenzhao Zheng","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2401.10442v1.pdf","comment":"ICLR 2024 accepted"},{"id":"http://arxiv.org/abs/2210.02672v3","updated":"2024-01-19T00:57:05Z","published":"2022-10-06T04:30:59Z","title":"A Novel Maximum-Entropy-Driven Technique for Low-Rank Orthogonal\n Nonnegative Matrix Factorization with $\\ell_0$-Norm sparsity Constraint","summary":" In data-driven control and machine learning, a common requirement involves\nbreaking down large matrices into smaller, low-rank factors that possess\nspecific levels of sparsity. This paper introduces an innovative solution to\nthe orthogonal nonnegative matrix factorization (ONMF) problem. The objective\nis to approximate input data by using two low-rank nonnegative matrices,\nadhering to both orthogonality and $\\ell_0$-norm sparsity constraints. the\nproposed maximum-entropy-principle based framework ensures orthogonality and\nsparsity of features or the mixing matrix, while maintaining nonnegativity in\nboth. Additionally, the methodology offers a quantitative determination of the\n``true'' number of underlying features, a crucial hyperparameter for ONMF.\nExperimental evaluation on synthetic and a standard datasets highlights the\nmethod's superiority in terms of sparsity, orthogonality, and computational\nspeed compared to existing approaches. Notably, the proposed method achieves\ncomparable or improved reconstruction errors in line with the literature.\n","authors":["Salar Basiri","Srinivasa Salapaka"],"pdf_url":"https://arxiv.org/pdf/2210.02672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.00110v3","updated":"2024-01-19T00:35:35Z","published":"2023-12-30T01:24:25Z","title":"Diffusion Model with Perceptual Loss","summary":" Diffusion models trained with mean squared error loss tend to generate\nunrealistic samples. Current state-of-the-art models rely on classifier-free\nguidance to improve sample quality, yet its surprising effectiveness is not\nfully understood. In this paper, we show that the effectiveness of\nclassifier-free guidance partly originates from it being a form of implicit\nperceptual guidance. As a result, we can directly incorporate perceptual loss\nin diffusion training to improve sample quality. Since the score matching\nobjective used in diffusion training strongly resembles the denoising\nautoencoder objective used in unsupervised training of perceptual networks, the\ndiffusion model itself is a perceptual network and can be used to generate\nmeaningful perceptual loss. We propose a novel self-perceptual objective that\nresults in diffusion models capable of generating more realistic samples. For\nconditional generation, our method only improves sample quality without\nentanglement with the conditional input and therefore does not sacrifice sample\ndiversity. Our method can also improve sample quality for unconditional\ngeneration, which was not possible with classifier-free guidance before.\n","authors":["Shanchuan Lin","Xiao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.00110v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07988v3","updated":"2024-01-19T00:28:45Z","published":"2023-09-14T19:01:08Z","title":"Folding Attention: Memory and Power Optimization for On-Device\n Transformer-based Streaming Speech Recognition","summary":" Transformer-based models excel in speech recognition. Existing efforts to\noptimize Transformer inference, typically for long-context applications, center\non simplifying attention score calculations. However, streaming speech\nrecognition models usually process a limited number of tokens each time, making\nattention score calculation less of a bottleneck. Instead, the bottleneck lies\nin the linear projection layers of multi-head attention and feedforward\nnetworks, constituting a substantial portion of the model size and contributing\nsignificantly to computation, memory, and power usage.\n To address this bottleneck, we propose folding attention, a technique\ntargeting these linear layers, significantly reducing model size and improving\nmemory and power efficiency. Experiments on on-device Transformer-based\nstreaming speech recognition models show that folding attention reduces model\nsize (and corresponding memory consumption) by up to 24% and power consumption\nby up to 23%, all without compromising model accuracy or computation overhead.\n","authors":["Yang Li","Liangzhen Lai","Yuan Shangguan","Forrest N. Iandola","Zhaoheng Ni","Ernie Chang","Yangyang Shi","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.07988v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10432v1","updated":"2024-01-19T00:27:34Z","published":"2024-01-19T00:27:34Z","title":"A2Q+: Improving Accumulator-Aware Weight Quantization","summary":" Quantization techniques commonly reduce the inference costs of neural\nnetworks by restricting the precision of weights and activations. Recent\nstudies show that also reducing the precision of the accumulator can further\nimprove hardware efficiency at the risk of numerical overflow, which introduces\narithmetic errors that can degrade model accuracy. To avoid numerical overflow\nwhile maintaining accuracy, recent work proposed accumulator-aware quantization\n(A2Q), a quantization-aware training method that constrains model weights\nduring training to safely use a target accumulator bit width during inference.\nAlthough this shows promise, we demonstrate that A2Q relies on an overly\nrestrictive constraint and a sub-optimal weight initialization strategy that\neach introduce superfluous quantization error. To address these shortcomings,\nwe introduce: (1) an improved bound that alleviates accumulator constraints\nwithout compromising overflow avoidance; and (2) a new strategy for\ninitializing quantized weights from pre-trained floating-point checkpoints. We\ncombine these contributions with weight normalization to introduce A2Q+. We\nsupport our analysis with experiments that show A2Q+ significantly improves the\ntrade-off between accumulator bit width and model accuracy and characterize new\ntrade-offs that arise as a consequence of accumulator constraints.\n","authors":["Ian Colbert","Alessandro Pappalardo","Jakoba Petri-Koenig","Yaman Umuroglu"],"pdf_url":"https://arxiv.org/pdf/2401.10432v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.01409v4","updated":"2024-01-19T00:23:28Z","published":"2022-06-03T06:34:09Z","title":"Hybrid Parameter Search and Dynamic Model Selection for Mixed-Variable\n Bayesian Optimization","summary":" This paper presents a new type of hybrid model for Bayesian optimization (BO)\nadept at managing mixed variables, encompassing both quantitative (continuous\nand integer) and qualitative (categorical) types. Our proposed new hybrid\nmodels (named hybridM) merge the Monte Carlo Tree Search structure (MCTS) for\ncategorical variables with Gaussian Processes (GP) for continuous ones. hybridM\nleverages the upper confidence bound tree search (UCTS) for MCTS strategy,\nshowcasing the tree architecture's integration into Bayesian optimization. Our\ninnovations, including dynamic online kernel selection in the surrogate\nmodeling phase and a unique UCTS search strategy, position our hybrid models as\nan advancement in mixed-variable surrogate models. Numerical experiments\nunderscore the superiority of hybrid models, highlighting their potential in\nBayesian optimization.\n","authors":["Hengrui Luo","Younghyun Cho","James W. Demmel","Xiaoye S. Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2206.01409v4.pdf","comment":"33 pages, 8 Figures"},{"id":"http://arxiv.org/abs/2305.14402v3","updated":"2024-01-19T00:16:49Z","published":"2023-05-23T10:16:08Z","title":"Enhancing Speech Emotion Recognition Through Differentiable Architecture\n Search","summary":" Speech Emotion Recognition (SER) is a critical enabler of emotion-aware\ncommunication in human-computer interactions. Recent advancements in Deep\nLearning (DL) have substantially enhanced the performance of SER models through\nincreased model complexity. However, designing optimal DL architectures\nrequires prior experience and experimental evaluations. Encouragingly, Neural\nArchitecture Search (NAS) offers a promising avenue to determine an optimal DL\nmodel automatically. In particular, Differentiable Architecture Search (DARTS)\nis an efficient method of using NAS to search for optimised models. This paper\nproposes a DARTS-optimised joint CNN and LSTM architecture, to improve SER\nperformance, where the literature informs the selection of CNN and LSTM\ncoupling to offer improved performance. While DARTS has previously been applied\nto CNN and LSTM combinations, our approach introduces a novel mechanism,\nparticularly in selecting CNN operations using DARTS. In contrast to previous\nstudies, we refrain from imposing constraints on the order of the layers for\nthe CNN within the DARTS cell; instead, we allow DARTS to determine the optimal\nlayer order autonomously. Experimenting with the IEMOCAP and MSP-IMPROV\ndatasets, we demonstrate that our proposed methodology achieves significantly\nhigher SER accuracy than hand-engineering the CNN-LSTM configuration. It also\noutperforms the best-reported SER results achieved using DARTS on CNN-LSTM.\n","authors":["Thejan Rajapakshe","Rajib Rana","Sara Khalifa","Berrak Sisman","Björn Schuller"],"pdf_url":"https://arxiv.org/pdf/2305.14402v3.pdf","comment":"5 pages, 4 figures"}],"Multimedia":[{"id":"http://arxiv.org/abs/2401.10608v1","updated":"2024-01-19T10:37:27Z","published":"2024-01-19T10:37:27Z","title":"M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics\n Prediction from Histopathology Images","summary":" The advancement of Spatial Transcriptomics (ST) has facilitated the\nspatially-aware profiling of gene expressions based on histopathology images.\nAlthough ST data offers valuable insights into the micro-environment of tumors,\nits acquisition cost remains expensive. Therefore, directly predicting the ST\nexpressions from digital pathology images is desired. Current methods usually\nadopt existing regression backbones for this task, which ignore the inherent\nmulti-scale hierarchical data structure of digital pathology images. To address\nthis limit, we propose M2ORT, a many-to-one regression Transformer that can\naccommodate the hierarchical structure of the pathology images through a\ndecoupled multi-scale feature extractor. Different from traditional models that\nare trained with one-to-one image-label pairs, M2ORT accepts multiple pathology\nimages of different magnifications at a time to jointly predict the gene\nexpressions at their corresponding common ST spot, aiming at learning a\nmany-to-one relationship through training. We have tested M2ORT on three public\nST datasets and the experimental results show that M2ORT can achieve\nstate-of-the-art performance with fewer parameters and floating-point\noperations (FLOPs). The code is available at:\nhttps://github.com/Dootmaan/M2ORT/.\n","authors":["Hongyi Wang","Xiuju Du","Jing Liu","Shuyi Ouyang","Yen-Wei Chen","Lanfen Lin"],"pdf_url":"https://arxiv.org/pdf/2401.10608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10475v1","updated":"2024-01-19T03:54:58Z","published":"2024-01-19T03:54:58Z","title":"CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short\n Video Search Scenarios","summary":" Vision-Language Models pre-trained on large-scale image-text datasets have\nshown superior performance in downstream tasks such as image retrieval. Most of\nthe images for pre-training are presented in the form of open domain\ncommon-sense visual elements. Differently, video covers in short video search\nscenarios are presented as user-originated contents that provide important\nvisual summaries of videos. In addition, a portion of the video covers come\nwith manually designed cover texts that provide semantic complements. In order\nto fill in the gaps in short video cover data, we establish the first\nlarge-scale cover-text benchmark for Chinese short video search scenarios.\nSpecifically, we release two large-scale datasets CBVS-5M/10M to provide short\nvideo covers, and the manual fine-labeling dataset CBVS-20K to provide real\nuser queries, which serves as an image-text benchmark test in the Chinese short\nvideo search field. To integrate the semantics of cover text in the case of\nmodality missing, we propose UniCLIP where cover texts play a guiding role\nduring training, however are not relied upon by inference. Extensive evaluation\non CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has\nbeen deployed to Tencent's online video search systems with hundreds of\nmillions of visits and achieved significant gains. The complete dataset, code\nand checkpoints will be available upon release.\n","authors":["Xiangshuo Qiao","Xianxin Li","Xiaozhe Qu","Jie Zhang","Yang Liu","Yu Luo","Cihang Jin","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2401.10475v1.pdf","comment":null}]},"2024-01-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.06766v2","updated":"2024-01-22T18:55:35Z","published":"2024-01-12T18:58:26Z","title":"Mind Your Format: Towards Consistent Evaluation of In-Context Learning\n Improvements","summary":" Large language models demonstrate a remarkable capability for learning to\nsolve new tasks from a few examples. The prompt template, or the way the input\nexamples are formatted to obtain the prompt, is an important yet often\noverlooked aspect of in-context learning. In this work, we conduct a\ncomprehensive study of the template format's influence on the in-context\nlearning performance. We evaluate the impact of the prompt template across\nmodels (from 770M to 70B parameters) and 4 standard classification datasets. We\nshow that a poor choice of the template can reduce the performance of the\nstrongest models and inference methods to a random guess level. More\nimportantly, the best templates do not transfer between different setups and\neven between models of the same family. Our findings show that the currently\nprevalent approach to evaluation, which ignores template selection, may give\nmisleading results due to different templates in different works. As a first\nstep towards mitigating this issue, we propose Template Ensembles that\naggregate model predictions across several templates. This simple test-time\naugmentation boosts average performance while being robust to the choice of\nrandom set of templates.\n","authors":["Anton Voronov","Lena Wolf","Max Ryabinin"],"pdf_url":"https://arxiv.org/pdf/2401.06766v2.pdf","comment":"21 pages, 10 figures. Code:\n https://github.com/yandex-research/mind-your-format"},{"id":"http://arxiv.org/abs/2401.12208v1","updated":"2024-01-22T18:51:07Z","published":"2024-01-22T18:51:07Z","title":"CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation","summary":" Chest X-rays (CXRs) are the most frequently performed imaging test in\nclinical practice. Recent advances in the development of vision-language\nfoundation models (FMs) give rise to the possibility of performing automated\nCXR interpretation, which can assist physicians with clinical decision-making\nand improve patient outcomes. However, developing FMs that can accurately\ninterpret CXRs is challenging due to the (1) limited availability of\nlarge-scale vision-language datasets in the medical image domain, (2) lack of\nvision and language encoders that can capture the complexities of medical data,\nand (3) absence of evaluation frameworks for benchmarking the abilities of FMs\non CXR interpretation. In this work, we address these challenges by first\nintroducing \\emph{CheXinstruct} - a large-scale instruction-tuning dataset\ncurated from 28 publicly-available datasets. We then present \\emph{CheXagent} -\nan instruction-tuned FM capable of analyzing and summarizing CXRs. To build\nCheXagent, we design a clinical large language model (LLM) for parsing\nradiology reports, a vision encoder for representing CXR images, and a network\nto bridge the vision and language modalities. Finally, we introduce\n\\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs\nacross 8 clinically-relevant CXR interpretation tasks. Extensive quantitative\nevaluations and qualitative reviews with five expert radiologists demonstrate\nthat CheXagent outperforms previously-developed general- and medical-domain FMs\non CheXbench tasks. Furthermore, in an effort to improve model transparency, we\nperform a fairness evaluation across factors of sex, race and age to highlight\npotential performance disparities. Our project is at\n\\url{https://stanford-aimi.github.io/chexagent.html}.\n","authors":["Zhihong Chen","Maya Varma","Jean-Benoit Delbrouck","Magdalini Paschali","Louis Blankemeier","Dave Van Veen","Jeya Maria Jose Valanarasu","Alaa Youssef","Joseph Paul Cohen","Eduardo Pontes Reis","Emily B. Tsai","Andrew Johnston","Cameron Olsen","Tanishq Mathew Abraham","Sergios Gatidis","Akshay S. Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2401.12208v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12200v1","updated":"2024-01-22T18:39:40Z","published":"2024-01-22T18:39:40Z","title":"APT: Adaptive Pruning and Tuning Pretrained Language Models for\n Efficient Training and Inference","summary":" Fine-tuning and inference with large Language Models (LM) are generally known\nto be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces\ntraining memory by updating a small number of LM parameters but does not\nimprove inference efficiency. Structured pruning improves LM inference\nefficiency by removing consistent parameter blocks, yet often increases\ntraining memory and time. To improve both training and inference efficiency, we\nintroduce APT that adaptively prunes and tunes parameters for the LMs. At the\nearly stage of fine-tuning, APT dynamically adds salient tuning parameters for\nfast and accurate convergence while discarding unimportant parameters for\nefficiency. Compared to baselines, our experiments show that APT maintains up\nto 98% task performance when pruning RoBERTa and T5 models with 40% parameters\nleft while keeping 86.4% LLaMA models' performance with 70% parameters\nremained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces\nlarge LMs memory training footprint by up to 70%.\n","authors":["Bowen Zhao","Hannaneh Hajishirzi","Qingqing Cao"],"pdf_url":"https://arxiv.org/pdf/2401.12200v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.12192v1","updated":"2024-01-22T18:34:42Z","published":"2024-01-22T18:34:42Z","title":"Text Embedding Inversion Attacks on Multilingual Language Models","summary":" Representing textual information as real-numbered embeddings has become the\nnorm in NLP. Moreover, with the rise of public interest in large language\nmodels (LLMs), Embeddings as a Service (EaaS) has rapidly gained traction as a\nbusiness model. This is not without outstanding security risks, as previous\nresearch has demonstrated that sensitive data can be reconstructed from\nembeddings, even without knowledge of the underlying model that generated them.\nHowever, such work is limited by its sole focus on English, leaving all other\nlanguages vulnerable to attacks by malicious actors. %As many international and\nmultilingual companies leverage EaaS, there is an urgent need for research into\nmultilingual LLM security. To this end, this work investigates LLM security\nfrom the perspective of multilingual embedding inversion. Concretely, we define\nthe problem of black-box multilingual and cross-lingual inversion attacks, with\nspecial attention to a cross-domain scenario. Our findings reveal that\nmultilingual models are potentially more vulnerable to inversion attacks than\ntheir monolingual counterparts. This stems from the reduced data requirements\nfor achieving comparable inversion performance in settings where the underlying\nlanguage is not known a-priori. To our knowledge, this work is the first to\ndelve into multilinguality within the context of inversion attacks, and our\nfindings highlight the need for further investigation and enhanced defenses in\nthe area of NLP Security.\n","authors":["Yiyi Chen","Heather Lent","Johannes Bjerva"],"pdf_url":"https://arxiv.org/pdf/2401.12192v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2401.12187v1","updated":"2024-01-22T18:27:08Z","published":"2024-01-22T18:27:08Z","title":"WARM: On the Benefits of Weight Averaged Reward Models","summary":" Aligning large language models (LLMs) with human preferences through\nreinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit\nfailures in the reward model (RM) to achieve seemingly high rewards without\nmeeting the underlying objectives. We identify two primary challenges when\ndesigning RMs to mitigate reward hacking: distribution shifts during the RL\nprocess and inconsistencies in human preferences. As a solution, we propose\nWeight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then\naveraging them in the weight space. This strategy follows the observation that\nfine-tuned weights remain linearly mode connected when sharing the same\npre-training. By averaging weights, WARM improves efficiency compared to the\ntraditional ensembling of predictions, while improving reliability under\ndistribution shifts and robustness to preference inconsistencies. Our\nexperiments on summarization tasks, using best-of-N and RL methods, shows that\nWARM improves the overall quality and alignment of LLM predictions; for\nexample, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy\nRL fine-tuned with a single RM.\n","authors":["Alexandre Ramé","Nino Vieillard","Léonard Hussenot","Robert Dadashi","Geoffrey Cideron","Olivier Bachem","Johan Ferret"],"pdf_url":"https://arxiv.org/pdf/2401.12187v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.12181v1","updated":"2024-01-22T18:11:01Z","published":"2024-01-22T18:11:01Z","title":"Universal Neurons in GPT2 Language Models","summary":" A basic question within the emerging field of mechanistic interpretability is\nthe degree to which neural networks learn the same underlying mechanisms. In\nother words, are neural mechanisms universal across different models? In this\nwork, we study the universality of individual neurons across GPT2 models\ntrained from different initial random seeds, motivated by the hypothesis that\nuniversal neurons are likely to be interpretable. In particular, we compute\npairwise correlations of neuron activations over 100 million tokens for every\nneuron pair across five different seeds and find that 1-5\\% of neurons are\nuniversal, that is, pairs of neurons which consistently activate on the same\ninputs. We then study these universal neurons in detail, finding that they\nusually have clear interpretations and taxonomize them into a small number of\nneuron families. We conclude by studying patterns in neuron weights to\nestablish several universal functional roles of neurons in simple circuits:\ndeactivating attention heads, changing the entropy of the next token\ndistribution, and predicting the next token to (not) be within a particular\nset.\n","authors":["Wes Gurnee","Theo Horsley","Zifan Carl Guo","Tara Rezaei Kheirkhah","Qinyi Sun","Will Hathaway","Neel Nanda","Dimitris Bertsimas"],"pdf_url":"https://arxiv.org/pdf/2401.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12178v1","updated":"2024-01-22T18:09:52Z","published":"2024-01-22T18:09:52Z","title":"In-Context Learning for Extreme Multi-Label Classification","summary":" Multi-label classification problems with thousands of classes are hard to\nsolve with in-context learning alone, as language models (LMs) might lack prior\nknowledge about the precise classes or how to assign them, and it is generally\ninfeasible to demonstrate every class in a prompt. We propose a general\nprogram, $\\texttt{Infer--Retrieve--Rank}$, that defines multi-step interactions\nbetween LMs and retrievers to efficiently tackle such problems. We implement\nthis program using the $\\texttt{DSPy}$ programming model, which specifies\nin-context systems in a declarative manner, and use $\\texttt{DSPy}$ optimizers\nto tune it towards specific datasets by bootstrapping only tens of few-shot\nexamples. Our primary extreme classification program, optimized separately for\neach task, attains state-of-the-art results across three benchmarks (HOUSE,\nTECH, TECHWOLF). We apply the same program to a benchmark with vastly different\ncharacteristics and attain competitive performance as well (BioDEX). Unlike\nprior work, our proposed solution requires no finetuning, is easily applicable\nto new tasks, alleviates prompt engineering, and requires only tens of labeled\nexamples. Our code is public at https://github.com/KarelDO/xmc.dspy.\n","authors":["Karel D'Oosterlinck","Omar Khattab","François Remy","Thomas Demeester","Chris Develder","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2401.12178v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n Capabilities","summary":" Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12143v1","updated":"2024-01-22T17:26:55Z","published":"2024-01-22T17:26:55Z","title":"Anisotropy Is Inherent to Self-Attention in Transformers","summary":" The representation degeneration problem is a phenomenon that is widely\nobserved among self-supervised learning methods based on Transformers. In NLP,\nit takes the form of anisotropy, a singular property of hidden representations\nwhich makes them unexpectedly close to each other in terms of angular distance\n(cosine-similarity). Some recent works tend to show that anisotropy is a\nconsequence of optimizing the cross-entropy loss on long-tailed distributions\nof tokens. We show in this paper that anisotropy can also be observed\nempirically in language models with specific objectives that should not suffer\ndirectly from the same consequences. We also show that the anisotropy problem\nextends to Transformers trained on other modalities. Our observations suggest\nthat anisotropy is actually inherent to Transformers-based models.\n","authors":["Nathan Godey","Éric de la Clergerie","Benoît Sagot"],"pdf_url":"https://arxiv.org/pdf/2401.12143v1.pdf","comment":"Proceedings of EACL 2024. Previously presented at ACL-SRW 2023\n (arXiv:2306.07656). arXiv admin note: substantial text overlap with\n arXiv:2306.07656"},{"id":"http://arxiv.org/abs/2401.10491v2","updated":"2024-01-22T17:16:37Z","published":"2024-01-19T05:02:46Z","title":"Knowledge Fusion of Large Language Models","summary":" While training large language models (LLMs) from scratch can generate models\nwith distinct functionalities and strengths, it comes at significant costs and\nmay result in redundant capabilities. Alternatively, a cost-effective and\ncompelling approach is to merge existing pre-trained LLMs into a more potent\nmodel. However, due to the varying architectures of these LLMs, directly\nblending their weights is impractical. In this paper, we introduce the notion\nof knowledge fusion for LLMs, aimed at combining the capabilities of existing\nLLMs and transferring them into a single LLM. By leveraging the generative\ndistributions of source LLMs, we externalize their collective knowledge and\nunique strengths, thereby potentially elevating the capabilities of the target\nmodel beyond those of any individual source LLM. We validate our approach using\nthree popular LLMs with different architectures--Llama-2, MPT, and\nOpenLLaMA--across various benchmarks and tasks. Our findings confirm that the\nfusion of LLMs can improve the performance of the target model across a range\nof capabilities such as reasoning, commonsense, and code generation. Our code,\nmodel weights, and data are public at\n\\url{https://github.com/fanqiwan/FuseLLM}.\n","authors":["Fanqi Wan","Xinting Huang","Deng Cai","Xiaojun Quan","Wei Bi","Shuming Shi"],"pdf_url":"https://arxiv.org/pdf/2401.10491v2.pdf","comment":"Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2304.14317v2","updated":"2024-01-22T17:06:50Z","published":"2023-04-27T16:38:17Z","title":"ICE-Score: Instructing Large Language Models to Evaluate Code","summary":" Recent advancements in the field of natural language generation have\nfacilitated the use of large language models to assess the quality of generated\ntext. Although these models have shown promising results in tasks such as\nmachine translation and summarization, their applicability in code intelligence\ntasks remains limited without human involvement. The complexity of programming\nconcepts required for such tasks makes it difficult to develop evaluation\nmetrics that align with human judgment. Token-matching-based metrics, such as\nBLEU, have demonstrated weak correlations with human practitioners in code\nintelligence tasks. Moreover, utilizing human-written test suites to evaluate\nfunctional correctness can be challenging in domains with low resources. To\novercome these obstacles, we propose \\texttt{ICE-Score}, a new evaluation\nmetric via instructing large language models (LLMs) for code assessments. Our\nmetric addresses the limitations of existing approaches by achieving superior\ncorrelations with functional correctness and human preferences, without the\nneed for test oracles or references. We evaluate the efficacy of our metric on\ntwo different aspects (\\textit{human preference} and \\textit{execution\nsuccess}) and four programming languages. Our results demonstrate that our\nmetric surpasses state-of-the-art metrics for code generation, delivering high\nlevels of accuracy and consistency across various programming languages and\ntasks. We also make our evaluation metric and datasets available to the\npublic\\footnote{\\url{https://github.com/terryyz/ice-score}}, encouraging\nfurther research in evaluating code intelligence tasks.\n","authors":["Terry Yue Zhuo"],"pdf_url":"https://arxiv.org/pdf/2304.14317v2.pdf","comment":"Accepted to Findings of EACL 2024"},{"id":"http://arxiv.org/abs/2401.12117v1","updated":"2024-01-22T16:57:05Z","published":"2024-01-22T16:57:05Z","title":"The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large\n Language Models","summary":" While large language models (LLMs) are still being adopted to new domains and\nutilized in novel applications, we are experiencing an influx of the new\ngeneration of foundation models, namely multi-modal large language models\n(MLLMs). These models integrate verbal and visual information, opening new\npossibilities to demonstrate more complex reasoning abilities at the\nintersection of the two modalities. However, despite the revolutionizing\nprospect of MLLMs, our understanding of their reasoning abilities is limited.\nIn this study, we assess the nonverbal abstract reasoning abilities of\nopen-source and closed-source MLLMs using variations of Raven's Progressive\nMatrices. Our experiments expose the difficulty of solving such problems while\nshowcasing the immense gap between open-source and closed-source models. We\nalso reveal critical shortcomings with individual visual and textual modules,\nsubjecting the models to low-performance ceilings. Finally, to improve MLLMs'\nperformance, we experiment with various methods, such as Chain-of-Thought\nprompting, resulting in a significant (up to 100%) boost in performance.\n","authors":["Kian Ahrabian","Zhivar Sourati","Kexuan Sun","Jiarui Zhang","Yifan Jiang","Fred Morstatter","Jay Pujara"],"pdf_url":"https://arxiv.org/pdf/2401.12117v1.pdf","comment":"Code and datasets are available at\n https://github.com/kahrabian/mllm-nvar"},{"id":"http://arxiv.org/abs/2401.12097v1","updated":"2024-01-22T16:35:00Z","published":"2024-01-22T16:35:00Z","title":"An Empirical Analysis of In-context Learning Abilities of LLMs for MT","summary":" In-context learning (ICL) has consistently demonstrated superior performance\nover zero-shot performance in large language models (LLMs). However, the\nunderstanding of the dynamics of ICL and the aspects that influence downstream\nperformance remains limited, especially for natural language generation (NLG)\ntasks. This work aims to address this gap by investigating the ICL capabilities\nof LLMs and studying the impact of different aspects of the in-context\ndemonstrations for the task of machine translation (MT). Our preliminary\ninvestigations aim to discern whether in-context learning (ICL) is\npredominantly influenced by demonstrations or instructions by applying diverse\nperturbations to in-context demonstrations while preserving the task\ninstruction. We observe varying behavior to perturbed examples across different\nmodel families, notably with BLOOM-7B derivatives being severely influenced by\nnoise, whereas Llama 2 derivatives not only exhibit robustness but also tend to\nshow enhancements over the clean baseline when subject to perturbed\ndemonstrations. This suggests that the robustness of ICL may be governed by\nseveral factors, including the type of noise, perturbation direction (source or\ntarget), the extent of pretraining of the specific model, and fine-tuning for\ndownstream tasks if applicable. Further investigation is warranted to develop a\ncomprehensive understanding of these factors in future research.\n","authors":["Pranjal A. Chitale","Jay Gala","Varun Gumma","Mitesh M. Khapra","Raj Dabre"],"pdf_url":"https://arxiv.org/pdf/2401.12097v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.12088v1","updated":"2024-01-22T16:25:47Z","published":"2024-01-22T16:25:47Z","title":"Unsupervised Learning of Graph from Recipes","summary":" Cooking recipes are one of the most readily available kinds of procedural\ntext. They consist of natural language instructions that can be challenging to\ninterpret. In this paper, we propose a model to identify relevant information\nfrom recipes and generate a graph to represent the sequence of actions in the\nrecipe. In contrast with other approaches, we use an unsupervised approach. We\niteratively learn the graph structure and the parameters of a $\\mathsf{GNN}$\nencoding the texts (text-to-graph) one sequence at a time while providing the\nsupervision by decoding the graph into text (graph-to-text) and comparing the\ngenerated text to the input. We evaluate the approach by comparing the\nidentified entities with annotated datasets, comparing the difference between\nthe input and output texts, and comparing our generated graphs with those\ngenerated by state of the art methods.\n","authors":["Aissatou Diallo","Antonis Bikakis","Luke Dickens","Anthony Hunter","Rob Miller"],"pdf_url":"https://arxiv.org/pdf/2401.12088v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12087v1","updated":"2024-01-22T16:25:27Z","published":"2024-01-22T16:25:27Z","title":"Revisiting Demonstration Selection Strategies in In-Context Learning","summary":" Large language models (LLMs) have shown an impressive ability to perform a\nwide range of tasks using in-context learning (ICL), where a few examples are\nused to describe a task to the model. However, the performance of ICL varies\nsignificantly with the choice of demonstrations, and it is still unclear why\nthis happens or what factors will influence its choice. In this work, we first\nrevisit the factors contributing to this variance from both data and model\naspects, and find that the choice of demonstration is both data- and\nmodel-dependent. We further proposed a data- and model-dependent demonstration\nselection method, \\textbf{TopK + ConE}, based on the assumption that\n\\textit{the performance of a demonstration positively correlates with its\ncontribution to the model's understanding of the test samples}, resulting in a\nsimple and effective recipe for ICL. Empirically, our method yields consistent\nimprovements in both language understanding and generation tasks with different\nmodel scales. Further analyses confirm that, besides the generality and\nstability under different circumstances, our method provides a unified\nexplanation for the effectiveness of previous methods. Code will be released.\n","authors":["Keqin Peng","Liang Ding","Yancheng Yuan","Xuebo Liu","Min Zhang","Yuanxin Ouyang","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2401.12087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12086v1","updated":"2024-01-22T16:24:43Z","published":"2024-01-22T16:24:43Z","title":"West-of-N: Synthetic Preference Generation for Improved Reward Modeling","summary":" The success of reinforcement learning from human feedback (RLHF) in language\nmodel alignment is strongly dependent on the quality of the underlying reward\nmodel. In this paper, we present a novel approach to improve reward model\nquality by generating synthetic preference data, thereby augmenting the\ntraining dataset with on-policy, high-quality preference pairs. Motivated by\nthe promising results of Best-of-N sampling strategies in language model\ntraining, we extend their application to reward model training. This results in\na self-training strategy to generate preference pairs by selecting the best and\nworst candidates in a pool of responses to a given query. Empirically, we find\nthat this approach improves the performance of any reward model, with an effect\ncomparable to the addition of a similar quantity of human preference data. This\nwork opens up new avenues of research for improving RLHF for language model\nalignment, by offering synthetic preference generation as a solution to reward\nmodeling challenges.\n","authors":["Alizée Pace","Jonathan Mallinson","Eric Malmi","Sebastian Krause","Aliaksei Severyn"],"pdf_url":"https://arxiv.org/pdf/2401.12086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12078v1","updated":"2024-01-22T16:20:14Z","published":"2024-01-22T16:20:14Z","title":"Temporal Blind Spots in Large Language Models","summary":" Large language models (LLMs) have recently gained significant attention due\nto their unparalleled ability to perform various natural language processing\ntasks. These models, benefiting from their advanced natural language\nunderstanding capabilities, have demonstrated impressive zero-shot performance.\nHowever, the pre-training data utilized in LLMs is often confined to a specific\ncorpus, resulting in inherent freshness and temporal scope limitations.\nConsequently, this raises concerns regarding the effectiveness of LLMs for\ntasks involving temporal intents. In this study, we aim to investigate the\nunderlying limitations of general-purpose LLMs when deployed for tasks that\nrequire a temporal understanding. We pay particular attention to handling\nfactual temporal knowledge through three popular temporal QA datasets.\nSpecifically, we observe low performance on detailed questions about the past\nand, surprisingly, for rather new information. In manual and automatic testing,\nwe find multiple temporal errors and characterize the conditions under which QA\nperformance deteriorates. Our analysis contributes to understanding LLM\nlimitations and offers valuable insights into developing future models that can\nbetter cater to the demands of temporally-oriented tasks. The code is\navailable\\footnote{https://github.com/jwallat/temporalblindspots}.\n","authors":["Jonas Wallat","Adam Jatowt","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2401.12078v1.pdf","comment":"accepted at WSDM'24"},{"id":"http://arxiv.org/abs/2401.12072v1","updated":"2024-01-22T16:13:45Z","published":"2024-01-22T16:13:45Z","title":"Cross-lingual Transfer Learning for Javanese Dependency Parsing","summary":" While structure learning achieves remarkable performance in high-resource\nlanguages, the situation differs for under-represented languages due to the\nscarcity of annotated data. This study focuses on assessing the efficacy of\ntransfer learning in enhancing dependency parsing for Javanese, a language\nspoken by 80 million individuals but characterized by limited representation in\nnatural language processing. We utilized the Universal Dependencies dataset\nconsisting of dependency treebanks from more than 100 languages, including\nJavanese. We propose two learning strategies to train the model: transfer\nlearning (TL) and hierarchical transfer learning (HTL). While TL only uses a\nsource language to pre-train the model, the HTL method uses a source language\nand an intermediate language in the learning process. The results show that our\nbest model uses the HTL method, which improves performance with an increase of\n10% for both UAS and LAS evaluations compared to the baseline model.\n","authors":["Fadli Aulawi Al Ghiffari","Ika Alfina","Kurniawati Azizah"],"pdf_url":"https://arxiv.org/pdf/2401.12072v1.pdf","comment":"Accepted at IJCNLP-AACL 2023 SRW"},{"id":"http://arxiv.org/abs/2401.12070v1","updated":"2024-01-22T16:09:47Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n Text","summary":" Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v1.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2311.14212v3","updated":"2024-01-22T15:05:30Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n Performance","summary":" When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations. We introduce the term annotation sensitivity to refer to the\nimpact of annotation data collection methods on the annotations themselves and\non downstream model performance and predictions. We collect annotations of hate\nspeech and offensive language in five experimental conditions of an annotation\ninstrument, randomly assigning annotators to conditions. We then fine-tune BERT\nmodels on each of the five resulting datasets and evaluate model performance on\na holdout portion of each condition. We find considerable differences between\nthe conditions for 1) the share of hate speech/offensive language annotations,\n2) model performance, 3) model predictions, and 4) model learning curves. Our\nresults emphasize the crucial role played by the annotation instrument which\nhas received little attention in the machine learning literature. We call for\nadditional research into how and why the instrument impacts the annotations to\ninform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v3.pdf","comment":"EMNLP 2023 Findings:\n https://aclanthology.org/2023.findings-emnlp.992/"},{"id":"http://arxiv.org/abs/2306.00824v2","updated":"2024-01-22T14:57:47Z","published":"2023-06-01T15:46:36Z","title":"Zero and Few-shot Semantic Parsing with Ambiguous Inputs","summary":" Despite the frequent challenges posed by ambiguity when representing meaning\nvia natural language, it is often ignored or deliberately removed in tasks\nmapping language to formally-designed representations, which generally assume a\none-to-one mapping between linguistic and formal representations. We attempt to\naddress this shortcoming by introducing AmP, a framework, dataset, and\nchallenge for translating ambiguous natural language to formal representations\nlike logic and code. We define templates and generate data for five\nwell-documented linguistic ambiguities. Using AmP, we investigate how several\nfew-shot text-to-code systems handle ambiguity, introducing three new metrics.\nWe find that large pre-trained models perform poorly at capturing the\ndistribution of possible meanings without deliberate instruction. However,\nmodels are able to capture the distribution well when ambiguity is attested in\ntheir inputs. These results motivate a call for including ambiguity explicitly\nin datasets and promote considering the distribution of possible outputs when\nevaluating systems. Data and code: https://github.com/esteng/ambiguous_parsing\n","authors":["Elias Stengel-Eskin","Kyle Rawlins","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2306.00824v2.pdf","comment":"ICLR 2024 Camera Ready"},{"id":"http://arxiv.org/abs/2401.12005v1","updated":"2024-01-22T14:53:59Z","published":"2024-01-22T14:53:59Z","title":"ALMs: Authorial Language Models for Authorship Attribution","summary":" In this paper, we introduce an authorship attribution method called Authorial\nLanguage Models (ALMs) that involves identifying the most likely author of a\nquestioned document based on the perplexity of the questioned document\ncalculated for a set of causal language models fine-tuned on the writings of a\nset of candidate author. We benchmarked ALMs against state-of-art-systems using\nthe CCAT50 dataset and the Blogs50 datasets. We find that ALMs achieves a\nmacro-average accuracy score of 83.6% on Blogs50, outperforming all other\nmethods, and 74.9% on CCAT50, matching the performance of the best method. To\nassess the performance of ALMs on shorter texts, we also conducted text\nablation testing. We found that to reach a macro-average accuracy of 70%, ALMs\nneeds 40 tokens on Blogs50 and 400 tokens on CCAT50, while to reach 60% ALMs\nrequires 20 tokens on Blogs50 and 70 tokens on CCAT50.\n","authors":["Weihang Huang","Akira Murakami","Jack Grieve"],"pdf_url":"https://arxiv.org/pdf/2401.12005v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02118v2","updated":"2024-01-22T14:41:43Z","published":"2023-10-03T14:59:35Z","title":"TWIZ-v2: The Wizard of Multimodal Conversational-Stimulus","summary":" In this report, we describe the vision, challenges, and scientific\ncontributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot\nChallenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal,\nknowledgeable, and engaging assistant that can guide users towards the\nsuccessful completion of complex manual tasks. To achieve this, we focus our\nefforts on three main research questions: (1) Humanly-Shaped Conversations, by\nproviding information in a knowledgeable way; (2) Multimodal Stimulus, making\nuse of various modalities including voice, images, and videos; and (3)\nZero-shot Conversational Flows, to improve the robustness of the interaction to\nunseen scenarios. TWIZ is an assistant capable of supporting a wide range of\ntasks, with several innovative features such as creative cooking, video\nnavigation through voice, and the robust TWIZ-LLM, a Large Language Model\ntrained for dialoguing about complex manual tasks. Given ratings and feedback\nprovided by users, we observed that TWIZ bot is an effective and robust system,\ncapable of guiding users through tasks while providing several multimodal\nstimuli.\n","authors":["Rafael Ferreira","Diogo Tavares","Diogo Silva","Rodrigo Valério","João Bordalo","Inês Simões","Vasco Ramos","David Semedo","João Magalhães"],"pdf_url":"https://arxiv.org/pdf/2310.02118v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11972v1","updated":"2024-01-22T14:24:03Z","published":"2024-01-22T14:24:03Z","title":"Synergizing Machine Learning & Symbolic Methods: A Survey on Hybrid\n Approaches to Natural Language Processing","summary":" The advancement of machine learning and symbolic approaches have underscored\ntheir strengths and weaknesses in Natural Language Processing (NLP). While\nmachine learning approaches are powerful in identifying patterns in data, they\noften fall short in learning commonsense and the factual knowledge required for\nthe NLP tasks. Meanwhile, the symbolic methods excel in representing\nknowledge-rich data. However, they struggle to adapt dynamic data and\ngeneralize the knowledge. Bridging these two paradigms through hybrid\napproaches enables the alleviation of weaknesses in both while preserving their\nstrengths. Recent studies extol the virtues of this union, showcasing promising\nresults in a wide range of NLP tasks. In this paper, we present an overview of\nhybrid approaches used for NLP. Specifically, we delve into the\nstate-of-the-art hybrid approaches used for a broad spectrum of NLP tasks\nrequiring natural language understanding, generation, and reasoning.\nFurthermore, we discuss the existing resources available for hybrid approaches\nfor NLP along with the challenges, offering a roadmap for future directions.\n","authors":["Rrubaa Panchendrarajan","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.11972v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11969v1","updated":"2024-01-22T14:17:03Z","published":"2024-01-22T14:17:03Z","title":"Claim Detection for Automated Fact-checking: A Survey on Monolingual,\n Multilingual and Cross-Lingual Research","summary":" Automated fact-checking has drawn considerable attention over the past few\ndecades due to the increase in the diffusion of misinformation on online\nplatforms. This is often carried out as a sequence of tasks comprising (i) the\ndetection of sentences circulating in online platforms which constitute claims\nneeding verification, followed by (ii) the verification process of those\nclaims. This survey focuses on the former, by discussing existing efforts\ntowards detecting claims needing fact-checking, with a particular focus on\nmultilingual data and methods. This is a challenging and fertile direction\nwhere existing methods are yet far from matching human performance due to the\nprofoundly challenging nature of the issue. Especially, the dissemination of\ninformation across multiple social platforms, articulated in multiple languages\nand modalities demands more generalized solutions for combating misinformation.\nFocusing on multilingual misinformation, we present a comprehensive survey of\nexisting multilingual claim detection research. We present state-of-the-art\nmultilingual claim detection research categorized into three key factors of the\nproblem, verifiability, priority, and similarity. Further, we present a\ndetailed overview of the existing multilingual datasets along with the\nchallenges and suggest possible future advancements.\n","authors":["Rrubaa Panchendrarajan","Arkaitz Zubiaga"],"pdf_url":"https://arxiv.org/pdf/2401.11969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14578v2","updated":"2024-01-22T14:13:51Z","published":"2023-05-23T23:31:24Z","title":"Connecting the Dots: What Graph-Based Text Representations Work Best for\n Text Classification Using Graph Neural Networks?","summary":" Given the success of Graph Neural Networks (GNNs) for structure-aware machine\nlearning, many studies have explored their use for text classification, but\nmostly in specific domains with limited data characteristics. Moreover, some\nstrategies prior to GNNs relied on graph mining and classical machine learning,\nmaking it difficult to assess their effectiveness in modern settings. This work\nextensively investigates graph representation methods for text classification,\nidentifying practical implications and open challenges. We compare different\ngraph construction schemes using a variety of GNN architectures and setups\nacross five datasets, encompassing short and long documents as well as\nunbalanced scenarios in diverse domains. Two Transformer-based large language\nmodels are also included to complement the study. The results show that i)\nalthough the effectiveness of graphs depends on the textual input features and\ndomain, simple graph constructions perform better the longer the documents are,\nii) graph representations are especially beneficial for longer documents,\noutperforming Transformer-based models, iii) graph methods are particularly\nefficient at solving the task.\n","authors":["Margarita Bugueño","Gerard de Melo"],"pdf_url":"https://arxiv.org/pdf/2305.14578v2.pdf","comment":"Accepted to Findings of the Association for Computational\n Linguistics: EMNLP 2023 (Long Paper). 17 pages, 2 figures, 15 tables. The\n Appendix starts on page 12"},{"id":"http://arxiv.org/abs/2310.01386v2","updated":"2024-01-22T13:58:50Z","published":"2023-10-02T17:46:09Z","title":"Who is ChatGPT? Benchmarking LLMs' Psychological Portrayal Using\n PsychoBench","summary":" Large Language Models (LLMs) have recently showcased their remarkable\ncapacities, not only in natural language processing tasks but also across\ndiverse domains such as clinical medicine, legal consultation, and education.\nLLMs become more than mere applications, evolving into assistants capable of\naddressing diverse user requests. This narrows the distinction between human\nbeings and artificial intelligence agents, raising intriguing questions\nregarding the potential manifestation of personalities, temperaments, and\nemotions within LLMs. In this paper, we propose a framework, PsychoBench, for\nevaluating diverse psychological aspects of LLMs. Comprising thirteen scales\ncommonly used in clinical psychology, PsychoBench further classifies these\nscales into four distinct categories: personality traits, interpersonal\nrelationships, motivational tests, and emotional abilities. Our study examines\nfive popular models, namely text-davinci-003, gpt-3.5-turbo, gpt-4, LLaMA-2-7b,\nand LLaMA-2-13b. Additionally, we employ a jailbreak approach to bypass the\nsafety alignment protocols and test the intrinsic natures of LLMs. We have made\nPsychoBench openly accessible via https://github.com/CUHK-ARISE/PsychoBench.\n","authors":["Jen-tse Huang","Wenxuan Wang","Eric John Li","Man Ho Lam","Shujie Ren","Youliang Yuan","Wenxiang Jiao","Zhaopeng Tu","Michael R. Lyu"],"pdf_url":"https://arxiv.org/pdf/2310.01386v2.pdf","comment":"Accepted for ICLR 2024 Oral Presentation. 15 pages (main text) and 5\n pages (appendix)"},{"id":"http://arxiv.org/abs/2401.11944v1","updated":"2024-01-22T13:34:34Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU.\n CMMMU includes 12k manually collected multimodal questions from college\nexams, quizzes, and textbooks, covering six core disciplines: Art & Design,\nBusiness, Science, Health & Medicine, Humanities & Social Science, and Tech &\nEngineering, like its companion, MMMU. These questions span 30 subjects and\ncomprise 39 highly heterogeneous image types, such as charts, diagrams, maps,\ntables, music sheets, and chemical structures.\n CMMMU focuses on complex perception and reasoning with domain-specific\nknowledge in the Chinese context. We evaluate 11 open-source LLMs and one\nproprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,\nindicating a large space for improvement. CMMMU will boost the community to\nbuild the next-generation LMMs towards expert artificial intelligence and\npromote the democratization of LMMs by providing diverse language contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Wenhu Chen","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":" This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11911v1","updated":"2024-01-22T12:54:04Z","published":"2024-01-22T12:54:04Z","title":"Blinded by Generated Contexts: How Language Models Merge Generated and\n Retrieved Contexts for Open-Domain QA?","summary":" While auxiliary information has become a key to enhance Large Language Models\n(LLMs), relatively little is known about how well LLMs merge these contexts,\nspecifically generated and retrieved. To study this, we formulate a task\nspecifically designed to identify whether the answers, derived from the\nintegration of generated and retrieved contexts, are attributed to either\ngenerated or retrieved contexts. To support this task, we develop a methodology\nto construct datasets with conflicting contexts, where each question is paired\nwith both generated and retrieved contexts, yet only one of them contains the\ncorrect answer. Our experiments reveal a significant bias in LLMs towards\ngenerated contexts, as evidenced across state-of-the-art open (Llama2-7b/13b)\nand closed (GPT 3.5/4) systems. We further identify two key factors\ncontributing to this bias: i) Contexts generated by LLMs typically show greater\nsimilarity to the questions, increasing their likelihood of selection; ii) The\nsegmentation process used in retrieved contexts disrupts their completeness,\nthereby hindering their full utilization in LLMs. Our analysis enhances the\nunderstanding of how LLMs merge diverse contexts, offering valuable insights\nfor advancing current augmentation methods for LLMs.\n","authors":["Hexiang Tan","Fei Sun","Wanli Yang","Yuanzhuo Wang","Qi Cao","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2401.11911v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10337v2","updated":"2024-01-22T12:33:43Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n Security Attack Pattern Recognition","summary":" Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v2.pdf","comment":"accepted at EACL 2024, in ARR October 2023"},{"id":"http://arxiv.org/abs/2311.07989v4","updated":"2024-01-22T12:27:47Z","published":"2023-11-14T08:34:26Z","title":"Unifying the Perspectives of NLP and Software Engineering: A Survey on\n Language Models for Code","summary":" In this work we systematically review the recent advancements in code\nprocessing with language models, covering 50+ models, 30+ evaluation tasks,\n170+ datasets, and 700+ related works. We break down code processing models\ninto general language models represented by the GPT family and specialized\nmodels that are specifically pretrained on code, often with tailored\nobjectives. We discuss the relations and differences between these models, and\nhighlight the historical transition of code modeling from statistical models\nand RNNs to pretrained Transformers and LLMs, which is exactly the same course\nthat had been taken by NLP. We also discuss code-specific features such as AST,\nCFG, and unit tests, along with their application in training code language\nmodels, and identify key challenges and potential future directions in this\ndomain. We keep the survey open and updated on GitHub at\nhttps://github.com/codefuse-ai/Awesome-Code-LLM.\n","authors":["Ziyin Zhang","Chaoyu Chen","Bingchang Liu","Cong Liao","Zi Gong","Hang Yu","Jianguo Li","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07989v4.pdf","comment":"Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM.\n 8 figures, 10 tables, and 713 references"},{"id":"http://arxiv.org/abs/2401.11880v1","updated":"2024-01-22T12:11:55Z","published":"2024-01-22T12:11:55Z","title":"PsySafe: A Comprehensive Framework for Psychological-based Attack,\n Defense, and Evaluation of Multi-agent System Safety","summary":" Multi-agent systems, augmented with Large Language Models (LLMs), demonstrate\nsignificant capabilities for collective intelligence. However, the potential\nmisuse of this intelligence for malicious purposes presents significant risks.\nTo date, comprehensive research on the safety issues associated with\nmulti-agent systems remains limited. From the perspective of agent psychology,\nwe discover that the dark psychological states of agents can lead to severe\nsafety issues. To address these issues, we propose a comprehensive framework\ngrounded in agent psychology. In our framework, we focus on three aspects:\nidentifying how dark personality traits in agents might lead to risky\nbehaviors, designing defense strategies to mitigate these risks, and evaluating\nthe safety of multi-agent systems from both psychological and behavioral\nperspectives. Our experiments reveal several intriguing phenomena, such as the\ncollective dangerous behaviors among agents, agents' propensity for\nself-reflection when engaging in dangerous behavior, and the correlation\nbetween agents' psychological assessments and their dangerous behaviors. We\nanticipate that our framework and observations will provide valuable insights\nfor further research into the safety of multi-agent systems. We will make our\ndata and code publicly accessible at https:/github.com/AI4Good24/PsySafe.\n","authors":["Zaibin Zhang","Yongting Zhang","Lijun Li","Hongzhi Gao","Lijun Wang","Huchuan Lu","Feng Zhao","Yu Qiao","Jing Shao"],"pdf_url":"https://arxiv.org/pdf/2401.11880v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11864v1","updated":"2024-01-22T11:37:18Z","published":"2024-01-22T11:37:18Z","title":"Improving Small Language Models' Mathematical Reasoning via Mix Thoughts\n Distillation","summary":" This work addresses the challenge of democratizing advanced Large Language\nModels (LLMs) by compressing their mathematical reasoning capabilities into\nsub-billion parameter Small Language Models (SLMs) without compromising\nperformance. We introduce Equation-of-Thought Distillation (EoTD), a novel\ntechnique that encapsulates the reasoning process into equation-based\nrepresentations to construct an EoTD dataset for fine-tuning SLMs.\nAdditionally, we propose the Mix Thoughts Distillation (MTD) framework to\nenhance the reasoning performance of SLMs. This involves creating a reasoning\ndataset with multiple thought processes and using it for fine-tuning. Our\nexperimental findings demonstrate that EoTD significantly boosts the reasoning\nabilities of SLMs, while MTD enables these models to achieve state-of-the-art\nreasoning performance.\n","authors":["Xunyu Zhu","Jian Li","Yong Liu","Can Ma","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11864v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11852v1","updated":"2024-01-22T11:15:07Z","published":"2024-01-22T11:15:07Z","title":"The Right Model for the Job: An Evaluation of Legal Multi-Label\n Classification Baselines","summary":" Multi-Label Classification (MLC) is a common task in the legal domain, where\nmore than one label may be assigned to a legal document. A wide range of\nmethods can be applied, ranging from traditional ML approaches to the latest\nTransformer-based architectures. In this work, we perform an evaluation of\ndifferent MLC methods using two public legal datasets, POSTURE50K and\nEURLEX57K. By varying the amount of training data and the number of labels, we\nexplore the comparative advantage offered by different approaches in relation\nto the dataset properties. Our findings highlight DistilRoBERTa and LegalBERT\nas performing consistently well in legal MLC with reasonable computational\ndemands. T5 also demonstrates comparable performance while offering advantages\nas a generative model in the presence of changing label sets. Finally, we show\nthat the CrossEncoder exhibits potential for notable macro-F1 score\nimprovements, albeit with increased computational costs.\n","authors":["Martina Forster","Claudia Schulz","Prudhvi Nokku","Melicaalsadat Mirsafian","Jaykumar Kasundra","Stavroula Skylaki"],"pdf_url":"https://arxiv.org/pdf/2401.11852v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11839v1","updated":"2024-01-22T10:57:09Z","published":"2024-01-22T10:57:09Z","title":"AI for social science and social science of AI: A Survey","summary":" Recent advancements in artificial intelligence, particularly with the\nemergence of large language models (LLMs), have sparked a rethinking of\nartificial general intelligence possibilities. The increasing human-like\ncapabilities of AI are also attracting attention in social science research,\nleading to various studies exploring the combination of these two fields. In\nthis survey, we systematically categorize previous explorations in the\ncombination of AI and social science into two directions that share common\ntechnical approaches but differ in their research objectives. The first\ndirection is focused on AI for social science, where AI is utilized as a\npowerful tool to enhance various stages of social science research. While the\nsecond direction is the social science of AI, which examines AI agents as\nsocial entities with their human-like cognitive and linguistic capabilities. By\nconducting a thorough review, particularly on the substantial progress\nfacilitated by recent advancements in large language models, this paper\nintroduces a fresh perspective to reassess the relationship between AI and\nsocial science, provides a cohesive framework that allows researchers to\nunderstand the distinctions and connections between AI for social science and\nsocial science of AI, and also summarized state-of-art experiment simulation\nplatforms to facilitate research in these two directions. We believe that as AI\ntechnology continues to advance and intelligent agents find increasing\napplications in our daily lives, the significance of the combination of AI and\nsocial science will become even more prominent.\n","authors":["Ruoxi Xu","Yingfei Sun","Mengjie Ren","Shiguang Guo","Ruotong Pan","Hongyu Lin","Le Sun","Xianpei Han"],"pdf_url":"https://arxiv.org/pdf/2401.11839v1.pdf","comment":"Accepted by Information Processing and Management (IP&M)"},{"id":"http://arxiv.org/abs/2401.11819v1","updated":"2024-01-22T10:30:11Z","published":"2024-01-22T10:30:11Z","title":"SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in\n Chinese","summary":" We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate\nthe mathematical reasoning abilities of Chinese language models. SC-Math6 is\ndesigned as an upgraded Chinese version of the GSM8K dataset with enhanced\ndifficulty, diversity, and application scope. It consists of over 2000\nmathematical word problems requiring multi-step reasoning and providing natural\nlanguage solutions. We propose an innovative scheme to quantify the reasoning\ncapability of large models based on performance over problems with different\nreasoning steps. Experiments on 12 representative Chinese models demonstrate a\nclear stratification of reasoning levels, with top models like GPT-4 showing\nsuperior performance. SC-Math6 fills the gap in Chinese mathematical reasoning\nbenchmarks and provides a comprehensive testbed to advance the intelligence of\nChinese language models.\n","authors":["Liang Xu","Hang Xue","Lei Zhu","Kangkang Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.11819v1.pdf","comment":"8 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11817v1","updated":"2024-01-22T10:26:14Z","published":"2024-01-22T10:26:14Z","title":"Hallucination is Inevitable: An Innate Limitation of Large Language\n Models","summary":" Hallucination has been widely recognized to be a significant drawback for\nlarge language models (LLMs). There have been many works that attempt to reduce\nthe extent of hallucination. These efforts have mostly been empirical so far,\nwhich cannot answer the fundamental question whether it can be completely\neliminated. In this paper, we formalize the problem and show that it is\nimpossible to eliminate hallucination in LLMs. Specifically, we define a formal\nworld where hallucination is defined as inconsistencies between a computable\nLLM and a computable ground truth function. By employing results from learning\ntheory, we show that LLMs cannot learn all of the computable functions and will\ntherefore always hallucinate. Since the formal world is a part of the real\nworld which is much more complicated, hallucinations are also inevitable for\nreal world LLMs. Furthermore, for real world LLMs constrained by provable time\ncomplexity, we describe the hallucination-prone tasks and empirically validate\nour claims. Finally, using the formal world framework, we discuss the possible\nmechanisms and efficacies of existing hallucination mitigators as well as the\npractical implications on the safe deployment of LLMs.\n","authors":["Ziwei Xu","Sanjay Jain","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2401.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n Segmentation","summary":" Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04408v3","updated":"2024-01-22T07:40:02Z","published":"2023-07-10T08:15:40Z","title":"TIM: Teaching Large Language Models to Translate with Comparison","summary":" Open-sourced large language models (LLMs) have demonstrated remarkable\nefficacy in various tasks with instruction tuning. However, these models can\nsometimes struggle with tasks that require more specialized knowledge such as\ntranslation. One possible reason for such deficiency is that instruction tuning\naims to generate fluent and coherent text that continues from a given\ninstruction without being constrained by any task-specific requirements.\nMoreover, it can be more challenging for tuning smaller LLMs with lower-quality\ntraining data. To address this issue, we propose a novel framework using\nexamples in comparison to teach LLMs to learn translation. Our approach\ninvolves presenting the model with examples of correct and incorrect\ntranslations and using a preference loss to guide the model's learning. We\nevaluate our method on WMT2022 test sets and show that it outperforms existing\nmethods. Our findings offer a new perspective on fine-tuning LLMs for\ntranslation tasks and provide a promising solution for generating high-quality\ntranslations. Please refer to Github for more details:\nhttps://github.com/lemon0830/TIM.\n","authors":["Jiali Zeng","Fandong Meng","Yongjing Yin","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.04408v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2309.12247v2","updated":"2024-01-22T07:24:30Z","published":"2023-09-21T16:47:30Z","title":"Bad Actor, Good Advisor: Exploring the Role of Large Language Models in\n Fake News Detection","summary":" Detecting fake news requires both a delicate sense of diverse clues and a\nprofound understanding of the real-world background, which remains challenging\nfor detectors based on small language models (SLMs) due to their knowledge and\ncapability limitations. Recent advances in large language models (LLMs) have\nshown remarkable performance in various tasks, but whether and how LLMs could\nhelp with fake news detection remains underexplored. In this paper, we\ninvestigate the potential of LLMs in fake news detection. First, we conduct an\nempirical study and find that a sophisticated LLM such as GPT 3.5 could\ngenerally expose fake news and provide desirable multi-perspective rationales\nbut still underperforms the basic SLM, fine-tuned BERT. Our subsequent analysis\nattributes such a gap to the LLM's inability to select and integrate rationales\nproperly to conclude. Based on these findings, we propose that current LLMs may\nnot substitute fine-tuned SLMs in fake news detection but can be a good advisor\nfor SLMs by providing multi-perspective instructive rationales. To instantiate\nthis proposal, we design an adaptive rationale guidance network for fake news\ndetection (ARG), in which SLMs selectively acquire insights on news analysis\nfrom the LLMs' rationales. We further derive a rationale-free version of ARG by\ndistillation, namely ARG-D, which services cost-sensitive scenarios without\nquerying LLMs. Experiments on two real-world datasets demonstrate that ARG and\nARG-D outperform three types of baseline methods, including SLM-based,\nLLM-based, and combinations of small and large language models.\n","authors":["Beizhe Hu","Qiang Sheng","Juan Cao","Yuhui Shi","Yang Li","Danding Wang","Peng Qi"],"pdf_url":"https://arxiv.org/pdf/2309.12247v2.pdf","comment":"16 pages, 5 figures, and 9 tables. To appear at AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11725v1","updated":"2024-01-22T07:07:06Z","published":"2024-01-22T07:07:06Z","title":"Speak It Out: Solving Symbol-Related Problems with Symbol-to-Language\n Conversion for Language Models","summary":" Symbols (or more broadly, non-natural language textual representations) such\nas numerical sequences, molecular formulas, and table delimiters widely exist,\nplaying important roles in various tasks such as abstract reasoning, chemical\nproperty prediction, and table question answering. Despite the impressive\nnatural language comprehension capabilities of large language models (LLMs),\ntheir reasoning abilities for symbols remain inadequate, which could attributed\nto the difference between symbol representations and general natural languages.\nWe propose symbol-to-language (S2L), a tuning-free method that enables large\nlanguage models to solve symbol-related problems with information expressed in\nnatural language. Specifically, S2L first converts the symbols involved to\nlanguage-based representations, which can be implemented by prompting LLMs or\nleveraging external tools, then these language-based representations are\nintegrated into the original problem via direct substitution or concatenation,\nserving as useful input information for LLMs. We evaluate the S2L method using\nboth API-based (GPT-4, ChatGPT) and open-source (OpenChat) models over eight\nsymbol-related tasks, ranging from symbol-only abstract reasoning to sentiment\nanalysis in social media. Experimental results show that S2L consistently leads\nto superior performance. For example, by employing S2L for GPT-4, there can be\naverage significant improvements of +21.9% and +9.5% for subtasks in 1D-ARC and\nDyck language, respectively. Codes and data are available at\nhttps://github.com/THUNLP-MT/symbol2language.\n","authors":["Yile Wang","Sijie Cheng","Zixin Sun","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11725v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09798v2","updated":"2024-01-22T06:22:55Z","published":"2024-01-18T08:36:54Z","title":"All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks","summary":" Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where\nsafeguards are bypassed to produce ethically harmful prompts. This study\nproposes a simple black-box method to effectively generate jailbreak prompts,\novercoming the high complexity and computational costs associated with existing\nmethods. The proposed technique iteratively rewrites harmful prompts into\nnon-harmful expressions using the target LLM itself, based on the hypothesis\nthat LLMs can directly sample expressions that bypass safeguards. Demonstrated\nthrough experiments with ChatGPT (GPT-3.5 and GPT-4) and Gemini-Pro, this\nmethod achieved an attack success rate of over 80% within an average of 5\niterations and remained effective despite model updates. The generated\njailbreak prompts were naturally-worded and concise; moreover, they were\ndifficult-to-defend. These results indicate that creating effective jailbreak\nprompts is simpler than previously considered, suggesting that black-box\njailbreak attacks pose a more serious threat.\n","authors":["Kazuhiro Takemoto"],"pdf_url":"https://arxiv.org/pdf/2401.09798v2.pdf","comment":"12 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.11700v1","updated":"2024-01-22T05:46:11Z","published":"2024-01-22T05:46:11Z","title":"Keep Decoding Parallel with Effective Knowledge Distillation from\n Language Models to End-to-end Speech Recognisers","summary":" This study presents a novel approach for knowledge distillation (KD) from a\nBERT teacher model to an automatic speech recognition (ASR) model using\nintermediate layers. To distil the teacher's knowledge, we use an attention\ndecoder that learns from BERT's token probabilities. Our method shows that\nlanguage model (LM) information can be more effectively distilled into an ASR\nmodel using both the intermediate layers and the final layer. By using the\nintermediate layers as distillation target, we can more effectively distil LM\nknowledge into the lower network layers. Using our method, we achieve better\nrecognition accuracy than with shallow fusion of an external LM, allowing us to\nmaintain fast parallel decoding. Experiments on the LibriSpeech dataset\ndemonstrate the effectiveness of our approach in enhancing greedy decoding with\nconnectionist temporal classification (CTC).\n","authors":["Michael Hentschel","Yuta Nishikawa","Tatsuya Komatsu","Yusuke Fujita"],"pdf_url":"https://arxiv.org/pdf/2401.11700v1.pdf","comment":"Accepted at ICASSP 2024"},{"id":"http://arxiv.org/abs/2304.03047v3","updated":"2024-01-22T04:57:32Z","published":"2023-04-06T13:07:17Z","title":"ETPNav: Evolving Topological Planning for Vision-Language Navigation in\n Continuous Environments","summary":" Vision-language navigation is a task that requires an agent to follow\ninstructions to navigate in environments. It becomes increasingly crucial in\nthe field of embodied AI, with potential applications in autonomous navigation,\nsearch and rescue, and human-robot interaction. In this paper, we propose to\naddress a more practical yet challenging counterpart setting - vision-language\nnavigation in continuous environments (VLN-CE). To develop a robust VLN-CE\nagent, we propose a new navigation framework, ETPNav, which focuses on two\ncritical skills: 1) the capability to abstract environments and generate\nlong-range navigation plans, and 2) the ability of obstacle-avoiding control in\ncontinuous environments. ETPNav performs online topological mapping of\nenvironments by self-organizing predicted waypoints along a traversed path,\nwithout prior environmental experience. It privileges the agent to break down\nthe navigation procedure into high-level planning and low-level control.\nConcurrently, ETPNav utilizes a transformer-based cross-modal planner to\ngenerate navigation plans based on topological maps and instructions. The plan\nis then performed through an obstacle-avoiding controller that leverages a\ntrial-and-error heuristic to prevent navigation from getting stuck in\nobstacles. Experimental results demonstrate the effectiveness of the proposed\nmethod. ETPNav yields more than 10% and 20% improvements over prior\nstate-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is\navailable at https://github.com/MarSaKi/ETPNav.\n","authors":["Dong An","Hanqing Wang","Wenguan Wang","Zun Wang","Yan Huang","Keji He","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.03047v3.pdf","comment":"Project page: https://github.com/MarSaKi/ETPNav"},{"id":"http://arxiv.org/abs/2305.05352v6","updated":"2024-01-22T04:15:13Z","published":"2023-05-09T11:37:16Z","title":"A Taxonomy of Foundation Model based Systems through the Lens of\n Software Architecture","summary":" The recent release of large language model (LLM) based chatbots, such as\nChatGPT, has attracted huge interest in foundation models. It is widely\nbelieved that foundation models will serve as the fundamental building blocks\nfor future AI systems. As foundation models are in their early stages, the\ndesign of foundation model based systems has not yet been systematically\nexplored. There is limited understanding about the impact of introducing\nfoundation models in software architecture. Therefore, in this paper, we\npropose a taxonomy of foundation model based systems, which classifies and\ncompares the characteristics of foundation models and design options of\nfoundation model based systems. Our taxonomy comprises three categories: the\npretraining and adaptation of foundation models, the architecture design of\nfoundation model based systems, and responsible-AI-by-design. This taxonomy can\nserve as concrete guidance for making major architectural design decisions when\ndesigning foundation model based systems and highlights trade-offs arising from\ndesign decisions.\n","authors":["Qinghua Lu","Liming Zhu","Xiwei Xu","Yue Liu","Zhenchang Xing","Jon Whittle"],"pdf_url":"https://arxiv.org/pdf/2305.05352v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01538v3","updated":"2024-01-22T02:39:17Z","published":"2023-09-04T11:38:02Z","title":"ChatRule: Mining Logical Rules with Large Language Models for Knowledge\n Graph Reasoning","summary":" Logical rules are essential for uncovering the logical connections between\nrelations, which could improve reasoning performance and provide interpretable\nresults on knowledge graphs (KGs). Although there have been many efforts to\nmine meaningful logical rules over KGs, existing methods suffer from\ncomputationally intensive searches over the rule space and a lack of\nscalability for large-scale KGs. Besides, they often ignore the semantics of\nrelations which is crucial for uncovering logical connections. Recently, large\nlanguage models (LLMs) have shown impressive performance in the field of\nnatural language processing and various applications, owing to their emergent\nability and generalizability. In this paper, we propose a novel framework,\nChatRule, unleashing the power of large language models for mining logical\nrules over knowledge graphs. Specifically, the framework is initiated with an\nLLM-based rule generator, leveraging both the semantic and structural\ninformation of KGs to prompt LLMs to generate logical rules. To refine the\ngenerated rules, a rule ranking module estimates the rule quality by\nincorporating facts from existing KGs. Last, the ranked rules can be used to\nconduct reasoning over KGs. ChatRule is evaluated on four large-scale KGs,\nw.r.t. different rule quality metrics and downstream tasks, showing the\neffectiveness and scalability of our method.\n","authors":["Linhao Luo","Jiaxin Ju","Bo Xiong","Yuan-Fang Li","Gholamreza Haffari","Shirui Pan"],"pdf_url":"https://arxiv.org/pdf/2309.01538v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11645v1","updated":"2024-01-22T01:44:42Z","published":"2024-01-22T01:44:42Z","title":"Streaming Bilingual End-to-End ASR model using Attention over Multiple\n Softmax","summary":" Even with several advancements in multilingual modeling, it is challenging to\nrecognize multiple languages using a single neural model, without knowing the\ninput language and most multilingual models assume the availability of the\ninput language. In this work, we propose a novel bilingual end-to-end (E2E)\nmodeling approach, where a single neural model can recognize both languages and\nalso support switching between the languages, without any language input from\nthe user. The proposed model has shared encoder and prediction networks, with\nlanguage-specific joint networks that are combined via a self-attention\nmechanism. As the language-specific posteriors are combined, it produces a\nsingle posterior probability over all the output symbols, enabling a single\nbeam search decoding and also allowing dynamic switching between the languages.\nThe proposed approach outperforms the conventional bilingual baseline with\n13.3%, 8.23% and 1.3% word error rate relative reduction on Hindi, English and\ncode-mixed test sets, respectively.\n","authors":["Aditya Patil","Vikas Joshi","Purvi Agrawal","Rupesh Mehta"],"pdf_url":"https://arxiv.org/pdf/2401.11645v1.pdf","comment":"Published in IEEE's Spoken Language Technology (SLT) 2022, 8 pages (6\n + 2 for references), 5 figures"},{"id":"http://arxiv.org/abs/2109.01636v4","updated":"2024-01-22T01:23:23Z","published":"2021-09-03T17:28:04Z","title":"Empirical Study of Named Entity Recognition Performance Using\n Distribution-aware Word Embedding","summary":" With the fast development of Deep Learning techniques, Named Entity\nRecognition (NER) is becoming more and more important in the information\nextraction task. The greatest difficulty that the NER task faces is to keep the\ndetectability even when types of NE and documents are unfamiliar. Realizing\nthat the specificity information may contain potential meanings of a word and\ngenerate semantic-related features for word embedding, we develop a\ndistribution-aware word embedding and implement three different methods to make\nuse of the distribution information in a NER framework. And the result shows\nthat the performance of NER will be improved if the word specificity is\nincorporated into existing NER methods.\n","authors":["Xin Chen","Qi Zhao","Xinyang Liu"],"pdf_url":"https://arxiv.org/pdf/2109.01636v4.pdf","comment":"Want to correct"},{"id":"http://arxiv.org/abs/2401.11641v1","updated":"2024-01-22T01:06:17Z","published":"2024-01-22T01:06:17Z","title":"Revolutionizing Finance with LLMs: An Overview of Applications and\n Insights","summary":" In recent years, Large Language Models (LLMs) like ChatGPT have seen\nconsiderable advancements and have been applied in diverse fields. Built on the\nTransformer architecture, these models are trained on extensive datasets,\nenabling them to understand and generate human language effectively. In the\nfinancial domain, the deployment of LLMs is gaining momentum. These models are\nbeing utilized for automating financial report generation, forecasting market\ntrends, analyzing investor sentiment, and offering personalized financial\nadvice. Leveraging their natural language processing capabilities, LLMs can\ndistill key insights from vast financial data, aiding institutions in making\ninformed investment choices and enhancing both operational efficiency and\ncustomer satisfaction. In this study, we provide a comprehensive overview of\nthe emerging integration of LLMs into various financial tasks. Additionally, we\nconducted holistic tests on multiple financial tasks through the combination of\nnatural language instructions. Our findings show that GPT-4 effectively follow\nprompt instructions across various financial tasks. This survey and evaluation\nof LLMs in the financial domain aim to deepen the understanding of LLMs'\ncurrent role in finance for both financial practitioners and LLM researchers,\nidentify new research and application prospects, and highlight how these\ntechnologies can be leveraged to solve practical challenges in the finance\nindustry.\n","authors":["Huaqin Zhao","Zhengliang Liu","Zihao Wu","Yiwei Li","Tianze Yang","Peng Shu","Shaochen Xu","Haixing Dai","Lin Zhao","Gengchen Mai","Ninghao Liu","Tianming Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14358v2","updated":"2024-01-22T00:38:08Z","published":"2022-06-29T01:57:44Z","title":"Using Twitter Data to Understand Public Perceptions of Approved versus\n Off-label Use for COVID-19-related Medications","summary":" Understanding public discourse on emergency use of unproven therapeutics is\ncrucial for monitoring safe use and combating misinformation. We developed a\nnatural language processing-based pipeline to comprehend public perceptions of\nand stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter\nover time. This retrospective study included 609,189 US-based tweets from\nJanuary 29, 2020, to November 30, 2021, about four drugs that garnered\nsignificant public attention during the COVID-19 pandemic: (1)\nHydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)\nMolnupiravir and Remdesivir, FDA-approved treatments for eligible patients.\nTime-trend analysis was employed to understand popularity trends and related\nevents. Content and demographic analyses were conducted to explore potential\nrationales behind people's stances on each drug. Time-trend analysis indicated\nthat Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir\nand Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and\nIvermectin discussions were highly politicized, related to conspiracy theories,\nhearsay, and celebrity influences. The distribution of stances between the two\nmajor US political parties was significantly different (P < .001); Republicans\nwere more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than\nDemocrats. People with healthcare backgrounds tended to oppose\nHydroxychloroquine (7%) more than the general population, while the general\npopulation was more likely to support Ivermectin (14%). Our study found that\nsocial media users have varying perceptions and stances on off-label versus\nFDA-authorized drug use at different stages of COVID-19. This indicates that\nhealth systems, regulatory agencies, and policymakers should design tailored\nstrategies to monitor and reduce misinformation to promote safe drug use.\n","authors":["Yining Hua","Hang Jiang","Shixu Lin","Jie Yang","Joseph M. Plasek","David W. Bates","Li Zhou"],"pdf_url":"https://arxiv.org/pdf/2206.14358v2.pdf","comment":"Full paper published in JAMIA"},{"id":"http://arxiv.org/abs/2306.16001v2","updated":"2024-01-22T00:27:45Z","published":"2023-06-28T08:20:35Z","title":"Streamlining Social Media Information Extraction for Public Health\n Research with Deep Learning","summary":" Objective: Social media-based public health research is crucial for epidemic\nsurveillance, but most studies identify relevant corpora with keyword matching.\nThis study develops a system to streamline the process of curating colloquial\nmedical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial\nsymptom dictionary from COVID-19-related tweets as proof of concept. Methods:\nCOVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The\npipeline includes three modules: a named entity recognition module to detect\nsymptoms in tweets; an entity normalization module to aggregate detected\nentities; and a mapping module that iteratively maps entities to Unified\nMedical Language System concepts. A random 500 entity sample were drawn from\nthe final dictionary for accuracy validation. Additionally, we conducted a\nsymptom frequency distribution analysis to compare our dictionary to a\npre-defined lexicon from previous research. Results: We identified 498,480\nunique symptom entity expressions from the tweets. Pre-processing reduces the\nnumber to 18,226. The final dictionary contains 38,175 unique expressions of\nsymptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom\ndistribution analysis found that our dictionary detects more symptoms and is\neffective at identifying psychiatric disorders like anxiety and depression,\noften missed by pre-defined lexicons. Conclusion: This study advances public\nhealth research by implementing a novel, systematic pipeline for curating\nsymptom lexicons from social media data. The final lexicon's high accuracy,\nvalidated by medical professionals, underscores the potential of this\nmethodology to reliably interpret and categorize vast amounts of unstructured\nsocial media data into actionable medical insights across diverse linguistic\nand regional landscapes.\n","authors":["Yining Hua","Shixu Lin","Minghui Li","Yujie Zhang","Dinah Foer","Siwen Wang","Peilin Zhou","Li Zhou","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2306.16001v2.pdf","comment":"Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA\n Annual Symposium 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.12217v1","updated":"2024-01-22T18:59:29Z","published":"2024-01-22T18:59:29Z","title":"Exploring Simple Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation models aim to accurately assign a\nsemantic label to each pixel in an image from a set of arbitrary\nopen-vocabulary texts. In order to learn such pixel-level alignment, current\napproaches typically rely on a combination of (i) image-level VL model (e.g.\nCLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this\npaper, we introduce S-Seg, a novel model that can achieve surprisingly strong\nperformance without depending on any of the above elements. S-Seg leverages\npseudo-mask and language to train a MaskFormer, and can be easily trained from\npublicly available image-text datasets. Contrary to prior works, our model\ndirectly trains for pixel-level features and language alignment. Once trained,\nS-Seg generalizes well to multiple testing datasets without requiring\nfine-tuning. In addition, S-Seg has the extra benefits of scalability with data\nand consistently improvement when augmented with self-training. We believe that\nour simple yet effective approach will serve as a solid baseline for future\nresearch.\n","authors":["Zihang Lai"],"pdf_url":"https://arxiv.org/pdf/2401.12217v1.pdf","comment":"Code is available at: https://github.com/zlai0/S-Seg"},{"id":"http://arxiv.org/abs/2401.12215v1","updated":"2024-01-22T18:59:07Z","published":"2024-01-22T18:59:07Z","title":"Less Could Be Better: Parameter-efficient Fine-tuning Advances Medical\n Vision Foundation Models","summary":" Parameter-efficient fine-tuning (PEFT) that was initially developed for\nexploiting pre-trained large language models has recently emerged as an\neffective approach to perform transfer learning on computer vision tasks.\nHowever, the effectiveness of PEFT on medical vision foundation models is still\nunclear and remains to be explored. As a proof of concept, we conducted a\ndetailed empirical study on applying PEFT to chest radiography foundation\nmodels. Specifically, we delved into LoRA, a representative PEFT method, and\ncompared it against full-parameter fine-tuning (FFT) on two self-supervised\nradiography foundation models across three well-established chest radiograph\ndatasets. Our results showed that LoRA outperformed FFT in 13 out of 18\ntransfer learning tasks by at most 2.9% using fewer than 1% tunable parameters.\nCombining LoRA with foundation models, we set up new state-of-the-art on a\nrange of data-efficient learning tasks, such as an AUROC score of 80.6% using\n1% labeled data on NIH ChestX-ray14. We hope this study can evoke more\nattention from the community in the use of PEFT for transfer learning on\nmedical imaging tasks. Code and models are available at\nhttps://github.com/RL4M/MED-PEFT.\n","authors":["Chenyu Lian","Hong-Yu Zhou","Yizhou Yu","Liansheng Wang"],"pdf_url":"https://arxiv.org/pdf/2401.12215v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2310.00647v2","updated":"2024-01-22T18:53:48Z","published":"2023-10-01T12:02:59Z","title":"Beyond Task Performance: Evaluating and Reducing the Flaws of Large\n Multimodal Models with In-Context Learning","summary":" Following the success of Large Language Models (LLMs), Large Multimodal\nModels (LMMs), such as the Flamingo model and its subsequent competitors, have\nstarted to emerge as natural steps towards generalist agents. However,\ninteracting with recent LMMs reveals major limitations that are hardly captured\nby the current evaluation benchmarks. Indeed, task performances (e.g., VQA\naccuracy) alone do not provide enough clues to understand their real\ncapabilities, limitations, and to which extent such models are aligned to human\nexpectations. To refine our understanding of those flaws, we deviate from the\ncurrent evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from\n3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,\ncompositionality, explainability and instruction following. Our evaluation on\nthese axes reveals major flaws in LMMs. While the current go-to solution to\nalign these models is based on training, such as instruction tuning or RLHF, we\nrather (2) explore the training-free in-context learning (ICL) as a solution,\nand study how it affects these limitations. Based on our ICL study, (3) we push\nICL further and propose new multimodal ICL variants such as; Multitask-ICL,\nChain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.\n(1) Despite their success, LMMs have flaws that remain unsolved with scaling\nalone. (2) The effect of ICL on LMMs flaws is nuanced; despite its\neffectiveness for improved explainability, answer abstention, ICL only slightly\nimproves instruction following, does not improve compositional abilities, and\nactually even amplifies hallucinations. (3) The proposed ICL variants are\npromising as post-hoc approaches to efficiently tackle some of those flaws. The\ncode is available here: https://github.com/mshukor/EvALign-ICL.\n","authors":["Mustafa Shukor","Alexandre Rame","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2310.00647v2.pdf","comment":"ICLR 2024. Project Page: https://evalign-icl.github.io/"},{"id":"http://arxiv.org/abs/2401.12210v1","updated":"2024-01-22T18:52:51Z","published":"2024-01-22T18:52:51Z","title":"Connecting the Dots: Leveraging Spatio-Temporal Graph Neural Networks\n for Accurate Bangla Sign Language Recognition","summary":" Recent advances in Deep Learning and Computer Vision have been successfully\nleveraged to serve marginalized communities in various contexts. One such area\nis Sign Language - a primary means of communication for the deaf community.\nHowever, so far, the bulk of research efforts and investments have gone into\nAmerican Sign Language, and research activity into low-resource sign languages\n- especially Bangla Sign Language - has lagged significantly. In this research\npaper, we present a new word-level Bangla Sign Language dataset - BdSL40 -\nconsisting of 611 videos over 40 words, along with two different approaches:\none with a 3D Convolutional Neural Network model and another with a novel Graph\nNeural Network approach for the classification of BdSL40 dataset. This is the\nfirst study on word-level BdSL recognition, and the dataset was transcribed\nfrom Indian Sign Language (ISL) using the Bangla Sign Language Dictionary\n(1997). The proposed GNN model achieved an F1 score of 89%. The study\nhighlights the significant lexical and semantic similarity between BdSL, West\nBengal Sign Language, and ISL, and the lack of word-level datasets for BdSL in\nthe literature. We release the dataset and source code to stimulate further\nresearch.\n","authors":["Haz Sameen Shahgir","Khondker Salman Sayeed","Md Toki Tahmid","Tanjeem Azwad Zaman","Md. Zarif Ul Alam"],"pdf_url":"https://arxiv.org/pdf/2401.12210v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12208v1","updated":"2024-01-22T18:51:07Z","published":"2024-01-22T18:51:07Z","title":"CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation","summary":" Chest X-rays (CXRs) are the most frequently performed imaging test in\nclinical practice. Recent advances in the development of vision-language\nfoundation models (FMs) give rise to the possibility of performing automated\nCXR interpretation, which can assist physicians with clinical decision-making\nand improve patient outcomes. However, developing FMs that can accurately\ninterpret CXRs is challenging due to the (1) limited availability of\nlarge-scale vision-language datasets in the medical image domain, (2) lack of\nvision and language encoders that can capture the complexities of medical data,\nand (3) absence of evaluation frameworks for benchmarking the abilities of FMs\non CXR interpretation. In this work, we address these challenges by first\nintroducing \\emph{CheXinstruct} - a large-scale instruction-tuning dataset\ncurated from 28 publicly-available datasets. We then present \\emph{CheXagent} -\nan instruction-tuned FM capable of analyzing and summarizing CXRs. To build\nCheXagent, we design a clinical large language model (LLM) for parsing\nradiology reports, a vision encoder for representing CXR images, and a network\nto bridge the vision and language modalities. Finally, we introduce\n\\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs\nacross 8 clinically-relevant CXR interpretation tasks. Extensive quantitative\nevaluations and qualitative reviews with five expert radiologists demonstrate\nthat CheXagent outperforms previously-developed general- and medical-domain FMs\non CheXbench tasks. Furthermore, in an effort to improve model transparency, we\nperform a fairness evaluation across factors of sex, race and age to highlight\npotential performance disparities. Our project is at\n\\url{https://stanford-aimi.github.io/chexagent.html}.\n","authors":["Zhihong Chen","Maya Varma","Jean-Benoit Delbrouck","Magdalini Paschali","Louis Blankemeier","Dave Van Veen","Jeya Maria Jose Valanarasu","Alaa Youssef","Joseph Paul Cohen","Eduardo Pontes Reis","Emily B. Tsai","Andrew Johnston","Cameron Olsen","Tanishq Mathew Abraham","Sergios Gatidis","Akshay S. Chaudhari","Curtis Langlotz"],"pdf_url":"https://arxiv.org/pdf/2401.12208v1.pdf","comment":"24 pages, 8 figures"},{"id":"http://arxiv.org/abs/2401.12202v1","updated":"2024-01-22T18:42:20Z","published":"2024-01-22T18:42:20Z","title":"OK-Robot: What Really Matters in Integrating Open-Knowledge Models for\n Robotics","summary":" Remarkable progress has been made in recent years in the fields of vision,\nlanguage, and robotics. We now have vision models capable of recognizing\nobjects based on language queries, navigation systems that can effectively\ncontrol mobile systems, and grasping models that can handle a wide range of\nobjects. Despite these advancements, general-purpose applications of robotics\nstill lag behind, even though they rely on these fundamental capabilities of\nrecognition, navigation, and grasping. In this paper, we adopt a systems-first\napproach to develop a new Open Knowledge-based robotics framework called\nOK-Robot. By combining Vision-Language Models (VLMs) for object detection,\nnavigation primitives for movement, and grasping primitives for object\nmanipulation, OK-Robot offers a integrated solution for pick-and-drop\noperations without requiring any training. To evaluate its performance, we run\nOK-Robot in 10 real-world home environments. The results demonstrate that\nOK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,\nrepresenting a new state-of-the-art in Open Vocabulary Mobile Manipulation\n(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered\nenvironments, OK-Robot's performance increases to 82%. However, the most\nimportant insight gained from OK-Robot is the critical role of nuanced details\nwhen combining Open Knowledge systems like VLMs with robotic modules. Videos of\nour experiments are available on our website: https://ok-robot.github.io\n","authors":["Peiqi Liu","Yaswanth Orru","Chris Paxton","Nur Muhammad Mahi Shafiullah","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2401.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12198v1","updated":"2024-01-22T18:38:44Z","published":"2024-01-22T18:38:44Z","title":"LONEStar: The Lunar Flashlight Optical Navigation Experiment","summary":" This paper documents the results from the highly successful Lunar flashlight\nOptical Navigation Experiment with a Star tracker (LONEStar). Launched in\nDecember 2022, Lunar Flashlight (LF) was a NASA-funded technology demonstration\nmission. After a propulsion system anomaly prevented capture in lunar orbit, LF\nwas ejected from the Earth-Moon system and into heliocentric space. NASA\nsubsequently transferred ownership of LF to Georgia Tech to conduct an unfunded\nextended mission to demonstrate further advanced technology objectives,\nincluding LONEStar. From August-December 2023, the LONEStar team performed\non-orbit calibration of the optical instrument and a number of different OPNAV\nexperiments. This campaign included the processing of nearly 400 images of star\nfields, Earth and Moon, and four other planets (Mercury, Mars, Jupiter, and\nSaturn). LONEStar provided the first on-orbit demonstrations of heliocentric\nnavigation using only optical observations of planets. Of special note is the\nsuccessful in-flight demonstration of (1) instantaneous triangulation with\nsimultaneous sightings of two planets with the LOST algorithm and (2) dynamic\ntriangulation with sequential sightings of multiple planets.\n","authors":["Michael Krause","Ava Thrasher","Priyal Soni","Liam Smego","Reuben Isaac","Jennifer Nolan","Micah Pledger","E. Glenn Lightsey","W. Jud Ready","John Christian"],"pdf_url":"https://arxiv.org/pdf/2401.12198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12176v1","updated":"2024-01-22T18:09:15Z","published":"2024-01-22T18:09:15Z","title":"Broiler-Net: A Deep Convolutional Framework for Broiler Behavior\n Analysis in Poultry Houses","summary":" Detecting anomalies in poultry houses is crucial for maintaining optimal\nchicken health conditions, minimizing economic losses and bolstering\nprofitability. This paper presents a novel real-time framework for analyzing\nchicken behavior in cage-free poultry houses to detect abnormal behaviors.\nSpecifically, two significant abnormalities, namely inactive broiler and\nhuddling behavior, are investigated in this study. The proposed framework\ncomprises three key steps: (1) chicken detection utilizing a state-of-the-art\ndeep learning model, (2) tracking individual chickens across consecutive frames\nwith a fast tracker module, and (3) detecting abnormal behaviors within the\nvideo stream. Experimental studies are conducted to evaluate the efficacy of\nthe proposed algorithm in accurately assessing chicken behavior. The results\nillustrate that our framework provides a precise and efficient solution for\nreal-time anomaly detection, facilitating timely interventions to maintain\nchicken health and enhance overall productivity on poultry farms. Github:\nhttps://github.com/TaherehZarratEhsan/Chicken-Behavior-Analysis\n","authors":["Tahereh Zarrat Ehsan","Seyed Mehdi Mohtavipour"],"pdf_url":"https://arxiv.org/pdf/2401.12176v1.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.05916v3","updated":"2024-01-22T18:08:52Z","published":"2023-10-09T17:59:04Z","title":"Interpreting CLIP's Image Representation via Text-Based Decomposition","summary":" We investigate the CLIP image encoder by analyzing how individual model\ncomponents affect the final representation. We decompose the image\nrepresentation as a sum across individual image patches, model layers, and\nattention heads, and use CLIP's text representation to interpret the summands.\nInterpreting the attention heads, we characterize each head's role by\nautomatically finding text representations that span its output space, which\nreveals property-specific roles for many heads (e.g. location or shape). Next,\ninterpreting the image patches, we uncover an emergent spatial localization\nwithin CLIP. Finally, we use this understanding to remove spurious features\nfrom CLIP and to create a strong zero-shot image segmenter. Our results\nindicate that a scalable understanding of transformer models is attainable and\ncan be used to repair and improve models.\n","authors":["Yossi Gandelsman","Alexei A. Efros","Jacob Steinhardt"],"pdf_url":"https://arxiv.org/pdf/2310.05916v3.pdf","comment":"Project page and code:\n https://yossigandelsman.github.io/clip_decomposition/"},{"id":"http://arxiv.org/abs/2401.12175v1","updated":"2024-01-22T18:08:22Z","published":"2024-01-22T18:08:22Z","title":"Single-View 3D Human Digitalization with Large Reconstruction Models","summary":" In this paper, we introduce Human-LRM, a single-stage feed-forward Large\nReconstruction Model designed to predict human Neural Radiance Fields (NeRF)\nfrom a single image. Our approach demonstrates remarkable adaptability in\ntraining using extensive datasets containing 3D scans and multi-view capture.\nFurthermore, to enhance the model's applicability for in-the-wild scenarios\nespecially with occlusions, we propose a novel strategy that distills\nmulti-view reconstruction into single-view via a conditional triplane diffusion\nmodel. This generative extension addresses the inherent variations in human\nbody shapes when observed from a single view, and makes it possible to\nreconstruct the full body human from an occluded image. Through extensive\nexperiments, we show that Human-LRM surpasses previous methods by a significant\nmargin on several benchmarks.\n","authors":["Zhenzhen Weng","Jingyuan Liu","Hao Tan","Zhan Xu","Yang Zhou","Serena Yeung-Levy","Jimei Yang"],"pdf_url":"https://arxiv.org/pdf/2401.12175v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n Capabilities","summary":" Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12164v1","updated":"2024-01-22T17:56:07Z","published":"2024-01-22T17:56:07Z","title":"Semi-supervised segmentation of land cover images using nonlinear\n canonical correlation analysis with multiple features and t-SNE","summary":" Image segmentation is a clustering task whereby each pixel is assigned a\ncluster label. Remote sensing data usually consists of multiple bands of\nspectral images in which there exist semantically meaningful land cover\nsubregions, co-registered with other source data such as LIDAR (LIght Detection\nAnd Ranging) data, where available. This suggests that, in order to account for\nspatial correlation between pixels, a feature vector associated with each pixel\nmay be a vectorized tensor representing the multiple bands and a local patch as\nappropriate. Similarly, multiple types of texture features based on a pixel's\nlocal patch would also be beneficial for encoding locally statistical\ninformation and spatial variations, without necessarily labelling pixel-wise a\nlarge amount of ground truth, then training a supervised model, which is\nsometimes impractical. In this work, by resorting to label only a small\nquantity of pixels, a new semi-supervised segmentation approach is proposed.\nInitially, over all pixels, an image data matrix is created in high dimensional\nfeature space. Then, t-SNE projects the high dimensional data onto 3D\nembedding. By using radial basis functions as input features, which use the\nlabelled data samples as centres, to pair with the output class labels, a\nmodified canonical correlation analysis algorithm, referred to as RBF-CCA, is\nintroduced which learns the associated projection matrix via the small labelled\ndata set. The associated canonical variables, obtained for the full image, are\napplied by k-means clustering algorithm. The proposed semi-supervised RBF-CCA\nalgorithm has been implemented on several remotely sensed multispectral images,\ndemonstrating excellent segmentation results.\n","authors":["Hong Wei","James Xiao","Yichao Zhang","Xia Hong"],"pdf_url":"https://arxiv.org/pdf/2401.12164v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12161v1","updated":"2024-01-22T17:55:16Z","published":"2024-01-22T17:55:16Z","title":"Automated facial recognition system using deep learning for pain\n assessment in adults with cerebral palsy","summary":" Background: Pain assessment in individuals with neurological conditions,\nespecially those with limited self-report ability and altered facial\nexpressions, presents challenges. Existing measures, relying on direct\nobservation by caregivers, lack sensitivity and specificity. In cerebral palsy,\npain is a common comorbidity and a reliable evaluation protocol is crucial.\nThus, having an automatic system that recognizes facial expressions could be of\nenormous help when diagnosing pain in this type of patient.\n Objectives: 1) to build a dataset of facial pain expressions in individuals\nwith cerebral palsy, and 2) to develop an automated facial recognition system\nbased on deep learning for pain assessment addressed to this population.\n Methods: Ten neural networks were trained on three pain image databases,\nincluding the UNBC-McMaster Shoulder Pain Expression Archive Database, the\nMultimodal Intensity Pain Dataset, and the Delaware Pain Database.\nAdditionally, a curated dataset (CPPAIN) was created, consisting of 109\npreprocessed facial pain expression images from individuals with cerebral\npalsy, categorized by two physiotherapists using the Facial Action Coding\nSystem observational scale.\n Results: InceptionV3 exhibited promising performance on the CP-PAIN dataset,\nachieving an accuracy of 62.67% and an F1 score of 61.12%. Explainable\nartificial intelligence techniques revealed consistent essential features for\npain identification across models.\n Conclusion: This study demonstrates the potential of deep learning models for\nrobust pain detection in populations with neurological conditions and\ncommunication disabilities. The creation of a larger dataset specific to\ncerebral palsy would further enhance model accuracy, offering a valuable tool\nfor discerning subtle and idiosyncratic pain expressions. The insights gained\ncould extend to other complex neurological conditions.\n","authors":["Álvaro Sabater-Gárriz","F. Xavier Gaya-Morey","José María Buades-Rubio","Cristina Manresa Yee","Pedro Montoya","Inmaculada Riquelme"],"pdf_url":"https://arxiv.org/pdf/2401.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08573v2","updated":"2024-01-22T17:54:58Z","published":"2024-01-16T18:58:36Z","title":"Benchmarking the Robustness of Image Watermarks","summary":" This paper investigates the weaknesses of image watermarking techniques. We\npresent WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel\nbenchmark for assessing watermark robustness, overcoming the limitations of\ncurrent evaluation methods.WAVES integrates detection and identification tasks,\nand establishes a standardized evaluation protocol comprised of a diverse range\nof stress tests. The attacks in WAVES range from traditional image distortions\nto advanced and novel variations of diffusive, and adversarial attacks. Our\nevaluation examines two pivotal dimensions: the degree of image quality\ndegradation and the efficacy of watermark detection after attacks. We develop a\nseries of Performance vs. Quality 2D plots, varying over several prominent\nimage similarity metrics, which are then aggregated in a heuristically novel\nmanner to paint an overall picture of watermark robustness and attack potency.\nOur comprehensive evaluation reveals previously undetected vulnerabilities of\nseveral modern watermarking algorithms. We envision WAVES as a toolkit for the\nfuture development of robust watermarking systems. The project is available at\nhttps://wavesbench.github.io/\n","authors":["Bang An","Mucong Ding","Tahseen Rabbani","Aakriti Agrawal","Yuancheng Xu","Chenghao Deng","Sicheng Zhu","Abdirisak Mohamed","Yuxin Wen","Tom Goldstein","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02273v4","updated":"2024-01-22T17:37:03Z","published":"2023-07-05T13:17:14Z","title":"Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient\n Neural Image Compression","summary":" Recently, the performance of neural image compression (NIC) has steadily\nimproved thanks to the last line of study, reaching or outperforming\nstate-of-the-art conventional codecs. Despite significant progress, current NIC\nmethods still rely on ConvNet-based entropy coding, limited in modeling\nlong-range dependencies due to their local connectivity and the increasing\nnumber of architectural biases and priors, resulting in complex underperforming\nmodels with high decoding latency. Motivated by the efficiency investigation of\nthe Tranformer-based transform coding framework, namely SwinT-ChARM, we propose\nto enhance the latter, as first, with a more straightforward yet effective\nTranformer-based channel-wise auto-regressive prior model, resulting in an\nabsolute image compression transformer (ICT). Through the proposed ICT, we can\ncapture both global and local contexts from the latent representations and\nbetter parameterize the distribution of the quantized latents. Further, we\nleverage a learnable scaling module with a sandwich ConvNeXt-based\npre-/post-processor to accurately extract more compact latent codes while\nreconstructing higher-quality images. Extensive experimental results on\nbenchmark datasets showed that the proposed framework significantly improves\nthe trade-off between coding efficiency and decoder complexity over the\nversatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec\nSwinT-ChARM. Moreover, we provide model scaling studies to verify the\ncomputational efficiency of our approach and conduct several objective and\nsubjective analyses to bring to the fore the performance gap between the\nadaptive image compression transformer (AICT) and the neural codec SwinT-ChARM.\n","authors":["Ahmed Ghorbel","Wassim Hamidouche","Luce Morin"],"pdf_url":"https://arxiv.org/pdf/2307.02273v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12133v1","updated":"2024-01-22T17:15:02Z","published":"2024-01-22T17:15:02Z","title":"VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear\n Responses in VR Stand-up Interactive Games","summary":" Understanding and recognizing emotions are important and challenging issues\nin the metaverse era. Understanding, identifying, and predicting fear, which is\none of the fundamental human emotions, in virtual reality (VR) environments\nplays an essential role in immersive game development, scene development, and\nnext-generation virtual human-computer interaction applications. In this\narticle, we used VR horror games as a medium to analyze fear emotions by\ncollecting multi-modal data (posture, audio, and physiological signals) from 23\nplayers. We used an LSTM-based model to predict fear with accuracies of 65.31%\nand 90.47% under 6-level classification (no fear and five different levels of\nfear) and 2-level classification (no fear and fear), respectively. We\nconstructed a multi-modal natural behavior dataset of immersive human fear\nresponses (VRMN-bD) and compared it with existing relevant advanced datasets.\nThe results show that our dataset has fewer limitations in terms of collection\nmethod, data scale and audience scope. We are unique and advanced in targeting\nmulti-modal datasets of fear and behavior in VR stand-up interactive\nenvironments. Moreover, we discussed the implications of this work for\ncommunities and applications. The dataset and pre-trained model are available\nat https://github.com/KindOPSTAR/VRMN-bD.\n","authors":["He Zhang","Xinyang Li","Yuanxi Sun","Xinyi Fu","Christine Qiu","John M. Carroll"],"pdf_url":"https://arxiv.org/pdf/2401.12133v1.pdf","comment":"Accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.06144v2","updated":"2024-01-22T17:11:57Z","published":"2023-11-30T23:31:33Z","title":"DFU: scale-robust diffusion model for zero-shot super-resolution image\n generation","summary":" Diffusion generative models have achieved remarkable success in generating\nimages with a fixed resolution. However, existing models have limited ability\nto generalize to different resolutions when training data at those resolutions\nare not available. Leveraging techniques from operator learning, we present a\nnovel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the\nscore operator by combining both spatial and spectral information at multiple\nresolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)\nsimultaneously training on multiple resolutions improves FID over training at\nany single fixed resolution; 2) DFU generalizes beyond its training\nresolutions, allowing for coherent, high-fidelity generation at\nhigher-resolutions with the same model, i.e. zero-shot super-resolution\nimage-generation; 3) we propose a fine-tuning strategy to further enhance the\nzero-shot super-resolution image-generation capability of our model, leading to\na FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no\nother method can come close to achieving.\n","authors":["Alex Havrilla","Kevin Rojas","Wenjing Liao","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2401.06144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12129v1","updated":"2024-01-22T17:11:01Z","published":"2024-01-22T17:11:01Z","title":"Out-of-Distribution Detection & Applications With Ablated Learned\n Temperature Energy","summary":" As deep neural networks become adopted in high-stakes domains, it is crucial\nto be able to identify when inference inputs are Out-of-Distribution (OOD) so\nthat users can be alerted of likely drops in performance and calibration\ndespite high confidence. Among many others, existing methods use the following\ntwo scores to do so without training on any apriori OOD examples: a learned\ntemperature and an energy score. In this paper we introduce Ablated Learned\nTemperature Energy (or \"AbeT\" for short), a method which combines these prior\nmethods in novel ways with effective modifications. Due to these contributions,\nAbeT lowers the False Positive Rate at $95\\%$ True Positive Rate (FPR@95) by\n$35.39\\%$ in classification (averaged across all ID and OOD datasets measured)\ncompared to state of the art without training networks in multiple stages or\nrequiring hyperparameters or test-time backward passes. We additionally provide\nempirical insights as to how our model learns to distinguish between\nIn-Distribution (ID) and OOD samples while only being explicitly trained on ID\nsamples via exposure to misclassified ID examples at training time. Lastly, we\nshow the efficacy of our method in identifying predicted bounding boxes and\npixels corresponding to OOD objects in object detection and semantic\nsegmentation, respectively - with an AUROC increase of $5.15\\%$ in object\ndetection and both a decrease in FPR@95 of $41.48\\%$ and an increase in AUPRC\nof $34.20\\%$ on average in semantic segmentation compared to previous state of\nthe art.\n","authors":["Will LeVine","Benjamin Pikus","Jacob Phillips","Berk Norman","Fernando Amat Gil","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2401.12129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00454v2","updated":"2024-01-22T17:10:49Z","published":"2023-09-30T18:13:41Z","title":"UniLVSeg: Unified Left Ventricular Segmentation with Sparsely Annotated\n Echocardiogram Videos through Self-Supervised Temporal Masking and Weakly\n Supervised Training","summary":" Echocardiography has become an indispensable clinical imaging modality for\ngeneral heart health assessment. From calculating biomarkers such as ejection\nfraction to the probability of a patient's heart failure, accurate segmentation\nof the heart and its structures allows doctors to plan and execute treatments\nwith greater precision and accuracy. However, achieving accurate and robust\nleft ventricle segmentation is time-consuming and challenging due to different\nreasons. This work introduces a novel approach for consistent left ventricular\n(LV) segmentation from sparsely annotated echocardiogram videos. We achieve\nthis through (1) self-supervised learning (SSL) using temporal masking followed\nby (2) weakly supervised training. We investigate two different segmentation\napproaches: 3D segmentation and a novel 2D superimage (SI). We demonstrate how\nour proposed method outperforms the state-of-the-art solutions by achieving a\n93.32% (95%CI 93.21-93.43%) dice score on a large-scale dataset\n(EchoNet-Dynamic) while being more efficient. To show the effectiveness of our\napproach, we provide extensive ablation studies, including pre-training\nsettings and various deep learning backbones. Additionally, we discuss how our\nproposed methodology achieves high data utility by incorporating unlabeled\nframes in the training process. To help support the AI in medicine community,\nthe complete solution with the source code will be made publicly available upon\nacceptance.\n","authors":["Fadillah Maani","Asim Ukaye","Nada Saadi","Numan Saeed","Mohammad Yaqub"],"pdf_url":"https://arxiv.org/pdf/2310.00454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12074v1","updated":"2024-01-22T16:14:26Z","published":"2024-01-22T16:14:26Z","title":"DeepCERES: A Deep learning method for cerebellar lobule segmentation\n using ultra-high resolution multimodal MRI","summary":" This paper introduces a novel multimodal and high-resolution human brain\ncerebellum lobule segmentation method. Unlike current tools that operate at\nstandard resolution ($1 \\text{ mm}^{3}$) or using mono-modal data, the proposed\nmethod improves cerebellum lobule segmentation through the use of a multimodal\nand ultra-high resolution ($0.125 \\text{ mm}^{3}$) training dataset. To develop\nthe method, first, a database of semi-automatically labelled cerebellum lobules\nwas created to train the proposed method with ultra-high resolution T1 and T2\nMR images. Then, an ensemble of deep networks has been designed and developed,\nallowing the proposed method to excel in the complex cerebellum lobule\nsegmentation task, improving precision while being memory efficient. Notably,\nour approach deviates from the traditional U-Net model by exploring alternative\narchitectures. We have also integrated deep learning with classical machine\nlearning methods incorporating a priori knowledge from multi-atlas\nsegmentation, which improved precision and robustness. Finally, a new online\npipeline, named DeepCERES, has been developed to make available the proposed\nmethod to the scientific community requiring as input only a single T1 MR image\nat standard resolution.\n","authors":["Sergio Morell-Ortega","Marina Ruiz-Perez","Marien Gadea","Roberto Vivo-Hernando","Gregorio Rubio","Fernando Aparici","Mariam de la Iglesia-Vaya","Gwenaelle Catheline","Pierrick Coupé","José V. Manjón"],"pdf_url":"https://arxiv.org/pdf/2401.12074v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.12051v1","updated":"2024-01-22T15:42:21Z","published":"2024-01-22T15:42:21Z","title":"CloSe: A 3D Clothing Segmentation Dataset and Model","summary":" 3D Clothing modeling and datasets play crucial role in the entertainment,\nanimation, and digital fashion industries. Existing work often lacks detailed\nsemantic understanding or uses synthetic datasets, lacking realism and\npersonalization. To address this, we first introduce CloSe-D: a novel\nlarge-scale dataset containing 3D clothing segmentation of 3167 scans, covering\na range of 18 distinct clothing classes. Additionally, we propose CloSe-Net,\nthe first learning-based 3D clothing segmentation model for fine-grained\nsegmentation from colored point clouds. CloSe-Net uses local point features,\nbody-clothing correlation, and a garment-class and point features-based\nattention module, improving performance over baselines and prior work. The\nproposed attention module enables our model to learn appearance and\ngeometry-dependent clothing prior from data. We further validate the efficacy\nof our approach by successfully segmenting publicly available datasets of\npeople in clothing. We also introduce CloSe-T, a 3D interactive tool for\nrefining segmentation labels. Combining the tool with CloSe-T in a continual\nlearning setup demonstrates improved generalization on real-world data.\nDataset, model, and tool can be found at\nhttps://virtualhumans.mpi-inf.mpg.de/close3dv24/.\n","authors":["Dimitrije Antić","Garvita Tiwari","Batuhan Ozcomlekci","Riccardo Marin","Gerard Pons-Moll"],"pdf_url":"https://arxiv.org/pdf/2401.12051v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12048v1","updated":"2024-01-22T15:40:24Z","published":"2024-01-22T15:40:24Z","title":"HomeRobot Open Vocabulary Mobile Manipulation Challenge 2023 Participant\n Report (Team KuzHum)","summary":" We report an improvements to NeurIPS 2023 HomeRobot: Open Vocabulary Mobile\nManipulation (OVMM) Challenge reinforcement learning baseline. More\nspecifically, we propose more accurate semantic segmentation module, along with\nbetter place skill policy, and high-level heuristic that outperforms the\nbaseline by 2.4% of overall success rate (sevenfold improvement) and 8.2% of\npartial success rate (1.75 times improvement) on Test Standard split of the\nchallenge dataset. With aforementioned enhancements incorporated our agent\nscored 3rd place in the challenge on both simulation and real-world stages.\n","authors":["Volodymyr Kuzma","Vladyslav Humennyy","Ruslan Partsey"],"pdf_url":"https://arxiv.org/pdf/2401.12048v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08865v2","updated":"2024-01-22T15:30:08Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n Learning Differences Between Natural and Medical Images","summary":" This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v2.pdf","comment":"ICLR 2024. Code:\n https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2401.12039v1","updated":"2024-01-22T15:26:01Z","published":"2024-01-22T15:26:01Z","title":"Look, Listen and Recognise: Character-Aware Audio-Visual Subtitling","summary":" The goal of this paper is automatic character-aware subtitle generation.\nGiven a video and a minimal amount of metadata, we propose an audio-visual\nmethod that generates a full transcript of the dialogue, with precise speech\ntimestamps, and the character speaking identified. The key idea is to first use\naudio-visual cues to select a set of high-precision audio exemplars for each\ncharacter, and then use these exemplars to classify all speech segments by\nspeaker identity. Notably, the method does not require face detection or\ntracking. We evaluate the method over a variety of TV sitcoms, including\nSeinfeld, Fraiser and Scrubs. We envision this system being useful for the\nautomatic generation of subtitles to improve the accessibility of the vast\namount of videos available on modern streaming services. Project page :\n\\url{https://www.robots.ox.ac.uk/~vgg/research/look-listen-recognise/}\n","authors":["Bruno Korbar","Jaesung Huh","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2401.12039v1.pdf","comment":"Accepted for publication in ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12033v1","updated":"2024-01-22T15:19:18Z","published":"2024-01-22T15:19:18Z","title":"Momentum-SAM: Sharpness Aware Minimization without Computational\n Overhead","summary":" The recently proposed optimization algorithm for deep neural networks\nSharpness Aware Minimization (SAM) suggests perturbing parameters before\ngradient calculation by a gradient ascent step to guide the optimization into\nparameter space regions of flat loss. While significant generalization\nimprovements and thus reduction of overfitting could be demonstrated, the\ncomputational costs are doubled due to the additionally needed gradient\ncalculation, making SAM unfeasible in case of limited computationally\ncapacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose\nMomentum-SAM (MSAM), which perturbs parameters in the direction of the\naccumulated momentum vector to achieve low sharpness without significant\ncomputational overhead or memory demands over SGD or Adam. We evaluate MSAM in\ndetail and reveal insights on separable mechanisms of NAG, SAM and MSAM\nregarding training optimization and generalization. Code is available at\nhttps://github.com/MarlonBecker/MSAM.\n","authors":["Marlon Becker","Frederick Altrock","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2401.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09495v3","updated":"2024-01-22T15:05:43Z","published":"2024-01-17T01:33:40Z","title":"IPR-NeRF: Ownership Verification meets Neural Radiance Field","summary":" Neural Radiance Field (NeRF) models have gained significant attention in the\ncomputer vision community in the recent past with state-of-the-art visual\nquality and produced impressive demonstrations. Since then, technopreneurs have\nsought to leverage NeRF models into a profitable business. Therefore, NeRF\nmodels make it worth the risk of plagiarizers illegally copying,\nre-distributing, or misusing those models. This paper proposes a comprehensive\nintellectual property (IP) protection framework for the NeRF model in both\nblack-box and white-box settings, namely IPR-NeRF. In the black-box setting, a\ndiffusion-based solution is introduced to embed and extract the watermark via a\ntwo-stage optimization process. In the white-box setting, a designated digital\nsignature is embedded into the weights of the NeRF model by adopting the sign\nloss objective. Our extensive experiments demonstrate that not only does our\napproach maintain the fidelity (\\ie, the rendering quality) of IPR-NeRF models,\nbut it is also robust against both ambiguity and removal attacks compared to\nprior arts.\n","authors":["Win Kent Ong","Kam Woh Ng","Chee Seng Chan","Yi Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2401.09495v3.pdf","comment":"Error on result tabulation for the state of the art method which\n might cause misleading to the readers"},{"id":"http://arxiv.org/abs/2401.12019v1","updated":"2024-01-22T15:05:05Z","published":"2024-01-22T15:05:05Z","title":"Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered\n by Multiple Disparity Consistency","summary":" In stereo-matching knowledge distillation methods of the self-supervised\nmonocular depth estimation, the stereo-matching network's knowledge is\ndistilled into a monocular depth network through pseudo-depth maps. In these\nmethods, the learning-based stereo-confidence network is generally utilized to\nidentify errors in the pseudo-depth maps to prevent transferring the errors.\nHowever, the learning-based stereo-confidence networks should be trained with\nground truth (GT), which is not feasible in a self-supervised setting. In this\npaper, we propose a method to identify and filter errors in the pseudo-depth\nmap using multiple disparity maps by checking their consistency without the\nneed for GT and a training process. Experimental results show that the proposed\nmethod outperforms the previous methods and works well on various\nconfigurations by filtering out erroneous areas where the stereo-matching is\nvulnerable, especially such as textureless regions, occlusion boundaries, and\nreflective surfaces.\n","authors":["Woonghyun Ka","Jae Young Lee","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12019v1.pdf","comment":"ICASSP 2024. The first two authors are equally contributed"},{"id":"http://arxiv.org/abs/2401.12014v1","updated":"2024-01-22T15:00:32Z","published":"2024-01-22T15:00:32Z","title":"Robustness to distribution shifts of compressed networks for edge\n devices","summary":" It is necessary to develop efficient DNNs deployed on edge devices with\nlimited computation resources. However, the compressed networks often execute\nnew tasks in the target domain, which is different from the source domain where\nthe original network is trained. It is important to investigate the robustness\nof compressed networks in two types of data distribution shifts: domain shifts\nand adversarial perturbations. In this study, we discover that compressed\nmodels are less robust to distribution shifts than their original networks.\nInterestingly, larger networks are more vulnerable to losing robustness than\nsmaller ones, even when they are compressed to a similar size as the smaller\nnetworks. Furthermore, compact networks obtained by knowledge distillation are\nmuch more robust to distribution shifts than pruned networks. Finally,\npost-training quantization is a reliable method for achieving significant\nrobustness to distribution shifts, and it outperforms both pruned and distilled\nmodels in terms of robustness.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.12014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11841v4","updated":"2024-01-22T14:59:20Z","published":"2023-12-19T04:14:11Z","title":"MixRT: Mixed Neural Representations For Real-Time NeRF Rendering","summary":" Neural Radiance Field (NeRF) has emerged as a leading technique for novel\nview synthesis, owing to its impressive photorealistic reconstruction and\nrendering capability. Nevertheless, achieving real-time NeRF rendering in\nlarge-scale scenes has presented challenges, often leading to the adoption of\neither intricate baked mesh representations with a substantial number of\ntriangles or resource-intensive ray marching in baked representations. We\nchallenge these conventions, observing that high-quality geometry, represented\nby meshes with substantial triangles, is not necessary for achieving\nphotorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF\nrepresentation that includes a low-quality mesh, a view-dependent displacement\nmap, and a compressed NeRF model. This design effectively harnesses the\ncapabilities of existing graphics hardware, thus enabling real-time NeRF\nrendering on edge devices. Leveraging a highly-optimized WebGL-based rendering\nframework, our proposed MixRT attains real-time rendering speeds on edge\ndevices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop),\nbetter rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360\ndatasets), and a smaller storage size (less than 80% compared to\nstate-of-the-art methods).\n","authors":["Chaojian Li","Bichen Wu","Peter Vajda"," Yingyan"," Lin"],"pdf_url":"https://arxiv.org/pdf/2312.11841v4.pdf","comment":"Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/"},{"id":"http://arxiv.org/abs/2312.10105v2","updated":"2024-01-22T14:56:52Z","published":"2023-12-15T04:11:34Z","title":"Forging Tokens for Improved Storage-efficient Training","summary":" Recent advancements in Deep Neural Network (DNN) models have significantly\nimproved performance across computer vision tasks. However, achieving highly\ngeneralizable and high-performing vision models requires extensive datasets,\nleading to large storage requirements. This storage challenge poses a critical\nbottleneck for scaling up vision models. Motivated by the success of discrete\nrepresentations, SeiT proposes to use Vector-Quantized (VQ) feature vectors\n(i.e., tokens) as network inputs for vision classification. However, applying\ntraditional data augmentations to tokens faces challenges due to input domain\nshift. To address this issue, we introduce TokenAdapt and ColorAdapt, simple\nyet effective token-based augmentation strategies. TokenAdapt realigns token\nembedding space for compatibility with spatial augmentations, preserving the\nmodel's efficiency without requiring fine-tuning. Additionally, ColorAdapt\naddresses color-based augmentations for tokens inspired by Adaptive Instance\nNormalization (AdaIN). We evaluate our approach across various scenarios,\nincluding storage-efficient ImageNet-1k classification, fine-grained\nclassification, robustness benchmarks, and ADE-20k semantic segmentation.\nExperimental results demonstrate consistent performance improvement in diverse\nexperiments. Code is available at https://github.com/naver-ai/tokenadapt.\n","authors":["Minhyun Lee","Song Park","Byeongho Heo","Dongyoon Han","Hyunjung Shim"],"pdf_url":"https://arxiv.org/pdf/2312.10105v2.pdf","comment":"First two authors contributed equally"},{"id":"http://arxiv.org/abs/2311.03782v3","updated":"2024-01-22T14:52:14Z","published":"2023-11-07T08:05:09Z","title":"CapST: An Enhanced and Lightweight Model Attribution Approach for\n Synthetic Videos","summary":" Deepfake videos, generated through AI faceswapping techniques, have garnered\nconsiderable attention due to their potential for powerful impersonation\nattacks. While existing research primarily focuses on binary classification to\ndiscern between real and fake videos, however determining the specific\ngeneration model for a fake video is crucial for forensic investigation.\nAddressing this gap, this paper investigates the model attribution problem of\nDeepfake videos from a recently proposed dataset, Deepfakes from Different\nModels (DFDM), derived from various Autoencoder models. The dataset comprises\n6,450 Deepfake videos generated by five distinct models with variations in\nencoder, decoder, intermediate layer, input resolution, and compression ratio.\nThis study formulates Deepfakes model attribution as a multiclass\nclassification task, proposing a segment of VGG19 as a feature extraction\nbackbone, known for its effectiveness in imagerelated tasks, while integrated a\nCapsule Network with a Spatio-Temporal attention mechanism. The Capsule module\ncaptures intricate hierarchies among features for robust identification of\ndeepfake attributes. Additionally, the video-level fusion technique leverages\ntemporal attention mechanisms to handle concatenated feature vectors,\ncapitalizing on inherent temporal dependencies in deepfake videos. By\naggregating insights across frames, our model gains a comprehensive\nunderstanding of video content, resulting in more precise predictions.\nExperimental results on the deepfake benchmark dataset (DFDM) demonstrate the\nefficacy of our proposed method, achieving up to a 4% improvement in accurately\ncategorizing deepfake videos compared to baseline models while demanding fewer\ncomputational resources.\n","authors":["Wasim Ahmad","Yan-Tsung Peng","Yuan-Hao Chang","Gaddisa Olani Ganfure","Sarwar Khan","Sahibzada Adil Shahzad"],"pdf_url":"https://arxiv.org/pdf/2311.03782v3.pdf","comment":"Rejected from jounal and will have to conduct several more\n experiments"},{"id":"http://arxiv.org/abs/2401.12001v1","updated":"2024-01-22T14:52:08Z","published":"2024-01-22T14:52:08Z","title":"Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network\n via Disparity Plane Sweep","summary":" We propose a novel stereo-confidence that can be measured externally to\nvarious stereo-matching networks, offering an alternative input modality choice\nof the cost volume for learning-based approaches, especially in safety-critical\nsystems. Grounded in the foundational concepts of disparity definition and the\ndisparity plane sweep, the proposed stereo-confidence method is built upon the\nidea that any shift in a stereo-image pair should be updated in a corresponding\namount shift in the disparity map. Based on this idea, the proposed\nstereo-confidence method can be summarized in three folds. 1) Using the\ndisparity plane sweep, multiple disparity maps can be obtained and treated as a\n3-D volume (predicted disparity volume), like the cost volume is constructed.\n2) One of these disparity maps serves as an anchor, allowing us to define a\ndesirable (or ideal) disparity profile at every spatial point. 3) By comparing\nthe desirable and predicted disparity profiles, we can quantify the level of\nmatching ambiguity between left and right images for confidence measurement.\nExtensive experimental results using various stereo-matching networks and\ndatasets demonstrate that the proposed stereo-confidence method not only shows\ncompetitive performance on its own but also consistent performance improvements\nwhen it is used as an input modality for learning-based stereo-confidence\nmethods.\n","authors":["Jae Young Lee","Woonghyun Ka","Jaehyun Choi","Junmo Kim"],"pdf_url":"https://arxiv.org/pdf/2401.12001v1.pdf","comment":"AAAI 2024. The first two authors contributed equally"},{"id":"http://arxiv.org/abs/2401.11985v1","updated":"2024-01-22T14:38:25Z","published":"2024-01-22T14:38:25Z","title":"Scaling Face Interaction Graph Networks to Real World Scenes","summary":" Accurately simulating real world object dynamics is essential for various\napplications such as robotics, engineering, graphics, and design. To better\ncapture complex real dynamics such as contact and friction, learned simulators\nbased on graph networks have recently shown great promise. However, applying\nthese learned simulators to real scenes comes with two major challenges: first,\nscaling learned simulators to handle the complexity of real world scenes which\ncan involve hundreds of objects each with complicated 3D shapes, and second,\nhandling inputs from perception rather than 3D state information. Here we\nintroduce a method which substantially reduces the memory required to run\ngraph-based learned simulators. Based on this memory-efficient simulation\nmodel, we then present a perceptual interface in the form of editable NeRFs\nwhich can convert real-world scenes into a structured representation that can\nbe processed by graph network simulator. We show that our method uses\nsubstantially less memory than previous graph-based simulators while retaining\ntheir accuracy, and that the simulators learned in synthetic environments can\nbe applied to real world scenes captured from multiple camera angles. This\npaves the way for expanding the application of learned simulators to settings\nwhere only perceptual information is available at inference time.\n","authors":["Tatiana Lopez-Guevara","Yulia Rubanova","William F. Whitney","Tobias Pfaff","Kimberly Stachenfeld","Kelsey R. Allen"],"pdf_url":"https://arxiv.org/pdf/2401.11985v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.11960v1","updated":"2024-01-22T14:02:56Z","published":"2024-01-22T14:02:56Z","title":"Observation-Guided Meteorological Field Downscaling at Station Scale: A\n Benchmark and a New Method","summary":" Downscaling (DS) of meteorological variables involves obtaining\nhigh-resolution states from low-resolution meteorological fields and is an\nimportant task in weather forecasting. Previous methods based on deep learning\ntreat downscaling as a super-resolution task in computer vision and utilize\nhigh-resolution gridded meteorological fields as supervision to improve\nresolution at specific grid scales. However, this approach has struggled to\nalign with the continuous distribution characteristics of meteorological\nfields, leading to an inherent systematic bias between the downscaled results\nand the actual observations at meteorological stations. In this paper, we\nextend meteorological downscaling to arbitrary scattered station scales,\nestablish a brand new benchmark and dataset, and retrieve meteorological states\nat any given station location from a coarse-resolution meteorological field.\nInspired by data assimilation techniques, we integrate observational data into\nthe downscaling process, providing multi-scale observational priors. Building\non this foundation, we propose a new downscaling model based on hypernetwork\narchitecture, namely HyperDS, which efficiently integrates different\nobservational information into the model training, achieving continuous scale\nmodeling of the meteorological field. Through extensive experiments, our\nproposed method outperforms other specially designed baseline models on\nmultiple surface variables. Notably, the mean squared error (MSE) for wind\nspeed and surface pressure improved by 67% and 19.5% compared to other methods.\nWe will release the dataset and code subsequently.\n","authors":["Zili Liu","Hao Chen","Lei Bai","Wenyuan Li","Keyan Chen","Zhengyi Wang","Wanli Ouyang","Zhengxia Zou","Zhenwei Shi"],"pdf_url":"https://arxiv.org/pdf/2401.11960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11949v1","updated":"2024-01-22T13:38:24Z","published":"2024-01-22T13:38:24Z","title":"Feature Denoising Diffusion Model for Blind Image Quality Assessment","summary":" Blind Image Quality Assessment (BIQA) aims to evaluate image quality in line\nwith human perception, without reference benchmarks. Currently, deep learning\nBIQA methods typically depend on using features from high-level tasks for\ntransfer learning. However, the inherent differences between BIQA and these\nhigh-level tasks inevitably introduce noise into the quality-aware features. In\nthis paper, we take an initial step towards exploring the diffusion model for\nfeature denoising in BIQA, namely Perceptual Feature Diffusion for IQA\n(PFD-IQA), which aims to remove noise from quality-aware features.\nSpecifically, (i) We propose a {Perceptual Prior Discovery and Aggregation\nmodule to establish two auxiliary tasks to discover potential low-level\nfeatures in images that are used to aggregate perceptual text conditions for\nthe diffusion model. (ii) We propose a Perceptual Prior-based Feature\nRefinement strategy, which matches noisy features to predefined denoising\ntrajectories and then performs exact feature denoising based on text\nconditions. Extensive experiments on eight standard BIQA datasets demonstrate\nthe superior performance to the state-of-the-art BIQA methods, i.e., achieving\nthe PLCC values of 0.935 ( vs. 0.905 in KADID) and 0.922 ( vs. 0.894 in LIVEC).\n","authors":["Xudong Li","Jingyuan Zheng","Runze Hu","Yan Zhang","Ke Li","Yunhang Shen","Xiawu Zheng","Yutao Liu","ShengChuan Zhang","Pingyang Dai","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2401.11949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11944v1","updated":"2024-01-22T13:34:34Z","published":"2024-01-22T13:34:34Z","title":"CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding\n Benchmark","summary":" As the capabilities of large multimodal models (LMMs) continue to advance,\nevaluating the performance of LMMs emerges as an increasing need. Additionally,\nthere is an even larger gap in evaluating the advanced knowledge and reasoning\nabilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU,\na new Chinese Massive Multi-discipline Multimodal Understanding benchmark\ndesigned to evaluate LMMs on tasks demanding college-level subject knowledge\nand deliberate reasoning in a Chinese context. CMMMU is inspired by and\nstrictly follows the annotation and analysis pattern of MMMU.\n CMMMU includes 12k manually collected multimodal questions from college\nexams, quizzes, and textbooks, covering six core disciplines: Art & Design,\nBusiness, Science, Health & Medicine, Humanities & Social Science, and Tech &\nEngineering, like its companion, MMMU. These questions span 30 subjects and\ncomprise 39 highly heterogeneous image types, such as charts, diagrams, maps,\ntables, music sheets, and chemical structures.\n CMMMU focuses on complex perception and reasoning with domain-specific\nknowledge in the Chinese context. We evaluate 11 open-source LLMs and one\nproprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%,\nindicating a large space for improvement. CMMMU will boost the community to\nbuild the next-generation LMMs towards expert artificial intelligence and\npromote the democratization of LMMs by providing diverse language contexts.\n","authors":["Ge Zhang","Xinrun Du","Bei Chen","Yiming Liang","Tongxu Luo","Tianyu Zheng","Kang Zhu","Yuyang Cheng","Chunpu Xu","Shuyue Guo","Haoran Zhang","Xingwei Qu","Junjie Wang","Ruibin Yuan","Yizhi Li","Zekun Wang","Yudong Liu","Yu-Hsuan Tsai","Fengji Zhang","Chenghua Lin","Wenhao Huang","Wenhu Chen","Jie Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11944v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":" This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2303.07064v3","updated":"2024-01-22T13:26:32Z","published":"2023-03-13T12:38:07Z","title":"A Generalized Multi-Modal Fusion Detection Framework","summary":" LiDAR point clouds have become the most common data source in autonomous\ndriving. However, due to the sparsity of point clouds, accurate and reliable\ndetection cannot be achieved in specific scenarios. Because of their\ncomplementarity with point clouds, images are getting increasing attention.\nAlthough with some success, existing fusion methods either perform hard fusion\nor do not fuse in a direct manner. In this paper, we propose a generic 3D\ndetection framework called MMFusion, using multi-modal features. The framework\naims to achieve accurate fusion between LiDAR and images to improve 3D\ndetection in complex scenes. Our framework consists of two separate streams:\nthe LiDAR stream and the camera stream, which can be compatible with any\nsingle-modal feature extraction network. The Voxel Local Perception Module in\nthe LiDAR stream enhances local feature representation, and then the\nMulti-modal Feature Fusion Module selectively combines feature output from\ndifferent streams to achieve better fusion. Extensive experiments have shown\nthat our framework not only outperforms existing benchmarks but also improves\ntheir detection, especially for detecting cyclists and pedestrians on KITTI\nbenchmarks, with strong robustness and generalization capabilities. Hopefully,\nour work will stimulate more research into multi-modal fusion for autonomous\ndriving tasks.\n","authors":["Leichao Cui","Xiuxian Li","Min Meng","Xiaoyu Mo"],"pdf_url":"https://arxiv.org/pdf/2303.07064v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15567v3","updated":"2024-01-22T13:17:21Z","published":"2023-07-28T14:04:06Z","title":"Panoptic Scene Graph Generation with Semantics-Prototype Learning","summary":" Panoptic Scene Graph Generation (PSG) parses objects and predicts their\nrelationships (predicate) to connect human language and visual scenes. However,\ndifferent language preferences of annotators and semantic overlaps between\npredicates lead to biased predicate annotations in the dataset, i.e. different\npredicates for same object pairs. Biased predicate annotations make PSG models\nstruggle in constructing a clear decision plane among predicates, which greatly\nhinders the real application of PSG models. To address the intrinsic bias\nabove, we propose a novel framework named ADTrans to adaptively transfer biased\npredicate annotations to informative and unified ones. To promise consistency\nand accuracy during the transfer process, we propose to measure the invariance\nof representations in each predicate class, and learn unbiased prototypes of\npredicates with different intensities. Meanwhile, we continuously measure the\ndistribution changes between each presentation and its prototype, and\nconstantly screen potential biased data. Finally, with the unbiased\npredicate-prototype representation embedding space, biased annotations are\neasily identified. Experiments show that ADTrans significantly improves the\nperformance of benchmark models, achieving a new state-of-the-art performance,\nand shows great generalization and effectiveness on multiple datasets.\n","authors":["Li Li","Wei Ji","Yiming Wu","Mengze Li","You Qin","Lina Wei","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2307.15567v3.pdf","comment":"AAAI 2024"},{"id":"http://arxiv.org/abs/2310.09126v2","updated":"2024-01-22T13:14:33Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Practical Low-light Raw Image\n Denoising","summary":" Recently, the mainstream practice for training low-light raw image denoising\nmethods has shifted towards employing synthetic data. Noise modeling, which\nfocuses on characterizing the noise distribution of real-world sensors,\nprofoundly influences the effectiveness and practicality of synthetic data.\nCurrently, physics-based noise modeling struggles to characterize the entire\nreal noise distribution, while learning-based noise modeling impractically\ndepends on paired real data. In this paper, we propose a novel strategy:\nlearning the noise model from dark frames instead of paired real data, to break\ndown the data dependency. Based on this strategy, we introduce an efficient\nphysics-guided noise neural proxy (PNNP) to approximate the real-world sensor\nnoise model. Specifically, we integrate physical priors into neural proxies and\nintroduce three efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution loss (DDL).\nPND decouples the dark frame into different components and handles different\nlevels of noise flexibly, which reduces the complexity of noise modeling. PPM\nincorporates physical priors to constrain the generated noise, which promotes\nthe accuracy of noise modeling. DDL provides explicit and reliable supervision\nfor noise distribution, which promotes the precision of noise modeling. PNNP\nexhibits powerful potential in characterizing the real noise distribution.\nExtensive experiments on public datasets demonstrate superior performance in\npractical low-light raw image denoising. The code will be available at\n\\url{https://github.com/fenghansen/PNNP}.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Lin Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11914v1","updated":"2024-01-22T13:01:35Z","published":"2024-01-22T13:01:35Z","title":"A Saliency Enhanced Feature Fusion based multiscale RGB-D Salient Object\n Detection Network","summary":" Multiscale convolutional neural network (CNN) has demonstrated remarkable\ncapabilities in solving various vision problems. However, fusing features of\ndifferent scales alwaysresults in large model sizes, impeding the application\nof multiscale CNNs in RGB-D saliency detection. In this paper, we propose a\ncustomized feature fusion module, called Saliency Enhanced Feature Fusion\n(SEFF), for RGB-D saliency detection. SEFF utilizes saliency maps of the\nneighboring scales to enhance the necessary features for fusing, resulting in\nmore representative fused features. Our multiscale RGB-D saliency detector uses\nSEFF and processes images with three different scales. SEFF is used to fuse the\nfeatures of RGB and depth images, as well as the features of decoders at\ndifferent scales. Extensive experiments on five benchmark datasets have\ndemonstrated the superiority of our method over ten SOTA saliency detectors.\n","authors":["Rui Huang","Qingyi Zhao","Yan Xing","Sihua Gao","Weifeng Xu","Yuxiang Zhang","Wei Fan"],"pdf_url":"https://arxiv.org/pdf/2401.11914v1.pdf","comment":"Accpeted by 2024 IEEE International Conference on Acoustics, Speech,\n and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.11913v1","updated":"2024-01-22T13:01:28Z","published":"2024-01-22T13:01:28Z","title":"Large receptive field strategy and important feature extraction strategy\n in 3D object detection","summary":" The enhancement of 3D object detection is pivotal for precise environmental\nperception and improved task execution capabilities in autonomous driving.\nLiDAR point clouds, offering accurate depth information, serve as a crucial\ninformation for this purpose. Our study focuses on key challenges in 3D target\ndetection. To tackle the challenge of expanding the receptive field of a 3D\nconvolutional kernel, we introduce the Dynamic Feature Fusion Module (DFFM).\nThis module achieves adaptive expansion of the 3D convolutional kernel's\nreceptive field, balancing the expansion with acceptable computational loads.\nThis innovation reduces operations, expands the receptive field, and allows the\nmodel to dynamically adjust to different object requirements. Simultaneously,\nwe identify redundant information in 3D features. Employing the Feature\nSelection Module (FSM) quantitatively evaluates and eliminates non-important\nfeatures, achieving the separation of output box fitting and feature\nextraction. This innovation enables the detector to focus on critical features,\nresulting in model compression, reduced computational burden, and minimized\ncandidate frame interference. Extensive experiments confirm that both DFFM and\nFSM not only enhance current benchmarks, particularly in small target\ndetection, but also accelerate network performance. Importantly, these modules\nexhibit effective complementarity.\n","authors":["Leichao Cui","Xiuxian Li","Min Meng"],"pdf_url":"https://arxiv.org/pdf/2401.11913v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11902v1","updated":"2024-01-22T12:50:21Z","published":"2024-01-22T12:50:21Z","title":"A Training-Free Defense Framework for Robust Learned Image Compression","summary":" We study the robustness of learned image compression models against\nadversarial attacks and present a training-free defense technique based on\nsimple image transform functions. Recent learned image compression models are\nvulnerable to adversarial attacks that result in poor compression rate, low\nreconstruction quality, or weird artifacts. To address the limitations, we\npropose a simple but effective two-way compression algorithm with random input\ntransforms, which is conveniently applicable to existing image compression\nmodels. Unlike the na\\\"ive approaches, our approach preserves the original\nrate-distortion performance of the models on clean images. Moreover, the\nproposed algorithm requires no additional training or modification of existing\nmodels, making it more practical. We demonstrate the effectiveness of the\nproposed techniques through extensive experiments under multiple compression\nmodels, evaluation metrics, and attack scenarios.\n","authors":["Myungseo Song","Jinyoung Choi","Bohyung Han"],"pdf_url":"https://arxiv.org/pdf/2401.11902v1.pdf","comment":"10 pages and 14 figures"},{"id":"http://arxiv.org/abs/2203.13718v2","updated":"2024-01-22T12:47:52Z","published":"2022-03-25T15:40:44Z","title":"Digital Fingerprinting of Microstructures","summary":" Finding efficient means of fingerprinting microstructural information is a\ncritical step towards harnessing data-centric machine learning approaches. A\nstatistical framework is systematically developed for compressed\ncharacterisation of a population of images, which includes some classical\ncomputer vision methods as special cases. The focus is on materials\nmicrostructure. The ultimate purpose is to rapidly fingerprint sample images in\nthe context of various high-throughput design/make/test scenarios. This\nincludes, but is not limited to, quantification of the disparity between\nmicrostructures for quality control, classifying microstructures, predicting\nmaterials properties from image data and identifying potential processing\nroutes to engineer new materials with specific properties. Here, we consider\nmicrostructure classification and utilise the resulting features over a range\nof related machine learning tasks, namely supervised, semi-supervised, and\nunsupervised learning.\n The approach is applied to two distinct datasets to illustrate various\naspects and some recommendations are made based on the findings. In particular,\nmethods that leverage transfer learning with convolutional neural networks\n(CNNs), pretrained on the ImageNet dataset, are generally shown to outperform\nother methods. Additionally, dimensionality reduction of these CNN-based\nfingerprints is shown to have negligible impact on classification accuracy for\nthe supervised learning approaches considered. In situations where there is a\nlarge dataset with only a handful of images labelled, graph-based label\npropagation to unlabelled data is shown to be favourable over discarding\nunlabelled data and performing supervised learning. In particular, label\npropagation by Poisson learning is shown to be highly effective at low label\nrates.\n","authors":["Michael D. White","Alexander Tarakanov","Christopher P. Race","Philip J. Withers","Kody J. H. Law"],"pdf_url":"https://arxiv.org/pdf/2203.13718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.00067v3","updated":"2024-01-22T12:24:36Z","published":"2022-06-30T19:13:23Z","title":"Rethinking Unsupervised Domain Adaptation for Semantic Segmentation","summary":" Unsupervised domain adaptation (UDA) adapts a model trained on one domain\n(called source) to a novel domain (called target) using only unlabeled data.\nDue to its high annotation cost, researchers have developed many UDA methods\nfor semantic segmentation, which assume no labeled sample is available in the\ntarget domain. We question the practicality of this assumption for two reasons.\nFirst, after training a model with a UDA method, we must somehow verify the\nmodel before deployment. Second, UDA methods have at least a few\nhyper-parameters that need to be determined. The surest solution to these is to\nevaluate the model using validation data, i.e., a certain amount of labeled\ntarget-domain samples. This question about the basic assumption of UDA leads us\nto rethink UDA from a data-centric point of view. Specifically, we assume we\nhave access to a minimum level of labeled data. Then, we ask how much is\nnecessary to find good hyper-parameters of existing UDA methods. We then\nconsider what if we use the same data for supervised training of the same\nmodel, e.g., finetuning. We conducted experiments to answer these questions\nwith popular scenarios, {GTA5, SYNTHIA}$\\rightarrow$Cityscapes. We found that\ni) choosing good hyper-parameters needs only a few labeled images for some UDA\nmethods whereas a lot more for others; and ii) simple finetuning works\nsurprisingly well; it outperforms many UDA methods if only several dozens of\nlabeled images are available.\n","authors":["Zhijie Wang","Masanori Suganuma","Takayuki Okatani"],"pdf_url":"https://arxiv.org/pdf/2207.00067v3.pdf","comment":"Under review in Pattern Recognition Letters"},{"id":"http://arxiv.org/abs/2401.11877v1","updated":"2024-01-22T12:02:40Z","published":"2024-01-22T12:02:40Z","title":"Evaluating the Feasibility of Standard Facial Expression Recognition in\n Individuals with Moderate to Severe Intellectual Disabilities","summary":" Recent research has underscored the increasing preference of users for\nhuman-like interactions with machines. Consequently, facial expression\nrecognition has gained significance as a means of imparting social robots with\nthe capacity to discern the emotional states of users. In this investigation,\nwe assess the suitability of deep learning approaches, known for their\nremarkable performance in this domain, for recognizing facial expressions in\nindividuals with intellectual disabilities, which has not been yet studied in\nthe literature, to the best of our knowledge. To address this objective, we\ntrain a set of twelve distinct convolutional neural networks in different\napproaches, including an ensemble of datasets without individuals with\nintellectual disabilities and a dataset featuring such individuals. Our\nexamination of the outcomes achieved by the various models under distinct\ntraining conditions, coupled with a comprehensive analysis of critical facial\nregions during expression recognition facilitated by explainable artificial\nintelligence techniques, revealed significant distinctions in facial\nexpressions between individuals with and without intellectual disabilities, as\nwell as among individuals with intellectual disabilities. Remarkably, our\nfindings demonstrate the feasibility of facial expression recognition within\nthis population through tailored user-specific training methodologies, which\nenable the models to effectively address the unique expressions of each user.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis","Jose M. Buades-Rubio","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11874v1","updated":"2024-01-22T12:00:37Z","published":"2024-01-22T12:00:37Z","title":"Detect-Order-Construct: A Tree Construction based Approach for\n Hierarchical Document Structure Analysis","summary":" Document structure analysis (aka document layout analysis) is crucial for\nunderstanding the physical layout and logical structure of documents, with\napplications in information retrieval, document summarization, knowledge\nextraction, etc. In this paper, we concentrate on Hierarchical Document\nStructure Analysis (HDSA) to explore hierarchical relationships within\nstructured documents created using authoring software employing hierarchical\nschemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze\nhierarchical document structures, we propose a tree construction based approach\nthat addresses multiple subtasks concurrently, including page object detection\n(Detect), reading order prediction of identified objects (Order), and the\nconstruction of intended hierarchical structure (Construct). We present an\neffective end-to-end solution based on this framework to demonstrate its\nperformance. To assess our approach, we develop a comprehensive benchmark\ncalled Comp-HRDoc, which evaluates the above subtasks simultaneously. Our\nend-to-end system achieves state-of-the-art performance on two large-scale\ndocument layout analysis datasets (PubLayNet and DocLayNet), a high-quality\nhierarchical document structure reconstruction dataset (HRDoc), and our\nComp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate\nfurther research in this field.\n","authors":["Jiawei Wang","Kai Hu","Zhuoyao Zhong","Lei Sun","Qiang Huo"],"pdf_url":"https://arxiv.org/pdf/2401.11874v1.pdf","comment":"Submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2401.11859v1","updated":"2024-01-22T11:28:24Z","published":"2024-01-22T11:28:24Z","title":"LKFormer: Large Kernel Transformer for Infrared Image Super-Resolution","summary":" Given the broad application of infrared technology across diverse fields,\nthere is an increasing emphasis on investigating super-resolution techniques\nfor infrared images within the realm of deep learning. Despite the impressive\nresults of current Transformer-based methods in image super-resolution tasks,\ntheir reliance on the self-attentive mechanism intrinsic to the Transformer\narchitecture results in images being treated as one-dimensional sequences,\nthereby neglecting their inherent two-dimensional structure. Moreover, infrared\nimages exhibit a uniform pixel distribution and a limited gradient range,\nposing challenges for the model to capture effective feature information.\nConsequently, we suggest a potent Transformer model, termed Large Kernel\nTransformer (LKFormer), to address this issue. Specifically, we have designed a\nLarge Kernel Residual Depth-wise Convolutional Attention (LKRDA) module with\nlinear complexity. This mainly employs depth-wise convolution with large\nkernels to execute non-local feature modeling, thereby substituting the\nstandard self-attentive layer. Additionally, we have devised a novel\nfeed-forward network structure called Gated-Pixel Feed-Forward Network (GPFN)\nto augment the LKFormer's capacity to manage the information flow within the\nnetwork. Comprehensive experimental results reveal that our method surpasses\nthe most advanced techniques available, using fewer parameters and yielding\nconsiderably superior performance.\n","authors":["Feiwei Qin","Kang Yan","Changmiao Wang","Ruiquan Ge","Yong Peng","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11859v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11856v1","updated":"2024-01-22T11:25:59Z","published":"2024-01-22T11:25:59Z","title":"MOSformer: Momentum encoder-based inter-slice fusion transformer for\n medical image segmentation","summary":" Medical image segmentation takes an important position in various clinical\napplications. Deep learning has emerged as the predominant solution for\nautomated segmentation of volumetric medical images. 2.5D-based segmentation\nmodels bridge computational efficiency of 2D-based models and spatial\nperception capabilities of 3D-based models. However, prevailing 2.5D-based\nmodels often treat each slice equally, failing to effectively learn and exploit\ninter-slice information, resulting in suboptimal segmentation performances. In\nthis paper, a novel Momentum encoder-based inter-slice fusion transformer\n(MOSformer) is proposed to overcome this issue by leveraging inter-slice\ninformation at multi-scale feature maps extracted by different encoders.\nSpecifically, dual encoders are employed to enhance feature distinguishability\namong different slices. One of the encoders is moving-averaged to maintain the\nconsistency of slice representations. Moreover, an IF-Swin transformer module\nis developed to fuse inter-slice multi-scale features. The MOSformer is\nevaluated on three benchmark datasets (Synapse, ACDC, and AMOS), establishing a\nnew state-of-the-art with 85.63%, 92.19%, and 85.43% of DSC, respectively.\nThese promising results indicate its competitiveness in medical image\nsegmentation. Codes and models of MOSformer will be made publicly available\nupon acceptance.\n","authors":["De-Xing Huang","Xiao-Hu Zhou","Xiao-Liang Xie","Shi-Qi Liu","Zhen-Qiu Feng","Mei-Jiang Gui","Hao Li","Tian-Yu Xiang","Xiu-Ling Liu","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2401.11856v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11847v1","updated":"2024-01-22T11:04:55Z","published":"2024-01-22T11:04:55Z","title":"SignVTCL: Multi-Modal Continuous Sign Language Recognition Enhanced by\n Visual-Textual Contrastive Learning","summary":" Sign language recognition (SLR) plays a vital role in facilitating\ncommunication for the hearing-impaired community. SLR is a weakly supervised\ntask where entire videos are annotated with glosses, making it challenging to\nidentify the corresponding gloss within a video segment. Recent studies\nindicate that the main bottleneck in SLR is the insufficient training caused by\nthe limited availability of large-scale datasets. To address this challenge, we\npresent SignVTCL, a multi-modal continuous sign language recognition framework\nenhanced by visual-textual contrastive learning, which leverages the full\npotential of multi-modal data and the generalization ability of language model.\nSignVTCL integrates multi-modal data (video, keypoints, and optical flow)\nsimultaneously to train a unified visual backbone, thereby yielding more robust\nvisual representations. Furthermore, SignVTCL contains a visual-textual\nalignment approach incorporating gloss-level and sentence-level alignment to\nensure precise correspondence between visual features and glosses at the level\nof individual glosses and sentence. Experimental results conducted on three\ndatasets, Phoenix-2014, Phoenix-2014T, and CSL-Daily, demonstrate that SignVTCL\nachieves state-of-the-art results compared with previous methods.\n","authors":["Hao Chen","Jiaze Wang","Ziyu Guo","Jinpeng Li","Donghao Zhou","Bian Wu","Chenyong Guan","Guangyong Chen","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2401.11847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11844v1","updated":"2024-01-22T11:01:52Z","published":"2024-01-22T11:01:52Z","title":"Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field\n Crop Yield Prediction","summary":" Accurate crop yield prediction is of utmost importance for informed\ndecision-making in agriculture, aiding farmers, and industry stakeholders.\nHowever, this task is complex and depends on multiple factors, such as\nenvironmental conditions, soil properties, and management practices. Combining\nheterogeneous data views poses a fusion challenge, like identifying the\nview-specific contribution to the predictive task. We present a novel\nmulti-view learning approach to predict crop yield for different crops\n(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our\nmulti-view input data includes multi-spectral optical images from Sentinel-2\nsatellites and weather data as dynamic features during the crop growing season,\ncomplemented by static features like soil properties and topographic\ninformation. To effectively fuse the data, we introduce a Multi-view Gated\nFusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)\nmodule. The view-encoders handle the heterogeneity of data sources with varying\ntemporal resolutions by learning a view-specific representation. These\nrepresentations are adaptively fused via a weighted sum. The fusion weights are\ncomputed for each sample by the GU using a concatenation of the\nview-representations. The MVGF model is trained at sub-field level with 10 m\nresolution pixels. Our evaluations show that the MVGF outperforms conventional\nmodels on the same task, achieving the best results by incorporating all the\ndata sources, unlike the usual fusion results in the literature. For Argentina,\nthe MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,\nwhile at field level evaluation (comparing field averages), it reaches around\n0.80 across different countries. The GU module learned different weights based\non the country and crop-type, aligning with the variable significance of each\ndata source to the prediction task.\n","authors":["Francisco Mena","Deepak Pathak","Hiba Najjar","Cristhian Sanchez","Patrick Helber","Benjamin Bischke","Peter Habelitz","Miro Miranda","Jayanth Siddamsetty","Marlon Nuske","Marcela Charfuelan","Diego Arenas","Michaela Vollmer","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11835v1","updated":"2024-01-22T10:52:02Z","published":"2024-01-22T10:52:02Z","title":"Unveiling the Human-like Similarities of Automatic Facial Expression\n Recognition: An Empirical Exploration through Explainable AI","summary":" Facial expression recognition is vital for human behavior analysis, and deep\nlearning has enabled models that can outperform humans. However, it is unclear\nhow closely they mimic human processing. This study aims to explore the\nsimilarity between deep neural networks and human perception by comparing\ntwelve different networks, including both general object classifiers and\nFER-specific models. We employ an innovative global explainable AI method to\ngenerate heatmaps, revealing crucial facial regions for the twelve networks\ntrained on six facial expressions. We assess these results both quantitatively\nand qualitatively, comparing them to ground truth masks based on Friesen and\nEkman's description and among them. We use Intersection over Union (IoU) and\nnormalized correlation coefficients for comparisons. We generate 72 heatmaps to\nhighlight critical regions for each expression and architecture. Qualitatively,\nmodels with pre-trained weights show more similarity in heatmaps compared to\nthose without pre-training. Specifically, eye and nose areas influence certain\nfacial expressions, while the mouth is consistently important across all models\nand expressions. Quantitatively, we find low average IoU values (avg. 0.2702)\nacross all expressions and architectures. The best-performing architecture\naverages 0.3269, while the worst-performing one averages 0.2066. Dendrograms,\nbuilt with the normalized correlation coefficient, reveal two main clusters for\nmost expressions: models with pre-training and models without pre-training.\nFindings suggest limited alignment between human and AI facial expression\nrecognition, with network architectures influencing the similarity, as similar\narchitectures prioritize similar facial regions.\n","authors":["F. Xavier Gaya-Morey","Silvia Ramis-Guarinos","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11835v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11831v1","updated":"2024-01-22T10:42:51Z","published":"2024-01-22T10:42:51Z","title":"A Fair Evaluation of Various Deep Learning-Based Document Image\n Binarization Approaches","summary":" Binarization of document images is an important pre-processing step in the\nfield of document analysis. Traditional image binarization techniques usually\nrely on histograms or local statistics to identify a valid threshold to\ndifferentiate between different aspects of the image. Deep learning techniques\nare able to generate binarized versions of the images by learning\ncontext-dependent features that are less error-prone to degradation typically\noccurring in document images. In recent years, many deep learning-based methods\nhave been developed for document binarization. But which one to choose? There\nhave been no studies that compare these methods rigorously. Therefore, this\nwork focuses on the evaluation of different deep learning-based methods under\nthe same evaluation protocol. We evaluate them on different Document Image\nBinarization Contest (DIBCO) datasets and obtain very heterogeneous results. We\nshow that the DE-GAN model was able to perform better compared to other models\nwhen evaluated on the DIBCO2013 dataset while DP-LinkNet performed best on the\nDIBCO2017 dataset. The 2-StageGAN performed best on the DIBCO2018 dataset while\nSauvolaNet outperformed the others on the DIBCO2019 challenge. Finally, we make\nthe code, all models and evaluation publicly available\n(https://github.com/RichSu95/Document_Binarization_Collection) to ensure\nreproducibility and simplify future binarization evaluations.\n","authors":["Richin Sukesh","Mathias Seuret","Anguelos Nicolaou","Martin Mayr","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2401.11831v1.pdf","comment":"DAS 2022"},{"id":"http://arxiv.org/abs/2401.11824v1","updated":"2024-01-22T10:37:59Z","published":"2024-01-22T10:37:59Z","title":"Rethinking Centered Kernel Alignment in Knowledge Distillation","summary":" Knowledge distillation has emerged as a highly effective method for bridging\nthe representation discrepancy between large-scale models and lightweight\nmodels. Prevalent approaches involve leveraging appropriate metrics to minimize\nthe divergence or distance between the knowledge extracted from the teacher\nmodel and the knowledge learned by the student model. Centered Kernel Alignment\n(CKA) is widely used to measure representation similarity and has been applied\nin several knowledge distillation methods. However, these methods are complex\nand fail to uncover the essence of CKA, thus not answering the question of how\nto use CKA to achieve simple and effective distillation properly. This paper\nfirst provides a theoretical perspective to illustrate the effectiveness of\nCKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD)\nand a constant term. Drawing from this, we propose a novel Relation-Centered\nKernel Alignment~(RCKA) framework, which practically establishes a connection\nbetween CKA and MMD. Furthermore, we dynamically customize the application of\nCKA based on the characteristics of each task, with less computational source\nyet comparable performance than the previous methods. The extensive experiments\non the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves\nstate-of-the-art performance on almost all teacher-student pairs for image\nclassification and object detection, validating the effectiveness of our\napproaches.\n","authors":["Zikai Zhou","Yunhang Shen","Shitong Shao","Huanran Chen","Linrui Gong","Shaohui Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11824v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11814v1","updated":"2024-01-22T10:22:14Z","published":"2024-01-22T10:22:14Z","title":"Symbrain: A large-scale dataset of MRI images for neonatal brain\n symmetry analysis","summary":" This paper presents an annotated dataset of brain MRI images designed to\nadvance the field of brain symmetry study. Magnetic resonance imaging (MRI) has\ngained interest in analyzing brain symmetry in neonatal infants, and challenges\nremain due to the vast size differences between fetal and adult brains.\nClassification methods for brain structural MRI use scales and visual cues to\nassess hemisphere symmetry, which can help diagnose neonatal patients by\ncomparing hemispheres and anatomical regions of interest in the brain. Using\nthe Developing Human Connectome Project dataset, this work presents a dataset\ncomprising cerebral images extracted as slices across selected portions of\ninterest for clinical evaluation . All the extracted images are annotated with\nthe brain's midline. All the extracted images are annotated with the brain's\nmidline. From the assumption that a decrease in symmetry is directly related to\npossible clinical pathologies, the dataset can contribute to a more precise\ndiagnosis because it can be used to train deep learning model application in\nneonatal cerebral MRI anomaly detection from postnatal infant scans thanks to\ncomputer vision. Such models learn to identify and classify anomalies by\nidentifying potential asymmetrical patterns in medical MRI images. Furthermore,\nthis dataset can contribute to the research and development of methods using\nthe relative symmetry of the two brain hemispheres for crucial diagnosis and\ntreatment planning.\n","authors":["Arnaud Gucciardi","Safouane El Ghazouali","Francesca Venturini","Vida Groznik","Umberto Michelucci"],"pdf_url":"https://arxiv.org/pdf/2401.11814v1.pdf","comment":"7 pages, 2 figures, Dataset Paper, Medical AI"},{"id":"http://arxiv.org/abs/2401.02436v2","updated":"2024-01-22T10:08:28Z","published":"2023-11-17T14:40:43Z","title":"Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis","summary":" Recently, high-fidelity scene reconstruction with an optimized 3D Gaussian\nsplat representation has been introduced for novel view synthesis from sparse\nimage sets. Making such representations suitable for applications like network\nstreaming and rendering on low-power devices requires significantly reduced\nmemory consumption as well as improved rendering efficiency. We propose a\ncompressed 3D Gaussian splat representation that utilizes sensitivity-aware\nvector clustering with quantization-aware training to compress directional\ncolors and Gaussian parameters. The learned codebooks have low bitrates and\nachieve a compression rate of up to $31\\times$ on real-world scenes with only\nminimal degradation of visual quality. We demonstrate that the compressed splat\nrepresentation can be efficiently rendered with hardware rasterization on\nlightweight GPUs at up to $4\\times$ higher framerates than reported via an\noptimized GPU compute pipeline. Extensive experiments across multiple datasets\ndemonstrate the robustness and rendering speed of the proposed approach.\n","authors":["Simon Niedermayr","Josef Stumpfegger","Rüdiger Westermann"],"pdf_url":"https://arxiv.org/pdf/2401.02436v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11796v1","updated":"2024-01-22T09:53:20Z","published":"2024-01-22T09:53:20Z","title":"Local Agnostic Video Explanations: a Study on the Applicability of\n Removal-Based Explanations to Video","summary":" Explainable artificial intelligence techniques are becoming increasingly\nimportant with the rise of deep learning applications in various domains. These\ntechniques aim to provide a better understanding of complex \"black box\" models\nand enhance user trust while maintaining high learning performance. While many\nstudies have focused on explaining deep learning models in computer vision for\nimage input, video explanations remain relatively unexplored due to the\ntemporal dimension's complexity. In this paper, we present a unified framework\nfor local agnostic explanations in the video domain. Our contributions include:\n(1) Extending a fine-grained explanation framework tailored for computer vision\ndata, (2) Adapting six existing explanation techniques to work on video data by\nincorporating temporal information and enabling local explanations, and (3)\nConducting an evaluation and comparison of the adapted explanation methods\nusing different models and datasets. We discuss the possibilities and choices\ninvolved in the removal-based explanation process for visual data. The\nadaptation of six explanation methods for video is explained, with comparisons\nto existing approaches. We evaluate the performance of the methods using\nautomated metrics and user-based evaluation, showing that 3D RISE, 3D LIME, and\n3D Kernel SHAP outperform other methods. By decomposing the explanation process\ninto manageable steps, we facilitate the study of each choice's impact and\nallow for further refinement of explanation methods to suit specific datasets\nand models.\n","authors":["F. Xavier Gaya-Morey","Jose M. Buades-Rubio","Cristina Manresa-Yee"],"pdf_url":"https://arxiv.org/pdf/2401.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12817v2","updated":"2024-01-22T09:44:18Z","published":"2023-10-19T15:12:44Z","title":"2D-3D Interlaced Transformer for Point Cloud Segmentation with\n Scene-Level Supervision","summary":" We present a Multimodal Interlaced Transformer (MIT) that jointly considers\n2D and 3D data for weakly supervised point cloud segmentation. Research studies\nhave shown that 2D and 3D features are complementary for point cloud\nsegmentation. However, existing methods require extra 2D annotations to achieve\n2D-3D information fusion. Considering the high annotation cost of point clouds,\neffective 2D and 3D feature fusion based on weakly supervised learning is in\ngreat demand. To this end, we propose a transformer model with two encoders and\none decoder for weakly supervised point cloud segmentation using only\nscene-level class tags. Specifically, the two encoders compute the\nself-attended features for 3D point clouds and 2D multi-view images,\nrespectively. The decoder implements interlaced 2D-3D cross-attention and\ncarries out implicit 2D and 3D feature fusion. We alternately switch the roles\nof queries and key-value pairs in the decoder layers. It turns out that the 2D\nand 3D features are iteratively enriched by each other. Experiments show that\nit performs favorably against existing weakly supervised point cloud\nsegmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The\nproject page will be available at https://jimmy15923.github.io/mit_web/.\n","authors":["Cheng-Kun Yang","Min-Hung Chen","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2310.12817v2.pdf","comment":"ICCV 2023 (main + supp). Website:\n https://jimmy15923.github.io/mit_web/"},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n Segmentation","summary":" Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11790v1","updated":"2024-01-22T09:40:52Z","published":"2024-01-22T09:40:52Z","title":"Deep Learning for Computer Vision based Activity Recognition and Fall\n Detection of the Elderly: a Systematic Review","summary":" As the percentage of elderly people in developed countries increases\nworldwide, the healthcare of this collective is a worrying matter, especially\nif it includes the preservation of their autonomy. In this direction, many\nstudies are being published on Ambient Assisted Living (AAL) systems, which\nhelp to reduce the preoccupations raised by the independent living of the\nelderly. In this study, a systematic review of the literature is presented on\nfall detection and Human Activity Recognition (HAR) for the elderly, as the two\nmain tasks to solve to guarantee the safety of elderly people living alone. To\naddress the current tendency to perform these two tasks, the review focuses on\nthe use of Deep Learning (DL) based approaches on computer vision data. In\naddition, different collections of data like DL models, datasets or hardware\n(e.g. depth or thermal cameras) are gathered from the reviewed studies and\nprovided for reference in future studies. Strengths and weaknesses of existing\napproaches are also discussed and, based on them, our recommendations for\nfuture works are provided.\n","authors":["F. Xavier Gaya-Morey","Cristina Manresa-Yee","Jose M. Buades-Rubio"],"pdf_url":"https://arxiv.org/pdf/2401.11790v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11783v1","updated":"2024-01-22T09:29:42Z","published":"2024-01-22T09:29:42Z","title":"Full-Body Motion Reconstruction with Sparse Sensing from Graph\n Perspective","summary":" Estimating 3D full-body pose from sparse sensor data is a pivotal technique\nemployed for the reconstruction of realistic human motions in Augmented Reality\nand Virtual Reality. However, translating sparse sensor signals into\ncomprehensive human motion remains a challenge since the sparsely distributed\nsensors in common VR systems fail to capture the motion of full human body. In\nthis paper, we use well-designed Body Pose Graph (BPG) to represent the human\nbody and translate the challenge into a prediction problem of graph missing\nnodes. Then, we propose a novel full-body motion reconstruction framework based\non BPG. To establish BPG, nodes are initially endowed with features extracted\nfrom sparse sensor signals. Features from identifiable joint nodes across\ndiverse sensors are amalgamated and processed from both temporal and spatial\nperspectives. Temporal dynamics are captured using the Temporal Pyramid\nStructure, while spatial relations in joint movements inform the spatial\nattributes. The resultant features serve as the foundational elements of the\nBPG nodes. To further refine the BPG, node features are updated through a graph\nneural network that incorporates edge reflecting varying joint relations. Our\nmethod's effectiveness is evidenced by the attained state-of-the-art\nperformance, particularly in lower body motion, outperforming other baseline\nmethods. Additionally, an ablation study validates the efficacy of each module\nin our proposed framework.\n","authors":["Feiyu Yao","Zongkai Wu","Li Yi"],"pdf_url":"https://arxiv.org/pdf/2401.11783v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11775v1","updated":"2024-01-22T09:11:12Z","published":"2024-01-22T09:11:12Z","title":"Collaborative Position Reasoning Network for Referring Image\n Segmentation","summary":" Given an image and a natural language expression as input, the goal of\nreferring image segmentation is to segment the foreground masks of the entities\nreferred by the expression. Existing methods mainly focus on interactive\nlearning between vision and language to enhance the multi-modal representations\nfor global context reasoning. However, predicting directly in pixel-level space\ncan lead to collapsed positioning and poor segmentation results. Its main\nchallenge lies in how to explicitly model entity localization, especially for\nnon-salient entities. In this paper, we tackle this problem by executing a\nCollaborative Position Reasoning Network (CPRN) via the proposed novel\nRow-and-Column interactive (RoCo) and Guided Holistic interactive (Holi)\nmodules. Specifically, RoCo aggregates the visual features into the row- and\ncolumn-wise features corresponding two directional axes respectively. It offers\na fine-grained matching behavior that perceives the associations between the\nlinguistic features and two decoupled visual features to perform position\nreasoning over a hierarchical space. Holi integrates features of the two\nmodalities by a cross-modal attention mechanism, which suppresses the\nirrelevant redundancy under the guide of positioning information from RoCo.\nThus, with the incorporation of RoCo and Holi modules, CPRN captures the visual\ndetails of position reasoning so that the model can achieve more accurate\nsegmentation. To our knowledge, this is the first work that explicitly focuses\non position reasoning modeling. We also validate the proposed method on three\nevaluation datasets. It consistently outperforms existing state-of-the-art\nmethods.\n","authors":["Jianjian Cao","Beiya Dai","Yulin Li","Xiameng Qin","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11775v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17004v2","updated":"2024-01-22T09:10:04Z","published":"2023-12-28T13:16:03Z","title":"Continual Learning in Medical Image Analysis: A Comprehensive Review of\n Recent Advancements and Future Prospects","summary":" Medical imaging analysis has witnessed remarkable advancements even\nsurpassing human-level performance in recent years, driven by the rapid\ndevelopment of advanced deep-learning algorithms. However, when the inference\ndataset slightly differs from what the model has seen during one-time training,\nthe model performance is greatly compromised. The situation requires restarting\nthe training process using both the old and the new data which is\ncomputationally costly, does not align with the human learning process, and\nimposes storage constraints and privacy concerns. Alternatively, continual\nlearning has emerged as a crucial approach for developing unified and\nsustainable deep models to deal with new classes, tasks, and the drifting\nnature of data in non-stationary environments for various application areas.\nContinual learning techniques enable models to adapt and accumulate knowledge\nover time, which is essential for maintaining performance on evolving datasets\nand novel tasks. This systematic review paper provides a comprehensive overview\nof the state-of-the-art in continual learning techniques applied to medical\nimaging analysis. We present an extensive survey of existing research, covering\ntopics including catastrophic forgetting, data drifts, stability, and\nplasticity requirements. Further, an in-depth discussion of key components of a\ncontinual learning framework such as continual learning scenarios, techniques,\nevaluation schemes, and metrics is provided. Continual learning techniques\nencompass various categories, including rehearsal, regularization,\narchitectural, and hybrid strategies. We assess the popularity and\napplicability of continual learning categories in various medical sub-fields\nlike radiology and histopathology...\n","authors":["Pratibha Kumari","Joohi Chauhan","Afshin Bozorgpour","Boqiang Huang","Reza Azad","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2312.17004v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11767v1","updated":"2024-01-22T09:02:52Z","published":"2024-01-22T09:02:52Z","title":"Concealed Object Segmentation with Hierarchical Coherence Modeling","summary":" Concealed object segmentation (COS) is a challenging task that involves\nlocalizing and segmenting those concealed objects that are visually blended\nwith their surrounding environments. Despite achieving remarkable success,\nexisting COS segmenters still struggle to achieve complete segmentation results\nin extremely concealed scenarios. In this paper, we propose a Hierarchical\nCoherence Modeling (HCM) segmenter for COS, aiming to address this incomplete\nsegmentation limitation. In specific, HCM promotes feature coherence by\nleveraging the intra-stage coherence and cross-stage coherence modules,\nexploring feature correlations at both the single-stage and contextual levels.\nAdditionally, we introduce the reversible re-calibration decoder to detect\npreviously undetected parts in low-confidence regions, resulting in further\nenhancing segmentation performance. Extensive experiments conducted on three\nCOS tasks, including camouflaged object detection, polyp image segmentation,\nand transparent object detection, demonstrate the promising results achieved by\nthe proposed HCM segmenter.\n","authors":["Fengyang Xiao","Pan Zhang","Chunming He","Runze Hu","Yutao Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11767v1.pdf","comment":"Accepted to CICAI 2023. 13 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11751v1","updated":"2024-01-22T08:23:52Z","published":"2024-01-22T08:23:52Z","title":"Boosting Multi-view Stereo with Late Cost Aggregation","summary":" Pairwise matching cost aggregation is a crucial step for modern\nlearning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation\nscheme, which adds up pairwise costs into an intermediate cost. However, we\nanalyze that this process can degrade informative pairwise matchings, thereby\nblocking the depth network from fully utilizing the original geometric matching\ncues.To address this challenge, we present a late aggregation approach that\nallows for aggregating pairwise costs throughout the network feed-forward\nprocess, achieving accurate estimations with only minor changes of the plain\nCasMVSNet.Instead of building an intermediate cost by weighted sum, late\naggregation preserves all pairwise costs along a distinct view channel. This\nenables the succeeding depth network to fully utilize the crucial geometric\ncues without loss of cost fidelity. Grounded in the new aggregation scheme, we\npropose further techniques addressing view order dependence inside the\npreserved cost, handling flexible testing views, and improving the depth\nfiltering process. Despite its technical simplicity, our method improves\nsignificantly upon the baseline cascade-based approach, achieving comparable\nresults with state-of-the-art methods with favorable computation overhead.\n","authors":["Jiang Wu","Rui Li","Yu Zhu","Wenxun Zhao","Jinqiu Sun","Yanning Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11751v1.pdf","comment":"Code and models are available at https://github.com/Wuuu3511/LAMVSNET"},{"id":"http://arxiv.org/abs/2401.11740v1","updated":"2024-01-22T07:37:25Z","published":"2024-01-22T07:37:25Z","title":"Multi-level Cross-modal Alignment for Image Clustering","summary":" Recently, the cross-modal pretraining model has been employed to produce\nmeaningful pseudo-labels to supervise the training of an image clustering\nmodel. However, numerous erroneous alignments in a cross-modal pre-training\nmodel could produce poor-quality pseudo-labels and degrade clustering\nperformance. To solve the aforementioned issue, we propose a novel\n\\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in\na cross-modal pretraining model for downstream tasks, by building a smaller but\nbetter semantic space and aligning the images and texts in three levels, i.e.,\ninstance-level, prototype-level, and semantic-level. Theoretical results show\nthat our proposed method converges, and suggests effective means to reduce the\nexpected clustering risk of our method. Experimental results on five benchmark\ndatasets clearly show the superiority of our new method.\n","authors":["Liping Qiu","Qin Zhang","Xiaojun Chen","Shaotian Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11739v1","updated":"2024-01-22T07:34:06Z","published":"2024-01-22T07:34:06Z","title":"EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models","summary":" Diffusion models have recently received increasing research attention for\ntheir remarkable transfer abilities in semantic segmentation tasks. However,\ngenerating fine-grained segmentation masks with diffusion models often requires\nadditional training on annotated datasets, leaving it unclear to what extent\npre-trained diffusion models alone understand the semantic relations of their\ngenerated images. To address this question, we leverage the semantic knowledge\nextracted from Stable Diffusion (SD) and aim to develop an image segmentor\ncapable of generating fine-grained segmentation maps without any additional\ntraining. The primary difficulty stems from the fact that semantically\nmeaningful feature maps typically exist only in the spatially lower-dimensional\nlayers, which poses a challenge in directly extracting pixel-level semantic\nrelations from these feature maps. To overcome this issue, our framework\nidentifies semantic correspondences between image pixels and spatial locations\nof low-dimensional feature maps by exploiting SD's generation process and\nutilizes them for constructing image-resolution segmentation maps. In extensive\nexperiments, the produced segmentation maps are demonstrated to be well\ndelineated and capture detailed parts of the images, indicating the existence\nof highly accurate pixel-level semantic knowledge in diffusion models.\n","authors":["Koichi Namekata","Amirmojtaba Sabour","Sanja Fidler","Seung Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11739v1.pdf","comment":"ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/"},{"id":"http://arxiv.org/abs/2401.11738v1","updated":"2024-01-22T07:31:52Z","published":"2024-01-22T07:31:52Z","title":"MetaSeg: Content-Aware Meta-Net for Omni-Supervised Semantic\n Segmentation","summary":" Noisy labels, inevitably existing in pseudo segmentation labels generated\nfrom weak object-level annotations, severely hampers model optimization for\nsemantic segmentation. Previous works often rely on massive hand-crafted losses\nand carefully-tuned hyper-parameters to resist noise, suffering poor\ngeneralization capability and high model complexity. Inspired by recent\nadvances in meta learning, we argue that rather than struggling to tolerate\nnoise hidden behind clean labels passively, a more feasible solution would be\nto find out the noisy regions actively, so as to simply ignore them during\nmodel optimization. With this in mind, this work presents a novel meta learning\nbased semantic segmentation method, MetaSeg, that comprises a primary\ncontent-aware meta-net (CAM-Net) to sever as a noise indicator for an arbitrary\nsegmentation model counterpart. Specifically, CAM-Net learns to generate\npixel-wise weights to suppress noisy regions with incorrect pseudo labels while\nhighlighting clean ones by exploiting hybrid strengthened features from image\ncontent, providing straightforward and reliable guidance for optimizing the\nsegmentation model. Moreover, to break the barrier of time-consuming training\nwhen applying meta learning to common large segmentation models, we further\npresent a new decoupled training strategy that optimizes different model layers\nin a divide-and-conquer manner. Extensive experiments on object, medical,\nremote sensing and human segmentation shows that our method achieves superior\nperformance, approaching that of fully supervised settings, which paves a new\npromising way for omni-supervised semantic segmentation.\n","authors":["Shenwang Jiang","Jianan Li","Ying Wang","Wenxuan Wu","Jizhou Zhang","Bo Huang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11738v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.16244v2","updated":"2024-01-22T07:29:09Z","published":"2023-12-25T11:39:00Z","title":"Modality-missing RGBT Tracking via Invertible Prompt Learning and A\n High-quality Data Simulation Method","summary":" Current RGBT tracking researches mainly focus on the modality-complete\nscenarios, overlooking the modality-missing challenge in real-world scenes. In\nthis work, we comprehensively investigate the impact of modality-missing\nchallenge in RGBT tracking and propose a novel invertible prompt learning\napproach, which integrates the content-preserving prompts into a well-trained\ntracking model to adapt to various modality-missing scenarios, for\nmodality-missing RGBT tracking. In particular, given one modality-missing\nscenario, we propose to utilize the available modality to generate the prompt\nof the missing modality to adapt to RGBT tracking model. However, the\ncross-modality gap between available and missing modalities usually causes\nsemantic distortion and information loss in prompt generation. To handle this\nissue, we propose the invertible prompt learning scheme by incorporating the\nfull reconstruction of the input available modality from the prompt in prompt\ngeneration model. Considering that there lacks a modality-missing RGBT tracking\ndataset and many modality-missing scenarios are difficult to capture, we design\na high-quality data simulation method based on hierarchical combination schemes\nto generate real-world modality-missing data. Extensive experiments on three\nmodality-missing datasets show that our method achieves significant performance\nimprovements compared with state-of-the-art methods. We will release the code\nand simulation dataset.\n","authors":["Andong Lu","Jiacong Zhao","Chenglong Li","Jin Tang","Bin Luo"],"pdf_url":"https://arxiv.org/pdf/2312.16244v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14571v4","updated":"2024-01-22T07:24:58Z","published":"2022-10-26T09:01:19Z","title":"Towards the Detection of Diffusion Model Deepfakes","summary":" In the course of the past few years, diffusion models (DMs) have reached an\nunprecedented level of visual quality. However, relatively little attention has\nbeen paid to the detection of DM-generated images, which is critical to prevent\nadverse impacts on our society. In contrast, generative adversarial networks\n(GANs), have been extensively studied from a forensic perspective. In this\nwork, we therefore take the natural next step to evaluate whether previous\nmethods can be used to detect images generated by DMs. Our experiments yield\ntwo key findings: (1) state-of-the-art GAN detectors are unable to reliably\ndistinguish real from DM-generated images, but (2) re-training them on\nDM-generated images allows for almost perfect detection, which remarkably even\ngeneralizes to GANs. Together with a feature space analysis, our results lead\nto the hypothesis that DMs produce fewer detectable artifacts and are thus more\ndifficult to detect compared to GANs. One possible reason for this is the\nabsence of grid-like frequency artifacts in DM-generated images, which are a\nknown weakness of GANs. However, we make the interesting observation that\ndiffusion models tend to underestimate high frequencies, which we attribute to\nthe learning objective.\n","authors":["Jonas Ricker","Simon Damm","Thorsten Holz","Asja Fischer"],"pdf_url":"https://arxiv.org/pdf/2210.14571v4.pdf","comment":"Accepted at VISAPP 2024. This is the extended version with additional\n experiments and supplemental material. Code and data:\n https://github.com/jonasricker/diffusion-model-deepfake-detection"},{"id":"http://arxiv.org/abs/2401.11734v1","updated":"2024-01-22T07:23:44Z","published":"2024-01-22T07:23:44Z","title":"Colorectal Polyp Segmentation in the Deep Learning Era: A Comprehensive\n Survey","summary":" Colorectal polyp segmentation (CPS), an essential problem in medical image\nanalysis, has garnered growing research attention. Recently, the deep\nlearning-based model completely overwhelmed traditional methods in the field of\nCPS, and more and more deep CPS methods have emerged, bringing the CPS into the\ndeep learning era. To help the researchers quickly grasp the main techniques,\ndatasets, evaluation metrics, challenges, and trending of deep CPS, this paper\npresents a systematic and comprehensive review of deep-learning-based CPS\nmethods from 2014 to 2023, a total of 115 technical papers. In particular, we\nfirst provide a comprehensive review of the current deep CPS with a novel\ntaxonomy, including network architectures, level of supervision, and learning\nparadigm. More specifically, network architectures include eight subcategories,\nthe level of supervision comprises six subcategories, and the learning paradigm\nencompasses 12 subcategories, totaling 26 subcategories. Then, we provided a\ncomprehensive analysis the characteristics of each dataset, including the\nnumber of datasets, annotation types, image resolution, polyp size, contrast\nvalues, and polyp location. Following that, we summarized CPS's commonly used\nevaluation metrics and conducted a detailed analysis of 40 deep SOTA models,\nincluding out-of-distribution generalization and attribute-based performance\nanalysis. Finally, we discussed deep learning-based CPS methods' main\nchallenges and opportunities.\n","authors":["Zhenyu Wu","Fengmao Lv","Chenglizhao Chen","Aimin Hao","Shuo Li"],"pdf_url":"https://arxiv.org/pdf/2401.11734v1.pdf","comment":"21 pages, 8 figures"},{"id":"http://arxiv.org/abs/2309.02773v3","updated":"2024-01-22T07:18:55Z","published":"2023-09-06T06:31:08Z","title":"Diffusion Model is Secretly a Training-free Open Vocabulary Semantic\n Segmenter","summary":" The pre-trained text-image discriminative models, such as CLIP, has been\nexplored for open-vocabulary semantic segmentation with unsatisfactory results\ndue to the loss of crucial localization information and awareness of object\nshapes. Recently, there has been a growing interest in expanding the\napplication of generative models from generation tasks to semantic\nsegmentation. These approaches utilize generative models either for generating\nannotated data or extracting features to facilitate semantic segmentation. This\ntypically involves generating a considerable amount of synthetic data or\nrequiring additional mask annotations. To this end, we uncover the potential of\ngenerative text-to-image diffusion models (e.g., Stable Diffusion) as highly\nefficient open-vocabulary semantic segmenters, and introduce a novel\ntraining-free approach named DiffSegmenter. The insight is that to generate\nrealistic objects that are semantically faithful to the input text, both the\ncomplete object shapes and the corresponding semantics are implicitly learned\nby diffusion models. We discover that the object shapes are characterized by\nthe self-attention maps while the semantics are indicated through the\ncross-attention maps produced by the denoising U-Net, forming the basis of our\nsegmentation results.Additionally, we carefully design effective textual\nprompts and a category filtering mechanism to further enhance the segmentation\nresults. Extensive experiments on three benchmark datasets show that the\nproposed DiffSegmenter achieves impressive results for open-vocabulary semantic\nsegmentation.\n","authors":["Jinglong Wang","Xiawei Li","Jing Zhang","Qingyuan Xu","Qin Zhou","Qian Yu","Lu Sheng","Dong Xu"],"pdf_url":"https://arxiv.org/pdf/2309.02773v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11726v1","updated":"2024-01-22T07:07:32Z","published":"2024-01-22T07:07:32Z","title":"Detecting Out-of-Distribution Samples via Conditional Distribution\n Entropy with Optimal Transport","summary":" When deploying a trained machine learning model in the real world, it is\ninevitable to receive inputs from out-of-distribution (OOD) sources. For\ninstance, in continual learning settings, it is common to encounter OOD samples\ndue to the non-stationarity of a domain. More generally, when we have access to\na set of test inputs, the existing rich line of OOD detection solutions,\nespecially the recent promise of distance-based methods, falls short in\neffectively utilizing the distribution information from training samples and\ntest inputs. In this paper, we argue that empirical probability distributions\nthat incorporate geometric information from both training samples and test\ninputs can be highly beneficial for OOD detection in the presence of test\ninputs available. To address this, we propose to model OOD detection as a\ndiscrete optimal transport problem. Within the framework of optimal transport,\nwe propose a novel score function known as the \\emph{conditional distribution\nentropy} to quantify the uncertainty of a test input being an OOD sample. Our\nproposal inherits the merits of certain distance-based methods while\neliminating the reliance on distribution assumptions, a-prior knowledge, and\nspecific training mechanisms. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method outperforms its competitors in OOD\ndetection.\n","authors":["Chuanwen Feng","Wenlong Chen","Ao Ke","Yilong Ren","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11724v1","updated":"2024-01-22T06:56:52Z","published":"2024-01-22T06:56:52Z","title":"Augmenting Prototype Network with TransMix for Few-shot Hyperspectral\n Image Classification","summary":" Few-shot hyperspectral image classification aims to identify the classes of\neach pixel in the images by only marking few of these pixels. And in order to\nobtain the spatial-spectral joint features of each pixel, the fixed-size\npatches centering around each pixel are often used for classification. However,\nobserving the classification results of existing methods, we found that\nboundary patches corresponding to the pixels which are located at the boundary\nof the objects in the hyperspectral images, are hard to classify. These\nboundary patchs are mixed with multi-class spectral information. Inspired by\nthis, we propose to augment the prototype network with TransMix for few-shot\nhyperspectrial image classification(APNT). While taking the prototype network\nas the backbone, it adopts the transformer as feature extractor to learn the\npixel-to-pixel relation and pay different attentions to different pixels. At\nthe same time, instead of directly using the patches which are cut from the\nhyperspectral images for training, it randomly mixs up two patches to imitate\nthe boundary patches and uses the synthetic patches to train the model, with\nthe aim to enlarge the number of hard training samples and enhance their\ndiversity. And by following the data agumentation technique TransMix, the\nattention returned by the transformer is also used to mix up the labels of two\npatches to generate better labels for synthetic patches. Compared with existing\nmethods, the proposed method has demonstrated sate of the art performance and\nbetter robustness for few-shot hyperspectral image classification in our\nexperiments.\n","authors":["Chun Liu","Longwei Yang","Dongmei Dong","Zheng Li","Wei Yang","Zhigang Han","Jiayao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17240v3","updated":"2024-01-22T06:53:23Z","published":"2023-12-28T18:58:33Z","title":"LISA++: An Improved Baseline for Reasoning Segmentation with Large\n Language Model","summary":" While LISA effectively bridges the gap between segmentation and large\nlanguage models to enable reasoning segmentation, it poses certain limitations:\nunable to distinguish different instances of the target region, and constrained\nby the pre-defined textual response formats. In this work, we introduce LISA++,\nan update to the existing LISA model, focusing on improving core\nfunctionalities while keeping the base architecture intact. The main\nenhancements in LISA++ include: \\textbf{1) Enhanced Segmentation}: The instance\nsegmentation ability has been added, providing a more detailed scene analysis\nalong with the existing multi-region semantic segmentation. \\textbf{2) More\nNatural Conversation}: Improved capability for multi-turn dialogue, with the\nability to incorporate segmentation results directly into text responses, i.e.,\nSegmentation in Dialogue (SiD). These improvements are achieved by curating the\nexisting samples of generic segmentation datasets, aimed specifically at\nenhancing the segmentation and conversational skills without structural change\nand additional data sources. Comparative analysis with the original LISA model\nshows significant advancements in these areas, positioning LISA++ as a notable\nupgrade in visual understanding and interaction. LISA++'s adaptability and\nimproved features highlight the versatility of the mask-as-embedding paradigm\nproposed by LISA, and the potential as a foundational model for diverse\napplications.\n","authors":["Senqiao Yang","Tianyuan Qu","Xin Lai","Zhuotao Tian","Bohao Peng","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17240v3.pdf","comment":"Typo fixed"},{"id":"http://arxiv.org/abs/2211.08824v4","updated":"2024-01-22T06:46:27Z","published":"2022-11-16T10:49:48Z","title":"SMILEtrack: SiMIlarity LEarning for Occlusion-Aware Multiple Object\n Tracking","summary":" Despite recent progress in Multiple Object Tracking (MOT), several obstacles\nsuch as occlusions, similar objects, and complex scenes remain an open\nchallenge. Meanwhile, a systematic study of the cost-performance tradeoff for\nthe popular tracking-by-detection paradigm is still lacking. This paper\nintroduces SMILEtrack, an innovative object tracker that effectively addresses\nthese challenges by integrating an efficient object detector with a Siamese\nnetwork-based Similarity Learning Module (SLM). The technical contributions of\nSMILETrack are twofold. First, we propose an SLM that calculates the appearance\nsimilarity between two objects, overcoming the limitations of feature\ndescriptors in Separate Detection and Embedding (SDE) models. The SLM\nincorporates a Patch Self-Attention (PSA) block inspired by the vision\nTransformer, which generates reliable features for accurate similarity\nmatching. Second, we develop a Similarity Matching Cascade (SMC) module with a\nnovel GATE function for robust object matching across consecutive video frames,\nfurther enhancing MOT performance. Together, these innovations help SMILETrack\nachieve an improved trade-off between the cost ({\\em e.g.}, running speed) and\nperformance (e.g., tracking accuracy) over several existing state-of-the-art\nbenchmarks, including the popular BYTETrack method. SMILETrack outperforms\nBYTETrack by 0.4-0.8 MOTA and 2.1-2.2 HOTA points on MOT17 and MOT20 datasets.\nCode is available at https://github.com/pingyang1117/SMILEtrack_Official\n","authors":["Yu-Hsiang Wang","Jun-Wei Hsieh","Ping-Yang Chen","Ming-Ching Chang","Hung Hin So","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2211.08824v4.pdf","comment":"Our paper was accepted by AAAI2024"},{"id":"http://arxiv.org/abs/2401.11719v1","updated":"2024-01-22T06:43:13Z","published":"2024-01-22T06:43:13Z","title":"SFC: Shared Feature Calibration in Weakly Supervised Semantic\n Segmentation","summary":" Image-level weakly supervised semantic segmentation has received increasing\nattention due to its low annotation cost. Existing methods mainly rely on Class\nActivation Mapping (CAM) to obtain pseudo-labels for training semantic\nsegmentation models. In this work, we are the first to demonstrate that\nlong-tailed distribution in training data can cause the CAM calculated through\nclassifier weights over-activated for head classes and under-activated for tail\nclasses due to the shared features among head- and tail- classes. This degrades\npseudo-label quality and further influences final semantic segmentation\nperformance. To address this issue, we propose a Shared Feature Calibration\n(SFC) method for CAM generation. Specifically, we leverage the class prototypes\nthat carry positive shared features and propose a Multi-Scaled\nDistribution-Weighted (MSDW) consistency loss for narrowing the gap between the\nCAMs generated through classifier weights and class prototypes during training.\nThe MSDW loss counterbalances over-activation and under-activation by\ncalibrating the shared features in head-/tail-class classifier weights.\nExperimental results show that our SFC significantly improves CAM boundaries\nand achieves new state-of-the-art performances. The project is available at\nhttps://github.com/Barrett-python/SFC.\n","authors":["Xinqiao Zhao","Feilong Tang","Xiaoyang Wang","Jimin Xiao"],"pdf_url":"https://arxiv.org/pdf/2401.11719v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11718v1","updated":"2024-01-22T06:42:23Z","published":"2024-01-22T06:42:23Z","title":"MsSVT++: Mixed-scale Sparse Voxel Transformer with Center Voting for 3D\n Object Detection","summary":" Accurate 3D object detection in large-scale outdoor scenes, characterized by\nconsiderable variations in object scales, necessitates features rich in both\nlong-range and fine-grained information. While recent detectors have utilized\nwindow-based transformers to model long-range dependencies, they tend to\noverlook fine-grained details. To bridge this gap, we propose MsSVT++, an\ninnovative Mixed-scale Sparse Voxel Transformer that simultaneously captures\nboth types of information through a divide-and-conquer approach. This approach\ninvolves explicitly dividing attention heads into multiple groups, each\nresponsible for attending to information within a specific range. The outputs\nof these groups are subsequently merged to obtain final mixed-scale features.\nTo mitigate the computational complexity associated with applying a\nwindow-based transformer in 3D voxel space, we introduce a novel Chessboard\nSampling strategy and implement voxel sampling and gathering operations\nsparsely using a hash map. Moreover, an important challenge stems from the\nobservation that non-empty voxels are primarily located on the surface of\nobjects, which impedes the accurate estimation of bounding boxes. To overcome\nthis challenge, we introduce a Center Voting module that integrates newly voted\nvoxels enriched with mixed-scale contextual information towards the centers of\nthe objects, thereby improving precise object localization. Extensive\nexperiments demonstrate that our single-stage detector, built upon the\nfoundation of MsSVT++, consistently delivers exceptional performance across\ndiverse datasets.\n","authors":["Jianan Li","Shaocong Dong","Lihe Ding","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.03842v4","updated":"2024-01-22T06:30:15Z","published":"2022-04-08T05:11:04Z","title":"From 2D Images to 3D Model:Weakly Supervised Multi-View Face\n Reconstruction with Deep Fusion","summary":" While weakly supervised multi-view face reconstruction (MVR) is garnering\nincreased attention, one critical issue still remains open: how to effectively\nfuse multiple image information to reconstruct high-precision 3D models. In\nthis regard, we propose a novel model called Deep Fusion MVR (DF-MVR) to\nreconstruct high-precision 3D facial shapes from multi-view images.\nSpecifically, we introduce MulEn-Unet, a multi-view encoding to single decoding\nframework with skip connections and attention. This design allows for the\nextraction, integration, and compensation of deep features with attention from\nmulti-view images. Furthermore, we adopt the involution kernel to enrich deep\nfusion features with channel features. In addition, we develop the face parse\nnetwork to learn, identify, and emphasize the critical common face area within\nmulti-view images. Experiments on Pixel-Face and Bosphorus datasets indicate\nthe superiority of our model. Without 3D annotation, DF-MVR achieves 5.2% and\n3.0% RMSE improvement over the existing weakly supervised MVRs respectively on\nPixel-Face and Bosphorus dataset. Code will be available publicly at\nhttps://github.com/weiguangzhao/DF_MVR.\n","authors":["Weiguang Zhao","Chaolong Yang","Jianan Ye","Rui Zhang","Yuyao Yan","Xi Yang","Bin Dong","Amir Hussain","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2204.03842v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11713v1","updated":"2024-01-22T06:29:52Z","published":"2024-01-22T06:29:52Z","title":"Medical Image Debiasing by Learning Adaptive Agreement from a Biased\n Council","summary":" Deep learning could be prone to learning shortcuts raised by dataset bias and\nresult in inaccurate, unreliable, and unfair models, which impedes its adoption\nin real-world clinical applications. Despite its significance, there is a\ndearth of research in the medical image classification domain to address\ndataset bias. Furthermore, the bias labels are often agnostic, as identifying\nbiases can be laborious and depend on post-hoc interpretation. This paper\nproposes learning Adaptive Agreement from a Biased Council (Ada-ABC), a\ndebiasing framework that does not rely on explicit bias labels to tackle\ndataset bias in medical images. Ada-ABC develops a biased council consisting of\nmultiple classifiers optimized with generalized cross entropy loss to learn the\ndataset bias. A debiasing model is then simultaneously trained under the\nguidance of the biased council. Specifically, the debiasing model is required\nto learn adaptive agreement with the biased council by agreeing on the\ncorrectly predicted samples and disagreeing on the wrongly predicted samples by\nthe biased council. In this way, the debiasing model could learn the target\nattribute on the samples without spurious correlations while also avoiding\nignoring the rich information in samples with spurious correlations. We\ntheoretically demonstrated that the debiasing model could learn the target\nfeatures when the biased model successfully captures dataset bias. Moreover, to\nour best knowledge, we constructed the first medical debiasing benchmark from\nfour datasets containing seven different bias scenarios. Our extensive\nexperiments practically showed that our proposed Ada-ABC outperformed\ncompetitive approaches, verifying its effectiveness in mitigating dataset bias\nfor medical image classification. The codes and organized benchmark datasets\nwill be made publicly available.\n","authors":["Luyang Luo","Xin Huang","Minghao Wang","Zhuoyue Wan","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11713v1.pdf","comment":"10 pages, 5 figures, 3 tables. Code and benchmark will be released\n via https://github.com/LLYXC/Ada-ABC/tree/main"},{"id":"http://arxiv.org/abs/2401.11711v1","updated":"2024-01-22T06:28:08Z","published":"2024-01-22T06:28:08Z","title":"HG3-NeRF: Hierarchical Geometric, Semantic, and Photometric Guided\n Neural Radiance Fields for Sparse View Inputs","summary":" Neural Radiance Fields (NeRF) have garnered considerable attention as a\nparadigm for novel view synthesis by learning scene representations from\ndiscrete observations. Nevertheless, NeRF exhibit pronounced performance\ndegradation when confronted with sparse view inputs, consequently curtailing\nits further applicability. In this work, we introduce Hierarchical Geometric,\nSemantic, and Photometric Guided NeRF (HG3-NeRF), a novel methodology that can\naddress the aforementioned limitation and enhance consistency of geometry,\nsemantic content, and appearance across different views. We propose\nHierarchical Geometric Guidance (HGG) to incorporate the attachment of\nStructure from Motion (SfM), namely sparse depth prior, into the scene\nrepresentations. Different from direct depth supervision, HGG samples volume\npoints from local-to-global geometric regions, mitigating the misalignment\ncaused by inherent bias in the depth prior. Furthermore, we draw inspiration\nfrom notable variations in semantic consistency observed across images of\ndifferent resolutions and propose Hierarchical Semantic Guidance (HSG) to learn\nthe coarse-to-fine semantic content, which corresponds to the coarse-to-fine\nscene representations. Experimental results demonstrate that HG3-NeRF can\noutperform other state-of-the-art methods on different standard benchmarks and\nachieve high-fidelity synthesis results for sparse view inputs.\n","authors":["Zelin Gao","Weichen Dai","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11711v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.11708v1","updated":"2024-01-22T06:16:29Z","published":"2024-01-22T06:16:29Z","title":"Mastering Text-to-Image Diffusion: Recaptioning, Planning, and\n Generating with Multimodal LLMs","summary":" Diffusion models have exhibit exceptional performance in text-to-image\ngeneration and editing. However, existing methods often face challenges when\nhandling complex text prompts that involve multiple objects with multiple\nattributes and relationships. In this paper, we propose a brand new\ntraining-free text-to-image generation/editing framework, namely Recaption,\nPlan and Generate (RPG), harnessing the powerful chain-of-thought reasoning\nability of multimodal LLMs to enhance the compositionality of text-to-image\ndiffusion models. Our approach employs the MLLM as a global planner to\ndecompose the process of generating complex images into multiple simpler\ngeneration tasks within subregions. We propose complementary regional diffusion\nto enable region-wise compositional generation. Furthermore, we integrate\ntext-guided image generation and editing within the proposed RPG in a\nclosed-loop fashion, thereby enhancing generalization ability. Extensive\nexperiments demonstrate our RPG outperforms state-of-the-art text-to-image\ndiffusion models, including DALL-E 3 and SDXL, particularly in multi-category\nobject composition and text-image semantic alignment. Notably, our RPG\nframework exhibits wide compatibility with various MLLM architectures (e.g.,\nMiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available\nat: https://github.com/YangLing0818/RPG-DiffusionMaster\n","authors":["Ling Yang","Zhaochen Yu","Chenlin Meng","Minkai Xu","Stefano Ermon","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11708v1.pdf","comment":"Project: https://github.com/YangLing0818/RPG-DiffusionMaster"},{"id":"http://arxiv.org/abs/2401.11704v1","updated":"2024-01-22T06:05:26Z","published":"2024-01-22T06:05:26Z","title":"EK-Net:Real-time Scene Text Detection with Expand Kernel Distance","summary":" Recently, scene text detection has received significant attention due to its\nwide application. However, accurate detection in complex scenes of multiple\nscales, orientations, and curvature remains a challenge. Numerous detection\nmethods adopt the Vatti clipping (VC) algorithm for multiple-instance training\nto address the issue of arbitrary-shaped text. Yet we identify several bias\nresults from these approaches called the \"shrinked kernel\". Specifically, it\nrefers to a decrease in accuracy resulting from an output that overly favors\nthe text kernel. In this paper, we propose a new approach named Expand Kernel\nNetwork (EK-Net) with expand kernel distance to compensate for the previous\ndeficiency, which includes three-stages regression to complete instance\ndetection. Moreover, EK-Net not only realize the precise positioning of\narbitrary-shaped text, but also achieve a trade-off between performance and\nspeed. Evaluation results demonstrate that EK-Net achieves state-of-the-art or\ncompetitive performance compared to other advanced methods, e.g., F-measure of\n85.72% at 35.42 FPS on ICDAR 2015, F-measure of 85.75% at 40.13 FPS on CTW1500.\n","authors":["Boyuan Zhu","Fagui Liu","Xi Chen","Quan Tang"],"pdf_url":"https://arxiv.org/pdf/2401.11704v1.pdf","comment":"2024 IEEE International Conference on Acoustics, Speech and Signal\n Processing"},{"id":"http://arxiv.org/abs/2304.03047v3","updated":"2024-01-22T04:57:32Z","published":"2023-04-06T13:07:17Z","title":"ETPNav: Evolving Topological Planning for Vision-Language Navigation in\n Continuous Environments","summary":" Vision-language navigation is a task that requires an agent to follow\ninstructions to navigate in environments. It becomes increasingly crucial in\nthe field of embodied AI, with potential applications in autonomous navigation,\nsearch and rescue, and human-robot interaction. In this paper, we propose to\naddress a more practical yet challenging counterpart setting - vision-language\nnavigation in continuous environments (VLN-CE). To develop a robust VLN-CE\nagent, we propose a new navigation framework, ETPNav, which focuses on two\ncritical skills: 1) the capability to abstract environments and generate\nlong-range navigation plans, and 2) the ability of obstacle-avoiding control in\ncontinuous environments. ETPNav performs online topological mapping of\nenvironments by self-organizing predicted waypoints along a traversed path,\nwithout prior environmental experience. It privileges the agent to break down\nthe navigation procedure into high-level planning and low-level control.\nConcurrently, ETPNav utilizes a transformer-based cross-modal planner to\ngenerate navigation plans based on topological maps and instructions. The plan\nis then performed through an obstacle-avoiding controller that leverages a\ntrial-and-error heuristic to prevent navigation from getting stuck in\nobstacles. Experimental results demonstrate the effectiveness of the proposed\nmethod. ETPNav yields more than 10% and 20% improvements over prior\nstate-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is\navailable at https://github.com/MarSaKi/ETPNav.\n","authors":["Dong An","Hanqing Wang","Wenguan Wang","Zun Wang","Yan Huang","Keji He","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2304.03047v3.pdf","comment":"Project page: https://github.com/MarSaKi/ETPNav"},{"id":"http://arxiv.org/abs/2401.11687v1","updated":"2024-01-22T04:54:42Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":" Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2310.09221v2","updated":"2024-01-22T04:48:57Z","published":"2023-10-13T16:18:48Z","title":"Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic\n Feature Co-Registration","summary":" Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in\nthe detection and treatment of thyroid cancer. However, owing to the diversity\nof scanner vendors and imaging protocols in different hospitals, the automatic\nsegmentation model, which has already demonstrated expert-level accuracy in the\nfield of medical image segmentation, finds its accuracy reduced as the result\nof its weak generalization performance when being applied in clinically\nrealistic environments. To address this issue, the present paper proposes ASTN,\na framework for thyroid nodule segmentation achieved through a new type\nco-registration network. By extracting latent semantic information from the\natlas and target images and utilizing in-depth features to accomplish the\nco-registration of nodules in thyroid ultrasound images, this framework can\nensure the integrity of anatomical structure and reduce the impact on\nsegmentation as the result of overall differences in image caused by different\ndevices. In addition, this paper also provides an atlas selection algorithm to\nmitigate the difficulty of co-registration. As shown by the evaluation results\ncollected from the datasets of different devices, thanks to the method we\nproposed, the model generalization has been greatly improved while maintaining\na high level of segmentation accuracy.\n","authors":["Xuewei Li","Yaqiao Zhu","Jie Gao","Xi Wei","Ruixuan Zhang","Yuan Tian","ZhiQiang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.09221v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07278v2","updated":"2024-01-22T04:43:04Z","published":"2024-01-14T12:22:34Z","title":"Semi-supervised Semantic Segmentation using Redesigned Self-Training for\n White Blood Cell","summary":" Artificial Intelligence (AI) in healthcare, especially in white blood cell\ncancer diagnosis, is hindered by two primary challenges: the lack of\nlarge-scale labeled datasets for white blood cell (WBC) segmentation and\noutdated segmentation methods. To address the first challenge, a\nsemi-supervised learning framework should be brought to efficiently annotate\nthe large dataset. In this work, we address this issue by proposing a novel\nself-training pipeline with the incorporation of FixMatch. We discover that by\nincorporating FixMatch in the self-training pipeline, the performance improves\nin the majority of cases. Our performance achieved the best performance with\nthe self-training scheme with consistency on DeepLab-V3 architecture and\nResNet-50, reaching 90.69%, 87.37%, and 76.49% on Zheng 1, Zheng 2, and LISC\ndatasets, respectively.\n","authors":["Vinh Quoc Luu","Duy Khanh Le","Huy Thanh Nguyen","Minh Thanh Nguyen","Thinh Tien Nguyen","Vinh Quang Dinh"],"pdf_url":"https://arxiv.org/pdf/2401.07278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11674v1","updated":"2024-01-22T03:24:45Z","published":"2024-01-22T03:24:45Z","title":"Memory-Efficient Prompt Tuning for Incremental Histopathology\n Classification","summary":" Recent studies have made remarkable progress in histopathology\nclassification. Based on current successes, contemporary works proposed to\nfurther upgrade the model towards a more generalizable and robust direction\nthrough incrementally learning from the sequentially delivered domains. Unlike\nprevious parameter isolation based approaches that usually demand massive\ncomputation resources during model updating, we present a memory-efficient\nprompt tuning framework to cultivate model generalization potential in\neconomical memory cost. For each incoming domain, we reuse the existing\nparameters of the initial classification model and attach lightweight trainable\nprompts into it for customized tuning. Considering the domain heterogeneity, we\nperform decoupled prompt tuning, where we adopt a domain-specific prompt for\neach domain to independently investigate its distinctive characteristics, and\none domain-invariant prompt shared across all domains to continually explore\nthe common content embedding throughout time. All domain-specific prompts will\nbe appended to the prompt bank and isolated from further changes to prevent\nforgetting the distinctive features of early-seen domains. While the\ndomain-invariant prompt will be passed on and iteratively evolve by\nstyle-augmented prompt refining to improve model generalization capability over\ntime. In specific, we construct a graph with existing prompts and build a\nstyle-augmented graph attention network to guide the domain-invariant prompt\nexploring the overlapped latent embedding among all delivered domains for more\ndomain generic representations. We have extensively evaluated our framework\nwith two histopathology tasks, i.e., breast cancer metastasis classification\nand epithelium-stroma tissue classification, where our approach yielded\nsuperior performance and memory efficiency over the competing methods.\n","authors":["Yu Zhu","Kang Li","Lequan Yu","Pheng-Ann Heng"],"pdf_url":"https://arxiv.org/pdf/2401.11674v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11673v1","updated":"2024-01-22T03:22:49Z","published":"2024-01-22T03:22:49Z","title":"MVSFormer++: Revealing the Devil in Transformer's Details for Multi-View\n Stereo","summary":" Recent advancements in learning-based Multi-View Stereo (MVS) methods have\nprominently featured transformer-based models with attention mechanisms.\nHowever, existing approaches have not thoroughly investigated the profound\ninfluence of transformers on different MVS modules, resulting in limited depth\nestimation capabilities. In this paper, we introduce MVSFormer++, a method that\nprudently maximizes the inherent characteristics of attention to enhance\nvarious components of the MVS pipeline. Formally, our approach involves\ninfusing cross-view information into the pre-trained DINOv2 model to facilitate\nMVS learning. Furthermore, we employ different attention mechanisms for the\nfeature encoder and cost volume regularization, focusing on feature and spatial\naggregations respectively. Additionally, we uncover that some design details\nwould substantially impact the performance of transformer modules in MVS,\nincluding normalized 3D positional encoding, adaptive attention scaling, and\nthe position of layer normalization. Comprehensive experiments on DTU,\nTanks-and-Temples, BlendedMVS, and ETH3D validate the effectiveness of the\nproposed method. Notably, MVSFormer++ achieves state-of-the-art performance on\nthe challenging DTU and Tanks-and-Temples benchmarks.\n","authors":["Chenjie Cao","Xinlin Ren","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2401.11673v1.pdf","comment":"Accepted to ICLR2024"},{"id":"http://arxiv.org/abs/2310.01852v7","updated":"2024-01-22T03:11:15Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n Language-based Semantic Alignment","summary":" The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. LanguageBind has achieved superior performance on a wide range of\n15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple\nexperiments have provided evidence for the effectiveness of LanguageBind in\nachieving indirect alignment and complementarity among diverse modalities. Code\naddress: https://github.com/PKU-YuanGroup/LanguageBind\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","HongFa Wang","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Wancai Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v7.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11671v1","updated":"2024-01-22T03:09:00Z","published":"2024-01-22T03:09:00Z","title":"RTA-Former: Reverse Transformer Attention for Polyp Segmentation","summary":" Polyp segmentation is a key aspect of colorectal cancer prevention, enabling\nearly detection and guiding subsequent treatments. Intelligent diagnostic\ntools, including deep learning solutions, are widely explored to streamline and\npotentially automate this process. However, even with many powerful network\narchitectures, there still comes the problem of producing accurate edge\nsegmentation. In this paper, we introduce a novel network, namely RTA-Former,\nthat employs a transformer model as the encoder backbone and innovatively\nadapts Reverse Attention (RA) with a transformer stage in the decoder for\nenhanced edge segmentation. The results of the experiments illustrate that\nRTA-Former achieves state-of-the-art (SOTA) performance in five polyp\nsegmentation datasets. The strong capability of RTA-Former holds promise in\nimproving the accuracy of Transformer-based polyp segmentation, potentially\nleading to better clinical decisions and patient outcomes. Our code will be\npublicly available on GitHub.\n","authors":["Zhikai Li","Murong Yi","Ali Uneri","Sihan Niu","Craig Jones"],"pdf_url":"https://arxiv.org/pdf/2401.11671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.08898v3","updated":"2024-01-22T03:01:28Z","published":"2023-01-21T05:34:29Z","title":"Recurrent Generic Contour-based Instance Segmentation with Progressive\n Learning","summary":" Contour-based instance segmentation has been actively studied, thanks to its\nflexibility and elegance in processing visual objects within complex\nbackgrounds. In this work, we propose a novel deep network architecture, i.e.,\nPolySnake, for generic contour-based instance segmentation. Motivated by the\nclassic Snake algorithm, the proposed PolySnake achieves superior and robust\nsegmentation performance with an iterative and progressive contour refinement\nstrategy. Technically, PolySnake introduces a recurrent update operator to\nestimate the object contour iteratively. It maintains a single estimate of the\ncontour that is progressively deformed toward the object boundary. At each\niteration, PolySnake builds a semantic-rich representation for the current\ncontour and feeds it to the recurrent operator for further contour adjustment.\nThrough the iterative refinements, the contour progressively converges to a\nstable status that tightly encloses the object instance. Beyond the scope of\ngeneral instance segmentation, extensive experiments are conducted to validate\nthe effectiveness and generalizability of our PolySnake in two additional\nspecific task scenarios, including scene text detection and lane detection. The\nresults demonstrate that the proposed PolySnake outperforms the existing\nadvanced methods on several multiple prevalent benchmarks across the three\ntasks. The codes and pre-trained models are available at\nhttps://github.com/fh2019ustc/PolySnake\n","authors":["Hao Feng","Keyi Zhou","Wengang Zhou","Yufei Yin","Jiajun Deng","Qi Sun","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2301.08898v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07444v3","updated":"2024-01-22T02:56:05Z","published":"2023-04-15T01:33:14Z","title":"The Art of Camouflage: Few-shot Learning for Animal Detection and\n Segmentation","summary":" Camouflaged object detection and segmentation is a new and challenging\nresearch topic in computer vision. There is a serious issue of lacking data of\ncamouflaged objects such as camouflaged animals in natural scenes. In this\npaper, we address the problem of few-shot learning for camouflaged object\ndetection and segmentation. To this end, we first collect a new dataset,\nCAMO-FS, for the benchmark. We then propose a novel method to efficiently\ndetect and segment the camouflaged objects in the images. In particular, we\nintroduce the instance triplet loss and the instance memory storage. The\nextensive experiments demonstrated that our proposed method achieves\nstate-of-the-art performance on the newly collected dataset.\n","authors":["Thanh-Danh Nguyen","Anh-Khoa Nguyen Vu","Nhat-Duy Nguyen","Vinh-Tiep Nguyen","Thanh Duc Ngo","Thanh-Toan Do","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2304.07444v3.pdf","comment":"Under-review Journal"},{"id":"http://arxiv.org/abs/2305.16789v2","updated":"2024-01-22T02:47:50Z","published":"2023-05-26T09:59:48Z","title":"Modulate Your Spectrum in Self-Supervised Learning","summary":" Whitening loss offers a theoretical guarantee against feature collapse in\nself-supervised learning (SSL) with joint embedding architectures. Typically,\nit involves a hard whitening approach, transforming the embedding and applying\nloss to the whitened output. In this work, we introduce Spectral Transformation\n(ST), a framework to modulate the spectrum of embedding and to seek for\nfunctions beyond whitening that can avoid dimensional collapse. We show that\nwhitening is a special instance of ST by definition, and our empirical\ninvestigations unveil other ST instances capable of preventing collapse.\nAdditionally, we propose a novel ST instance named IterNorm with trace loss\n(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse\nand modulating the spectrum of embedding toward equal-eigenvalues during\noptimization. Our experiments on ImageNet classification and COCO object\ndetection demonstrate INTL's potential in learning superior representations.\nThe code is available at https://github.com/winci-ai/INTL.\n","authors":["Xi Weng","Yunhao Ni","Tengwei Song","Jie Luo","Rao Muhammad Anwer","Salman Khan","Fahad Shahbaz Khan","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2305.16789v2.pdf","comment":"Accepted at ICLR 2024. The code is available at\n https://github.com/winci-ai/intl"},{"id":"http://arxiv.org/abs/2401.10150v3","updated":"2024-01-22T02:40:52Z","published":"2024-01-18T17:22:37Z","title":"Motion-Zero: Zero-Shot Moving Object Control Framework for\n Diffusion-Based Video Generation","summary":" Recent large-scale pre-trained diffusion models have demonstrated a powerful\ngenerative ability to produce high-quality videos from detailed text\ndescriptions. However, exerting control over the motion of objects in videos\ngenerated by any video diffusion model is a challenging problem. In this paper,\nwe propose a novel zero-shot moving object trajectory control framework,\nMotion-Zero, to enable a bounding-box-trajectories-controlled text-to-video\ndiffusion model. To this end, an initial noise prior module is designed to\nprovide a position-based prior to improve the stability of the appearance of\nthe moving object and the accuracy of position. In addition, based on the\nattention map of the U-net, spatial constraints are directly applied to the\ndenoising process of diffusion models, which further ensures the positional and\nspatial consistency of moving objects during the inference. Furthermore,\ntemporal consistency is guaranteed with a proposed shift temporal attention\nmechanism. Our method can be flexibly applied to various state-of-the-art video\ndiffusion models without any training process. Extensive experiments\ndemonstrate our proposed method can control the motion trajectories of objects\nand generate high-quality videos.\n","authors":["Changgu Chen","Junwei Shu","Lianggangxu Chen","Gaoqi He","Changbo Wang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2401.10150v3.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2401.11654v1","updated":"2024-01-22T02:21:26Z","published":"2024-01-22T02:21:26Z","title":"ActionHub: A Large-scale Action Video Description Dataset for Zero-shot\n Action Recognition","summary":" Zero-shot action recognition (ZSAR) aims to learn an alignment model between\nvideos and class descriptions of seen actions that is transferable to unseen\nactions. The text queries (class descriptions) used in existing ZSAR works,\nhowever, are often short action names that fail to capture the rich semantics\nin the videos, leading to misalignment. With the intuition that video content\ndescriptions (e.g., video captions) can provide rich contextual information of\nvisual concepts in videos, we propose to utilize human annotated video\ndescriptions to enrich the semantics of the class descriptions of each action.\nHowever, all existing action video description datasets are limited in terms of\nthe number of actions, the semantics of video descriptions, etc. To this end,\nwe collect a large-scale action video descriptions dataset named ActionHub,\nwhich covers a total of 1,211 common actions and provides 3.6 million action\nvideo descriptions. With the proposed ActionHub dataset, we further propose a\nnovel Cross-modality and Cross-action Modeling (CoCo) framework for ZSAR, which\nconsists of a Dual Cross-modality Alignment module and a Cross-action\nInvariance Mining module. Specifically, the Dual Cross-modality Alignment\nmodule utilizes both action labels and video descriptions from ActionHub to\nobtain rich class semantic features for feature alignment. The Cross-action\nInvariance Mining module exploits a cycle-reconstruction process between the\nclass semantic feature spaces of seen actions and unseen actions, aiming to\nguide the model to learn cross-action invariant representations. Extensive\nexperimental results demonstrate that our CoCo framework significantly\noutperforms the state-of-the-art on three popular ZSAR benchmarks (i.e.,\nKinetics-ZSAR, UCF101 and HMDB51) under two different learning protocols in\nZSAR. We will release our code, models, and the proposed ActionHub dataset.\n","authors":["Jiaming Zhou","Junwei Liang","Kun-Yu Lin","Jinrui Yang","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2401.11654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11652v1","updated":"2024-01-22T02:17:36Z","published":"2024-01-22T02:17:36Z","title":"OnDev-LCT: On-Device Lightweight Convolutional Transformers towards\n federated learning","summary":" Federated learning (FL) has emerged as a promising approach to\ncollaboratively train machine learning models across multiple edge devices\nwhile preserving privacy. The success of FL hinges on the efficiency of\nparticipating models and their ability to handle the unique challenges of\ndistributed learning. While several variants of Vision Transformer (ViT) have\nshown great potential as alternatives to modern convolutional neural networks\n(CNNs) for centralized training, the unprecedented size and higher\ncomputational demands hinder their deployment on resource-constrained edge\ndevices, challenging their widespread application in FL. Since client devices\nin FL typically have limited computing resources and communication bandwidth,\nmodels intended for such devices must strike a balance between model size,\ncomputational efficiency, and the ability to adapt to the diverse and non-IID\ndata distributions encountered in FL. To address these challenges, we propose\nOnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks\nwith limited training data and resources. Our models incorporate image-specific\ninductive biases through the LCT tokenizer by leveraging efficient depthwise\nseparable convolutions in residual linear bottleneck blocks to extract local\nfeatures, while the multi-head self-attention (MHSA) mechanism in the LCT\nencoder implicitly facilitates capturing global representations of images.\nExtensive experiments on benchmark image datasets indicate that our models\noutperform existing lightweight vision models while having fewer parameters and\nlower computational demands, making them suitable for FL scenarios with data\nheterogeneity and communication bottlenecks.\n","authors":["Chu Myaet Thwal","Minh N. H. Nguyen","Ye Lin Tun","Seong Tae Kim","My T. Thai","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11652v1.pdf","comment":"Published in Neural Networks"},{"id":"http://arxiv.org/abs/2401.11650v1","updated":"2024-01-22T02:05:33Z","published":"2024-01-22T02:05:33Z","title":"PointGL: A Simple Global-Local Framework for Efficient Point Cloud\n Analysis","summary":" Efficient analysis of point clouds holds paramount significance in real-world\n3D applications. Currently, prevailing point-based models adhere to the\nPointNet++ methodology, which involves embedding and abstracting point features\nwithin a sequence of spatially overlapping local point sets, resulting in\nnoticeable computational redundancy. Drawing inspiration from the streamlined\nparadigm of pixel embedding followed by regional pooling in Convolutional\nNeural Networks (CNNs), we introduce a novel, uncomplicated yet potent\narchitecture known as PointGL, crafted to facilitate efficient point cloud\nanalysis. PointGL employs a hierarchical process of feature acquisition through\ntwo recursive steps. First, the Global Point Embedding leverages\nstraightforward residual Multilayer Perceptrons (MLPs) to effectuate feature\nembedding for each individual point. Second, the novel Local Graph Pooling\ntechnique characterizes point-to-point relationships and abstracts regional\nrepresentations through succinct local graphs. The harmonious fusion of\none-time point embedding and parameter-free graph pooling contributes to\nPointGL's defining attributes of minimized model complexity and heightened\nefficiency. Our PointGL attains state-of-the-art accuracy on the ScanObjectNN\ndataset while exhibiting a runtime that is more than 5 times faster and\nutilizing only approximately 4% of the FLOPs and 30% of the parameters compared\nto the recent PointMLP model. The code for PointGL is available at\nhttps://github.com/Roywangj/PointGL.\n","authors":["Jianan Li","Jie Wang","Tingfa Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11649v1","updated":"2024-01-22T02:03:31Z","published":"2024-01-22T02:03:31Z","title":"M2-CLIP: A Multimodal, Multi-task Adapting Framework for Video Action\n Recognition","summary":" Recently, the rise of large-scale vision-language pretrained models like\nCLIP, coupled with the technology of Parameter-Efficient FineTuning (PEFT), has\ncaptured substantial attraction in video action recognition. Nevertheless,\nprevailing approaches tend to prioritize strong supervised performance at the\nexpense of compromising the models' generalization capabilities during\ntransfer. In this paper, we introduce a novel Multimodal, Multi-task CLIP\nadapting framework named \\name to address these challenges, preserving both\nhigh supervised performance and robust transferability. Firstly, to enhance the\nindividual modality architectures, we introduce multimodal adapters to both the\nvisual and text branches. Specifically, we design a novel visual TED-Adapter,\nthat performs global Temporal Enhancement and local temporal Difference\nmodeling to improve the temporal representation capabilities of the visual\nencoder. Moreover, we adopt text encoder adapters to strengthen the learning of\nsemantic label information. Secondly, we design a multi-task decoder with a\nrich set of supervisory signals to adeptly satisfy the need for strong\nsupervised performance and generalization within a multimodal framework.\nExperimental results validate the efficacy of our approach, demonstrating\nexceptional performance in supervised learning while maintaining strong\ngeneralization in zero-shot scenarios.\n","authors":["Mengmeng Wang","Jiazheng Xing","Boyuan Jiang","Jun Chen","Jianbiao Mei","Xingxing Zuo","Guang Dai","Jingdong Wang","Yong Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05482v2","updated":"2024-01-22T01:48:29Z","published":"2023-04-11T20:28:33Z","title":"Computational Pathology: A Survey Review and The Way Forward","summary":" Computational Pathology CPath is an interdisciplinary science that augments\ndevelopments of computational approaches to analyze and model medical\nhistopathology images. The main objective for CPath is to develop\ninfrastructure and workflows of digital diagnostics as an assistive CAD system\nfor clinical pathology, facilitating transformational changes in the diagnosis\nand treatment of cancer that are mainly address by CPath tools. With\nevergrowing developments in deep learning and computer vision algorithms, and\nthe ease of the data flow from digital pathology, currently CPath is witnessing\na paradigm shift. Despite the sheer volume of engineering and scientific works\nbeing introduced for cancer image analysis, there is still a considerable gap\nof adopting and integrating these algorithms in clinical practice. This raises\na significant question regarding the direction and trends that are undertaken\nin CPath. In this article we provide a comprehensive review of more than 800\npapers to address the challenges faced in problem design all-the-way to the\napplication and implementation viewpoints. We have catalogued each paper into a\nmodel-card by examining the key works and challenges faced to layout the\ncurrent landscape in CPath. We hope this helps the community to locate relevant\nworks and facilitate understanding of the field's future directions. In a\nnutshell, we oversee the CPath developments in cycle of stages which are\nrequired to be cohesively linked together to address the challenges associated\nwith such multidisciplinary science. We overview this cycle from different\nperspectives of data-centric, model-centric, and application-centric problems.\nWe finally sketch remaining challenges and provide directions for future\ntechnical developments and clinical integration of CPath\n(https://github.com/AtlasAnalyticsLab/CPath_Survey).\n","authors":["Mahdi S. Hosseini","Babak Ehteshami Bejnordi","Vincent Quoc-Huy Trinh","Danial Hasan","Xingwen Li","Taehyo Kim","Haochen Zhang","Theodore Wu","Kajanan Chinniah","Sina Maghsoudlou","Ryan Zhang","Stephen Yang","Jiadai Zhu","Lyndon Chan","Samir Khaki","Andrei Buin","Fatemeh Chaji","Ala Salehi","Bich Ngoc Nguyen","Dimitris Samaras","Konstantinos N. Plataniotis"],"pdf_url":"https://arxiv.org/pdf/2304.05482v2.pdf","comment":"Accepted in Elsevier Journal of Pathology Informatics (JPI) 2024"},{"id":"http://arxiv.org/abs/2401.11644v1","updated":"2024-01-22T01:34:03Z","published":"2024-01-22T01:34:03Z","title":"Friends Across Time: Multi-Scale Action Segmentation Transformer for\n Surgical Phase Recognition","summary":" Automatic surgical phase recognition is a core technology for modern\noperating rooms and online surgical video assessment platforms. Current\nstate-of-the-art methods use both spatial and temporal information to tackle\nthe surgical phase recognition task. Building on this idea, we propose the\nMulti-Scale Action Segmentation Transformer (MS-AST) for offline surgical phase\nrecognition and the Multi-Scale Action Segmentation Causal Transformer\n(MS-ASCT) for online surgical phase recognition. We use ResNet50 or\nEfficientNetV2-M for spatial feature extraction. Our MS-AST and MS-ASCT can\nmodel temporal information at different scales with multi-scale temporal\nself-attention and multi-scale temporal cross-attention, which enhances the\ncapture of temporal relationships between frames and segments. We demonstrate\nthat our method can achieve 95.26% and 96.15% accuracy on the Cholec80 dataset\nfor online and offline surgical phase recognition, respectively, which achieves\nnew state-of-the-art results. Our method can also achieve state-of-the-art\nresults on non-medical datasets in the video action segmentation domain.\n","authors":["Bokai Zhang","Jiayuan Meng","Bin Cheng","Dean Biskup","Svetlana Petculescu","Angela Chapman"],"pdf_url":"https://arxiv.org/pdf/2401.11644v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17778v3","updated":"2024-01-22T00:54:30Z","published":"2023-06-30T16:31:14Z","title":"Look, Remember and Reason: Grounded reasoning in videos with language\n models","summary":" Multi-modal language models (LM) have recently shown promising performance in\nhigh-level reasoning tasks on videos. However, existing methods still fall\nshort in tasks like causal or compositional spatiotemporal reasoning over\nactions, in which model predictions need to be grounded in fine-grained\nlow-level details, such as object motions and object interactions. In this\nwork, we propose training an LM end-to-end on low-level surrogate tasks,\nincluding object detection, re-identification, and tracking, to endow the model\nwith the required low-level visual capabilities. We show that a two-stream\nvideo encoder with spatiotemporal attention is effective at capturing the\nrequired static and motion-based cues in the video. By leveraging the LM's\nability to perform the low-level surrogate tasks, we can cast reasoning in\nvideos as the three-step process of Look, Remember, Reason wherein visual\ninformation is extracted using low-level visual skills step-by-step and then\nintegrated to arrive at a final answer. We demonstrate the effectiveness of our\nframework on diverse visual reasoning tasks from the ACRE, CATER,\nSomething-Else and STAR datasets. Our approach is trainable end-to-end and\nsurpasses state-of-the-art task-specific methods across these tasks by a large\nmargin.\n","authors":["Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Reza Pourreza","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2306.17778v3.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2309.01409v5","updated":"2024-01-22T00:22:14Z","published":"2023-09-04T07:40:30Z","title":"Implicit Neural Image Stitching","summary":" Existing frameworks for image stitching often provide visually reasonable\nstitchings. However, they suffer from blurry artifacts and disparities in\nillumination, depth level, etc. Although the recent learning-based stitchings\nrelax such disparities, the required methods impose sacrifice of image\nqualities failing to capture high-frequency details for stitched images. To\naddress the problem, we propose a novel approach, implicit Neural Image\nStitching (NIS) that extends arbitrary-scale super-resolution. Our method\nestimates Fourier coefficients of images for quality-enhancing warps. Then, the\nsuggested model blends color mismatches and misalignment in the latent space\nand decodes the features into RGB values of stitched images. Our experiments\nshow that our approach achieves improvement in resolving the low-definition\nimaging of the previous deep image stitching with favorable accelerated\nimage-enhancing methods. Our source code is available at\nhttps://github.com/minshu-kim/NIS.\n","authors":["Minsu Kim","Jaewon Lee","Byeonghun Lee","Sunghoon Im","Kyong Hwan Jin"],"pdf_url":"https://arxiv.org/pdf/2309.01409v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11633v1","updated":"2024-01-22T00:00:30Z","published":"2024-01-22T00:00:30Z","title":"Zoom-shot: Fast and Efficient Unsupervised Zero-Shot Transfer of CLIP to\n Vision Encoders with Multimodal Loss","summary":" The fusion of vision and language has brought about a transformative shift in\ncomputer vision through the emergence of Vision-Language Models (VLMs).\nHowever, the resource-intensive nature of existing VLMs poses a significant\nchallenge. We need an accessible method for developing the next generation of\nVLMs. To address this issue, we propose Zoom-shot, a novel method for\ntransferring the zero-shot capabilities of CLIP to any pre-trained vision\nencoder. We do this by exploiting the multimodal information (i.e. text and\nimage) present in the CLIP latent space through the use of specifically\ndesigned multimodal loss functions. These loss functions are (1)\ncycle-consistency loss and (2) our novel prompt-guided knowledge distillation\nloss (PG-KD). PG-KD combines the concept of knowledge distillation with CLIP's\nzero-shot classification, to capture the interactions between text and image\nfeatures. With our multimodal losses, we train a $\\textbf{linear mapping}$\nbetween the CLIP latent space and the latent space of a pre-trained vision\nencoder, for only a $\\textbf{single epoch}$. Furthermore, Zoom-shot is entirely\nunsupervised and is trained using $\\textbf{unpaired}$ data. We test the\nzero-shot capabilities of a range of vision encoders augmented as new VLMs, on\ncoarse and fine-grained classification datasets, outperforming the previous\nstate-of-the-art in this problem domain. In our ablations, we find Zoom-shot\nallows for a trade-off between data and compute during training; and our\nstate-of-the-art results can be obtained by reducing training from 20% to 1% of\nthe ImageNet training data with 20 epochs. All code and models are available on\nGitHub.\n","authors":["Jordan Shipard","Arnold Wiliem","Kien Nguyen Thanh","Wei Xiang","Clinton Fookes"],"pdf_url":"https://arxiv.org/pdf/2401.11633v1.pdf","comment":"15 pages"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2204.11209v3","updated":"2024-01-22T14:13:11Z","published":"2022-04-24T07:18:04Z","title":"Hierarchical Locality Sensitive Hashing for Structured Data: A Survey","summary":" Data similarity (or distance) computation is a fundamental research topic\nwhich fosters a variety of similarity-based machine learning and data mining\napplications. In big data analytics, it is impractical to compute the exact\nsimilarity of data instances due to high computational cost. To this end, the\nLocality Sensitive Hashing (LSH) technique has been proposed to provide\naccurate estimators for various similarity measures between sets or vectors in\nan efficient manner without the learning process. Structured data (e.g.,\nsequences, trees and graphs), which are composed of elements and relations\nbetween the elements, are commonly seen in the real world, but the traditional\nLSH algorithms cannot preserve the structure information represented as\nrelations between elements. In order to conquer the issue, researchers have\nbeen devoted to the family of the hierarchical LSH algorithms. In this paper,\nwe explore the present progress of the research into hierarchical LSH from the\nfollowing perspectives: 1) Data structures, where we review various\nhierarchical LSH algorithms for three typical data structures and uncover their\ninherent connections; 2) Applications, where we review the hierarchical LSH\nalgorithms in multiple application scenarios; 3) Challenges, where we discuss\nsome potential challenges as future directions.\n","authors":["Wei Wu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2204.11209v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16034v2","updated":"2024-01-22T11:26:35Z","published":"2023-09-27T21:26:01Z","title":"Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale\n Localization","summary":" Advancements in nanotechnology and material science are paving the way toward\nnanoscale devices that combine sensing, computing, data and energy storage, and\nwireless communication. In precision medicine, these nanodevices show promise\nfor disease diagnostics, treatment, and monitoring from within the patients'\nbloodstreams. Assigning the location of a sensed biological event with the\nevent itself, which is the main proposition of flow-guided in-body nanoscale\nlocalization, would be immensely beneficial from the perspective of precision\nmedicine. The nanoscale nature of the nanodevices and the challenging\nenvironment that the bloodstream represents, result in current flow-guided\nlocalization approaches being constrained in their communication and\nenergy-related capabilities. The communication and energy constraints of the\nnanodevices result in different features of raw data for flow-guided\nlocalization, in turn affecting its performance. An analytical modeling of the\neffects of imperfect communication and constrained energy causing intermittent\noperation of the nanodevices on the raw data produced by the nanodevices would\nbe beneficial. Hence, we propose an analytical model of raw data for\nflow-guided localization, where the raw data is modeled as a function of\ncommunication and energy-related capabilities of the nanodevice. We evaluate\nthe model by comparing its output with the one obtained through the utilization\nof a simulator for objective evaluation of flow-guided localization, featuring\ncomparably higher level of realism. Our results across a number of scenarios\nand heterogeneous performance metrics indicate high similarity between the\nmodel and simulator-generated raw datasets.\n","authors":["Guillem Pascual","Filip Lemic","Carmen Delgado","Xavier Costa-Perez"],"pdf_url":"https://arxiv.org/pdf/2309.16034v2.pdf","comment":"6 pages, 7 figures, 4 tables, 16 references"},{"id":"http://arxiv.org/abs/2401.11800v1","updated":"2024-01-22T10:01:06Z","published":"2024-01-22T10:01:06Z","title":"Revisiting Document-Level Relation Extraction with Context-Guided Link\n Prediction","summary":" Document-level relation extraction (DocRE) poses the challenge of identifying\nrelationships between entities within a document as opposed to the traditional\nRE setting where a single sentence is input. Existing approaches rely on\nlogical reasoning or contextual cues from entities. This paper reframes\ndocument-level RE as link prediction over a knowledge graph with distinct\nbenefits: 1) Our approach combines entity context with document-derived logical\nreasoning, enhancing link prediction quality. 2) Predicted links between\nentities offer interpretability, elucidating employed reasoning. We evaluate\nour approach on three benchmark datasets: DocRED, ReDocRED, and DWIE. The\nresults indicate that our proposed method outperforms the state-of-the-art\nmodels and suggests that incorporating context-based link prediction techniques\ncan enhance the performance of document-level relation extraction models.\n","authors":["Monika Jain","Raghava Mutharaju","Ramakanth Kavuluru","Kuldeep Singh"],"pdf_url":"https://arxiv.org/pdf/2401.11800v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2305.19604v3","updated":"2024-01-22T08:13:50Z","published":"2023-05-31T07:22:15Z","title":"Medication Recommendation via Domain Knowledge Informed Deep Learning","summary":" Medication recommendation is a fundamental yet crucial branch of healthcare,\nwhich provides opportunities to support clinical physicians with more accurate\nmedication prescriptions for patients with complex health conditions. Learning\nfrom electronic health records (EHR) to recommend medications is the most\ncommon way in previous studies. However, most of them neglect incorporating\ndomain knowledge according to the clinical manifestations in the EHR of the\npatient. To address these issues, we propose a novel \\textbf{D}omain\n\\textbf{K}nowledge \\textbf{I}nformed \\textbf{Net}work (DKINet) to integrate\ndomain knowledge with observable clinical manifestations of the patient, which\nis the first dynamic domain knowledge informed framework toward medication\nrecommendation. In particular, we first design a knowledge-driven encoder to\ncapture the domain information and then develop a data-driven encoder to\nintegrate domain knowledge into the observable EHR. To endow the model with the\ncapability of temporal decision, we design an explicit medication encoder for\nlearning the longitudinal dependence of the patient. Extensive experiments on\nthree publicly available datasets verify the superiority of our method. The\ncode will be public upon acceptance.\n","authors":["Sicen Liu","Xiaolong Wang","Xianbing Zhao","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19604v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11742v1","updated":"2024-01-22T08:00:49Z","published":"2024-01-22T08:00:49Z","title":"Knowledge Navigation: Inferring the Interlocking Map of Knowledge from\n Research Trajectories","summary":" \"If I have seen further, it is by standing on the shoulders of giants,\" Isaac\nNewton's renowned statement hints that new knowledge builds upon existing\nfoundations, which means there exists an interdependent relationship between\nknowledge, which, yet uncovered, is implied in the historical development of\nscientific systems for hundreds of years. By leveraging natural language\nprocessing techniques, this study introduces an innovative embedding scheme\ndesigned to infer the \"knowledge interlocking map.\" This map, derived from the\nresearch trajectories of millions of scholars, reveals the intricate\nconnections among knowledge. We validate that the inferred map effectively\ndelineates disciplinary boundaries and captures the intricate relationships\nbetween diverse concepts. The utility of the interlocking map is showcased\nthrough multiple applications. Firstly, we demonstrated the multi-step analogy\ninferences within the knowledge space and the functional connectivity between\nconcepts in different disciplines. Secondly, we trace the evolution of\nknowledge across domains, observing trends such as shifts from \"Theoretical\" to\n\"Applied\" or \"Chemistry\" to \"Biomedical\" along predefined functional\ndirections. Lastly, by analyzing the high-dimensional knowledge network\nstructure, we found that knowledge connects each other with shorter global\npathways, and the interdisciplinary knowledge plays a critical role in\naccessibility of the global knowledge network. Our framework offers a novel\napproach to mining knowledge inheritance pathways in extensive scientific\nliterature, which is of great significance for understanding scientific\ndevelopment patterns, tailoring scientific learning trajectories, and\naccelerating scientific progress.\n","authors":["Shibing Xiang","Bing Liu","Yurui Huang","Chaolin Tian","Xin Jiang","Yifang Ma"],"pdf_url":"https://arxiv.org/pdf/2401.11742v1.pdf","comment":"28 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2304.01225v2","updated":"2024-01-22T06:31:50Z","published":"2023-04-02T07:25:01Z","title":"A greedy approach for increased vehicle utilization in ridesharing\n networks","summary":" In recent years, ridesharing platforms have become a prominent mode of\ntransportation for the residents of urban areas. As a fundamental problem,\nroute recommendation for these platforms is vital for their sustenance. The\nworks done in this direction have recommended routes with higher passenger\ndemand. Despite the existing works, statistics have suggested that these\nservices cause increased greenhouse emissions compared to private vehicles as\nthey roam around in search of riders. This analysis provides finer details\nregarding the functionality of ridesharing systems and it reveals that in the\nface of their boom, they have not utilized the vehicle capacity efficiently. We\npropose to overcome the above limitations and recommend routes that will fetch\nmultiple passengers simultaneously which will result in increased vehicle\nutilization and thereby decrease the effect of these systems on the\nenvironment. As route recommendation is NP-hard, we propose a k-hop-based\nsliding window approximation algorithm that reduces the search space from\nentire road network to a window. We further demonstrate that maximizing\nexpected demand is submodular and greedy algorithms can be used to optimize our\nobjective function within a window. We evaluate our proposed model on\nreal-world datasets and experimental results demonstrate superior performance\nby our proposed model.\n","authors":["Aqsa Ashraf Makhdomi","Iqra Altaf Gillani"],"pdf_url":"https://arxiv.org/pdf/2304.01225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11705v1","updated":"2024-01-22T06:12:48Z","published":"2024-01-22T06:12:48Z","title":"Domain-Aware Cross-Attention for Cross-domain Recommendation","summary":" Cross-domain recommendation (CDR) is an important method to improve\nrecommender system performance, especially when observations in target domains\nare sparse. However, most existing cross-domain recommendations fail to fully\nutilize the target domain's special features and are hard to be generalized to\nnew domains. The designed network is complex and is not suitable for rapid\nindustrial deployment. Our method introduces a two-step domain-aware\ncross-attention, extracting transferable features of the source domain from\ndifferent granularity, which allows the efficient expression of both domain and\nuser interests. In addition, we simplify the training process, and our model\ncan be easily deployed on new domains. We conduct experiments on both public\ndatasets and industrial datasets, and the experimental results demonstrate the\neffectiveness of our method. We have also deployed the model in an online\nadvertising system and observed significant improvements in both\nClick-Through-Rate (CTR) and effective cost per mille (ECPM).\n","authors":["Yuhao Luo","Shiwei Ma","Mingjun Nie","Changping Peng","Zhangang Lin","Jingping Shao","Qianfang Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11705v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2401.11648v1","updated":"2024-01-22T01:58:32Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n Contrastive EHR Modelling with Hierarchical Regularisation","summary":" Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v1.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2306.16001v2","updated":"2024-01-22T00:27:45Z","published":"2023-06-28T08:20:35Z","title":"Streamlining Social Media Information Extraction for Public Health\n Research with Deep Learning","summary":" Objective: Social media-based public health research is crucial for epidemic\nsurveillance, but most studies identify relevant corpora with keyword matching.\nThis study develops a system to streamline the process of curating colloquial\nmedical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial\nsymptom dictionary from COVID-19-related tweets as proof of concept. Methods:\nCOVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The\npipeline includes three modules: a named entity recognition module to detect\nsymptoms in tweets; an entity normalization module to aggregate detected\nentities; and a mapping module that iteratively maps entities to Unified\nMedical Language System concepts. A random 500 entity sample were drawn from\nthe final dictionary for accuracy validation. Additionally, we conducted a\nsymptom frequency distribution analysis to compare our dictionary to a\npre-defined lexicon from previous research. Results: We identified 498,480\nunique symptom entity expressions from the tweets. Pre-processing reduces the\nnumber to 18,226. The final dictionary contains 38,175 unique expressions of\nsymptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom\ndistribution analysis found that our dictionary detects more symptoms and is\neffective at identifying psychiatric disorders like anxiety and depression,\noften missed by pre-defined lexicons. Conclusion: This study advances public\nhealth research by implementing a novel, systematic pipeline for curating\nsymptom lexicons from social media data. The final lexicon's high accuracy,\nvalidated by medical professionals, underscores the potential of this\nmethodology to reliably interpret and categorize vast amounts of unstructured\nsocial media data into actionable medical insights across diverse linguistic\nand regional landscapes.\n","authors":["Yining Hua","Shixu Lin","Minghui Li","Yujie Zhang","Dinah Foer","Siwen Wang","Peilin Zhou","Li Zhou","Jie Yang"],"pdf_url":"https://arxiv.org/pdf/2306.16001v2.pdf","comment":"Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA\n Annual Symposium 2023"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.12217v1","updated":"2024-01-22T18:59:29Z","published":"2024-01-22T18:59:29Z","title":"Exploring Simple Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation models aim to accurately assign a\nsemantic label to each pixel in an image from a set of arbitrary\nopen-vocabulary texts. In order to learn such pixel-level alignment, current\napproaches typically rely on a combination of (i) image-level VL model (e.g.\nCLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this\npaper, we introduce S-Seg, a novel model that can achieve surprisingly strong\nperformance without depending on any of the above elements. S-Seg leverages\npseudo-mask and language to train a MaskFormer, and can be easily trained from\npublicly available image-text datasets. Contrary to prior works, our model\ndirectly trains for pixel-level features and language alignment. Once trained,\nS-Seg generalizes well to multiple testing datasets without requiring\nfine-tuning. In addition, S-Seg has the extra benefits of scalability with data\nand consistently improvement when augmented with self-training. We believe that\nour simple yet effective approach will serve as a solid baseline for future\nresearch.\n","authors":["Zihang Lai"],"pdf_url":"https://arxiv.org/pdf/2401.12217v1.pdf","comment":"Code is available at: https://github.com/zlai0/S-Seg"},{"id":"http://arxiv.org/abs/2401.12216v1","updated":"2024-01-22T18:59:12Z","published":"2024-01-22T18:59:12Z","title":"Mitigating Covariate Shift in Misspecified Regression with Applications\n to Reinforcement Learning","summary":" A pervasive phenomenon in machine learning applications is distribution\nshift, where training and deployment conditions for a machine learning model\ndiffer. As distribution shift typically results in a degradation in\nperformance, much attention has been devoted to algorithmic interventions that\nmitigate these detrimental effects. In this paper, we study the effect of\ndistribution shift in the presence of model misspecification, specifically\nfocusing on $L_{\\infty}$-misspecified regression and adversarial covariate\nshift, where the regression target remains fixed while the covariate\ndistribution changes arbitrarily. We show that empirical risk minimization, or\nstandard least squares regression, can result in undesirable misspecification\namplification where the error due to misspecification is amplified by the\ndensity ratio between the training and testing distributions. As our main\nresult, we develop a new algorithm -- inspired by robust optimization\ntechniques -- that avoids this undesirable behavior, resulting in no\nmisspecification amplification while still obtaining optimal statistical rates.\nAs applications, we use this regression procedure to obtain new guarantees in\noffline and online reinforcement learning with misspecification and establish\nnew separations between previously studied structural conditions and notions of\ncoverage.\n","authors":["Philip Amortila","Tongyi Cao","Akshay Krishnamurthy"],"pdf_url":"https://arxiv.org/pdf/2401.12216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13507v2","updated":"2024-01-22T18:54:52Z","published":"2023-08-25T17:33:05Z","title":"Large Language Models Should Ask Clarifying Questions to Increase\n Confidence in Generated Code","summary":" Large language models (LLMs) have significantly improved the ability to\nperform tasks in the field of code generation. However, there is still a gap\nbetween LLMs being capable coders and being top-tier software engineers. Based\non the observation that toplevel software engineers often ask clarifying\nquestions to reduce ambiguity in both requirements and coding solutions, I\nargue that the same should be applied to LLMs for code generation tasks. By\nasking probing questions in various topics before generating the final code,\nthe challenges of programming with LLMs, such as unclear intent specification,\nlack of computational thinking, and undesired code quality, may be alleviated.\nThis, in turn, increases confidence in the generated code. In this work, I\nexplore how to leverage better communication skills to achieve greater\nconfidence in generated code. I propose a communication-centered process that\nuses an LLM-generated communicator to identify issues with high ambiguity or\nlow confidence in problem descriptions and generated code. I then ask\nclarifying questions to obtain responses from users for refining the code.\n","authors":["Jie JW Wu"],"pdf_url":"https://arxiv.org/pdf/2308.13507v2.pdf","comment":"6 pages, 2 figures, 1 table. Accepted and presented at the 7th Annual\n Symposium on Machine Programming (MAPS 2023 Workshop, see\n https://mapsworkshop.github.io/). Reference: \"Wu, Jie JW. Large Language\n Models Should Ask Clarifying Questions to Increase Confidence in Generated\n Code. The 7th Annual Symposium on Machine Programming (MAPS 23), December 3,\n 2023, San Francisco, CA, USA\""},{"id":"http://arxiv.org/abs/2401.03506v3","updated":"2024-01-22T18:53:36Z","published":"2024-01-07T14:54:57Z","title":"DiarizationLM: Speaker Diarization Post-Processing with Large Language\n Models","summary":" In this paper, we introduce DiarizationLM, a framework to leverage large\nlanguage models (LLM) to post-process the outputs from a speaker diarization\nsystem. Various goals can be achieved with the proposed framework, such as\nimproving the readability of the diarized transcript, or reducing the word\ndiarization error rate (WDER). In this framework, the outputs of the automatic\nspeech recognition (ASR) and speaker diarization systems are represented as a\ncompact textual format, which is included in the prompt to an optionally\nfinetuned LLM. The outputs of the LLM can be used as the refined diarization\nresults with the desired enhancement. As a post-processing step, this framework\ncan be easily applied to any off-the-shelf ASR and speaker diarization systems\nwithout retraining existing components. Our experiments show that a finetuned\nPaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone\nconversation dataset, and rel. 44.9% on the Callhome English dataset.\n","authors":["Quan Wang","Yiling Huang","Guanlong Zhao","Evan Clark","Wei Xia","Hank Liao"],"pdf_url":"https://arxiv.org/pdf/2401.03506v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12207v1","updated":"2024-01-22T18:49:56Z","published":"2024-01-22T18:49:56Z","title":"Rate-Distortion-Perception Tradeoff Based on the\n Conditional-Distribution Perception Measure","summary":" We study the rate-distortion-perception (RDP) tradeoff for a memoryless\nsource model in the asymptotic limit of large block-lengths. Our perception\nmeasure is based on a divergence between the distributions of the source and\nreconstruction sequences conditioned on the encoder output, which was first\nproposed in [1], [2]. We consider the case when there is no shared randomness\nbetween the encoder and the decoder. For the case of discrete memoryless\nsources we derive a single-letter characterization of the RDP function, thus\nsettling a problem that remains open for the marginal metric introduced in Blau\nand Michaeli [3] (with no shared randomness). Our achievability scheme is based\non lossy source coding with a posterior reference map proposed in [4]. For the\ncase of continuous valued sources under squared error distortion measure and\nsquared quadratic Wasserstein perception measure we also derive a single-letter\ncharacterization and show that a noise-adding mechanism at the decoder suffices\nto achieve the optimal representation. For the case of zero perception loss, we\nshow that our characterization interestingly coincides with the results for the\nmarginal metric derived in [5], [6] and again demonstrate that zero perception\nloss can be achieved with a $3$-dB penalty in the minimum distortion. Finally\nwe specialize our results to the case of Gaussian sources. We derive the RDP\nfunction for vector Gaussian sources and propose a waterfilling type solution.\nWe also partially characterize the RDP function for a mixture of vector\nGaussians.\n","authors":["Sadaf Salehkalaibar","Jun Chen","Ashish Khisti","Wei Yu"],"pdf_url":"https://arxiv.org/pdf/2401.12207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12205v1","updated":"2024-01-22T18:46:30Z","published":"2024-01-22T18:46:30Z","title":"Retrieval-Guided Reinforcement Learning for Boolean Circuit Minimization","summary":" Logic synthesis, a pivotal stage in chip design, entails optimizing chip\nspecifications encoded in hardware description languages like Verilog into\nhighly efficient implementations using Boolean logic gates. The process\ninvolves a sequential application of logic minimization heuristics (``synthesis\nrecipe\"), with their arrangement significantly impacting crucial metrics such\nas area and delay. Addressing the challenge posed by the broad spectrum of\ndesign complexities - from variations of past designs (e.g., adders and\nmultipliers) to entirely novel configurations (e.g., innovative processor\ninstructions) - requires a nuanced `synthesis recipe` guided by human expertise\nand intuition. This study conducts a thorough examination of learning and\nsearch techniques for logic synthesis, unearthing a surprising revelation:\npre-trained agents, when confronted with entirely novel designs, may veer off\ncourse, detrimentally affecting the search trajectory. We present ABC-RL, a\nmeticulously tuned $\\alpha$ parameter that adeptly adjusts recommendations from\npre-trained agents during the search process. Computed based on similarity\nscores through nearest neighbor retrieval from the training dataset, ABC-RL\nyields superior synthesis recipes tailored for a wide array of hardware\ndesigns. Our findings showcase substantial enhancements in the\nQuality-of-result (QoR) of synthesized circuits, boasting improvements of up to\n24.8% compared to state-of-the-art techniques. Furthermore, ABC-RL achieves an\nimpressive up to 9x reduction in runtime (iso-QoR) when compared to current\nstate-of-the-art methodologies.\n","authors":["Animesh Basak Chowdhury","Marco Romanelli","Benjamin Tan","Ramesh Karri","Siddharth Garg"],"pdf_url":"https://arxiv.org/pdf/2401.12205v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2401.12202v1","updated":"2024-01-22T18:42:20Z","published":"2024-01-22T18:42:20Z","title":"OK-Robot: What Really Matters in Integrating Open-Knowledge Models for\n Robotics","summary":" Remarkable progress has been made in recent years in the fields of vision,\nlanguage, and robotics. We now have vision models capable of recognizing\nobjects based on language queries, navigation systems that can effectively\ncontrol mobile systems, and grasping models that can handle a wide range of\nobjects. Despite these advancements, general-purpose applications of robotics\nstill lag behind, even though they rely on these fundamental capabilities of\nrecognition, navigation, and grasping. In this paper, we adopt a systems-first\napproach to develop a new Open Knowledge-based robotics framework called\nOK-Robot. By combining Vision-Language Models (VLMs) for object detection,\nnavigation primitives for movement, and grasping primitives for object\nmanipulation, OK-Robot offers a integrated solution for pick-and-drop\noperations without requiring any training. To evaluate its performance, we run\nOK-Robot in 10 real-world home environments. The results demonstrate that\nOK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks,\nrepresenting a new state-of-the-art in Open Vocabulary Mobile Manipulation\n(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered\nenvironments, OK-Robot's performance increases to 82%. However, the most\nimportant insight gained from OK-Robot is the critical role of nuanced details\nwhen combining Open Knowledge systems like VLMs with robotic modules. Videos of\nour experiments are available on our website: https://ok-robot.github.io\n","authors":["Peiqi Liu","Yaswanth Orru","Chris Paxton","Nur Muhammad Mahi Shafiullah","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2401.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12200v1","updated":"2024-01-22T18:39:40Z","published":"2024-01-22T18:39:40Z","title":"APT: Adaptive Pruning and Tuning Pretrained Language Models for\n Efficient Training and Inference","summary":" Fine-tuning and inference with large Language Models (LM) are generally known\nto be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces\ntraining memory by updating a small number of LM parameters but does not\nimprove inference efficiency. Structured pruning improves LM inference\nefficiency by removing consistent parameter blocks, yet often increases\ntraining memory and time. To improve both training and inference efficiency, we\nintroduce APT that adaptively prunes and tunes parameters for the LMs. At the\nearly stage of fine-tuning, APT dynamically adds salient tuning parameters for\nfast and accurate convergence while discarding unimportant parameters for\nefficiency. Compared to baselines, our experiments show that APT maintains up\nto 98% task performance when pruning RoBERTa and T5 models with 40% parameters\nleft while keeping 86.4% LLaMA models' performance with 70% parameters\nremained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces\nlarge LMs memory training footprint by up to 70%.\n","authors":["Bowen Zhao","Hannaneh Hajishirzi","Qingqing Cao"],"pdf_url":"https://arxiv.org/pdf/2401.12200v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2401.12187v1","updated":"2024-01-22T18:27:08Z","published":"2024-01-22T18:27:08Z","title":"WARM: On the Benefits of Weight Averaged Reward Models","summary":" Aligning large language models (LLMs) with human preferences through\nreinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit\nfailures in the reward model (RM) to achieve seemingly high rewards without\nmeeting the underlying objectives. We identify two primary challenges when\ndesigning RMs to mitigate reward hacking: distribution shifts during the RL\nprocess and inconsistencies in human preferences. As a solution, we propose\nWeight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then\naveraging them in the weight space. This strategy follows the observation that\nfine-tuned weights remain linearly mode connected when sharing the same\npre-training. By averaging weights, WARM improves efficiency compared to the\ntraditional ensembling of predictions, while improving reliability under\ndistribution shifts and robustness to preference inconsistencies. Our\nexperiments on summarization tasks, using best-of-N and RL methods, shows that\nWARM improves the overall quality and alignment of LLM predictions; for\nexample, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy\nRL fine-tuned with a single RM.\n","authors":["Alexandre Ramé","Nino Vieillard","Léonard Hussenot","Robert Dadashi","Geoffrey Cideron","Olivier Bachem","Johan Ferret"],"pdf_url":"https://arxiv.org/pdf/2401.12187v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2401.10305v2","updated":"2024-01-22T18:12:20Z","published":"2024-01-18T13:18:51Z","title":"Personality Trait Inference Via Mobile Phone Sensors: A Machine Learning\n Approach","summary":" This study provides evidence that personality can be reliably predicted from\nactivity data collected through mobile phone sensors. Employing a set of well\ninformed indicators calculable from accelerometer records and movement\npatterns, we were able to predict users' personality up to a 0.78 F1 score on a\ntwo class problem. Given the fast growing number of data collected from mobile\nphones, our novel personality indicators open the door to exciting avenues for\nfuture research in social sciences. Our results reveal distinct behavioral\npatterns that proved to be differentially predictive of big five personality\ntraits. They potentially enable cost effective, questionnaire free\ninvestigation of personality related questions at an unprecedented scale. We\nshow how a combination of rich behavioral data obtained with smartphone sensing\nand the use of machine learning techniques can help to advance personality\nresearch and can inform both practitioners and researchers about the different\nbehavioral patterns of personality. These findings have practical implications\nfor organizations harnessing mobile sensor data for personality assessment,\nguiding the refinement of more precise and efficient prediction models in the\nfuture.\n","authors":["Wun Yung Shaney Sze","Maryglen Pearl Herrero","Roger Garriga"],"pdf_url":"https://arxiv.org/pdf/2401.10305v2.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2401.12181v1","updated":"2024-01-22T18:11:01Z","published":"2024-01-22T18:11:01Z","title":"Universal Neurons in GPT2 Language Models","summary":" A basic question within the emerging field of mechanistic interpretability is\nthe degree to which neural networks learn the same underlying mechanisms. In\nother words, are neural mechanisms universal across different models? In this\nwork, we study the universality of individual neurons across GPT2 models\ntrained from different initial random seeds, motivated by the hypothesis that\nuniversal neurons are likely to be interpretable. In particular, we compute\npairwise correlations of neuron activations over 100 million tokens for every\nneuron pair across five different seeds and find that 1-5\\% of neurons are\nuniversal, that is, pairs of neurons which consistently activate on the same\ninputs. We then study these universal neurons in detail, finding that they\nusually have clear interpretations and taxonomize them into a small number of\nneuron families. We conclude by studying patterns in neuron weights to\nestablish several universal functional roles of neurons in simple circuits:\ndeactivating attention heads, changing the entropy of the next token\ndistribution, and predicting the next token to (not) be within a particular\nset.\n","authors":["Wes Gurnee","Theo Horsley","Zifan Carl Guo","Tara Rezaei Kheirkhah","Qinyi Sun","Will Hathaway","Neel Nanda","Dimitris Bertsimas"],"pdf_url":"https://arxiv.org/pdf/2401.12181v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12179v1","updated":"2024-01-22T18:10:10Z","published":"2024-01-22T18:10:10Z","title":"DITTO: Diffusion Inference-Time T-Optimization for Music Generation","summary":" We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose\nframe-work for controlling pre-trained text-to-music diffusion models at\ninference-time via optimizing initial noise latents. Our method can be used to\noptimize through any differentiable feature matching loss to achieve a target\n(stylized) output and leverages gradient checkpointing for memory efficiency.\nWe demonstrate a surprisingly wide-range of applications for music generation\nincluding inpainting, outpainting, and looping as well as intensity, melody,\nand musical structure control - all without ever fine-tuning the underlying\nmodel. When we compare our approach against related training, guidance, and\noptimization-based methods, we find DITTO achieves state-of-the-art performance\non nearly all tasks, including outperforming comparable approaches on\ncontrollability, audio quality, and computational efficiency, thus opening the\ndoor for high-quality, flexible, training-free control of diffusion models.\nSound examples can be found at https://DITTO-Music.github.io/web/.\n","authors":["Zachary Novack","Julian McAuley","Taylor Berg-Kirkpatrick","Nicholas J. Bryan"],"pdf_url":"https://arxiv.org/pdf/2401.12179v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11359v2","updated":"2024-01-22T18:01:37Z","published":"2022-05-23T14:45:34Z","title":"Towards Size-Independent Generalization Bounds for Deep Operator Nets","summary":" In recent times machine learning methods have made significant advances in\nbecoming a useful tool for analyzing physical systems. A particularly active\narea in this theme has been \"physics-informed machine learning\" which focuses\non using neural nets for numerically solving differential equations. In this\nwork, we aim to advance the theory of measuring out-of-sample error while\ntraining DeepONets -- which is among the most versatile ways to solve PDE\nsystems in one-shot.\n Firstly, for a class of DeepONets, we prove a bound on their Rademacher\ncomplexity which does not explicitly scale with the width of the nets involved.\nSecondly, we use this to show how the Huber loss can be chosen so that for\nthese DeepONet classes generalization error bounds can be obtained that have no\nexplicit dependence on the size of the nets. We note that our theoretical\nresults apply to any PDE being targeted to be solved by DeepONets.\n","authors":["Pulkit Gopalani","Sayar Karmakar","Dibyakanti Kumar","Anirbit Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2205.11359v2.pdf","comment":"27 pages, 5 figures; Added theorem on generalization error indicating\n benefits of training DeepONets on the Huber loss and corresponding\n experiments"},{"id":"http://arxiv.org/abs/2401.12168v1","updated":"2024-01-22T18:01:01Z","published":"2024-01-22T18:01:01Z","title":"SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning\n Capabilities","summary":" Understanding and reasoning about spatial relationships is a fundamental\ncapability for Visual Question Answering (VQA) and robotics. While Vision\nLanguage Models (VLM) have demonstrated remarkable performance in certain VQA\nbenchmarks, they still lack capabilities in 3D spatial reasoning, such as\nrecognizing quantitative relationships of physical objects like distances or\nsize differences. We hypothesize that VLMs' limited spatial reasoning\ncapability is due to the lack of 3D spatial knowledge in training data and aim\nto solve this problem by training VLMs with Internet-scale spatial reasoning\ndata. To this end, we present a system to facilitate this approach. We first\ndevelop an automatic 3D spatial VQA data generation framework that scales up to\n2 billion VQA examples on 10 million real-world images. We then investigate\nvarious factors in the training recipe, including data quality, training\npipeline, and VLM architecture. Our work features the first internet-scale 3D\nspatial reasoning dataset in metric space. By training a VLM on such data, we\nsignificantly enhance its ability on both qualitative and quantitative spatial\nVQA. Finally, we demonstrate that this VLM unlocks novel downstream\napplications in chain-of-thought spatial reasoning and robotics due to its\nquantitative estimation capability. Project website:\nhttps://spatial-vlm.github.io/\n","authors":["Boyuan Chen","Zhuo Xu","Sean Kirmani","Brian Ichter","Danny Driess","Pete Florence","Dorsa Sadigh","Leonidas Guibas","Fei Xia"],"pdf_url":"https://arxiv.org/pdf/2401.12168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08573v2","updated":"2024-01-22T17:54:58Z","published":"2024-01-16T18:58:36Z","title":"Benchmarking the Robustness of Image Watermarks","summary":" This paper investigates the weaknesses of image watermarking techniques. We\npresent WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel\nbenchmark for assessing watermark robustness, overcoming the limitations of\ncurrent evaluation methods.WAVES integrates detection and identification tasks,\nand establishes a standardized evaluation protocol comprised of a diverse range\nof stress tests. The attacks in WAVES range from traditional image distortions\nto advanced and novel variations of diffusive, and adversarial attacks. Our\nevaluation examines two pivotal dimensions: the degree of image quality\ndegradation and the efficacy of watermark detection after attacks. We develop a\nseries of Performance vs. Quality 2D plots, varying over several prominent\nimage similarity metrics, which are then aggregated in a heuristically novel\nmanner to paint an overall picture of watermark robustness and attack potency.\nOur comprehensive evaluation reveals previously undetected vulnerabilities of\nseveral modern watermarking algorithms. We envision WAVES as a toolkit for the\nfuture development of robust watermarking systems. The project is available at\nhttps://wavesbench.github.io/\n","authors":["Bang An","Mucong Ding","Tahseen Rabbani","Aakriti Agrawal","Yuancheng Xu","Chenghao Deng","Sicheng Zhu","Abdirisak Mohamed","Yuxin Wen","Tom Goldstein","Furong Huang"],"pdf_url":"https://arxiv.org/pdf/2401.08573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12149v1","updated":"2024-01-22T17:36:23Z","published":"2024-01-22T17:36:23Z","title":"Personalized Over-the-Air Federated Learning with Personalized\n Reconfigurable Intelligent Surfaces","summary":" Over-the-air federated learning (OTA-FL) provides bandwidth-efficient\nlearning by leveraging the inherent superposition property of wireless\nchannels. Personalized federated learning balances performance for users with\ndiverse datasets, addressing real-life data heterogeneity. We propose the first\npersonalized OTA-FL scheme through multi-task learning, assisted by personal\nreconfigurable intelligent surfaces (RIS) for each user. We take a cross-layer\napproach that optimizes communication and computation resources for global and\npersonalized tasks in time-varying channels with imperfect channel state\ninformation, using multi-task learning for non-i.i.d data. Our PROAR-PFed\nalgorithm adaptively designs power, local iterations, and RIS configurations.\nWe present convergence analysis for non-convex objectives and demonstrate that\nPROAR-PFed outperforms state-of-the-art on the Fashion-MNIST dataset.\n","authors":["Jiayu Mao","Aylin Yener"],"pdf_url":"https://arxiv.org/pdf/2401.12149v1.pdf","comment":"Copyright 2024 IEEE. Published in ICASSP 2024, 14-19 April, Seoul,\n Korea. Personal use of this material is permitted. However, permission to\n reprint/republish this material for advertising or promotional purposes or\n for creating new collective works for resale or redistribution to servers or\n lists, or to reuse any copyrighted component of this work in other works,\n must be obtained from the IEEE"},{"id":"http://arxiv.org/abs/2401.12133v1","updated":"2024-01-22T17:15:02Z","published":"2024-01-22T17:15:02Z","title":"VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear\n Responses in VR Stand-up Interactive Games","summary":" Understanding and recognizing emotions are important and challenging issues\nin the metaverse era. Understanding, identifying, and predicting fear, which is\none of the fundamental human emotions, in virtual reality (VR) environments\nplays an essential role in immersive game development, scene development, and\nnext-generation virtual human-computer interaction applications. In this\narticle, we used VR horror games as a medium to analyze fear emotions by\ncollecting multi-modal data (posture, audio, and physiological signals) from 23\nplayers. We used an LSTM-based model to predict fear with accuracies of 65.31%\nand 90.47% under 6-level classification (no fear and five different levels of\nfear) and 2-level classification (no fear and fear), respectively. We\nconstructed a multi-modal natural behavior dataset of immersive human fear\nresponses (VRMN-bD) and compared it with existing relevant advanced datasets.\nThe results show that our dataset has fewer limitations in terms of collection\nmethod, data scale and audience scope. We are unique and advanced in targeting\nmulti-modal datasets of fear and behavior in VR stand-up interactive\nenvironments. Moreover, we discussed the implications of this work for\ncommunities and applications. The dataset and pre-trained model are available\nat https://github.com/KindOPSTAR/VRMN-bD.\n","authors":["He Zhang","Xinyang Li","Yuanxi Sun","Xinyi Fu","Christine Qiu","John M. Carroll"],"pdf_url":"https://arxiv.org/pdf/2401.12133v1.pdf","comment":"Accepted to IEEE VR 2024"},{"id":"http://arxiv.org/abs/2401.12132v1","updated":"2024-01-22T17:14:47Z","published":"2024-01-22T17:14:47Z","title":"Evaluation of QCNN-LSTM for Disability Forecasting in Multiple Sclerosis\n Using Sequential Multisequence MRI","summary":" Introduction Quantum Convolutional Neural Network (QCNN)-Long Short-Term\nMemory (LSTM) models were studied to provide sequential relationships for each\ntimepoint in MRIs of patients with Multiple Sclerosis (MS). In this pilot\nstudy, we compared three QCNN-LSTM models for binary classification of MS\ndisability benchmarked against classical neural network architectures. Our\nhypothesis is that quantum models will provide competitive performance. Methods\nMatrix Product State (MPS), reverse Multistate Entanglement Renormalization\nAnsatz (MERA), and Tree-Tensor Network (TTN) circuits were paired with LSTM\nlayer to process near-annual MRI data of patients diagnosed with MS. These were\nbenchmarked against a Visual Geometry Group (VGG)-LSTM and a Video Vision\nTransformer (ViViT). Predicted logits were measured against ground truth labels\nof each patient's Extended Disability Severity Score (EDSS) using binary\ncross-entropy loss. Training/validation/holdout testing was partitioned using\n5-fold cross validation with a total split of 60:20:20. Levene's test of\nvariance was used to measure statistical difference and Student's t-test for\npaired model differences in mean. Results The MPS-LSTM, reverse MERA-LSTM, and\nTTN-LSTM had holdout testing ROC-AUC of 0.70, 0.77, and 0.81, respectively\n(p-value 0.915). VGG16-LSTM and ViViT performed similarly with ROC-AUC of 0.73\nand 0.77, respectively (p-value 0.631). Overall variance and mean were not\nstatistically significant (p-value 0.713), however, time to train was\nsignificantly faster for the QCNN-LSTMs (39.4 sec per fold vs. 224 and 218,\nrespectively, p-value <0.001). Conclusion QCNN-LSTM models perform\ncompetitively to their classical counterparts with greater efficiency in train\ntime. Clinically, these can add value in terms of efficiency to time-dependent\ndeep learning prediction of disease progression based upon medical imaging.\n","authors":["John D. Mayfield","Issam El Naqa"],"pdf_url":"https://arxiv.org/pdf/2401.12132v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12131v1","updated":"2024-01-22T17:13:50Z","published":"2024-01-22T17:13:50Z","title":"NeuroSynt: A Neuro-symbolic Portfolio Solver for Reactive Synthesis","summary":" We introduce NeuroSynt, a neuro-symbolic portfolio solver framework for\nreactive synthesis. At the core of the solver lies a seamless integration of\nneural and symbolic approaches to solving the reactive synthesis problem. To\nensure soundness, the neural engine is coupled with model checkers verifying\nthe predictions of the underlying neural models. The open-source implementation\nof NeuroSynt provides an integration framework for reactive synthesis in which\nnew neural and state-of-the-art symbolic approaches can be seamlessly\nintegrated. Extensive experiments demonstrate its efficacy in handling\nchallenging specifications, enhancing the state-of-the-art reactive synthesis\nsolvers, with NeuroSynt contributing novel solves in the current SYNTCOMP\nbenchmarks.\n","authors":["Matthias Cosler","Christopher Hahn","Ayham Omar","Frederik Schmitt"],"pdf_url":"https://arxiv.org/pdf/2401.12131v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06144v2","updated":"2024-01-22T17:11:57Z","published":"2023-11-30T23:31:33Z","title":"DFU: scale-robust diffusion model for zero-shot super-resolution image\n generation","summary":" Diffusion generative models have achieved remarkable success in generating\nimages with a fixed resolution. However, existing models have limited ability\nto generalize to different resolutions when training data at those resolutions\nare not available. Leveraging techniques from operator learning, we present a\nnovel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the\nscore operator by combining both spatial and spectral information at multiple\nresolutions. Comparisons of DFU to baselines demonstrate its scalability: 1)\nsimultaneously training on multiple resolutions improves FID over training at\nany single fixed resolution; 2) DFU generalizes beyond its training\nresolutions, allowing for coherent, high-fidelity generation at\nhigher-resolutions with the same model, i.e. zero-shot super-resolution\nimage-generation; 3) we propose a fine-tuning strategy to further enhance the\nzero-shot super-resolution image-generation capability of our model, leading to\na FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no\nother method can come close to achieving.\n","authors":["Alex Havrilla","Kevin Rojas","Wenjing Liao","Molei Tao"],"pdf_url":"https://arxiv.org/pdf/2401.06144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12129v1","updated":"2024-01-22T17:11:01Z","published":"2024-01-22T17:11:01Z","title":"Out-of-Distribution Detection & Applications With Ablated Learned\n Temperature Energy","summary":" As deep neural networks become adopted in high-stakes domains, it is crucial\nto be able to identify when inference inputs are Out-of-Distribution (OOD) so\nthat users can be alerted of likely drops in performance and calibration\ndespite high confidence. Among many others, existing methods use the following\ntwo scores to do so without training on any apriori OOD examples: a learned\ntemperature and an energy score. In this paper we introduce Ablated Learned\nTemperature Energy (or \"AbeT\" for short), a method which combines these prior\nmethods in novel ways with effective modifications. Due to these contributions,\nAbeT lowers the False Positive Rate at $95\\%$ True Positive Rate (FPR@95) by\n$35.39\\%$ in classification (averaged across all ID and OOD datasets measured)\ncompared to state of the art without training networks in multiple stages or\nrequiring hyperparameters or test-time backward passes. We additionally provide\nempirical insights as to how our model learns to distinguish between\nIn-Distribution (ID) and OOD samples while only being explicitly trained on ID\nsamples via exposure to misclassified ID examples at training time. Lastly, we\nshow the efficacy of our method in identifying predicted bounding boxes and\npixels corresponding to OOD objects in object detection and semantic\nsegmentation, respectively - with an AUROC increase of $5.15\\%$ in object\ndetection and both a decrease in FPR@95 of $41.48\\%$ and an increase in AUPRC\nof $34.20\\%$ on average in semantic segmentation compared to previous state of\nthe art.\n","authors":["Will LeVine","Benjamin Pikus","Jacob Phillips","Berk Norman","Fernando Amat Gil","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2401.12129v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.15462v2","updated":"2024-01-22T17:02:16Z","published":"2023-09-27T07:57:37Z","title":"DTC: Deep Tracking Control","summary":" Legged locomotion is a complex control problem that requires both accuracy\nand robustness to cope with real-world challenges. Legged systems have\ntraditionally been controlled using trajectory optimization with inverse\ndynamics. Such hierarchical model-based methods are appealing due to intuitive\ncost function tuning, accurate planning, generalization, and most importantly,\nthe insightful understanding gained from more than one decade of extensive\nresearch. However, model mismatch and violation of assumptions are common\nsources of faulty operation. Simulation-based reinforcement learning, on the\nother hand, results in locomotion policies with unprecedented robustness and\nrecovery skills. Yet, all learning algorithms struggle with sparse rewards\nemerging from environments where valid footholds are rare, such as gaps or\nstepping stones. In this work, we propose a hybrid control architecture that\ncombines the advantages of both worlds to simultaneously achieve greater\nrobustness, foot-placement accuracy, and terrain generalization. Our approach\nutilizes a model-based planner to roll out a reference motion during training.\nA deep neural network policy is trained in simulation, aiming to track the\noptimized footholds. We evaluate the accuracy of our locomotion pipeline on\nsparse terrains, where pure data-driven methods are prone to fail. Furthermore,\nwe demonstrate superior robustness in the presence of slippery or deformable\nground when compared to model-based counterparts. Finally, we show that our\nproposed tracking controller generalizes across different trajectory\noptimization methods not seen during training. In conclusion, our work unites\nthe predictive capabilities and optimality guarantees of online planning with\nthe inherent robustness attributed to offline learning.\n","authors":["Fabian Jenelten","Junzhe He","Farbod Farshidian","Marco Hutter"],"pdf_url":"https://arxiv.org/pdf/2309.15462v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12113v1","updated":"2024-01-22T16:51:01Z","published":"2024-01-22T16:51:01Z","title":"Extracting Formulae in Many-Valued Logic from Deep Neural Networks","summary":" We propose a new perspective on deep ReLU networks, namely as circuit\ncounterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV)\ngeneralization of Boolean logic. An algorithm for extracting formulae in MV\nlogic from deep ReLU networks is presented. As the algorithm applies to\nnetworks with general, in particular also real-valued, weights, it can be used\nto extract logical formulae from deep ReLU networks trained on data.\n","authors":["Yani Zhang","Helmut Bölcskei"],"pdf_url":"https://arxiv.org/pdf/2401.12113v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12108v1","updated":"2024-01-22T16:45:15Z","published":"2024-01-22T16:45:15Z","title":"On-Time Delivery in Crowdshipping Systems: An Agent-Based Approach Using\n Streaming Data","summary":" In parcel delivery, the \"last mile\" from the parcel hub to the customer is\ncostly, especially for time-sensitive delivery tasks that have to be completed\nwithin hours after arrival. Recently, crowdshipping has attracted increased\nattention as a new alternative to traditional delivery modes. In crowdshipping,\nprivate citizens (\"the crowd\") perform short detours in their daily lives to\ncontribute to parcel delivery in exchange for small incentives. However,\nachieving desirable crowd behavior is challenging as the crowd is highly\ndynamic and consists of autonomous, self-interested individuals. Leveraging\ncrowdshipping for time-sensitive deliveries remains an open challenge. In this\npaper, we present an agent-based approach to on-time parcel delivery with\ncrowds. Our system performs data stream processing on the couriers' smartphone\nsensor data to predict delivery delays. Whenever a delay is predicted, the\nsystem attempts to forge an agreement for transferring the parcel from the\ncurrent deliverer to a more promising courier nearby. Our experiments show that\nthrough accurate delay predictions and purposeful task transfers many delays\ncan be prevented that would occur without our approach.\n","authors":["Jeremias Dötterl","Ralf Bruns","Jürgen Dunkel","Sascha Ossowski"],"pdf_url":"https://arxiv.org/pdf/2401.12108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12103v1","updated":"2024-01-22T16:38:33Z","published":"2024-01-22T16:38:33Z","title":"LearnedWMP: Workload Memory Prediction Using Distribution of Query\n Templates","summary":" In a modern DBMS, working memory is frequently the limiting factor when\nprocessing in-memory analytic query operations such as joins, sorting, and\naggregation. Existing resource estimation approaches for a DBMS estimate the\nresource consumption of a query by computing an estimate of each individual\ndatabase operator in the query execution plan. Such an approach is slow and\nerror-prone as it relies upon simplifying assumptions, such as uniformity and\nindependence of the underlying data. Additionally, the existing approach\nfocuses on individual queries separately and does not factor in other queries\nin the workload that may be executed concurrently. In this research, we are\ninterested in query performance optimization under concurrent execution of a\nbatch of queries (a workload). Specifically, we focus on predicting the memory\ndemand for a workload rather than providing separate estimates for each query\nwithin it. We introduce the problem of workload memory prediction and formalize\nit as a distribution regression problem. We propose Learned Workload Memory\nPrediction (LearnedWMP) to improve and simplify estimating the working memory\ndemands of workloads. Through a comprehensive experimental evaluation, we show\nthat LearnedWMP reduces the memory estimation error of the\nstate-of-the-practice method by up to 47.6%. Compared to an alternative\nsingle-query model, during training and inferencing, the LearnedWMP model and\nits variants were 3x to 10x faster. Moreover, LearnedWMP-based models were at\nleast 50% smaller in most cases. Overall, the results demonstrate the\nadvantages of the LearnedWMP approach and its potential for a broader impact on\nquery performance optimization.\n","authors":["Shaikh Quader","Andres Jaramillo","Sumona Mukhopadhyay","Ghadeer Abuoda","Calisto Zuzarte","David Kalmuk","Marin Litoiu","Manos Papagelis"],"pdf_url":"https://arxiv.org/pdf/2401.12103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17028v2","updated":"2024-01-22T16:25:13Z","published":"2023-05-26T15:36:59Z","title":"Better Batch for Deep Probabilistic Time Series Forecasting","summary":" Deep probabilistic time series forecasting has gained significant attention\ndue to its superior performance in nonlinear approximation and its ability to\nprovide valuable uncertainty quantification for decision-making tasks. However,\nmany existing models oversimplify the problem by assuming that the error\nprocess is time-independent, thereby overlooking the serial correlation in the\nerror process. To overcome this limitation, we propose an innovative training\nmethod that incorporates error autocorrelation to further enhance the accuracy\nof probabilistic forecasting. Our method involves constructing a mini-batch as\na collection of $D$ consecutive time series segments for model training and\nexplicitly learning a time-varying covariance matrix over each mini-batch that\nencodes the error correlation among adjacent time steps. The learned covariance\nmatrix can be used to improve prediction accuracy and enhance uncertainty\nquantification. We evaluate our method on two different neural forecasting\nmodels and multiple public datasets, and the experimental results confirm the\neffectiveness of the proposed approach in enhancing the performance of both\nmodels across a wide range of datasets, yielding notable improvements in\npredictive accuracy.\n","authors":["Vincent Zhihao Zheng","Seongjin Choi","Lijun Sun"],"pdf_url":"https://arxiv.org/pdf/2305.17028v2.pdf","comment":"9 pages, 3 figures, camera-ready version, The 27th International\n Conference on Artificial Intelligence and Statistics (AISTATS 2024)"},{"id":"http://arxiv.org/abs/2401.12086v1","updated":"2024-01-22T16:24:43Z","published":"2024-01-22T16:24:43Z","title":"West-of-N: Synthetic Preference Generation for Improved Reward Modeling","summary":" The success of reinforcement learning from human feedback (RLHF) in language\nmodel alignment is strongly dependent on the quality of the underlying reward\nmodel. In this paper, we present a novel approach to improve reward model\nquality by generating synthetic preference data, thereby augmenting the\ntraining dataset with on-policy, high-quality preference pairs. Motivated by\nthe promising results of Best-of-N sampling strategies in language model\ntraining, we extend their application to reward model training. This results in\na self-training strategy to generate preference pairs by selecting the best and\nworst candidates in a pool of responses to a given query. Empirically, we find\nthat this approach improves the performance of any reward model, with an effect\ncomparable to the addition of a similar quantity of human preference data. This\nwork opens up new avenues of research for improving RLHF for language model\nalignment, by offering synthetic preference generation as a solution to reward\nmodeling challenges.\n","authors":["Alizée Pace","Jonathan Mallinson","Eric Malmi","Sebastian Krause","Aliaksei Severyn"],"pdf_url":"https://arxiv.org/pdf/2401.12086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12079v1","updated":"2024-01-22T16:21:19Z","published":"2024-01-22T16:21:19Z","title":"Collaborative Reinforcement Learning Based Unmanned Aerial Vehicle (UAV)\n Trajectory Design for 3D UAV Tracking","summary":" In this paper, the problem of using one active unmanned aerial vehicle (UAV)\nand four passive UAVs to localize a 3D target UAV in real time is investigated.\nIn the considered model, each passive UAV receives reflection signals from the\ntarget UAV, which are initially transmitted by the active UAV. The received\nreflection signals allow each passive UAV to estimate the signal transmission\ndistance which will be transmitted to a base station (BS) for the estimation of\nthe position of the target UAV. Due to the movement of the target UAV, each\nactive/passive UAV must optimize its trajectory to continuously localize the\ntarget UAV. Meanwhile, since the accuracy of the distance estimation depends on\nthe signal-to-noise ratio of the transmission signals, the active UAV must\noptimize its transmit power. This problem is formulated as an optimization\nproblem whose goal is to jointly optimize the transmit power of the active UAV\nand trajectories of both active and passive UAVs so as to maximize the target\nUAV positioning accuracy. To solve this problem, a Z function decomposition\nbased reinforcement learning (ZD-RL) method is proposed. Compared to value\nfunction decomposition based RL (VD-RL), the proposed method can find the\nprobability distribution of the sum of future rewards to accurately estimate\nthe expected value of the sum of future rewards thus finding better transmit\npower of the active UAV and trajectories for both active and passive UAVs and\nimproving target UAV positioning accuracy. Simulation results show that the\nproposed ZD-RL method can reduce the positioning errors by up to 39.4% and\n64.6%, compared to VD-RL and independent deep RL methods, respectively.\n","authors":["Yujiao Zhu","Mingzhe Chen","Sihua Wang","Ye Hu","Yuchen Liu","Changchuan Yin"],"pdf_url":"https://arxiv.org/pdf/2401.12079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12070v1","updated":"2024-01-22T16:09:47Z","published":"2024-01-22T16:09:47Z","title":"Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated\n Text","summary":" Detecting text generated by modern large language models is thought to be\nhard, as both LLMs and humans can exhibit a wide range of complex behaviors.\nHowever, we find that a score based on contrasting two closely related language\nmodels is highly accurate at separating human-generated and machine-generated\ntext. Based on this mechanism, we propose a novel LLM detector that only\nrequires simple calculations using a pair of pre-trained LLMs. The method,\ncalled Binoculars, achieves state-of-the-art accuracy without any training\ndata. It is capable of spotting machine text from a range of modern LLMs\nwithout any model-specific modifications. We comprehensively evaluate\nBinoculars on a number of text sources and in varied situations. Over a wide\nrange of document types, Binoculars detects over 90% of generated samples from\nChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being\ntrained on any ChatGPT data.\n","authors":["Abhimanyu Hans","Avi Schwarzschild","Valeriia Cherepanova","Hamid Kazemi","Aniruddha Saha","Micah Goldblum","Jonas Geiping","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2401.12070v1.pdf","comment":"20 pages, code available at https://github.com/ahans30/Binoculars"},{"id":"http://arxiv.org/abs/2401.12069v1","updated":"2024-01-22T16:08:41Z","published":"2024-01-22T16:08:41Z","title":"Beyond TreeSHAP: Efficient Computation of Any-Order Shapley Interactions\n for Tree Ensembles","summary":" While shallow decision trees may be interpretable, larger ensemble models\nlike gradient-boosted trees, which often set the state of the art in machine\nlearning problems involving tabular data, still remain black box models. As a\nremedy, the Shapley value (SV) is a well-known concept in explainable\nartificial intelligence (XAI) research for quantifying additive feature\nattributions of predictions. The model-specific TreeSHAP methodology solves the\nexponential complexity for retrieving exact SVs from tree-based models.\nExpanding beyond individual feature attribution, Shapley interactions reveal\nthe impact of intricate feature interactions of any order. In this work, we\npresent TreeSHAP-IQ, an efficient method to compute any-order additive Shapley\ninteractions for predictions of tree-based models. TreeSHAP-IQ is supported by\na mathematical framework that exploits polynomial arithmetic to compute the\ninteraction scores in a single recursive traversal of the tree, akin to Linear\nTreeSHAP. We apply TreeSHAP-IQ on state-of-the-art tree ensembles and explore\ninteractions on well-established benchmark datasets.\n","authors":["Maximilian Muschalik","Fabian Fumagalli","Barbara Hammer","Eyke Hüllermeier"],"pdf_url":"https://arxiv.org/pdf/2401.12069v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12068v1","updated":"2024-01-22T16:05:30Z","published":"2024-01-22T16:05:30Z","title":"Resource-constrained stereo singing voice cancellation","summary":" We study the problem of stereo singing voice cancellation, a subtask of music\nsource separation, whose goal is to estimate an instrumental background from a\nstereo mix. We explore how to achieve performance similar to large\nstate-of-the-art source separation networks starting from a small, efficient\nmodel for real-time speech separation. Such a model is useful when memory and\ncompute are limited and singing voice processing has to run with limited\nlook-ahead. In practice, this is realised by adapting an existing mono model to\nhandle stereo input. Improvements in quality are obtained by tuning model\nparameters and expanding the training set. Moreover, we highlight the benefits\na stereo model brings by introducing a new metric which detects attenuation\ninconsistencies between channels. Our approach is evaluated using objective\noffline metrics and a large-scale MUSHRA trial, confirming the effectiveness of\nour techniques in stringent listening tests.\n","authors":["Clara Borrelli","James Rae","Dogac Basaran","Matt McVicar","Mehrez Souden","Matthias Mauch"],"pdf_url":"https://arxiv.org/pdf/2401.12068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12058v1","updated":"2024-01-22T15:50:32Z","published":"2024-01-22T15:50:32Z","title":"The Dimension Strikes Back with Gradients: Generalization of Gradient\n Methods in Stochastic Convex Optimization","summary":" We study the generalization performance of gradient methods in the\nfundamental stochastic convex optimization setting, focusing on its dimension\ndependence. First, for full-batch gradient descent (GD) we give a construction\nof a learning problem in dimension $d=O(n^2)$, where the canonical version of\nGD (tuned for optimal performance of the empirical risk) trained with $n$\ntraining examples converges, with constant probability, to an approximate\nempirical risk minimizer with $\\Omega(1)$ population excess risk. Our bound\ntranslates to a lower bound of $\\Omega (\\sqrt{d})$ on the number of training\nexamples required for standard GD to reach a non-trivial test error, answering\nan open question raised by Feldman (2016) and Amir, Koren, and Livni (2021b)\nand showing that a non-trivial dimension dependence is unavoidable.\nFurthermore, for standard one-pass stochastic gradient descent (SGD), we show\nthat an application of the same construction technique provides a similar\n$\\Omega(\\sqrt{d})$ lower bound for the sample complexity of SGD to reach a\nnon-trivial empirical error, despite achieving optimal test performance. This\nagain provides an exponential improvement in the dimension dependence compared\nto previous work (Koren, Livni, Mansour, and Sherman, 2022), resolving an open\nquestion left therein.\n","authors":["Matan Schliserman","Uri Sherman","Tomer Koren"],"pdf_url":"https://arxiv.org/pdf/2401.12058v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12055v1","updated":"2024-01-22T15:47:05Z","published":"2024-01-22T15:47:05Z","title":"NEUROSEC: FPGA-Based Neuromorphic Audio Security","summary":" Neuromorphic systems, inspired by the complexity and functionality of the\nhuman brain, have gained interest in academic and industrial attention due to\ntheir unparalleled potential across a wide range of applications. While their\ncapabilities herald innovation, it is imperative to underscore that these\ncomputational paradigms, analogous to their traditional counterparts, are not\nimpervious to security threats. Although the exploration of neuromorphic\nmethodologies for image and video processing has been rigorously pursued, the\nrealm of neuromorphic audio processing remains in its early stages. Our results\nhighlight the robustness and precision of our FPGA-based neuromorphic system.\nSpecifically, our system showcases a commendable balance between desired signal\nand background noise, efficient spike rate encoding, and unparalleled\nresilience against adversarial attacks such as FGSM and PGD. A standout feature\nof our framework is its detection rate of 94%, which, when compared to other\nmethodologies, underscores its greater capability in identifying and mitigating\nthreats within 5.39 dB, a commendable SNR ratio. Furthermore, neuromorphic\ncomputing and hardware security serve many sensor domains in mission-critical\nand privacy-preserving applications.\n","authors":["Murat Isik","Hiruna Vishwamith","Yusuf Sur","Kayode Inadagbo","I. Can Dikmen"],"pdf_url":"https://arxiv.org/pdf/2401.12055v1.pdf","comment":"Audio processing, FPGA, Hardware Security, Neuromorphic Computing"},{"id":"http://arxiv.org/abs/2401.12046v1","updated":"2024-01-22T15:38:29Z","published":"2024-01-22T15:38:29Z","title":"Fourier Transporter: Bi-Equivariant Robotic Manipulation in 3D","summary":" Many complex robotic manipulation tasks can be decomposed as a sequence of\npick and place actions. Training a robotic agent to learn this sequence over\nmany different starting conditions typically requires many iterations or\ndemonstrations, especially in 3D environments. In this work, we propose Fourier\nTransporter (\\ours{}) which leverages the two-fold $\\SE(d)\\times\\SE(d)$\nsymmetry in the pick-place problem to achieve much higher sample efficiency.\n\\ours{} is an open-loop behavior cloning method trained using expert\ndemonstrations to predict pick-place actions on new environments. \\ours{} is\nconstrained to incorporate symmetries of the pick and place actions\nindependently. Our method utilizes a fiber space Fourier transformation that\nallows for memory-efficient construction. We test our proposed network on the\nRLbench benchmark and achieve state-of-the-art results across various tasks.\n","authors":["Haojie Huang","Owen Howell","Xupeng Zhu","Dian Wang","Robin Walters","Robert Platt"],"pdf_url":"https://arxiv.org/pdf/2401.12046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08865v2","updated":"2024-01-22T15:30:08Z","published":"2024-01-16T22:36:23Z","title":"The Effect of Intrinsic Dataset Properties on Generalization: Unraveling\n Learning Differences Between Natural and Medical Images","summary":" This paper investigates discrepancies in how neural networks learn from\ndifferent imaging domains, which are commonly overlooked when adopting computer\nvision techniques from the domain of natural images to other specialized\ndomains such as medical images. Recent works have found that the generalization\nerror of a trained network typically increases with the intrinsic dimension\n($d_{data}$) of its training set. Yet, the steepness of this relationship\nvaries significantly between medical (radiological) and natural imaging\ndomains, with no existing theoretical explanation. We address this gap in\nknowledge by establishing and empirically validating a generalization scaling\nlaw with respect to $d_{data}$, and propose that the substantial scaling\ndiscrepancy between the two considered domains may be at least partially\nattributed to the higher intrinsic \"label sharpness\" ($K_F$) of medical imaging\ndatasets, a metric which we propose. Next, we demonstrate an additional benefit\nof measuring the label sharpness of a training set: it is negatively correlated\nwith the trained model's adversarial robustness, which notably leads to models\nfor medical images having a substantially higher vulnerability to adversarial\nattack. Finally, we extend our $d_{data}$ formalism to the related metric of\nlearned representation intrinsic dimension ($d_{repr}$), derive a\ngeneralization scaling law with respect to $d_{repr}$, and show that $d_{data}$\nserves as an upper bound for $d_{repr}$. Our theoretical results are supported\nby thorough experiments with six models and eleven natural and medical imaging\ndatasets over a range of training set sizes. Our findings offer insights into\nthe influence of intrinsic dataset properties on generalization, representation\nlearning, and robustness in deep neural networks.\n","authors":["Nicholas Konz","Maciej A. Mazurowski"],"pdf_url":"https://arxiv.org/pdf/2401.08865v2.pdf","comment":"ICLR 2024. Code:\n https://github.com/mazurowski-lab/intrinsic-properties"},{"id":"http://arxiv.org/abs/2401.12033v1","updated":"2024-01-22T15:19:18Z","published":"2024-01-22T15:19:18Z","title":"Momentum-SAM: Sharpness Aware Minimization without Computational\n Overhead","summary":" The recently proposed optimization algorithm for deep neural networks\nSharpness Aware Minimization (SAM) suggests perturbing parameters before\ngradient calculation by a gradient ascent step to guide the optimization into\nparameter space regions of flat loss. While significant generalization\nimprovements and thus reduction of overfitting could be demonstrated, the\ncomputational costs are doubled due to the additionally needed gradient\ncalculation, making SAM unfeasible in case of limited computationally\ncapacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose\nMomentum-SAM (MSAM), which perturbs parameters in the direction of the\naccumulated momentum vector to achieve low sharpness without significant\ncomputational overhead or memory demands over SGD or Adam. We evaluate MSAM in\ndetail and reveal insights on separable mechanisms of NAG, SAM and MSAM\nregarding training optimization and generalization. Code is available at\nhttps://github.com/MarlonBecker/MSAM.\n","authors":["Marlon Becker","Frederick Altrock","Benjamin Risse"],"pdf_url":"https://arxiv.org/pdf/2401.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12024v1","updated":"2024-01-22T15:11:57Z","published":"2024-01-22T15:11:57Z","title":"Multimodal Visual-Tactile Representation Learning through\n Self-Supervised Contrastive Pre-Training","summary":" The rapidly evolving field of robotics necessitates methods that can\nfacilitate the fusion of multiple modalities. Specifically, when it comes to\ninteracting with tangible objects, effectively combining visual and tactile\nsensory data is key to understanding and navigating the complex dynamics of the\nphysical world, enabling a more nuanced and adaptable response to changing\nenvironments. Nevertheless, much of the earlier work in merging these two\nsensory modalities has relied on supervised methods utilizing datasets labeled\nby humans.This paper introduces MViTac, a novel methodology that leverages\ncontrastive learning to integrate vision and touch sensations in a\nself-supervised fashion. By availing both sensory inputs, MViTac leverages\nintra and inter-modality losses for learning representations, resulting in\nenhanced material property classification and more adept grasping prediction.\nThrough a series of experiments, we showcase the effectiveness of our method\nand its superiority over existing state-of-the-art self-supervised and\nsupervised techniques. In evaluating our methodology, we focus on two distinct\ntasks: material classification and grasping success prediction. Our results\nindicate that MViTac facilitates the development of improved modality encoders,\nyielding more robust representations as evidenced by linear probing\nassessments.\n","authors":["Vedant Dave","Fotios Lygerakis","Elmar Rueckert"],"pdf_url":"https://arxiv.org/pdf/2401.12024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13141v2","updated":"2024-01-22T15:07:26Z","published":"2023-12-20T16:02:25Z","title":"Augment on Manifold: Mixup Regularization with UMAP","summary":" Data augmentation techniques play an important role in enhancing the\nperformance of deep learning models. Despite their proven benefits in computer\nvision tasks, their application in the other domains remains limited. This\npaper proposes a Mixup regularization scheme, referred to as UMAP Mixup,\ndesigned for ``on-manifold\" automated data augmentation for deep learning\npredictive models. The proposed approach ensures that the Mixup operations\nresult in synthesized samples that lie on the data manifold of the features and\nlabels by utilizing a dimensionality reduction technique known as uniform\nmanifold approximation and projection. Evaluations across diverse regression\ntasks show that UMAP Mixup is competitive with or outperforms other Mixup\nvariants, show promise for its potential as an effective tool for enhancing the\ngeneralization performance of deep learning models.\n","authors":["Yousef El-Laham","Elizabeth Fons","Dillon Daudert","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13141v2.pdf","comment":"accepted paper to be published in the proceedings of ICASSP 2024"},{"id":"http://arxiv.org/abs/2311.14212v3","updated":"2024-01-22T15:05:30Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n Performance","summary":" When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations. We introduce the term annotation sensitivity to refer to the\nimpact of annotation data collection methods on the annotations themselves and\non downstream model performance and predictions. We collect annotations of hate\nspeech and offensive language in five experimental conditions of an annotation\ninstrument, randomly assigning annotators to conditions. We then fine-tune BERT\nmodels on each of the five resulting datasets and evaluate model performance on\na holdout portion of each condition. We find considerable differences between\nthe conditions for 1) the share of hate speech/offensive language annotations,\n2) model performance, 3) model predictions, and 4) model learning curves. Our\nresults emphasize the crucial role played by the annotation instrument which\nhas received little attention in the machine learning literature. We call for\nadditional research into how and why the instrument impacts the annotations to\ninform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v3.pdf","comment":"EMNLP 2023 Findings:\n https://aclanthology.org/2023.findings-emnlp.992/"},{"id":"http://arxiv.org/abs/2312.13152v2","updated":"2024-01-22T15:04:57Z","published":"2023-12-20T16:16:29Z","title":"Neural Stochastic Differential Equations with Change Points: A\n Generative Adversarial Approach","summary":" Stochastic differential equations (SDEs) have been widely used to model real\nworld random phenomena. Existing works mainly focus on the case where the time\nseries is modeled by a single SDE, which might be restrictive for modeling time\nseries with distributional shift. In this work, we propose a change point\ndetection algorithm for time series modeled as neural SDEs. Given a time series\ndataset, the proposed method jointly learns the unknown change points and the\nparameters of distinct neural SDE models corresponding to each change point.\nSpecifically, the SDEs are learned under the framework of generative\nadversarial networks (GANs) and the change points are detected based on the\noutput of the GAN discriminator in a forward pass. At each step of the proposed\nalgorithm, the change points and the SDE model parameters are updated in an\nalternating fashion. Numerical results on both synthetic and real datasets are\nprovided to validate the performance of our algorithm in comparison to\nclassical change point detection benchmarks, standard GAN-based neural SDEs,\nand other state-of-the-art deep generative models for time series data.\n","authors":["Zhongchang Sun","Yousef El-Laham","Svitlana Vyetrenko"],"pdf_url":"https://arxiv.org/pdf/2312.13152v2.pdf","comment":"accepted paper to be published in the proceedings of ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.12014v1","updated":"2024-01-22T15:00:32Z","published":"2024-01-22T15:00:32Z","title":"Robustness to distribution shifts of compressed networks for edge\n devices","summary":" It is necessary to develop efficient DNNs deployed on edge devices with\nlimited computation resources. However, the compressed networks often execute\nnew tasks in the target domain, which is different from the source domain where\nthe original network is trained. It is important to investigate the robustness\nof compressed networks in two types of data distribution shifts: domain shifts\nand adversarial perturbations. In this study, we discover that compressed\nmodels are less robust to distribution shifts than their original networks.\nInterestingly, larger networks are more vulnerable to losing robustness than\nsmaller ones, even when they are compressed to a similar size as the smaller\nnetworks. Furthermore, compact networks obtained by knowledge distillation are\nmuch more robust to distribution shifts than pruned networks. Finally,\npost-training quantization is a reliable method for achieving significant\nrobustness to distribution shifts, and it outperforms both pruned and distilled\nmodels in terms of robustness.\n","authors":["Lulan Shen","Ali Edalati","Brett Meyer","Warren Gross","James J. Clark"],"pdf_url":"https://arxiv.org/pdf/2401.12014v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12012v1","updated":"2024-01-22T14:59:11Z","published":"2024-01-22T14:59:11Z","title":"TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for\n Lazy Clients","summary":" Federated learning is a distributed collaborative machine learning paradigm\nthat has gained strong momentum in recent years. In federated learning, a\ncentral server periodically coordinates models with clients and aggregates the\nmodels trained locally by clients without necessitating access to local data.\nDespite its potential, the implementation of federated learning continues to\nencounter several challenges, predominantly the slow convergence that is\nlargely due to data heterogeneity. The slow convergence becomes particularly\nproblematic in cross-device federated learning scenarios where clients may be\nstrongly limited by computing power and storage space, and hence counteracting\nmethods that induce additional computation or memory cost on the client side\nsuch as auxiliary objective terms and larger training iterations can be\nimpractical. In this paper, we propose a novel federated aggregation strategy,\nTurboSVM-FL, that poses no additional computation burden on the client side and\ncan significantly accelerate convergence for federated classification task,\nespecially when clients are \"lazy\" and train their models solely for few epochs\nfor next global aggregation. TurboSVM-FL extensively utilizes support vector\nmachine to conduct selective aggregation and max-margin spread-out\nregularization on class embeddings. We evaluate TurboSVM-FL on multiple\ndatasets including FEMNIST, CelebA, and Shakespeare using user-independent\nvalidation with non-iid data distribution. Our results show that TurboSVM-FL\ncan significantly outperform existing popular algorithms on convergence rate\nand reduce communication rounds while delivering better test metrics including\naccuracy, F1 score, and MCC.\n","authors":["Mengdi Wang","Anna Bodonhelyi","Efe Bozkir","Enkelejda Kasneci"],"pdf_url":"https://arxiv.org/pdf/2401.12012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12007v1","updated":"2024-01-22T14:55:01Z","published":"2024-01-22T14:55:01Z","title":"Tensor-view Topological Graph Neural Network","summary":" Graph classification is an important learning task for graph-structured data.\nGraph neural networks (GNNs) have recently gained growing attention in graph\nlearning and have shown significant improvements in many important graph\nproblems. Despite their state-of-the-art performances, existing GNNs only use\nlocal information from a very limited neighborhood around each node, suffering\nfrom loss of multi-modal information and overheads of excessive computation. To\naddress these issues, we propose a novel Tensor-view Topological Graph Neural\nNetwork (TTG-NN), a class of simple yet effective topological deep learning\nbuilt upon persistent homology, graph convolution, and tensor operations. This\nnew method incorporates tensor learning to simultaneously capture Tensor-view\nTopological (TT), as well as Tensor-view Graph (TG) structural information on\nboth local and global levels. Computationally, to fully exploit graph topology\nand structure, we propose two flexible TT and TG representation learning\nmodules that disentangle feature tensor aggregation and transformation and\nlearn to preserve multi-modal structure with less computation. Theoretically,\nwe derive high probability bounds on both the out-of-sample and in-sample mean\nsquared approximation errors for our proposed Tensor Transformation Layer\n(TTL). Real data experiments show that the proposed TTG-NN outperforms 20\nstate-of-the-art methods on various graph benchmarks.\n","authors":["Tao Wen","Elynn Chen","Yuzhou Chen"],"pdf_url":"https://arxiv.org/pdf/2401.12007v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2309.12701v2","updated":"2024-01-22T14:53:22Z","published":"2023-09-22T08:18:08Z","title":"Decision Tree Search as a Markov Decision Problem","summary":" Finding an optimal decision tree for a supervised learning task is a\nchallenging combinatorial problem to solve at scale. It was recently proposed\nto frame the problem as a Markov Decision Problem (MDP) and use deep\nreinforcement learning to tackle scaling. Unfortunately, these methods are not\ncompetitive with the current branch-and-bound state-of-the-art. We propose\ninstead to scale the resolution of such MDPs using an information-theoretic\ntests generating function that heuristically, and dynamically for every state,\nlimits the set of admissible test actions to a few good candidates. As a\nsolver, we show empirically that our algorithm is at the very least competitive\nwith branch-and-bound alternatives. As a machine learning tool, a key advantage\nof our approach is to solve for multiple complexity-performance trade-offs at\nvirtually no additional cost. With such a set of solutions, a user can then\nselect the tree that generalizes best and which has the interpretability level\nthat best suits their needs, which no current branch-and-bound method allows.\n","authors":["Hector Kohler","Riad Akrour","Philippe Preux"],"pdf_url":"https://arxiv.org/pdf/2309.12701v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12004v1","updated":"2024-01-22T14:53:21Z","published":"2024-01-22T14:53:21Z","title":"NLCG-Net: A Model-Based Zero-Shot Learning Framework for Undersampled\n Quantitative MRI Reconstruction","summary":" Typical quantitative MRI (qMRI) methods estimate parameter maps after image\nreconstructing, which is prone to biases and error propagation. We propose a\nNonlinear Conjugate Gradient (NLCG) optimizer for model-based T2/T1 estimation,\nwhich incorporates U-Net regularization trained in a scan-specific manner. This\nend-to-end method directly estimates qMRI maps from undersampled k-space data\nusing mono-exponential signal modeling with zero-shot scan-specific neural\nnetwork regularization to enable high fidelity T1 and T2 mapping. T2 and T1\nmapping results demonstrate the ability of the proposed NLCG-Net to improve\nestimation quality compared to subspace reconstruction at high accelerations.\n","authors":["Xinrui Jiang","Yohan Jun","Jaejin Cho","Mengze Gao","Xingwang Yong","Berkin Bilgic"],"pdf_url":"https://arxiv.org/pdf/2401.12004v1.pdf","comment":"8 pages, 5 figures, submitted to International Society for Magnetic\n Resonance in Medicine 2024"},{"id":"http://arxiv.org/abs/2401.12002v1","updated":"2024-01-22T14:52:34Z","published":"2024-01-22T14:52:34Z","title":"HgbNet: predicting hemoglobin level/anemia degree from EHR data","summary":" Anemia is a prevalent medical condition that typically requires invasive\nblood tests for diagnosis and monitoring. Electronic health records (EHRs) have\nemerged as valuable data sources for numerous medical studies. EHR-based\nhemoglobin level/anemia degree prediction is non-invasive and rapid but still\nfaces some challenges due to the fact that EHR data is typically an irregular\nmultivariate time series containing a significant number of missing values and\nirregular time intervals. To address these issues, we introduce HgbNet, a\nmachine learning-based prediction model that emulates clinicians'\ndecision-making processes for hemoglobin level/anemia degree prediction. The\nmodel incorporates a NanDense layer with a missing indicator to handle missing\nvalues and employs attention mechanisms to account for both local irregularity\nand global irregularity. We evaluate the proposed method using two real-world\ndatasets across two use cases. In our first use case, we predict hemoglobin\nlevel/anemia degree at moment T+1 by utilizing records from moments prior to\nT+1. In our second use case, we integrate all historical records with\nadditional selected test results at moment T+1 to predict hemoglobin\nlevel/anemia degree at the same moment, T+1. HgbNet outperforms the best\nbaseline results across all datasets and use cases. These findings demonstrate\nthe feasibility of estimating hemoglobin levels and anemia degree from EHR\ndata, positioning HgbNet as an effective non-invasive anemia diagnosis solution\nthat could potentially enhance the quality of life for millions of affected\nindividuals worldwide. To our knowledge, HgbNet is the first machine learning\nmodel leveraging EHR data for hemoglobin level/anemia degree prediction.\n","authors":["Zhuo Zhi","Moe Elbadawi","Adam Daneshmend","Mine Orlu","Abdul Basit","Andreas Demosthenous","Miguel Rodrigues"],"pdf_url":"https://arxiv.org/pdf/2401.12002v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.12000v1","updated":"2024-01-22T14:51:01Z","published":"2024-01-22T14:51:01Z","title":"Integrating Statistical Significance and Discriminative Power in Pattern\n Discovery","summary":" Pattern discovery plays a central role in both descriptive and predictive\ntasks across multiple domains. Actionable patterns must meet rigorous\nstatistical significance criteria and, in the presence of target variables,\nfurther uphold discriminative power. Our work addresses the underexplored area\nof guiding pattern discovery by integrating statistical significance and\ndiscriminative power criteria into state-of-the-art algorithms while preserving\npattern quality. We also address how pattern quality thresholds, imposed by\nsome algorithms, can be rectified to accommodate these additional criteria. To\ntest the proposed methodology, we select the triclustering task as the guiding\npattern discovery case and extend well-known greedy and multi-objective\noptimization triclustering algorithms, $\\delta$-Trimax and TriGen, that use\nvarious pattern quality criteria, such as Mean Squared Residual (MSR), Least\nSquared Lines (LSL), and Multi Slope Measure (MSL). Results from three case\nstudies show the role of the proposed methodology in discovering patterns with\npronounced improvements of discriminative power and statistical significance\nwithout quality deterioration, highlighting its importance in supervisedly\nguiding the search. Although the proposed methodology is motivated over\nmultivariate time series data, it can be straightforwardly extended to pattern\ndiscovery tasks involving multivariate, N-way (N>3), transactional, and\nsequential data structures.\n Availability: The code is freely available at\nhttps://github.com/JupitersMight/MOF_Triclustering under the MIT license.\n","authors":["Leonardo Alexandre","Rafael S. Costa","Rui Henriques"],"pdf_url":"https://arxiv.org/pdf/2401.12000v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11993v1","updated":"2024-01-22T14:46:41Z","published":"2024-01-22T14:46:41Z","title":"Expert-Driven Monitoring of Operational ML Models","summary":" We propose Expert Monitoring, an approach that leverages domain expertise to\nenhance the detection and mitigation of concept drift in machine learning (ML)\nmodels. Our approach supports practitioners by consolidating domain expertise\nrelated to concept drift-inducing events, making this expertise accessible to\non-call personnel, and enabling automatic adaptability with expert oversight.\n","authors":["Joran Leest","Claudia Raibulet","Ilias Gerostathopoulos","Patricia Lago"],"pdf_url":"https://arxiv.org/pdf/2401.11993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11985v1","updated":"2024-01-22T14:38:25Z","published":"2024-01-22T14:38:25Z","title":"Scaling Face Interaction Graph Networks to Real World Scenes","summary":" Accurately simulating real world object dynamics is essential for various\napplications such as robotics, engineering, graphics, and design. To better\ncapture complex real dynamics such as contact and friction, learned simulators\nbased on graph networks have recently shown great promise. However, applying\nthese learned simulators to real scenes comes with two major challenges: first,\nscaling learned simulators to handle the complexity of real world scenes which\ncan involve hundreds of objects each with complicated 3D shapes, and second,\nhandling inputs from perception rather than 3D state information. Here we\nintroduce a method which substantially reduces the memory required to run\ngraph-based learned simulators. Based on this memory-efficient simulation\nmodel, we then present a perceptual interface in the form of editable NeRFs\nwhich can convert real-world scenes into a structured representation that can\nbe processed by graph network simulator. We show that our method uses\nsubstantially less memory than previous graph-based simulators while retaining\ntheir accuracy, and that the simulators learned in synthetic environments can\nbe applied to real world scenes captured from multiple camera angles. This\npaves the way for expanding the application of learned simulators to settings\nwhere only perceptual information is available at inference time.\n","authors":["Tatiana Lopez-Guevara","Yulia Rubanova","William F. Whitney","Tobias Pfaff","Kimberly Stachenfeld","Kelsey R. Allen"],"pdf_url":"https://arxiv.org/pdf/2401.11985v1.pdf","comment":"16 pages, 12 figures"},{"id":"http://arxiv.org/abs/2401.11974v1","updated":"2024-01-22T14:26:02Z","published":"2024-01-22T14:26:02Z","title":"Cross-Validation Conformal Risk Control","summary":" Conformal risk control (CRC) is a recently proposed technique that applies\npost-hoc to a conventional point predictor to provide calibration guarantees.\nGeneralizing conformal prediction (CP), with CRC, calibration is ensured for a\nset predictor that is extracted from the point predictor to control a risk\nfunction such as the probability of miscoverage or the false negative rate. The\noriginal CRC requires the available data set to be split between training and\nvalidation data sets. This can be problematic when data availability is\nlimited, resulting in inefficient set predictors. In this paper, a novel CRC\nmethod is introduced that is based on cross-validation, rather than on\nvalidation as the original CRC. The proposed cross-validation CRC (CV-CRC)\nextends a version of the jackknife-minmax from CP to CRC, allowing for the\ncontrol of a broader range of risk functions. CV-CRC is proved to offer\ntheoretical guarantees on the average risk of the set predictor. Furthermore,\nnumerical experiments show that CV-CRC can reduce the average set size with\nrespect to CRC when the available data are limited.\n","authors":["Kfir M. Cohen","Sangwoo Park","Osvaldo Simeone","Shlomo Shamai"],"pdf_url":"https://arxiv.org/pdf/2401.11974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.08738v2","updated":"2024-01-22T14:17:27Z","published":"2024-01-16T18:31:23Z","title":"Machine Learning-Based Analysis of Ebola Virus' Impact on Gene\n Expression in Nonhuman Primates","summary":" This study introduces the Supervised Magnitude-Altitude Scoring (SMAS)\nmethodology, a machine learning-based approach, for analyzing gene expression\ndata obtained from nonhuman primates (NHPs) infected with Ebola virus (EBOV).\nWe utilize a comprehensive dataset of NanoString gene expression profiles from\nEbola-infected NHPs, deploying the SMAS system for nuanced host-pathogen\ninteraction analysis. SMAS effectively combines gene selection based on\nstatistical significance and expression changes, employing linear classifiers\nsuch as logistic regression to accurately differentiate between RT-qPCR\npositive and negative NHP samples. A key finding of our research is the\nidentification of IFI6 and IFI27 as critical biomarkers, demonstrating\nexceptional predictive performance with 100% accuracy and Area Under the Curve\n(AUC) metrics in classifying various stages of Ebola infection. Alongside IFI6\nand IFI27, genes, including MX1, OAS1, and ISG15, were significantly\nupregulated, highlighting their essential roles in the immune response to EBOV.\nOur results underscore the efficacy of the SMAS method in revealing complex\ngenetic interactions and response mechanisms during EBOV infection. This\nresearch provides valuable insights into EBOV pathogenesis and aids in\ndeveloping more precise diagnostic tools and therapeutic strategies to address\nEBOV infection in particular and viral infection in general.\n","authors":["Mostafa Rezapour","Muhammad Khalid Khan Niazi","Hao Lu","Aarthi Narayanan","Metin Nafi Gurcan"],"pdf_url":"https://arxiv.org/pdf/2401.08738v2.pdf","comment":"28 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2401.10451v2","updated":"2024-01-22T14:14:16Z","published":"2024-01-19T01:40:58Z","title":"Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian\n Optimization Approach","summary":" Solving large-scale capacity expansion problems (CEPs) is central to\ncost-effective decarbonization of regional-scale energy systems. To ensure the\nintended outcomes of CEPs, modeling uncertainty due to weather-dependent\nvariable renewable energy (VRE) supply and energy demand becomes crucially\nimportant. However, the resulting stochastic optimization models are often less\ncomputationally tractable than their deterministic counterparts. Here, we\npropose a learning-assisted approximate solution method to tractably solve\ntwo-stage stochastic CEPs. Our method identifies low-cost planning decisions by\nconstructing and solving a sequence of tractable temporally aggregated\nsurrogate problems. We adopt a Bayesian optimization approach to searching the\nspace of time series aggregation hyperparameters and compute approximate\nsolutions that minimize costs on a validation set of supply-demand projections.\nImportantly, we evaluate solved planning outcomes on a held-out set of test\nprojections. We apply our approach to generation and transmission expansion\nplanning for a joint power-gas system spanning New England. We show that our\napproach yields an estimated cost savings of up to 3.8% in comparison to\nbenchmark time series aggregation approaches.\n","authors":["Aron Brenner","Rahman Khorramfar","Dharik Mallapragada","Saurabh Amin"],"pdf_url":"https://arxiv.org/pdf/2401.10451v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11963v1","updated":"2024-01-22T14:06:37Z","published":"2024-01-22T14:06:37Z","title":"Bridging Evolutionary Algorithms and Reinforcement Learning: A\n Comprehensive Survey","summary":" Evolutionary Reinforcement Learning (ERL), which integrates Evolutionary\nAlgorithms (EAs) and Reinforcement Learning (RL) for optimization, has\ndemonstrated remarkable performance advancements. By fusing the strengths of\nboth approaches, ERL has emerged as a promising research direction. This survey\noffers a comprehensive overview of the diverse research branches in ERL.\nSpecifically, we systematically summarize recent advancements in relevant\nalgorithms and identify three primary research directions: EA-assisted\noptimization of RL, RL-assisted optimization of EA, and synergistic\noptimization of EA and RL. Following that, we conduct an in-depth analysis of\neach research direction, organizing multiple research branches. We elucidate\nthe problems that each branch aims to tackle and how the integration of EA and\nRL addresses these challenges. In conclusion, we discuss potential challenges\nand prospective future research directions across various research directions.\n","authors":["Pengyi Li","Jianye Hao","Hongyao Tang","Xian Fu","Yan Zheng","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2401.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11954v1","updated":"2024-01-22T13:54:26Z","published":"2024-01-22T13:54:26Z","title":"RUMBoost: Gradient Boosted Random Utility Models","summary":" This paper introduces the RUMBoost model, a novel discrete choice modelling\napproach that combines the interpretability and behavioural robustness of\nRandom Utility Models (RUMs) with the generalisation and predictive ability of\ndeep learning methods. We obtain the full functional form of non-linear utility\nspecifications by replacing each linear parameter in the utility functions of a\nRUM with an ensemble of gradient boosted regression trees. This enables\npiece-wise constant utility values to be imputed for all alternatives directly\nfrom the data for any possible combination of input variables. We introduce\nadditional constraints on the ensembles to ensure three crucial features of the\nutility specifications: (i) dependency of the utilities of each alternative on\nonly the attributes of that alternative, (ii) monotonicity of marginal\nutilities, and (iii) an intrinsically interpretable functional form, where the\nexact response of the model is known throughout the entire input space.\nFurthermore, we introduce an optimisation-based smoothing technique that\nreplaces the piece-wise constant utility values of alternative attributes with\nmonotonic piece-wise cubic splines to identify non-linear parameters with\ndefined gradient. We demonstrate the potential of the RUMBoost model compared\nto various ML and Random Utility benchmark models for revealed preference mode\nchoice data from London. The results highlight the great predictive performance\nand the direct interpretability of our proposed approach. Furthermore, the\nsmoothed attribute utility functions allow for the calculation of various\nbehavioural indicators and marginal utilities. Finally, we demonstrate the\nflexibility of our methodology by showing how the RUMBoost model can be\nextended to complex model specifications, including attribute interactions,\ncorrelation within alternative error terms and heterogeneity within the\npopulation.\n","authors":["Nicolas Salvadé","Tim Hillel"],"pdf_url":"https://arxiv.org/pdf/2401.11954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.04033v4","updated":"2024-01-22T13:53:09Z","published":"2021-09-09T04:48:54Z","title":"New Versions of Gradient Temporal Difference Learning","summary":" Sutton, Szepesv\\'{a}ri and Maei introduced the first gradient\ntemporal-difference (GTD) learning algorithms compatible with both linear\nfunction approximation and off-policy training. The goal of this paper is (a)\nto propose some variants of GTDs with extensive comparative analysis and (b) to\nestablish new theoretical analysis frameworks for the GTDs. These variants are\nbased on convex-concave saddle-point interpretations of GTDs, which effectively\nunify all the GTDs into a single framework, and provide simple stability\nanalysis based on recent results on primal-dual gradient dynamics. Finally,\nnumerical comparative analysis is given to evaluate these approaches.\n","authors":["Donghwan Lee","Han-Dong Lim","Jihoon Park","Okyong Choi"],"pdf_url":"https://arxiv.org/pdf/2109.04033v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03242v2","updated":"2024-01-22T13:40:16Z","published":"2023-11-06T16:31:09Z","title":"Approximating Langevin Monte Carlo with ResNet-like Neural Network\n architectures","summary":" We sample from a given target distribution by constructing a neural network\nwhich maps samples from a simple reference, e.g. the standard normal\ndistribution, to samples from the target. To that end, we propose using a\nneural network architecture inspired by the Langevin Monte Carlo (LMC)\nalgorithm. Based on LMC perturbation results, we show approximation rates of\nthe proposed architecture for smooth, log-concave target distributions measured\nin the Wasserstein-$2$ distance. The analysis heavily relies on the notion of\nsub-Gaussianity of the intermediate measures of the perturbed LMC process. In\nparticular, we derive bounds on the growth of the intermediate variance proxies\nunder different assumptions on the perturbations. Moreover, we propose an\narchitecture similar to deep residual neural networks and derive expressivity\nresults for approximating the sample to target distribution map.\n","authors":["Charles Miranda","Janina Schütte","David Sommer","Martin Eigel"],"pdf_url":"https://arxiv.org/pdf/2311.03242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10107v2","updated":"2024-01-22T13:36:12Z","published":"2024-01-18T16:18:18Z","title":"Comparison analysis between standard polysomnographic data and\n in-ear-EEG signals: A preliminary study","summary":" Study Objectives: Polysomnography (PSG) currently serves as the benchmark for\nevaluating sleep disorders. Its discomfort, impracticality for home-use, and\nintroduction of bias in sleep quality assessment necessitate the exploration of\nless invasive, cost-effective, and portable alternatives. One promising\ncontender is the in-ear-EEG sensor, which offers advantages in terms of\ncomfort, fixed electrode positions, resistance to electromagnetic interference,\nand user-friendliness. This study aims to establish a methodology to assess the\nsimilarity between the in-ear-EEG signal and standard PSG.\n Methods: We assess the agreement between the PSG and in-ear-EEG derived\nhypnograms. We extract features in the time- and frequency- domain from PSG and\nin-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers\nand the in-ear-EEG-scorers were in agreement. We introduce a methodology to\nquantify the similarity between PSG derivations and the single-channel\nin-ear-EEG. The approach relies on a comparison of distributions of selected\nfeatures -- extracted for each sleep stage and subject on both PSG and the\nin-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity\nIndex (JSD-FSI).\n Results: We found a high intra-scorer variability, mainly due to the\nuncertainty the scorers had in evaluating the in-ear-EEG signals. We show that\nthe similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/-\n0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line\nwith the similarity values computed independently on standard\nPSG-channel-combinations.\n Conclusions: In-ear-EEG is a valuable solution for home-based sleep\nmonitoring, however further studies with a larger and more heterogeneous\ndataset are needed.\n","authors":["Gianpaolo Palo","Luigi Fiorillo","Giuliana Monachino","Michal Bechny","Mark Melnykowycz","Athina Tzovara","Valentina Agostini","Francesca Dalia Faraci"],"pdf_url":"https://arxiv.org/pdf/2401.10107v2.pdf","comment":"29 pages, 12 figures, 1 table"},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":" This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11940v1","updated":"2024-01-22T13:30:11Z","published":"2024-01-22T13:30:11Z","title":"Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent","summary":" This paper considers the problem of recovering a tensor with an underlying\nlow-tubal-rank structure from a small number of corrupted linear measurements.\nTraditional approaches tackling such a problem require the computation of\ntensor Singular Value Decomposition (t-SVD), that is a computationally\nintensive process, rendering them impractical for dealing with large-scale\ntensors. Aim to address this challenge, we propose an efficient and effective\nlow-tubal-rank tensor recovery method based on a factorization procedure akin\nto the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves\ndecomposing a large tensor into two smaller factor tensors, followed by solving\nthe problem through factorized gradient descent (FGD). This strategy eliminates\nthe need for t-SVD computation, thereby reducing computational costs and\nstorage requirements. We provide rigorous theoretical analysis to ensure the\nconvergence of FGD under both noise-free and noisy situations. Additionally, it\nis worth noting that our method does not require the precise estimation of the\ntensor tubal-rank. Even in cases where the tubal-rank is slightly\noverestimated, our approach continues to demonstrate robust performance. A\nseries of experiments have been carried out to demonstrate that, as compared to\nother popular ones, our approach exhibits superior performance in multiple\nscenarios, in terms of the faster computational speed and the smaller\nconvergence error.\n","authors":["Zhiyu Liu","Zhi Han","Yandong Tang","Xi-Le Zhao","Yao Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11940v1.pdf","comment":"13 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11929v1","updated":"2024-01-22T13:15:40Z","published":"2024-01-22T13:15:40Z","title":"The Bigger the Better? Rethinking the Effective Model Scale in Long-term\n Time Series Forecasting","summary":" Long-term time series forecasting (LTSF) represents a critical frontier in\ntime series analysis, distinguished by its focus on extensive input sequences,\nin contrast to the constrained lengths typical of traditional approaches. While\nlonger sequences inherently convey richer information, potentially enhancing\npredictive precision, prevailing techniques often respond by escalating model\ncomplexity. These intricate models can inflate into millions of parameters,\nincorporating parameter-intensive elements like positional encodings,\nfeed-forward networks and self-attention mechanisms. This complexity, however,\nleads to prohibitive model scale, particularly given the time series data's\nsemantic simplicity. Motivated by the pursuit of parsimony, our research\nemploys conditional correlation and auto-correlation as investigative tools,\nrevealing significant redundancies within the input data. Leveraging these\ninsights, we introduce the HDformer, a lightweight Transformer variant enhanced\nwith hierarchical decomposition. This novel architecture not only inverts the\nprevailing trend toward model expansion but also accomplishes precise\nforecasting with drastically fewer computations and parameters. Remarkably,\nHDformer outperforms existing state-of-the-art LTSF models, while requiring\nover 99\\% fewer parameters. Through this work, we advocate a paradigm shift in\nLTSF, emphasizing the importance to tailor the model to the inherent dynamics\nof time series data-a timely reminder that in the realm of LTSF, bigger is not\ninvariably better.\n","authors":["Jinliang Deng","Xuan Song","Ivor W. Tsang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2401.11929v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09126v2","updated":"2024-01-22T13:14:33Z","published":"2023-10-13T14:14:43Z","title":"Physics-guided Noise Neural Proxy for Practical Low-light Raw Image\n Denoising","summary":" Recently, the mainstream practice for training low-light raw image denoising\nmethods has shifted towards employing synthetic data. Noise modeling, which\nfocuses on characterizing the noise distribution of real-world sensors,\nprofoundly influences the effectiveness and practicality of synthetic data.\nCurrently, physics-based noise modeling struggles to characterize the entire\nreal noise distribution, while learning-based noise modeling impractically\ndepends on paired real data. In this paper, we propose a novel strategy:\nlearning the noise model from dark frames instead of paired real data, to break\ndown the data dependency. Based on this strategy, we introduce an efficient\nphysics-guided noise neural proxy (PNNP) to approximate the real-world sensor\nnoise model. Specifically, we integrate physical priors into neural proxies and\nintroduce three efficient techniques: physics-guided noise decoupling (PND),\nphysics-guided proxy model (PPM), and differentiable distribution loss (DDL).\nPND decouples the dark frame into different components and handles different\nlevels of noise flexibly, which reduces the complexity of noise modeling. PPM\nincorporates physical priors to constrain the generated noise, which promotes\nthe accuracy of noise modeling. DDL provides explicit and reliable supervision\nfor noise distribution, which promotes the precision of noise modeling. PNNP\nexhibits powerful potential in characterizing the real noise distribution.\nExtensive experiments on public datasets demonstrate superior performance in\npractical low-light raw image denoising. The code will be available at\n\\url{https://github.com/fenghansen/PNNP}.\n","authors":["Hansen Feng","Lizhi Wang","Yiqi Huang","Yuzhi Wang","Lin Zhu","Hua Huang"],"pdf_url":"https://arxiv.org/pdf/2310.09126v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.10337v2","updated":"2024-01-22T12:33:43Z","published":"2024-01-18T19:02:00Z","title":"Noise Contrastive Estimation-based Matching Framework for Low-resource\n Security Attack Pattern Recognition","summary":" Tactics, Techniques and Procedures (TTPs) represent sophisticated attack\npatterns in the cybersecurity domain, described encyclopedically in textual\nknowledge bases. Identifying TTPs in cybersecurity writing, often called TTP\nmapping, is an important and challenging task. Conventional learning approaches\noften target the problem in the classical multi-class or multilabel\nclassification setting. This setting hinders the learning ability of the model\ndue to a large number of classes (i.e., TTPs), the inevitable skewness of the\nlabel distribution and the complex hierarchical structure of the label space.\nWe formulate the problem in a different learning paradigm, where the assignment\nof a text to a TTP label is decided by the direct semantic similarity between\nthe two, thus reducing the complexity of competing solely over the large\nlabeling space. To that end, we propose a neural matching architecture with an\neffective sampling-based learn-to-compare mechanism, facilitating the learning\nprocess of the matching model despite constrained resources.\n","authors":["Tu Nguyen","Nedim Srndic","Alexander Neth"],"pdf_url":"https://arxiv.org/pdf/2401.10337v2.pdf","comment":"accepted at EACL 2024, in ARR October 2023"},{"id":"http://arxiv.org/abs/2401.11888v1","updated":"2024-01-22T12:28:50Z","published":"2024-01-22T12:28:50Z","title":"Multimodal Deep Learning of Word-of-Mouth Text and Demographics to\n Predict Customer Rating: Handling Consumer Heterogeneity in Marketing","summary":" In the marketing field, understanding consumer heterogeneity, which is the\ninternal or psychological difference among consumers that cannot be captured by\nbehavioral logs, has long been a critical challenge. However, a number of\nconsumers today usually post their evaluation on the specific product on the\nonline platform, which can be the valuable source of such unobservable\ndifferences among consumers. Several previous studies have shown the validity\nof the analysis on text modality, but on the other hand, such analyses may not\nnecessarily demonstrate sufficient predictive accuracy for text alone, as they\nmay not include information readily available from cross-sectional data, such\nas consumer profile data. In addition, recent advances in machine learning\ntechniques, such as large-scale language models (LLMs) and multimodal learning\nhave made it possible to deal with the various kind of dataset simultaneously,\nincluding textual data and the traditional cross-sectional data, and the joint\nrepresentations can be effectively obtained from multiple modalities.\nTherefore, this study constructs a product evaluation model that takes into\naccount consumer heterogeneity by multimodal learning of online product reviews\nand consumer profile information. We also compare multiple models using\ndifferent modalities or hyper-parameters to demonstrate the robustness of\nmultimodal learning in marketing analysis.\n","authors":["Junichiro Niimi"],"pdf_url":"https://arxiv.org/pdf/2401.11888v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.15269v3","updated":"2024-01-22T12:26:44Z","published":"2022-06-30T13:20:48Z","title":"Deep Reinforcement Learning with Swin Transformers","summary":" Transformers are neural network models that utilize multiple layers of\nself-attention heads and have exhibited enormous potential in natural language\nprocessing tasks. Meanwhile, there have been efforts to adapt transformers to\nvisual tasks of machine learning, including Vision Transformers and Swin\nTransformers. Although some researchers use Vision Transformers for\nreinforcement learning tasks, their experiments remain at a small scale due to\nthe high computational cost. This article presents the first online\nreinforcement learning scheme that is based on Swin Transformers: Swin DQN. In\ncontrast to existing research, our novel approach demonstrate the superior\nperformance with experiments on 49 games in the Arcade Learning Environment.\nThe results show that our approach achieves significantly higher maximal\nevaluation scores than the baseline method in 45 of all the 49 games (92%), and\nhigher mean evaluation scores than the baseline method in 40 of all the 49\ngames (82%).\n","authors":["Li Meng","Morten Goodwin","Anis Yazidi","Paal Engelstad"],"pdf_url":"https://arxiv.org/pdf/2206.15269v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10393v2","updated":"2024-01-22T12:04:18Z","published":"2024-01-18T22:06:38Z","title":"Catastrophic Interference is Mitigated in Naturalistic Power-Law\n Learning Environments","summary":" Neural networks often suffer from catastrophic interference (CI): performance\non previously learned tasks drops off significantly when learning a new task.\nThis contrasts strongly with humans, who can sequentially learn new tasks\nwithout appreciably forgetting previous tasks. Prior work has explored various\ntechniques for mitigating CI such as regularization, rehearsal, generative\nreplay, and distillation methods. The current work takes a different approach,\none guided by cognitive science research showing that in naturalistic\nenvironments, the probability of encountering a task decreases as a power-law\nof the time since it was last performed. We argue that a realistic evaluation\nof techniques for the mitigation of CI should be performed in simulated\nnaturalistic learning environments. Thus, we evaluate the extent of mitigation\nof CI when training simple rehearsal-based methods in power-law environments\nsimilar to the ones humans face. Our work explores this novel rehearsal-based\napproach for a domain-incremental task: learning permutations in the MNIST\ntask. We compare our rehearsal environment with other baselines to show its\nefficacy in promoting continual learning. Additionally, we investigate whether\nthis environment shows forward facilitation, i.e., faster learning of later\ntasks. Next, we explore the robustness of our learning environment to the\nnumber of tasks, model size, and amount of data rehearsed after each task.\nNotably, our results show that the performance is comparable or superior to\nthat of models trained using popular regularization methods and also to\nrehearsals in non-power-law environments. The benefits of this training\nparadigm include simplicity and the lack of a need for extra neural circuitry.\nIn addition, because our method is orthogonal to other methods, future research\ncan combine training in power-law environments with other continual learning\nmechanisms.\n","authors":["Atith Gandhi","Raj Sanjay Shah","Vijay Marupudi","Sashank Varma"],"pdf_url":"https://arxiv.org/pdf/2401.10393v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06064v4","updated":"2024-01-22T12:04:06Z","published":"2023-05-18T13:59:02Z","title":"Neural Algorithmic Reasoning for Combinatorial Optimisation","summary":" Solving NP-hard/complete combinatorial problems with neural networks is a\nchallenging research area that aims to surpass classical approximate\nalgorithms. The long-term objective is to outperform hand-designed heuristics\nfor NP-hard/complete problems by learning to generate superior solutions solely\nfrom training data. Current neural-based methods for solving CO problems often\noverlook the inherent \"algorithmic\" nature of the problems. In contrast,\nheuristics designed for CO problems, e.g. TSP, frequently leverage\nwell-established algorithms, such as those for finding the minimum spanning\ntree. In this paper, we propose leveraging recent advancements in neural\nalgorithmic reasoning to improve the learning of CO problems. Specifically, we\nsuggest pre-training our neural model on relevant algorithms before training it\non CO instances. Our results demonstrate that by using this learning setup, we\nachieve superior performance compared to non-algorithmically informed deep\nlearning models.\n","authors":["Dobrik Georgiev","Danilo Numeroso","Davide Bacciu","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2306.06064v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04073v2","updated":"2024-01-22T12:00:58Z","published":"2023-05-06T15:26:22Z","title":"Explaining RL Decisions with Trajectories","summary":" Explanation is a key component for the adoption of reinforcement learning\n(RL) in many real-world decision-making problems. In the literature, the\nexplanation is often provided by saliency attribution to the features of the RL\nagent's state. In this work, we propose a complementary approach to these\nexplanations, particularly for offline RL, where we attribute the policy\ndecisions of a trained RL agent to the trajectories encountered by it during\ntraining. To do so, we encode trajectories in offline training data\nindividually as well as collectively (encoding a set of trajectories). We then\nattribute policy decisions to a set of trajectories in this encoded space by\nestimating the sensitivity of the decision with respect to that set. Further,\nwe demonstrate the effectiveness of the proposed approach in terms of quality\nof attributions as well as practical scalability in diverse environments that\ninvolve both discrete and continuous state and action spaces such as\ngrid-worlds, video games (Atari) and continuous control (MuJoCo). We also\nconduct a human study on a simple navigation task to observe how their\nunderstanding of the task compares with data attributed for a trained RL\npolicy. Keywords -- Explainable AI, Verifiability of AI Decisions, Explainable\nRL.\n","authors":["Shripad Vilasrao Deshmukh","Arpan Dasgupta","Balaji Krishnamurthy","Nan Jiang","Chirag Agarwal","Georgios Theocharous","Jayakumar Subramanian"],"pdf_url":"https://arxiv.org/pdf/2305.04073v2.pdf","comment":"Published at International Conference on Learning Representations\n (ICLR), 2023"},{"id":"http://arxiv.org/abs/2210.00108v3","updated":"2024-01-22T11:51:29Z","published":"2022-09-30T21:59:24Z","title":"ImpNet: Imperceptible and blackbox-undetectable backdoors in compiled\n neural networks","summary":" Early backdoor attacks against machine learning set off an arms race in\nattack and defence development. Defences have since appeared demonstrating some\nability to detect backdoors in models or even remove them. These defences work\nby inspecting the training data, the model, or the integrity of the training\nprocedure. In this work, we show that backdoors can be added during\ncompilation, circumventing any safeguards in the data preparation and model\ntraining stages. The attacker can not only insert existing weight-based\nbackdoors during compilation, but also a new class of weight-independent\nbackdoors, such as ImpNet. These backdoors are impossible to detect during the\ntraining or data preparation processes, because they are not yet present. Next,\nwe demonstrate that some backdoors, including ImpNet, can only be reliably\ndetected at the stage where they are inserted and removing them anywhere else\npresents a significant challenge. We conclude that ML model security requires\nassurance of provenance along the entire technical pipeline, including the\ndata, model architecture, compiler, and hardware specification.\n","authors":["Tim Clifford","Ilia Shumailov","Yiren Zhao","Ross Anderson","Robert Mullins"],"pdf_url":"https://arxiv.org/pdf/2210.00108v3.pdf","comment":"10 pages, 7 figures, to be published in IEEE Secure and Trustworthy\n Machine Learning 2024. For website see https://ml.backdoors.uk . For source\n code, see https://git.sr.ht/~tim-clifford/impnet_source"},{"id":"http://arxiv.org/abs/2401.11860v1","updated":"2024-01-22T11:29:44Z","published":"2024-01-22T11:29:44Z","title":"A Review of Physics-Informed Machine Learning Methods with Applications\n to Condition Monitoring and Anomaly Detection","summary":" This study presents a comprehensive overview of PIML techniques in the\ncontext of condition monitoring. The central concept driving PIML is the\nincorporation of known physical laws and constraints into machine learning\nalgorithms, enabling them to learn from available data while remaining\nconsistent with physical principles. Through fusing domain knowledge with\ndata-driven learning, PIML methods offer enhanced accuracy and interpretability\nin comparison to purely data-driven approaches. In this comprehensive survey,\ndetailed examinations are performed with regard to the methodology by which\nknown physical principles are integrated within machine learning frameworks, as\nwell as their suitability for specific tasks within condition monitoring.\nIncorporation of physical knowledge into the ML model may be realized in a\nvariety of methods, with each having its unique advantages and drawbacks. The\ndistinct advantages and limitations of each methodology for the integration of\nphysics within data-driven models are detailed, considering factors such as\ncomputational efficiency, model interpretability, and generalizability to\ndifferent systems in condition monitoring and fault detection. Several case\nstudies and works of literature utilizing this emerging concept are presented\nto demonstrate the efficacy of PIML in condition monitoring applications. From\nthe literature reviewed, the versatility and potential of PIML in condition\nmonitoring may be demonstrated. Novel PIML methods offer an innovative solution\nfor addressing the complexities of condition monitoring and associated\nchallenges. This comprehensive survey helps form the foundation for future work\nin the field. As the technology continues to advance, PIML is expected to play\na crucial role in enhancing maintenance strategies, system reliability, and\noverall operational efficiency in engineering systems.\n","authors":["Yuandi Wu","Brett Sicard","Stephen Andrew Gadsden"],"pdf_url":"https://arxiv.org/pdf/2401.11860v1.pdf","comment":"Paper has been submitted for review to the journal Expert Systems\n with Applications (December 31, 2023). 90 pages, 22 figures, 9 tables"},{"id":"http://arxiv.org/abs/2309.16034v2","updated":"2024-01-22T11:26:35Z","published":"2023-09-27T21:26:01Z","title":"Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale\n Localization","summary":" Advancements in nanotechnology and material science are paving the way toward\nnanoscale devices that combine sensing, computing, data and energy storage, and\nwireless communication. In precision medicine, these nanodevices show promise\nfor disease diagnostics, treatment, and monitoring from within the patients'\nbloodstreams. Assigning the location of a sensed biological event with the\nevent itself, which is the main proposition of flow-guided in-body nanoscale\nlocalization, would be immensely beneficial from the perspective of precision\nmedicine. The nanoscale nature of the nanodevices and the challenging\nenvironment that the bloodstream represents, result in current flow-guided\nlocalization approaches being constrained in their communication and\nenergy-related capabilities. The communication and energy constraints of the\nnanodevices result in different features of raw data for flow-guided\nlocalization, in turn affecting its performance. An analytical modeling of the\neffects of imperfect communication and constrained energy causing intermittent\noperation of the nanodevices on the raw data produced by the nanodevices would\nbe beneficial. Hence, we propose an analytical model of raw data for\nflow-guided localization, where the raw data is modeled as a function of\ncommunication and energy-related capabilities of the nanodevice. We evaluate\nthe model by comparing its output with the one obtained through the utilization\nof a simulator for objective evaluation of flow-guided localization, featuring\ncomparably higher level of realism. Our results across a number of scenarios\nand heterogeneous performance metrics indicate high similarity between the\nmodel and simulator-generated raw datasets.\n","authors":["Guillem Pascual","Filip Lemic","Carmen Delgado","Xavier Costa-Perez"],"pdf_url":"https://arxiv.org/pdf/2309.16034v2.pdf","comment":"6 pages, 7 figures, 4 tables, 16 references"},{"id":"http://arxiv.org/abs/2309.10688v3","updated":"2024-01-22T11:26:17Z","published":"2023-09-19T15:23:07Z","title":"On the different regimes of Stochastic Gradient Descent","summary":" Modern deep networks are trained with stochastic gradient descent (SGD) whose\nkey hyperparameters are the number of data considered at each step or batch\nsize $B$, and the step size or learning rate $\\eta$. For small $B$ and large\n$\\eta$, SGD corresponds to a stochastic evolution of the parameters, whose\nnoise amplitude is governed by the `temperature' $T\\equiv \\eta/B$. Yet this\ndescription is observed to break down for sufficiently large batches $B\\geq\nB^*$, or simplifies to gradient descent (GD) when the temperature is\nsufficiently small. Understanding where these cross-overs take place remains a\ncentral challenge. Here, we resolve these questions for a teacher-student\nperceptron classification model and show empirically that our key predictions\nstill apply to deep networks. Specifically, we obtain a phase diagram in the\n$B$-$\\eta$ plane that separates three dynamical phases: \\textit{(i)} a\nnoise-dominated SGD governed by temperature, \\textit{(ii)} a\nlarge-first-step-dominated SGD and \\textit{(iii)} GD. These different phases\nalso correspond to different regimes of generalization error. Remarkably, our\nanalysis reveals that the batch size $B^*$ separating regimes \\textit{(i)} and\n\\textit{(ii)} scale with the size $P$ of the training set, with an exponent\nthat characterizes the hardness of the classification problem.\n","authors":["Antonio Sclocchi","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2309.10688v3.pdf","comment":"Main: 8 pages, 4 figures; Appendix: 20 pages, 10 figures"},{"id":"http://arxiv.org/abs/2308.09647v2","updated":"2024-01-22T11:14:39Z","published":"2023-08-18T16:07:01Z","title":"Robust Uncertainty Quantification Using Conformalised Monte Carlo\n Prediction","summary":" Deploying deep learning models in safety-critical applications remains a very\nchallenging task, mandating the provision of assurances for the dependable\noperation of these models. Uncertainty quantification (UQ) methods estimate the\nmodel's confidence per prediction, informing decision-making by considering the\neffect of randomness and model misspecification. Despite the advances of\nstate-of-the-art UQ methods, they are computationally expensive or produce\nconservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ\nmethod that combines a new adaptive Monte Carlo (MC) dropout method with\nconformal prediction (CP). MC-CP adaptively modulates the traditional MC\ndropout at runtime to save memory and computation resources, enabling\npredictions to be consumed by CP, yielding robust prediction sets/intervals.\nThroughout comprehensive experiments, we show that MC-CP delivers significant\nimprovements over advanced UQ methods, like MC dropout, RAPS and CQR, both in\nclassification and regression benchmarks. MC-CP can be easily added to existing\nmodels, making its deployment simple.\n","authors":["Daniel Bethell","Simos Gerasimou","Radu Calinescu"],"pdf_url":"https://arxiv.org/pdf/2308.09647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11849v1","updated":"2024-01-22T11:08:36Z","published":"2024-01-22T11:08:36Z","title":"Self-Labeling the Job Shop Scheduling Problem","summary":" In this work, we propose a Self-Supervised training strategy specifically\ndesigned for combinatorial problems. One of the main obstacles in applying\nsupervised paradigms to such problems is the requirement of expensive target\nsolutions as ground-truth, often produced with costly exact solvers. Inspired\nby Semi- and Self-Supervised learning, we show that it is possible to easily\ntrain generative models by sampling multiple solutions and using the best one\naccording to the problem objective as a pseudo-label. In this way, we\niteratively improve the model generation capability by relying only on its\nself-supervision, completely removing the need for optimality information. We\nprove the effectiveness of this Self-Labeling strategy on the Job Shop\nScheduling (JSP), a complex combinatorial problem that is receiving much\nattention from the Reinforcement Learning community. We propose a generative\nmodel based on the well-known Pointer Network and train it with our strategy.\nExperiments on two popular benchmarks demonstrate the potential of this\napproach as the resulting models outperform constructive heuristics and current\nstate-of-the-art Reinforcement Learning proposals.\n","authors":["Andrea Corsini","Angelo Porrello","Simone Calderara","Mauro Dell'Amico"],"pdf_url":"https://arxiv.org/pdf/2401.11849v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11844v1","updated":"2024-01-22T11:01:52Z","published":"2024-01-22T11:01:52Z","title":"Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field\n Crop Yield Prediction","summary":" Accurate crop yield prediction is of utmost importance for informed\ndecision-making in agriculture, aiding farmers, and industry stakeholders.\nHowever, this task is complex and depends on multiple factors, such as\nenvironmental conditions, soil properties, and management practices. Combining\nheterogeneous data views poses a fusion challenge, like identifying the\nview-specific contribution to the predictive task. We present a novel\nmulti-view learning approach to predict crop yield for different crops\n(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our\nmulti-view input data includes multi-spectral optical images from Sentinel-2\nsatellites and weather data as dynamic features during the crop growing season,\ncomplemented by static features like soil properties and topographic\ninformation. To effectively fuse the data, we introduce a Multi-view Gated\nFusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU)\nmodule. The view-encoders handle the heterogeneity of data sources with varying\ntemporal resolutions by learning a view-specific representation. These\nrepresentations are adaptively fused via a weighted sum. The fusion weights are\ncomputed for each sample by the GU using a concatenation of the\nview-representations. The MVGF model is trained at sub-field level with 10 m\nresolution pixels. Our evaluations show that the MVGF outperforms conventional\nmodels on the same task, achieving the best results by incorporating all the\ndata sources, unlike the usual fusion results in the literature. For Argentina,\nthe MVGF model achieves an R2 value of 0.68 at sub-field yield prediction,\nwhile at field level evaluation (comparing field averages), it reaches around\n0.80 across different countries. The GU module learned different weights based\non the country and crop-type, aligning with the variable significance of each\ndata source to the prediction task.\n","authors":["Francisco Mena","Deepak Pathak","Hiba Najjar","Cristhian Sanchez","Patrick Helber","Benjamin Bischke","Peter Habelitz","Miro Miranda","Jayanth Siddamsetty","Marlon Nuske","Marcela Charfuelan","Diego Arenas","Michaela Vollmer","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2401.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11840v1","updated":"2024-01-22T10:57:11Z","published":"2024-01-22T10:57:11Z","title":"Learning to Approximate Adaptive Kernel Convolution on Graphs","summary":" Various Graph Neural Networks (GNNs) have been successful in analyzing data\nin non-Euclidean spaces, however, they have limitations such as oversmoothing,\ni.e., information becomes excessively averaged as the number of hidden layers\nincreases. The issue stems from the intrinsic formulation of conventional graph\nconvolution where the nodal features are aggregated from a direct neighborhood\nper layer across the entire nodes in the graph. As setting different number of\nhidden layers per node is infeasible, recent works leverage a diffusion kernel\nto redefine the graph structure and incorporate information from farther nodes.\nUnfortunately, such approaches suffer from heavy diagonalization of a graph\nLaplacian or learning a large transform matrix. In this regards, we propose a\ndiffusion learning framework, where the range of feature aggregation is\ncontrolled by the scale of a diffusion kernel. For efficient computation, we\nderive closed-form derivatives of approximations of the graph convolution with\nrespect to the scale, so that node-wise range can be adaptively learned. With a\ndownstream classifier, the entire framework is made trainable in an end-to-end\nmanner. Our model is tested on various standard datasets for node-wise\nclassification for the state-of-the-art performance, and it is also validated\non a real-world brain network data for graph classifications to demonstrate its\npracticality for Alzheimer classification.\n","authors":["Jaeyoon Sim","Sooyeon Jeon","InJun Choi","Guorong Wu","Won Hwa Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11840v1.pdf","comment":"15 pages, Accepted to AAAI 2024"},{"id":"http://arxiv.org/abs/2401.11836v1","updated":"2024-01-22T10:52:22Z","published":"2024-01-22T10:52:22Z","title":"Privacy-Preserving Data Fusion for Traffic State Estimation: A Vertical\n Federated Learning Approach","summary":" This paper proposes a privacy-preserving data fusion method for traffic state\nestimation (TSE). Unlike existing works that assume all data sources to be\naccessible by a single trusted party, we explicitly address data privacy\nconcerns that arise in the collaboration and data sharing between multiple data\nowners, such as municipal authorities (MAs) and mobility providers (MPs). To\nthis end, we propose a novel vertical federated learning (FL) approach, FedTSE,\nthat enables multiple data owners to collaboratively train and apply a TSE\nmodel without having to exchange their private data. To enhance the\napplicability of the proposed FedTSE in common TSE scenarios with limited\navailability of ground-truth data, we further propose a privacy-preserving\nphysics-informed FL approach, i.e., FedTSE-PI, that integrates traffic models\ninto FL. Real-world data validation shows that the proposed methods can protect\nprivacy while yielding similar accuracy to the oracle method without privacy\nconsiderations.\n","authors":["Qiqing Wang","Kaidi Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18394v5","updated":"2024-01-22T10:44:50Z","published":"2023-05-28T12:34:07Z","title":"On Optimal Regularization Parameters via Bilevel Learning","summary":" Variational regularization is commonly used to solve linear inverse problems,\nand involves augmenting a data fidelity by a regularizer. The regularizer is\nused to promote a priori information and is weighted by a regularization\nparameter. Selection of an appropriate regularization parameter is critical,\nwith various choices leading to very different reconstructions. Classical\nstrategies used to determine a suitable parameter value include the discrepancy\nprinciple and the L-curve criterion, and in recent years a supervised machine\nlearning approach called bilevel learning has been employed. Bilevel learning\nis a powerful framework to determine optimal parameters and involves solving a\nnested optimization problem. While previous strategies enjoy various\ntheoretical results, the well-posedness of bilevel learning in this setting is\nstill an open question. In particular, a necessary property is positivity of\nthe determined regularization parameter. In this work, we provide a new\ncondition that better characterizes positivity of optimal regularization\nparameters than the existing theory. Numerical results verify and explore this\nnew condition for both small and high-dimensional problems.\n","authors":["Matthias J. Ehrhardt","Silvia Gazzola","Sebastian J. Scott"],"pdf_url":"https://arxiv.org/pdf/2305.18394v5.pdf","comment":"34 pages, 11 figures. Version for publication"},{"id":"http://arxiv.org/abs/2401.11825v1","updated":"2024-01-22T10:38:14Z","published":"2024-01-22T10:38:14Z","title":"Sparse discovery of differential equations based on multi-fidelity\n Gaussian process","summary":" Sparse identification of differential equations aims to compute the analytic\nexpressions from the observed data explicitly. However, there exist two primary\nchallenges. Firstly, it exhibits sensitivity to the noise in the observed data,\nparticularly for the derivatives computations. Secondly, existing literature\npredominantly concentrates on single-fidelity (SF) data, which imposes\nlimitations on its applicability due to the computational cost. In this paper,\nwe present two novel approaches to address these problems from the view of\nuncertainty quantification. We construct a surrogate model employing the\nGaussian process regression (GPR) to mitigate the effect of noise in the\nobserved data, quantify its uncertainty, and ultimately recover the equations\naccurately. Subsequently, we exploit the multi-fidelity Gaussian processes\n(MFGP) to address scenarios involving multi-fidelity (MF), sparse, and noisy\nobserved data. We demonstrate the robustness and effectiveness of our\nmethodologies through several numerical experiments.\n","authors":["Yuhuang Meng","Yue Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.11825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.07178v2","updated":"2024-01-22T10:31:56Z","published":"2023-12-12T11:22:31Z","title":"Beyond Expected Return: Accounting for Policy Reproducibility when\n Evaluating Reinforcement Learning Algorithms","summary":" Many applications in Reinforcement Learning (RL) usually have noise or\nstochasticity present in the environment. Beyond their impact on learning,\nthese uncertainties lead the exact same policy to perform differently, i.e.\nyield different return, from one roll-out to another. Common evaluation\nprocedures in RL summarise the consequent return distributions using solely the\nexpected return, which does not account for the spread of the distribution. Our\nwork defines this spread as the policy reproducibility: the ability of a policy\nto obtain similar performance when rolled out many times, a crucial property in\nsome real-world applications. We highlight that existing procedures that only\nuse the expected return are limited on two fronts: first an infinite number of\nreturn distributions with a wide range of performance-reproducibility\ntrade-offs can have the same expected return, limiting its effectiveness when\nused for comparing policies; second, the expected return metric does not leave\nany room for practitioners to choose the best trade-off value for considered\napplications. In this work, we address these limitations by recommending the\nuse of Lower Confidence Bound, a metric taken from Bayesian optimisation that\nprovides the user with a preference parameter to choose a desired\nperformance-reproducibility trade-off. We also formalise and quantify policy\nreproducibility, and demonstrate the benefit of our metrics using extensive\nexperiments of popular RL algorithms on common uncertain RL tasks.\n","authors":["Manon Flageat","Bryan Lim","Antoine Cully"],"pdf_url":"https://arxiv.org/pdf/2312.07178v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11817v1","updated":"2024-01-22T10:26:14Z","published":"2024-01-22T10:26:14Z","title":"Hallucination is Inevitable: An Innate Limitation of Large Language\n Models","summary":" Hallucination has been widely recognized to be a significant drawback for\nlarge language models (LLMs). There have been many works that attempt to reduce\nthe extent of hallucination. These efforts have mostly been empirical so far,\nwhich cannot answer the fundamental question whether it can be completely\neliminated. In this paper, we formalize the problem and show that it is\nimpossible to eliminate hallucination in LLMs. Specifically, we define a formal\nworld where hallucination is defined as inconsistencies between a computable\nLLM and a computable ground truth function. By employing results from learning\ntheory, we show that LLMs cannot learn all of the computable functions and will\ntherefore always hallucinate. Since the formal world is a part of the real\nworld which is much more complicated, hallucinations are also inevitable for\nreal world LLMs. Furthermore, for real world LLMs constrained by provable time\ncomplexity, we describe the hallucination-prone tasks and empirically validate\nour claims. Finally, using the formal world framework, we discuss the possible\nmechanisms and efficacies of existing hallucination mitigators as well as the\npractical implications on the safe deployment of LLMs.\n","authors":["Ziwei Xu","Sanjay Jain","Mohan Kankanhalli"],"pdf_url":"https://arxiv.org/pdf/2401.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11810v1","updated":"2024-01-22T10:14:45Z","published":"2024-01-22T10:14:45Z","title":"Generalization and Informativeness of Conformal Prediction","summary":" The safe integration of machine learning modules in decision-making processes\nhinges on their ability to quantify uncertainty. A popular technique to achieve\nthis goal is conformal prediction (CP), which transforms an arbitrary base\npredictor into a set predictor with coverage guarantees. While CP certifies the\npredicted set to contain the target quantity with a user-defined tolerance, it\ndoes not provide control over the average size of the predicted sets, i.e.,\nover the informativeness of the prediction. In this work, a theoretical\nconnection is established between the generalization properties of the base\npredictor and the informativeness of the resulting CP prediction sets. To this\nend, an upper bound is derived on the expected size of the CP set predictor\nthat builds on generalization error bounds for the base predictor. The derived\nupper bound provides insights into the dependence of the average size of the CP\nset predictor on the amount of calibration data, the target reliability, and\nthe generalization performance of the base predictor. The theoretical insights\nare validated using simple numerical regression and classification tasks.\n","authors":["Matteo Zecchin","Sangwoo Park","Osvaldo Simeone","Fredrik Hellström"],"pdf_url":"https://arxiv.org/pdf/2401.11810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.01168v3","updated":"2024-01-22T10:09:20Z","published":"2022-12-02T13:47:21Z","title":"Towards Cross Domain Generalization of Hamiltonian Representation via\n Meta Learning","summary":" Recent advances in deep learning for physics have focused on discovering\nshared representations of target systems by incorporating physics priors or\ninductive biases into neural networks. While effective, these methods are\nlimited to the system domain, where the type of system remains consistent and\nthus cannot ensure the adaptation to new, or unseen physical systems governed\nby different laws. For instance, a neural network trained on a mass-spring\nsystem cannot guarantee accurate predictions for the behavior of a two-body\nsystem or any other system with different physical laws. In this work, we take\na significant leap forward by targeting cross domain generalization within the\nfield of Hamiltonian dynamics. We model our system with a graph neural network\nand employ a meta learning algorithm to enable the model to gain experience\nover a distribution of tasks and make it adapt to new physics. Our approach\naims to learn a unified Hamiltonian representation that is generalizable across\nmultiple system domains, thereby overcoming the limitations of system-specific\nmodels. Our results demonstrate that the meta-trained model not only adapts\neffectively to new systems but also captures a generalized Hamiltonian\nrepresentation that is consistent across different physical domains. Overall,\nthrough the use of meta learning, we offer a framework that achieves cross\ndomain generalization, providing a step towards a unified model for\nunderstanding a wide array of dynamical systems via deep learning.\n","authors":["Yeongwoo Song","Hawoong Jeong"],"pdf_url":"https://arxiv.org/pdf/2212.01168v3.pdf","comment":"Conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2311.06558v2","updated":"2024-01-22T10:07:39Z","published":"2023-11-11T12:28:31Z","title":"Convolve and Conquer: Data Comparison with Wiener Filters","summary":" Quantitative evaluations of differences and/or similarities between data\nsamples define and shape optimisation problems associated with learning data\ndistributions. Current methods to compare data often suffer from limitations in\ncapturing such distributions or lack desirable mathematical properties for\noptimisation (e.g. smoothness, differentiability, or convexity). In this paper,\nwe introduce a new method to measure (dis)similarities between paired samples\ninspired by Wiener-filter theory. The convolutional nature of Wiener filters\nallows us to comprehensively compare data samples in a globally correlated way.\nWe validate our approach in four machine learning applications: data\ncompression, medical imaging imputation, translated classification, and\nnon-parametric generative modelling. Our results demonstrate increased\nresolution in reconstructed images with better perceptual quality and higher\ndata fidelity, as well as robustness against translations, compared to\nconventional mean-squared-error analogue implementations.\n","authors":["Deborah Pelacani Cruz","George Strong","Oscar Bates","Carlos Cueto","Jiashun Yao","Lluis Guasch"],"pdf_url":"https://arxiv.org/pdf/2311.06558v2.pdf","comment":"10 pages, 5 figures, Medical Imaging Meets Neurips Workshop"},{"id":"http://arxiv.org/abs/2401.11798v1","updated":"2024-01-22T09:54:49Z","published":"2024-01-22T09:54:49Z","title":"Knowledge Distillation on Spatial-Temporal Graph Convolutional Network\n for Traffic Prediction","summary":" Efficient real-time traffic prediction is crucial for reducing transportation\ntime. To predict traffic conditions, we employ a spatio-temporal graph neural\nnetwork (ST-GNN) to model our real-time traffic data as temporal graphs.\nDespite its capabilities, it often encounters challenges in delivering\nefficient real-time predictions for real-world traffic data. Recognizing the\nsignificance of timely prediction due to the dynamic nature of real-time data,\nwe employ knowledge distillation (KD) as a solution to enhance the execution\ntime of ST-GNNs for traffic prediction. In this paper, We introduce a cost\nfunction designed to train a network with fewer parameters (the student) using\ndistilled data from a complex network (the teacher) while maintaining its\naccuracy close to that of the teacher. We use knowledge distillation,\nincorporating spatial-temporal correlations from the teacher network to enable\nthe student to learn the complex patterns perceived by the teacher. However, a\nchallenge arises in determining the student network architecture rather than\nconsidering it inadvertently. To address this challenge, we propose an\nalgorithm that utilizes the cost function to calculate pruning scores,\naddressing small network architecture search issues, and jointly fine-tunes the\nnetwork resulting from each pruning stage using KD. Ultimately, we evaluate our\nproposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results\nindicate that our method can maintain the student's accuracy close to that of\nthe teacher, even with the retention of only $3\\%$ of network parameters.\n","authors":["Mohammad Izadi","Mehran Safayani","Abdolreza Mirzaei"],"pdf_url":"https://arxiv.org/pdf/2401.11798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12817v2","updated":"2024-01-22T09:44:18Z","published":"2023-10-19T15:12:44Z","title":"2D-3D Interlaced Transformer for Point Cloud Segmentation with\n Scene-Level Supervision","summary":" We present a Multimodal Interlaced Transformer (MIT) that jointly considers\n2D and 3D data for weakly supervised point cloud segmentation. Research studies\nhave shown that 2D and 3D features are complementary for point cloud\nsegmentation. However, existing methods require extra 2D annotations to achieve\n2D-3D information fusion. Considering the high annotation cost of point clouds,\neffective 2D and 3D feature fusion based on weakly supervised learning is in\ngreat demand. To this end, we propose a transformer model with two encoders and\none decoder for weakly supervised point cloud segmentation using only\nscene-level class tags. Specifically, the two encoders compute the\nself-attended features for 3D point clouds and 2D multi-view images,\nrespectively. The decoder implements interlaced 2D-3D cross-attention and\ncarries out implicit 2D and 3D feature fusion. We alternately switch the roles\nof queries and key-value pairs in the decoder layers. It turns out that the 2D\nand 3D features are iteratively enriched by each other. Experiments show that\nit performs favorably against existing weakly supervised point cloud\nsegmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The\nproject page will be available at https://jimmy15923.github.io/mit_web/.\n","authors":["Cheng-Kun Yang","Min-Hung Chen","Yung-Yu Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2310.12817v2.pdf","comment":"ICCV 2023 (main + supp). Website:\n https://jimmy15923.github.io/mit_web/"},{"id":"http://arxiv.org/abs/2401.11792v1","updated":"2024-01-22T09:44:16Z","published":"2024-01-22T09:44:16Z","title":"Safe and Generalized end-to-end Autonomous Driving System with\n Reinforcement Learning and Demonstrations","summary":" An intelligent driving system should be capable of dynamically formulating\nappropriate driving strategies based on the current environment and vehicle\nstatus, while ensuring the security and reliability of the system. However,\nexisting methods based on reinforcement learning and imitation learning suffer\nfrom low safety, poor generalization, and inefficient sampling. Additionally,\nthey cannot accurately predict future driving trajectories, and the accurate\nprediction of future driving trajectories is a precondition for making optimal\ndecisions. To solve these problems, in this paper, we introduce a Safe and\nGeneralized end-to-end Autonomous Driving System (SGADS) for complex and\nvarious scenarios. Our SGADS incorporates variational inference with\nnormalizing flows, enabling the intelligent vehicle to accurately predict\nfuture driving trajectories. Moreover, we propose the formulation of robust\nsafety constraints. Furthermore, we combine reinforcement learning with\ndemonstrations to augment search process of the agent. The experimental results\ndemonstrate that our SGADS can significantly improve safety performance,\nexhibit strong generalization, and enhance the training efficiency of\nintelligent vehicles in complex urban scenarios compared to existing methods.\n","authors":["Zuojin Tang","Xiaoyu Chen","YongQiang Li","Jianyu Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11791v1","updated":"2024-01-22T09:41:05Z","published":"2024-01-22T09:41:05Z","title":"SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic\n Segmentation","summary":" Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation\nmodels using training image data with only image-level supervision. Since\nprecise pixel-level annotations are not accessible, existing methods typically\nfocus on producing pseudo masks for training segmentation models by refining\nCAM-like heatmaps. However, the produced heatmaps may only capture\ndiscriminative image regions of target object categories or the associated\nco-occurring backgrounds. To address the issues, we propose a Semantic Prompt\nLearning for WSSS (SemPLeS) framework, which learns to effectively prompt the\nCLIP space to enhance the semantic alignment between the segmented regions and\nthe target object categories. More specifically, we propose Contrastive Prompt\nLearning and Class-associated Semantic Refinement to learn the prompts that\nadequately describe and suppress the image backgrounds associated with each\ntarget object category. In this way, our proposed framework is able to perform\nbetter semantic matching between object regions and the associated text labels,\nresulting in desired pseudo masks for training the segmentation model. The\nproposed SemPLeS framework achieves SOTA performance on the standard WSSS\nbenchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the\nsemantic visualization of our learned prompts. The codes will be released.\n","authors":["Ci-Siang Lin","Chien-Yi Wang","Yu-Chiang Frank Wang","Min-Hung Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11791v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06101v2","updated":"2024-01-22T09:27:30Z","published":"2023-11-10T15:09:04Z","title":"In-Context Learning for MIMO Equalization Using Transformer-Based\n Sequence Models","summary":" Large pre-trained sequence models, such as transformer-based architectures,\nhave been recently shown to have the capacity to carry out in-context learning\n(ICL). In ICL, a decision on a new input is made via a direct mapping of the\ninput and of a few examples from the given task, serving as the task's context,\nto the output variable. No explicit updates of the model parameters are needed\nto tailor the decision to a new task. Pre-training, which amounts to a form of\nmeta-learning, is based on the observation of examples from several related\ntasks. Prior work has shown ICL capabilities for linear regression. In this\nstudy, we leverage ICL to address the inverse problem of multiple-input and\nmultiple-output (MIMO) equalization based on a context given by pilot symbols.\nA task is defined by the unknown fading channel and by the signal-to-noise\nratio (SNR) level, which may be known. To highlight the practical potential of\nthe approach, we allow the presence of quantization of the received signals. We\ndemonstrate via numerical results that transformer-based ICL has a threshold\nbehavior, whereby, as the number of pre-training tasks grows, the performance\nswitches from that of a minimum mean squared error (MMSE) equalizer with a\nprior determined by the pre-trained tasks to that of an MMSE equalizer with the\ntrue data-generating prior.\n","authors":["Matteo Zecchin","Kai Yu","Osvaldo Simeone"],"pdf_url":"https://arxiv.org/pdf/2311.06101v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11772v1","updated":"2024-01-22T09:09:10Z","published":"2024-01-22T09:09:10Z","title":"LightDiC: A Simple yet Effective Approach for Large-scale Digraph\n Representation Learning","summary":" Most existing graph neural networks (GNNs) are limited to undirected graphs,\nwhose restricted scope of the captured relational information hinders their\nexpressive capabilities and deployments in real-world scenarios. Compared with\nundirected graphs, directed graphs (digraphs) fit the demand for modeling more\ncomplex topological systems by capturing more intricate relationships between\nnodes, such as formulating transportation and financial networks. While some\ndirected GNNs have been introduced, their inspiration mainly comes from deep\nlearning architectures, which lead to redundant complexity and computation,\nmaking them inapplicable to large-scale databases. To address these issues, we\npropose LightDiC, a scalable variant of the digraph convolution based on the\nmagnetic Laplacian. Since topology-related computations are conducted solely\nduring offline pre-processing, LightDiC achieves exceptional scalability,\nenabling downstream predictions to be trained separately without incurring\nrecursive computational costs. Theoretical analysis shows that LightDiC\nutilizes directed information to achieve message passing based on the complex\nfield, which corresponds to the proximal gradient descent process of the\nDirichlet energy optimization function from the perspective of digraph signal\ndenoising, ensuring its expressiveness. Experimental results demonstrate that\nLightDiC performs comparably well or even outperforms other SOTA methods in\nvarious downstream tasks, with fewer learnable parameters and higher training\nefficiency. Notably, LightDiC is the first DiGNN to provide satisfactory\nresults in the most representative large-scale database (ogbn-papers100M).\n","authors":["Xunkai Li","Meihao Liao","Zhengyu Wu","Daohan Su","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11772v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2401.11768v1","updated":"2024-01-22T09:03:16Z","published":"2024-01-22T09:03:16Z","title":"ADA-GNN: Atom-Distance-Angle Graph Neural Network for Crystal Material\n Property Prediction","summary":" Property prediction is a fundamental task in crystal material research. To\nmodel atoms and structures, structures represented as graphs are widely used\nand graph learning-based methods have achieved significant progress. Bond\nangles and bond distances are two key structural information that greatly\ninfluence crystal properties. However, most of the existing works only consider\nbond distances and overlook bond angles. The main challenge lies in the time\ncost of handling bond angles, which leads to a significant increase in\ninference time. To solve this issue, we first propose a crystal structure\nmodeling based on dual scale neighbor partitioning mechanism, which uses a\nlarger scale cutoff for edge neighbors and a smaller scale cutoff for angle\nneighbors. Then, we propose a novel Atom-Distance-Angle Graph Neural Network\n(ADA-GNN) for property prediction tasks, which can process node information and\nstructural information separately. The accuracy of predictions and inference\ntime are improved with the dual scale modeling and the specially designed\narchitecture of ADA-GNN. The experimental results validate that our approach\nachieves state-of-the-art results in two large-scale material benchmark\ndatasets on property prediction tasks.\n","authors":["Jiao Huang","Qianli Xing","Jinglong Ji","Bo Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11768v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05023v2","updated":"2024-01-22T08:47:49Z","published":"2023-06-08T08:22:27Z","title":"Beyond Vanilla Variational Autoencoders: Detecting Posterior Collapse in\n Conditional and Hierarchical Variational Autoencoders","summary":" The posterior collapse phenomenon in variational autoencoder (VAE), where the\nvariational posterior distribution closely matches the prior distribution, can\nhinder the quality of the learned latent variables. As a consequence of\nposterior collapse, the latent variables extracted by the encoder in VAE\npreserve less information from the input data and thus fail to produce\nmeaningful representations as input to the reconstruction process in the\ndecoder. While this phenomenon has been an actively addressed topic related to\nVAE performance, the theory for posterior collapse remains underdeveloped,\nespecially beyond the standard VAE. In this work, we advance the theoretical\nunderstanding of posterior collapse to two important and prevalent yet less\nstudied classes of VAE: conditional VAE and hierarchical VAE. Specifically, via\na non-trivial theoretical analysis of linear conditional VAE and hierarchical\nVAE with two levels of latent, we prove that the cause of posterior collapses\nin these models includes the correlation between the input and output of the\nconditional VAE and the effect of learnable encoder variance in the\nhierarchical VAE. We empirically validate our theoretical findings for linear\nconditional and hierarchical VAE and demonstrate that these results are also\npredictive for non-linear cases with extensive experiments.\n","authors":["Hien Dang","Tho Tran","Tan Nguyen","Nhat Ho"],"pdf_url":"https://arxiv.org/pdf/2306.05023v2.pdf","comment":"International Conference on Learning Representations (ICLR) 2024"},{"id":"http://arxiv.org/abs/2401.11760v1","updated":"2024-01-22T08:45:29Z","published":"2024-01-22T08:45:29Z","title":"Towards Effective and General Graph Unlearning via Mutual Evolution","summary":" With the rapid advancement of AI applications, the growing needs for data\nprivacy and model robustness have highlighted the importance of machine\nunlearning, especially in thriving graph-based scenarios. However, most\nexisting graph unlearning strategies primarily rely on well-designed\narchitectures or manual process, rendering them less user-friendly and posing\nchallenges in terms of deployment efficiency. Furthermore, striking a balance\nbetween unlearning performance and framework generalization is also a pivotal\nconcern. To address the above issues, we propose \\underline{\\textbf{M}}utual\n\\underline{\\textbf{E}}volution \\underline{\\textbf{G}}raph\n\\underline{\\textbf{U}}nlearning (MEGU), a new mutual evolution paradigm that\nsimultaneously evolves the predictive and unlearning capacities of graph\nunlearning. By incorporating aforementioned two components, MEGU ensures\ncomplementary optimization in a unified training framework that aligns with the\nprediction and unlearning requirements. Extensive experiments on 9 graph\nbenchmark datasets demonstrate the superior performance of MEGU in addressing\nunlearning requirements at the feature, node, and edge levels. Specifically,\nMEGU achieves average performance improvements of 2.7\\%, 2.5\\%, and 3.2\\%\nacross these three levels of unlearning tasks when compared to state-of-the-art\nbaselines. Furthermore, MEGU exhibits satisfactory training efficiency,\nreducing time and space overhead by an average of 159.8x and 9.6x,\nrespectively, in comparison to retraining GNN from scratch.\n","authors":["Xunkai Li","Yulin Zhao","Zhengyu Wu","Wentao Zhang","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11760v1.pdf","comment":"Accepted by AAAI 2024 Oral"},{"id":"http://arxiv.org/abs/2401.09953v2","updated":"2024-01-22T08:32:02Z","published":"2024-01-18T12:58:53Z","title":"Through the Dual-Prism: A Spectral Perspective on Graph Data\n Augmentation for Graph Classification","summary":" Graph Neural Networks (GNNs) have become the preferred tool to process graph\ndata, with their efficacy being boosted through graph data augmentation\ntechniques. Despite the evolution of augmentation methods, issues like graph\nproperty distortions and restricted structural changes persist. This leads to\nthe question: Is it possible to develop more property-conserving and\nstructure-sensitive augmentation methods? Through a spectral lens, we\ninvestigate the interplay between graph properties, their augmentation, and\ntheir spectral behavior, and found that keeping the low-frequency eigenvalues\nunchanged can preserve the critical properties at a large scale when generating\naugmented graphs. These observations inform our introduction of the Dual-Prism\n(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly\nretains essential graph properties while diversifying augmented graphs.\nExtensive experiments validate the efficiency of our approach, providing a new\nand promising direction for graph data augmentation.\n","authors":["Yutong Xia","Runpeng Yu","Yuxuan Liang","Xavier Bresson","Xinchao Wang","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2401.09953v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11755v1","updated":"2024-01-22T08:31:53Z","published":"2024-01-22T08:31:53Z","title":"FedGTA: Topology-aware Averaging for Federated Graph Learning","summary":" Federated Graph Learning (FGL) is a distributed machine learning paradigm\nthat enables collaborative training on large-scale subgraphs across multiple\nlocal systems. Existing FGL studies fall into two categories: (i) FGL\nOptimization, which improves multi-client training in existing machine learning\nmodels; (ii) FGL Model, which enhances performance with complex local models\nand multi-client interactions. However, most FGL optimization strategies are\ndesigned specifically for the computer vision domain and ignore graph\nstructure, presenting dissatisfied performance and slow convergence. Meanwhile,\ncomplex local model architectures in FGL Models studies lack scalability for\nhandling large-scale subgraphs and have deployment limitations. To address\nthese issues, we propose Federated Graph Topology-aware Aggregation (FedGTA), a\npersonalized optimization strategy that optimizes through topology-aware local\nsmoothing confidence and mixed neighbor features. During experiments, we deploy\nFedGTA in 12 multi-scale real-world datasets with the Louvain and Metis split.\nThis allows us to evaluate the performance and robustness of FedGTA across a\nrange of scenarios. Extensive experiments demonstrate that FedGTA achieves\nstate-of-the-art performance while exhibiting high scalability and efficiency.\nThe experiment includes ogbn-papers100M, the most representative large-scale\ngraph database so that we can verify the applicability of our method to\nlarge-scale graph learning. To the best of our knowledge, our study is the\nfirst to bridge large-scale graph learning with FGL using this optimization\nstrategy, contributing to the development of efficient and scalable FGL\nmethods.\n","authors":["Xunkai Li","Zhengyu Wu","Wentao Zhang","Yinlin Zhu","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11755v1.pdf","comment":"Accepted by VLDB 2024"},{"id":"http://arxiv.org/abs/2401.11750v1","updated":"2024-01-22T08:23:31Z","published":"2024-01-22T08:23:31Z","title":"AdaFGL: A New Paradigm for Federated Node Classification with Topology\n Heterogeneity","summary":" Recently, Federated Graph Learning (FGL) has attracted significant attention\nas a distributed framework based on graph neural networks, primarily due to its\ncapability to break data silos. Existing FGL studies employ community split on\nthe homophilous global graph by default to simulate federated semi-supervised\nnode classification settings. Such a strategy assumes the consistency of\ntopology between the multi-client subgraphs and the global graph, where\nconnected nodes are highly likely to possess similar feature distributions and\nthe same label. However, in real-world implementations, the varying\nperspectives of local data engineering result in various subgraph topologies,\nposing unique heterogeneity challenges in FGL. Unlike the well-known label\nNon-independent identical distribution (Non-iid) problems in federated\nlearning, FGL heterogeneity essentially reveals the topological divergence\namong multiple clients, namely homophily or heterophily. To simulate and handle\nthis unique challenge, we introduce the concept of structure Non-iid split and\nthen present a new paradigm called \\underline{Ada}ptive \\underline{F}ederated\n\\underline{G}raph \\underline{L}earning (AdaFGL), a decoupled two-step\npersonalized approach. To begin with, AdaFGL employs standard multi-client\nfederated collaborative training to acquire the federated knowledge extractor\nby aggregating uploaded models in the final round at the server. Then, each\nclient conducts personalized training based on the local subgraph and the\nfederated knowledge extractor. Extensive experiments on the 12 graph benchmark\ndatasets validate the superior performance of AdaFGL over state-of-the-art\nbaselines. Specifically, in terms of test accuracy, our proposed AdaFGL\noutperforms baselines by significant margins of 3.24\\% and 5.57\\% on community\nsplit and structure Non-iid split, respectively.\n","authors":["Xunkai Li","Zhengyu Wu","Wentao Zhang","Henan Sun","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11750v1.pdf","comment":"Accepted by ICDE 2024"},{"id":"http://arxiv.org/abs/2401.11748v1","updated":"2024-01-22T08:20:47Z","published":"2024-01-22T08:20:47Z","title":"GI-PIP: Do We Require Impractical Auxiliary Dataset for Gradient\n Inversion Attacks?","summary":" Deep gradient inversion attacks expose a serious threat to Federated Learning\n(FL) by accurately recovering private data from shared gradients. However, the\nstate-of-the-art heavily relies on impractical assumptions to access excessive\nauxiliary data, which violates the basic data partitioning principle of FL. In\nthis paper, a novel method, Gradient Inversion Attack using Practical Image\nPrior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits\nanomaly detection models to capture the underlying distribution from fewer\ndata, while GAN-based methods consume significant more data to synthesize\nimages. The extracted distribution is then leveraged to regulate the attack\nprocess as Anomaly Score loss. Experimental results show that GI-PIP achieves a\n16.12 dB PSNR recovery using only 3.8\\% data of ImageNet, while GAN-based\nmethods necessitate over 70\\%. Moreover, GI-PIP exhibits superior capability on\ndistribution generalization compared to GAN-based methods. Our approach\nsignificantly alleviates the auxiliary data requirement on both amount and\ndistribution in gradient inversion attacks, hence posing more substantial\nthreat to real-world FL.\n","authors":["Yu sun","Gaojian Xiong","Xianxun Yao","Kailang Ma","Jian Cui"],"pdf_url":"https://arxiv.org/pdf/2401.11748v1.pdf","comment":"5pages, 5 figures, accepted to ICASSP 2024, not published yet"},{"id":"http://arxiv.org/abs/2401.10765v2","updated":"2024-01-22T08:17:42Z","published":"2024-01-19T15:37:11Z","title":"Starlit: Privacy-Preserving Federated Learning to Enhance Financial\n Fraud Detection","summary":" Federated Learning (FL) is a data-minimization approach enabling\ncollaborative model training across diverse clients with local data, avoiding\ndirect data exchange. However, state-of-the-art FL solutions to identify\nfraudulent financial transactions exhibit a subset of the following\nlimitations. They (1) lack a formal security definition and proof, (2) assume\nprior freezing of suspicious customers' accounts by financial institutions\n(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$\ncomputationally expensive modular exponentiation (where $n$ is the total number\nof financial institutions) or highly inefficient fully homomorphic encryption,\n(4) assume the parties have already completed the identity alignment phase,\nhence excluding it from the implementation, performance evaluation, and\nsecurity analysis, and (5) struggle to resist clients' dropouts. This work\nintroduces Starlit, a novel scalable privacy-preserving FL mechanism that\novercomes these limitations. It has various applications, such as enhancing\nfinancial fraud detection, mitigating terrorism, and enhancing digital health.\nWe implemented Starlit and conducted a thorough performance analysis using\nsynthetic data from a key player in global financial transactions. The\nevaluation indicates Starlit's scalability, efficiency, and accuracy.\n","authors":["Aydin Abadi","Bradley Doyle","Francesco Gini","Kieron Guinamard","Sasi Kumar Murakonda","Jack Liddell","Paul Mellor","Steven J. Murdoch","Mohammad Naseri","Hector Page","George Theodorakopoulos","Suzanne Weller"],"pdf_url":"https://arxiv.org/pdf/2401.10765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19604v3","updated":"2024-01-22T08:13:50Z","published":"2023-05-31T07:22:15Z","title":"Medication Recommendation via Domain Knowledge Informed Deep Learning","summary":" Medication recommendation is a fundamental yet crucial branch of healthcare,\nwhich provides opportunities to support clinical physicians with more accurate\nmedication prescriptions for patients with complex health conditions. Learning\nfrom electronic health records (EHR) to recommend medications is the most\ncommon way in previous studies. However, most of them neglect incorporating\ndomain knowledge according to the clinical manifestations in the EHR of the\npatient. To address these issues, we propose a novel \\textbf{D}omain\n\\textbf{K}nowledge \\textbf{I}nformed \\textbf{Net}work (DKINet) to integrate\ndomain knowledge with observable clinical manifestations of the patient, which\nis the first dynamic domain knowledge informed framework toward medication\nrecommendation. In particular, we first design a knowledge-driven encoder to\ncapture the domain information and then develop a data-driven encoder to\nintegrate domain knowledge into the observable EHR. To endow the model with the\ncapability of temporal decision, we design an explicit medication encoder for\nlearning the longitudinal dependence of the patient. Extensive experiments on\nthree publicly available datasets verify the superiority of our method. The\ncode will be public upon acceptance.\n","authors":["Sicen Liu","Xiaolong Wang","Xianbing Zhao","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2305.19604v3.pdf","comment":"11 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11740v1","updated":"2024-01-22T07:37:25Z","published":"2024-01-22T07:37:25Z","title":"Multi-level Cross-modal Alignment for Image Clustering","summary":" Recently, the cross-modal pretraining model has been employed to produce\nmeaningful pseudo-labels to supervise the training of an image clustering\nmodel. However, numerous erroneous alignments in a cross-modal pre-training\nmodel could produce poor-quality pseudo-labels and degrade clustering\nperformance. To solve the aforementioned issue, we propose a novel\n\\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in\na cross-modal pretraining model for downstream tasks, by building a smaller but\nbetter semantic space and aligning the images and texts in three levels, i.e.,\ninstance-level, prototype-level, and semantic-level. Theoretical results show\nthat our proposed method converges, and suggests effective means to reduce the\nexpected clustering risk of our method. Experimental results on five benchmark\ndatasets clearly show the superiority of our new method.\n","authors":["Liping Qiu","Qin Zhang","Xiaojun Chen","Shaotian Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11740v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11739v1","updated":"2024-01-22T07:34:06Z","published":"2024-01-22T07:34:06Z","title":"EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models","summary":" Diffusion models have recently received increasing research attention for\ntheir remarkable transfer abilities in semantic segmentation tasks. However,\ngenerating fine-grained segmentation masks with diffusion models often requires\nadditional training on annotated datasets, leaving it unclear to what extent\npre-trained diffusion models alone understand the semantic relations of their\ngenerated images. To address this question, we leverage the semantic knowledge\nextracted from Stable Diffusion (SD) and aim to develop an image segmentor\ncapable of generating fine-grained segmentation maps without any additional\ntraining. The primary difficulty stems from the fact that semantically\nmeaningful feature maps typically exist only in the spatially lower-dimensional\nlayers, which poses a challenge in directly extracting pixel-level semantic\nrelations from these feature maps. To overcome this issue, our framework\nidentifies semantic correspondences between image pixels and spatial locations\nof low-dimensional feature maps by exploiting SD's generation process and\nutilizes them for constructing image-resolution segmentation maps. In extensive\nexperiments, the produced segmentation maps are demonstrated to be well\ndelineated and capture detailed parts of the images, indicating the existence\nof highly accurate pixel-level semantic knowledge in diffusion models.\n","authors":["Koichi Namekata","Amirmojtaba Sabour","Sanja Fidler","Seung Wook Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11739v1.pdf","comment":"ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/"},{"id":"http://arxiv.org/abs/2401.11736v1","updated":"2024-01-22T07:24:15Z","published":"2024-01-22T07:24:15Z","title":"Attention on Personalized Clinical Decision Support System: Federated\n Learning Approach","summary":" Health management has become a primary problem as new kinds of diseases and\ncomplex symptoms are introduced to a rapidly growing modern society. Building a\nbetter and smarter healthcare infrastructure is one of the ultimate goals of a\nsmart city. To the best of our knowledge, neural network models are already\nemployed to assist healthcare professionals in achieving this goal. Typically,\ntraining a neural network requires a rich amount of data but heterogeneous and\nvulnerable properties of clinical data introduce a challenge for the\ntraditional centralized network. Moreover, adding new inputs to a medical\ndatabase requires re-training an existing model from scratch. To tackle these\nchallenges, we proposed a deep learning-based clinical decision support system\ntrained and managed under a federated learning paradigm. We focused on a novel\nstrategy to guarantee the safety of patient privacy and overcome the risk of\ncyberattacks while enabling large-scale clinical data mining. As a result, we\ncan leverage rich clinical data for training each local neural network without\nthe need for exchanging the confidential data of patients. Moreover, we\nimplemented the proposed scheme as a sequence-to-sequence model architecture\nintegrating the attention mechanism. Thus, our objective is to provide a\npersonalized clinical decision support system with evolvable characteristics\nthat can deliver accurate solutions and assist healthcare professionals in\nmedical diagnosing.\n","authors":["Chu Myaet Thwal","Kyi Thar","Ye Lin Tun","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11736v1.pdf","comment":"Published in IEEE BigComp 2021"},{"id":"http://arxiv.org/abs/2401.11731v1","updated":"2024-01-22T07:19:16Z","published":"2024-01-22T07:19:16Z","title":"Fast and Scalable Network Slicing by Integrating Deep Learning with\n Lagrangian Methods","summary":" Network slicing is a key technique in 5G and beyond for efficiently\nsupporting diverse services. Many network slicing solutions rely on deep\nlearning to manage complex and high-dimensional resource allocation problems.\nHowever, deep learning models suffer limited generalization and adaptability to\ndynamic slicing configurations. In this paper, we propose a novel framework\nthat integrates constrained optimization methods and deep learning models,\nresulting in strong generalization and superior approximation capability. Based\non the proposed framework, we design a new neural-assisted algorithm to\nallocate radio resources to slices to maximize the network utility under\ninter-slice resource constraints. The algorithm exhibits high scalability,\naccommodating varying numbers of slices and slice configurations with ease. We\nimplement the proposed solution in a system-level network simulator and\nevaluate its performance extensively by comparing it to state-of-the-art\nsolutions including deep reinforcement learning approaches. The numerical\nresults show that our solution obtains near-optimal quality-of-service\nsatisfaction and promising generalization performance under different network\nslicing scenarios.\n","authors":["Tianlun Hu","Qi Liao","Qiang Liu","Antonio Massaro","Georg Carle"],"pdf_url":"https://arxiv.org/pdf/2401.11731v1.pdf","comment":"6 pages, 5 figures, IEEE Global Communications Conference 2023"},{"id":"http://arxiv.org/abs/2305.00418v3","updated":"2024-01-22T07:09:17Z","published":"2023-04-30T07:28:06Z","title":"An Empirical Study of Using Large Language Models for Unit Test\n Generation","summary":" A code generation model generates code by taking a prompt from a code\ncomment, existing code, or a combination of both. Although code generation\nmodels (e.g., GitHub Copilot) are increasingly being adopted in practice, it is\nunclear whether they can successfully be used for unit test generation without\nfine-tuning for a strongly typed language like Java. To fill this gap, we\ninvestigated how well three models (Codex, GPT-3.5-Turbo, and StarCoder) can\ngenerate unit tests. We used two benchmarks (HumanEval and Evosuite SF110) to\ninvestigate the effect of context generation on the unit test generation\nprocess. We evaluated the models based on compilation rates, test correctness,\ntest coverage, and test smells. We found that the Codex model achieved above\n80% coverage for the HumanEval dataset, but no model had more than 2% coverage\nfor the EvoSuite SF110 benchmark. The generated tests also suffered from test\nsmells, such as Duplicated Asserts and Empty Tests.\n","authors":["Mohammed Latif Siddiq","Joanna C. S. Santos","Ridwanul Hasan Tanvir","Noshin Ulfat","Fahmid Al Rifat","Vinicius Carvalho Lopes"],"pdf_url":"https://arxiv.org/pdf/2305.00418v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11726v1","updated":"2024-01-22T07:07:32Z","published":"2024-01-22T07:07:32Z","title":"Detecting Out-of-Distribution Samples via Conditional Distribution\n Entropy with Optimal Transport","summary":" When deploying a trained machine learning model in the real world, it is\ninevitable to receive inputs from out-of-distribution (OOD) sources. For\ninstance, in continual learning settings, it is common to encounter OOD samples\ndue to the non-stationarity of a domain. More generally, when we have access to\na set of test inputs, the existing rich line of OOD detection solutions,\nespecially the recent promise of distance-based methods, falls short in\neffectively utilizing the distribution information from training samples and\ntest inputs. In this paper, we argue that empirical probability distributions\nthat incorporate geometric information from both training samples and test\ninputs can be highly beneficial for OOD detection in the presence of test\ninputs available. To address this, we propose to model OOD detection as a\ndiscrete optimal transport problem. Within the framework of optimal transport,\nwe propose a novel score function known as the \\emph{conditional distribution\nentropy} to quantify the uncertainty of a test input being an OOD sample. Our\nproposal inherits the merits of certain distance-based methods while\neliminating the reliance on distribution assumptions, a-prior knowledge, and\nspecific training mechanisms. Extensive experiments conducted on benchmark\ndatasets demonstrate that our method outperforms its competitors in OOD\ndetection.\n","authors":["Chuanwen Feng","Wenlong Chen","Ao Ke","Yilong Ren","Xike Xie","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11726v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11720v1","updated":"2024-01-22T06:47:00Z","published":"2024-01-22T06:47:00Z","title":"Graph Condensation: A Survey","summary":" The burgeoning volume of graph data poses significant challenges in storage,\ntransmission, and particularly the training of graph neural networks (GNNs). To\naddress these challenges, graph condensation (GC) has emerged as an innovative\nsolution. GC focuses on synthesizing a compact yet highly representative graph,\non which GNNs can achieve performance comparable to trained on the large\noriginal graph. The notable efficacy of GC and its broad prospects have\ngarnered significant attention and spurred extensive research. This survey\npaper provides an up-to-date and systematic overview of GC, organizing existing\nresearch into four categories aligned with critical GC evaluation criteria:\neffectiveness, generalization, fairness, and efficiency. To facilitate an\nin-depth and comprehensive understanding of GC, we examine various methods\nunder each category and thoroughly discuss two essential components within GC:\noptimization strategies and condensed graph generation. Additionally, we\nintroduce the applications of GC in a variety of fields, and highlight the\npresent challenges and novel insights in GC, promoting advancements in future\nresearch.\n","authors":["Xinyi Gao","Junliang Yu","Wei Jiang","Tong Chen","Wentao Zhang","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2401.11720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11698v1","updated":"2024-01-22T05:44:43Z","published":"2024-01-22T05:44:43Z","title":"Admission Prediction in Undergraduate Applications: an Interpretable\n Deep Learning Approach","summary":" This article addresses the challenge of validating the admission committee's\ndecisions for undergraduate admissions. In recent years, the traditional review\nprocess has struggled to handle the overwhelmingly large amount of applicants'\ndata. Moreover, this traditional assessment often leads to human bias, which\nmight result in discrimination among applicants. Although classical machine\nlearning-based approaches exist that aim to verify the quantitative assessment\nmade by the application reviewers, these methods lack scalability and suffer\nfrom performance issues when a large volume of data is in place. In this\ncontext, we propose deep learning-based classifiers, namely Feed-Forward and\nInput Convex neural networks, which overcome the challenges faced by the\nexisting methods. Furthermore, we give additional insights into our model by\nincorporating an interpretability module, namely LIME. Our training and test\ndatasets comprise applicants' data with a wide range of variables and\ninformation. Our models achieve higher accuracy compared to the best-performing\ntraditional machine learning-based approach by a considerable margin of 3.03\\%.\nAdditionally, we show the sensitivity of different features and their relative\nimpacts on the overall admission decision using the LIME technique.\n","authors":["Amisha Priyadarshini","Barbara Martinez-Neda","Sergio Gago-Masague"],"pdf_url":"https://arxiv.org/pdf/2401.11698v1.pdf","comment":"This paper has been accepted for Transdisciplinary AI 2023 conference"},{"id":"http://arxiv.org/abs/2401.11694v1","updated":"2024-01-22T05:26:18Z","published":"2024-01-22T05:26:18Z","title":"Parametric Matrix Models","summary":" We present a general class of machine learning algorithms called parametric\nmatrix models. Parametric matrix models are based on matrix equations, and the\ndesign is motivated by the efficiency of reduced basis methods for\napproximating solutions of parametric equations. The dependent variables can be\ndefined implicitly or explicitly, and the equations may use algebraic,\ndifferential, or integral relations. Parametric matrix models can be trained\nwith empirical data only, and no high-fidelity model calculations are needed.\nWhile originally designed for scientific computing, parametric matrix models\nare universal function approximators that can be applied to general machine\nlearning problems. After introducing the underlying theory, we apply parametric\nmatrix models to a series of different challenges that show their performance\nfor a wide range of problems. For all the challenges tested here, parametric\nmatrix models produce accurate results within a computational framework that\nallows for parameter extrapolation and interpretability.\n","authors":["Patrick Cook","Danny Jammooa","Morten Hjorth-Jensen","Daniel D. Lee","Dean Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10371v2","updated":"2024-01-22T05:24:17Z","published":"2024-01-18T20:35:47Z","title":"Langevin Unlearning: A New Perspective of Noisy Gradient Descent for\n Machine Unlearning","summary":" Machine unlearning has raised significant interest with the adoption of laws\nensuring the ``right to be forgotten''. Researchers have provided a\nprobabilistic notion of approximate unlearning under a similar definition of\nDifferential Privacy (DP), where privacy is defined as statistical\nindistinguishability to retraining from scratch. We propose Langevin\nunlearning, an unlearning framework based on noisy gradient descent with\nprivacy guarantees for approximate unlearning problems. Langevin unlearning\nunifies the DP learning process and the privacy-certified unlearning process\nwith many algorithmic benefits. These include approximate certified unlearning\nfor non-convex problems, complexity saving compared to retraining, sequential\nand batch unlearning for multiple unlearning requests. We verify the\npracticality of Langevin unlearning by studying its privacy-utility-complexity\ntrade-off via experiments on benchmark datasets, and also demonstrate its\nsuperiority against gradient-decent-plus-output-perturbation based approximate\nunlearning.\n","authors":["Eli Chien","Haoyu Wang","Ziang Chen","Pan Li"],"pdf_url":"https://arxiv.org/pdf/2401.10371v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11687v1","updated":"2024-01-22T04:54:42Z","published":"2024-01-22T04:54:42Z","title":"TIM: An Efficient Temporal Interaction Module for Spiking Transformer","summary":" Spiking Neural Networks (SNNs), as the third generation of neural networks,\nhave gained prominence for their biological plausibility and computational\nefficiency, especially in processing diverse datasets. The integration of\nattention mechanisms, inspired by advancements in neural network architectures,\nhas led to the development of Spiking Transformers. These have shown promise in\nenhancing SNNs' capabilities, particularly in the realms of both static and\nneuromorphic datasets. Despite their progress, a discernible gap exists in\nthese systems, specifically in the Spiking Self Attention (SSA) mechanism's\neffectiveness in leveraging the temporal processing potential of SNNs. To\naddress this, we introduce the Temporal Interaction Module (TIM), a novel,\nconvolution-based enhancement designed to augment the temporal data processing\nabilities within SNN architectures. TIM's integration into existing SNN\nframeworks is seamless and efficient, requiring minimal additional parameters\nwhile significantly boosting their temporal information handling capabilities.\nThrough rigorous experimentation, TIM has demonstrated its effectiveness in\nexploiting temporal information, leading to state-of-the-art performance across\nvarious neuromorphic datasets.\n","authors":["Sicheng Shen","Dongcheng Zhao","Guobin Shen","Yi Zeng"],"pdf_url":"https://arxiv.org/pdf/2401.11687v1.pdf","comment":"10pages,6figures"},{"id":"http://arxiv.org/abs/2310.03298v3","updated":"2024-01-22T04:39:36Z","published":"2023-10-05T03:56:09Z","title":"A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive\n Sampling","summary":" Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate\nmodeling and design optimization by incorporating data from various\nlow-fidelity (LF) models. While most existing MF methods assume a fixed\ndataset, adaptive sampling methods that dynamically allocate resources among\nfidelity models can achieve higher efficiency in the exploring and exploiting\nthe design space. However, most existing MF methods rely on the hierarchical\nassumption of fidelity levels or fail to capture the intercorrelation between\nmultiple fidelity levels and utilize it to quantify the value of the future\nsamples and navigate the adaptive sampling. To address this hurdle, we propose\na framework hinged on a latent embedding for different fidelity models and the\nassociated pre-posterior analysis to explicitly utilize their correlation for\nadaptive sampling. In this framework, each infill sampling iteration includes\ntwo steps: We first identify the location of interest with the greatest\npotential improvement using the high-fidelity (HF) model, then we search for\nthe next sample across all fidelity levels that maximize the improvement per\nunit cost at the location identified in the first step. This is made possible\nby a single Latent Variable Gaussian Process (LVGP) model that maps different\nfidelity models into an interpretable latent space to capture their\ncorrelations without assuming hierarchical fidelity levels. The LVGP enables us\nto assess how LF sampling candidates will affect HF response with pre-posterior\nanalysis and determine the next sample with the best benefit-to-cost ratio.\nThrough test cases, we demonstrate that the proposed method outperforms the\nbenchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO)\nproblems in convergence rate and robustness. Moreover, the method offers the\nflexibility to switch between GF and BO by simply changing the acquisition\nfunction.\n","authors":["Yi-Ping Chen","Liwei Wang","Yigitcan Comlek","Wei Chen"],"pdf_url":"https://arxiv.org/pdf/2310.03298v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13158v2","updated":"2024-01-22T03:47:17Z","published":"2023-07-24T22:52:02Z","title":"Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell\n Association: DRL with Action Branching","summary":" This paper presents a deep reinforcement learning solution for optimizing\nmulti-UAV cell-association decisions and their moving velocity on a 3D aerial\nhighway. The objective is to enhance transportation and communication\nperformance, including collision avoidance, connectivity, and handovers. The\nproblem is formulated as a Markov decision process (MDP) with UAVs' states\ndefined by velocities and communication data rates. We propose a neural\narchitecture with a shared decision module and multiple network branches, each\ndedicated to a specific action dimension in a 2D transportation-communication\nspace. This design efficiently handles the multi-dimensional action space,\nallowing independence for individual action dimensions. We introduce two\nmodels, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep\nQ-Network (Dueling DDQN), to demonstrate the approach. Simulation results show\na significant improvement of 18.32% compared to existing benchmarks.\n","authors":["Zijiang Yan","Wael Jaafar","Bassant Selim","Hina Tabassum"],"pdf_url":"https://arxiv.org/pdf/2307.13158v2.pdf","comment":"IEEE Globecom 2023 Accepted"},{"id":"http://arxiv.org/abs/2401.11679v1","updated":"2024-01-22T03:44:35Z","published":"2024-01-22T03:44:35Z","title":"Simulating Nighttime Visible Satellite Imagery of Tropical Cyclones\n Using Conditional Generative Adversarial Networks","summary":" Visible (VIS) imagery of satellites has various important applications in\nmeteorology, including monitoring Tropical Cyclones (TCs). However, it is\nunavailable at night because of the lack of sunlight. This study presents a\nConditional Generative Adversarial Networks (CGAN) model that generates highly\naccurate nighttime visible reflectance using infrared (IR) bands and sunlight\ndirection parameters as input. The model was trained and validated using target\narea observations of the Advanced Himawari Imager (AHI) in the daytime. This\nstudy also presents the first nighttime model validation using the Day/Night\nBand (DNB) of the Visible/Infrared Imager Radiometer Suite (VIIRS). The daytime\nstatistical results of the Structural Similarity Index Measure (SSIM), Peak\nSignal-to-Noise Ratio (PSNR), Root Mean Square Error (RMSE), Correlation\nCoefficient (CC), and Bias are 0.885, 28.3, 0.0428, 0.984, and -0.0016\nrespectively, completely surpassing the model performance of previous studies.\nThe nighttime statistical results of SSIM, PSNR, RMSE, and CC are 0.821, 24.4,\n0.0643, and 0.969 respectively, which are slightly negatively impacted by the\nparallax between satellites. We performed full-disk model validation which\nproves our model could also be readily applied in the tropical ocean without\nTCs in the northern hemisphere. This model contributes to the nighttime\nmonitoring of meteorological phenomena by providing accurate AI-generated\nvisible imagery with adjustable virtual sunlight directions.\n","authors":["Jinghuai Yao","Puyuan Du","Yucheng Zhao","Yubo Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.01841v3","updated":"2024-01-22T03:43:34Z","published":"2024-01-03T17:19:54Z","title":"Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov\n Decision Processes","summary":" A fundamental (and largely open) challenge in sequential decision-making is\ndealing with non-stationary environments, where exogenous environmental\nconditions change over time. Such problems are traditionally modeled as\nnon-stationary Markov decision processes (NSMDP). However, existing approaches\nfor decision-making in NSMDPs have two major shortcomings: first, they assume\nthat the updated environmental dynamics at the current time are known (although\nfuture dynamics can change); and second, planning is largely pessimistic, i.e.,\nthe agent acts ``safely'' to account for the non-stationary evolution of the\nenvironment. We argue that both these assumptions are invalid in practice --\nupdated environmental conditions are rarely known, and as the agent interacts\nwith the environment, it can learn about the updated dynamics and avoid being\npessimistic, at least in states whose dynamics it is confident about. We\npresent a heuristic search algorithm called \\textit{Adaptive Monte Carlo Tree\nSearch (ADA-MCTS)} that addresses these challenges. We show that the agent can\nlearn the updated dynamics of the environment over time and then act as it\nlearns, i.e., if the agent is in a region of the state space about which it has\nupdated knowledge, it can avoid being pessimistic. To quantify ``updated\nknowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the\nagent's updated belief and show how the agent can use these estimates for\ndecision-making. We compare the proposed approach with the multiple\nstate-of-the-art approaches in decision-making across multiple well-established\nopen-source problems and empirically show that our approach is faster and\nhighly adaptive without sacrificing safety.\n","authors":["Baiting Luo","Yunuo Zhang","Abhishek Dubey","Ayan Mukhopadhyay"],"pdf_url":"https://arxiv.org/pdf/2401.01841v3.pdf","comment":"Accepted for publication at the International Conference on\n Autonomous Agents and MultiAgent Systems (AAMAS), 2024"},{"id":"http://arxiv.org/abs/2401.11671v1","updated":"2024-01-22T03:09:00Z","published":"2024-01-22T03:09:00Z","title":"RTA-Former: Reverse Transformer Attention for Polyp Segmentation","summary":" Polyp segmentation is a key aspect of colorectal cancer prevention, enabling\nearly detection and guiding subsequent treatments. Intelligent diagnostic\ntools, including deep learning solutions, are widely explored to streamline and\npotentially automate this process. However, even with many powerful network\narchitectures, there still comes the problem of producing accurate edge\nsegmentation. In this paper, we introduce a novel network, namely RTA-Former,\nthat employs a transformer model as the encoder backbone and innovatively\nadapts Reverse Attention (RA) with a transformer stage in the decoder for\nenhanced edge segmentation. The results of the experiments illustrate that\nRTA-Former achieves state-of-the-art (SOTA) performance in five polyp\nsegmentation datasets. The strong capability of RTA-Former holds promise in\nimproving the accuracy of Transformer-based polyp segmentation, potentially\nleading to better clinical decisions and patient outcomes. Our code will be\npublicly available on GitHub.\n","authors":["Zhikai Li","Murong Yi","Ali Uneri","Sihan Niu","Craig Jones"],"pdf_url":"https://arxiv.org/pdf/2401.11671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11669v1","updated":"2024-01-22T03:07:24Z","published":"2024-01-22T03:07:24Z","title":"An Improved Grey Wolf Optimization Algorithm for Heart Disease\n Prediction","summary":" This paper presents a unique solution to challenges in medical image\nprocessing by incorporating an adaptive curve grey wolf optimization (ACGWO)\nalgorithm into neural network backpropagation. Neural networks show potential\nin medical data but suffer from issues like overfitting and lack of\ninterpretability due to imbalanced and scarce data. Traditional Gray Wolf\nOptimization (GWO) also has its drawbacks, such as a lack of population\ndiversity and premature convergence. This paper addresses these problems by\nintroducing an adaptive algorithm, enhancing the standard GWO with a sigmoid\nfunction. This algorithm was extensively compared to four leading algorithms\nusing six well-known test functions, outperforming them effectively. Moreover,\nby utilizing the ACGWO, we increase the robustness and generalization of the\nneural network, resulting in more interpretable predictions. Applied to the\npublicly accessible Cleveland Heart Disease dataset, our technique surpasses\nten other methods, achieving 86.8% accuracy, indicating its potential for\nefficient heart disease prediction in the clinical setting.\n","authors":["Sihan Niu","Yifan Zhou","Zhikai Li","Shuyao Huang","Yujun Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11669v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11667v1","updated":"2024-01-22T02:59:27Z","published":"2024-01-22T02:59:27Z","title":"INCPrompt: Task-Aware incremental Prompting for Rehearsal-Free\n Class-incremental Learning","summary":" This paper introduces INCPrompt, an innovative continual learning solution\nthat effectively addresses catastrophic forgetting. INCPrompt's key innovation\nlies in its use of adaptive key-learner and task-aware prompts that capture\ntask-relevant information. This unique combination encapsulates general\nknowledge across tasks and encodes task-specific knowledge. Our comprehensive\nevaluation across multiple continual learning benchmarks demonstrates\nINCPrompt's superiority over existing algorithms, showing its effectiveness in\nmitigating catastrophic forgetting while maintaining high performance. These\nresults highlight the significant impact of task-aware incremental prompting on\ncontinual learning performance.\n","authors":["Zhiyuan Wang","Xiaoyang Qu","Jing Xiao","Bokui Chen","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11667v1.pdf","comment":"Accepted by the 49th IEEE International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2401.11666v1","updated":"2024-01-22T02:58:53Z","published":"2024-01-22T02:58:53Z","title":"P2DT: Mitigating Forgetting in task-incremental Learning with\n progressive prompt Decision Transformer","summary":" Catastrophic forgetting poses a substantial challenge for managing\nintelligent agents controlled by a large model, causing performance degradation\nwhen these agents face new tasks. In our work, we propose a novel solution -\nthe Progressive Prompt Decision Transformer (P2DT). This method enhances a\ntransformer-based model by dynamically appending decision tokens during new\ntask training, thus fostering task-specific policies. Our approach mitigates\nforgetting in continual and offline reinforcement learning scenarios. Moreover,\nP2DT leverages trajectories collected via traditional reinforcement learning\nfrom all tasks and generates new task-specific tokens during training, thereby\nretaining knowledge from previous studies. Preliminary results demonstrate that\nour model effectively alleviates catastrophic forgetting and scales well with\nincreasing task environments.\n","authors":["Zhiyuan Wang","Xiaoyang Qu","Jing Xiao","Bokui Chen","Jianzong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11666v1.pdf","comment":"Accepted by the 49th IEEE International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP 2024)"},{"id":"http://arxiv.org/abs/2212.00325v2","updated":"2024-01-22T02:56:53Z","published":"2022-12-01T07:19:17Z","title":"HashVFL: Defending Against Data Reconstruction Attacks in Vertical\n Federated Learning","summary":" Vertical Federated Learning (VFL) is a trending collaborative machine\nlearning model training solution. Existing industrial frameworks employ secure\nmulti-party computation techniques such as homomorphic encryption to ensure\ndata security and privacy. Despite these efforts, studies have revealed that\ndata leakage remains a risk in VFL due to the correlations between intermediate\nrepresentations and raw data. Neural networks can accurately capture these\ncorrelations, allowing an adversary to reconstruct the data. This emphasizes\nthe need for continued research into securing VFL systems.\n Our work shows that hashing is a promising solution to counter data\nreconstruction attacks. The one-way nature of hashing makes it difficult for an\nadversary to recover data from hash codes. However, implementing hashing in VFL\npresents new challenges, including vanishing gradients and information loss. To\naddress these issues, we propose HashVFL, which integrates hashing and\nsimultaneously achieves learnability, bit balance, and consistency.\n Experimental results indicate that HashVFL effectively maintains task\nperformance while defending against data reconstruction attacks. It also brings\nadditional benefits in reducing the degree of label leakage, mitigating\nadversarial attacks, and detecting abnormal inputs. We hope our work will\ninspire further research into the potential applications of HashVFL.\n","authors":["Pengyu Qiu","Xuhong Zhang","Shouling Ji","Chong Fu","Xing Yang","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2212.00325v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11665v1","updated":"2024-01-22T02:54:58Z","published":"2024-01-22T02:54:58Z","title":"Accelerating Approximate Thompson Sampling with Underdamped Langevin\n Monte Carlo","summary":" Approximate Thompson sampling with Langevin Monte Carlo broadens its reach\nfrom Gaussian posterior sampling to encompass more general smooth posteriors.\nHowever, it still encounters scalability issues in high-dimensional problems\nwhen demanding high accuracy. To address this, we propose an approximate\nThompson sampling strategy, utilizing underdamped Langevin Monte Carlo, where\nthe latter is the go-to workhorse for simulations of high-dimensional\nposteriors. Based on the standard smoothness and log-concavity conditions, we\nstudy the accelerated posterior concentration and sampling using a specific\npotential function. This design improves the sample complexity for realizing\nlogarithmic regrets from $\\mathcal{\\tilde O}(d)$ to $\\mathcal{\\tilde\nO}(\\sqrt{d})$. The scalability and robustness of our algorithm are also\nempirically validated through synthetic experiments in high-dimensional bandit\nproblems.\n","authors":["Haoyang Zheng","Wei Deng","Christian Moya","Guang Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11665v1.pdf","comment":"50 pages, 1 figure, to appear in AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.11664v1","updated":"2024-01-22T02:50:38Z","published":"2024-01-22T02:50:38Z","title":"Zero-Space Cost Fault Tolerance for Transformer-based Language Models on\n ReRAM","summary":" Resistive Random Access Memory (ReRAM) has emerged as a promising platform\nfor deep neural networks (DNNs) due to its support for parallel in-situ\nmatrix-vector multiplication. However, hardware failures, such as\nstuck-at-fault defects, can result in significant prediction errors during\nmodel inference. While additional crossbars can be used to address these\nfailures, they come with storage overhead and are not efficient in terms of\nspace, energy, and cost. In this paper, we propose a fault protection mechanism\nthat incurs zero space cost. Our approach includes: 1) differentiable structure\npruning of rows and columns to reduce model redundancy, 2) weight duplication\nand voting for robust output, and 3) embedding duplicated most significant bits\n(MSBs) into the model weight. We evaluate our method on nine tasks of the GLUE\nbenchmark with the BERT model, and experimental results prove its\neffectiveness.\n","authors":["Bingbing Li","Geng Yuan","Zigeng Wang","Shaoyi Huang","Hongwu Peng","Payman Behnam","Wujie Wen","Hang Liu","Caiwen Ding"],"pdf_url":"https://arxiv.org/pdf/2401.11664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11835v2","updated":"2024-01-22T02:48:48Z","published":"2023-12-19T04:03:47Z","title":"Provably Convergent Federated Trilevel Learning","summary":" Trilevel learning, also called trilevel optimization (TLO), has been\nrecognized as a powerful modelling tool for hierarchical decision process and\nwidely applied in many machine learning applications, such as robust neural\narchitecture search, hyperparameter optimization, and domain adaptation.\nTackling TLO problems has presented a great challenge due to their nested\ndecision-making structure. In addition, existing works on TLO face the\nfollowing key challenges: 1) they all focus on the non-distributed setting,\nwhich may lead to privacy breach; 2) they do not offer any non-asymptotic\nconvergence analysis which characterizes how fast an algorithm converges. To\naddress the aforementioned challenges, this paper proposes an asynchronous\nfederated trilevel optimization method to solve TLO problems. The proposed\nmethod utilizes $\\mu$-cuts to construct a hyper-polyhedral approximation for\nthe TLO problem and solve it in an asynchronous manner. We demonstrate that the\nproposed $\\mu$-cuts are applicable to not only convex functions but also a wide\nrange of non-convex functions that meet the $\\mu$-weakly convex assumption.\nFurthermore, we theoretically analyze the non-asymptotic convergence rate for\nthe proposed method by showing its iteration complexity to obtain\n$\\epsilon$-stationary point is upper bounded by\n$\\mathcal{O}(\\frac{1}{\\epsilon^2})$. Extensive experiments on real-world\ndatasets have been conducted to elucidate the superiority of the proposed\nmethod, e.g., it has a faster convergence rate with a maximum acceleration of\napproximately 80$\\%$.\n","authors":["Yang Jiao","Kai Yang","Tiancheng Wu","Chengtao Jian","Jianwei Huang"],"pdf_url":"https://arxiv.org/pdf/2312.11835v2.pdf","comment":"Accepted at AAAI 2024"},{"id":"http://arxiv.org/abs/2305.16789v2","updated":"2024-01-22T02:47:50Z","published":"2023-05-26T09:59:48Z","title":"Modulate Your Spectrum in Self-Supervised Learning","summary":" Whitening loss offers a theoretical guarantee against feature collapse in\nself-supervised learning (SSL) with joint embedding architectures. Typically,\nit involves a hard whitening approach, transforming the embedding and applying\nloss to the whitened output. In this work, we introduce Spectral Transformation\n(ST), a framework to modulate the spectrum of embedding and to seek for\nfunctions beyond whitening that can avoid dimensional collapse. We show that\nwhitening is a special instance of ST by definition, and our empirical\ninvestigations unveil other ST instances capable of preventing collapse.\nAdditionally, we propose a novel ST instance named IterNorm with trace loss\n(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse\nand modulating the spectrum of embedding toward equal-eigenvalues during\noptimization. Our experiments on ImageNet classification and COCO object\ndetection demonstrate INTL's potential in learning superior representations.\nThe code is available at https://github.com/winci-ai/INTL.\n","authors":["Xi Weng","Yunhao Ni","Tengwei Song","Jie Luo","Rao Muhammad Anwer","Salman Khan","Fahad Shahbaz Khan","Lei Huang"],"pdf_url":"https://arxiv.org/pdf/2305.16789v2.pdf","comment":"Accepted at ICLR 2024. The code is available at\n https://github.com/winci-ai/intl"},{"id":"http://arxiv.org/abs/2401.11660v1","updated":"2024-01-22T02:33:38Z","published":"2024-01-22T02:33:38Z","title":"Differentiable Tree Search in Latent State Space","summary":" In decision-making problems with limited training data, policy functions\napproximated using deep neural networks often exhibit suboptimal performance.\nAn alternative approach involves learning a world model from the limited data\nand determining actions through online search. However, the performance is\nadversely affected by compounding errors arising from inaccuracies in the\nlearnt world model. While methods like TreeQN have attempted to address these\ninaccuracies by incorporating algorithmic structural biases into their\narchitectures, the biases they introduce are often weak and insufficient for\ncomplex decision-making tasks. In this work, we introduce Differentiable Tree\nSearch (DTS), a novel neural network architecture that significantly\nstrengthens the inductive bias by embedding the algorithmic structure of a\nbest-first online search algorithm. DTS employs a learnt world model to conduct\na fully differentiable online search in latent state space. The world model is\njointly optimised with the search algorithm, enabling the learning of a robust\nworld model and mitigating the effect of model inaccuracies. We address\npotential Q-function discontinuities arising from naive incorporation of\nbest-first search by adopting a stochastic tree expansion policy, formulating\nsearch tree expansion as a decision-making task, and introducing an effective\nvariance reduction technique for the gradient computation. We evaluate DTS in\nan offline-RL setting with a limited training data scenario on Procgen games\nand grid navigation task, and demonstrate that DTS outperforms popular\nmodel-free and model-based baselines.\n","authors":["Dixant Mittal","Wee Sun Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06333v2","updated":"2024-01-22T02:22:12Z","published":"2023-10-10T06:03:51Z","title":"Learning bounded-degree polytrees with known skeleton","summary":" We establish finite-sample guarantees for efficient proper learning of\nbounded-degree polytrees, a rich class of high-dimensional probability\ndistributions and a subclass of Bayesian networks, a widely-studied type of\ngraphical model. Recently, Bhattacharyya et al. (2021) obtained finite-sample\nguarantees for recovering tree-structured Bayesian networks, i.e., 1-polytrees.\nWe extend their results by providing an efficient algorithm which learns\n$d$-polytrees in polynomial time and sample complexity for any bounded $d$ when\nthe underlying undirected graph (skeleton) is known. We complement our\nalgorithm with an information-theoretic sample complexity lower bound, showing\nthat the dependence on the dimension and target accuracy parameters are nearly\ntight.\n","authors":["Davin Choo","Joy Qiping Yang","Arnab Bhattacharyya","Clément L. Canonne"],"pdf_url":"https://arxiv.org/pdf/2310.06333v2.pdf","comment":"Fixed some typos. Added some discussions. Accepted to ALT 2024"},{"id":"http://arxiv.org/abs/2401.11652v1","updated":"2024-01-22T02:17:36Z","published":"2024-01-22T02:17:36Z","title":"OnDev-LCT: On-Device Lightweight Convolutional Transformers towards\n federated learning","summary":" Federated learning (FL) has emerged as a promising approach to\ncollaboratively train machine learning models across multiple edge devices\nwhile preserving privacy. The success of FL hinges on the efficiency of\nparticipating models and their ability to handle the unique challenges of\ndistributed learning. While several variants of Vision Transformer (ViT) have\nshown great potential as alternatives to modern convolutional neural networks\n(CNNs) for centralized training, the unprecedented size and higher\ncomputational demands hinder their deployment on resource-constrained edge\ndevices, challenging their widespread application in FL. Since client devices\nin FL typically have limited computing resources and communication bandwidth,\nmodels intended for such devices must strike a balance between model size,\ncomputational efficiency, and the ability to adapt to the diverse and non-IID\ndata distributions encountered in FL. To address these challenges, we propose\nOnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks\nwith limited training data and resources. Our models incorporate image-specific\ninductive biases through the LCT tokenizer by leveraging efficient depthwise\nseparable convolutions in residual linear bottleneck blocks to extract local\nfeatures, while the multi-head self-attention (MHSA) mechanism in the LCT\nencoder implicitly facilitates capturing global representations of images.\nExtensive experiments on benchmark image datasets indicate that our models\noutperform existing lightweight vision models while having fewer parameters and\nlower computational demands, making them suitable for FL scenarios with data\nheterogeneity and communication bottlenecks.\n","authors":["Chu Myaet Thwal","Minh N. H. Nguyen","Ye Lin Tun","Seong Tae Kim","My T. Thai","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11652v1.pdf","comment":"Published in Neural Networks"},{"id":"http://arxiv.org/abs/2312.02277v2","updated":"2024-01-22T02:03:50Z","published":"2023-12-04T19:00:07Z","title":"ALEXR: An Optimal Single-Loop Algorithm for Convex Finite-Sum Coupled\n Compositional Stochastic Optimization","summary":" This paper revisits a class of convex Finite-Sum Coupled Compositional\nStochastic Optimization (cFCCO) problems with many applications, including\ngroup distributionally robust optimization (GDRO), learning with imbalanced\ndata, reinforcement learning, and learning to rank. To better solve these\nproblems, we introduce an efficient single-loop primal-dual block-coordinate\nproximal algorithm, dubbed ALEXR. This algorithm leverages block-coordinate\nstochastic mirror ascent updates for the dual variable and stochastic proximal\ngradient descent updates for the primal variable. We establish the convergence\nrates of ALEXR in both convex and strongly convex cases under smoothness and\nnon-smoothness conditions of involved functions, which not only improve the\nbest rates in previous works on smooth cFCCO problems but also expand the realm\nof cFCCO for solving more challenging non-smooth problems such as the dual form\nof GDRO. Finally, we present lower complexity bounds to demonstrate that the\nconvergence rates of ALEXR are optimal among first-order block-coordinate\nstochastic algorithms for the considered class of cFCCO problems.\n","authors":["Bokun Wang","Tianbao Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02277v2.pdf","comment":"Fixed several typos; Added some numerical experiments"},{"id":"http://arxiv.org/abs/2401.11648v1","updated":"2024-01-22T01:58:32Z","published":"2024-01-22T01:58:32Z","title":"Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal\n Contrastive EHR Modelling with Hierarchical Regularisation","summary":" Predicting next visit diagnosis using Electronic Health Records (EHR) is an\nessential task in healthcare, critical for devising proactive future plans for\nboth healthcare providers and patients. Nonetheless, many preceding studies\nhave not sufficiently addressed the heterogeneous and hierarchical\ncharacteristics inherent in EHR data, inevitably leading to sub-optimal\nperformance. To this end, we propose NECHO, a novel medical code-centric\nmultimodal contrastive EHR learning framework with hierarchical regularisation.\nFirst, we integrate multifaceted information encompassing medical codes,\ndemographics, and clinical notes using a tailored network design and a pair of\nbimodal contrastive losses, all of which pivot around a medical code\nrepresentation. We also regularise modality-specific encoders using a parental\nlevel information in medical ontology to learn hierarchical structure of EHR\ndata. A series of experiments on MIMIC-III data demonstrates effectiveness of\nour approach.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2401.11648v1.pdf","comment":"Accepted to EACL 2024 (The 18th Conference of the European Chapter of\n the Association for Computational Linguistics)"},{"id":"http://arxiv.org/abs/2401.11647v1","updated":"2024-01-22T01:57:31Z","published":"2024-01-22T01:57:31Z","title":"LW-FedSSL: Resource-efficient Layer-wise Federated Self-supervised\n Learning","summary":" Many recent studies integrate federated learning (FL) with self-supervised\nlearning (SSL) to take advantage of raw training data distributed across edge\ndevices. However, edge devices often struggle with high computation and\ncommunication costs imposed by SSL and FL algorithms. To tackle this hindrance,\nwe propose LW-FedSSL, a layer-wise federated self-supervised learning approach\nthat allows edge devices to incrementally train one layer of the model at a\ntime. LW-FedSSL comprises server-side calibration and representation alignment\nmechanisms to maintain comparable performance with end-to-end FedSSL while\nsignificantly lowering clients' resource requirements. The server-side\ncalibration mechanism takes advantage of the resource-rich server in an FL\nenvironment to assist in global model training. Meanwhile, the representation\nalignment mechanism encourages closeness between representations of FL local\nmodels and those of the global model. Our experiments show that LW-FedSSL has a\n$3.3 \\times$ lower memory requirement and a $3.2 \\times$ cheaper communication\ncost than its end-to-end counterpart. We also explore a progressive training\nstrategy called Prog-FedSSL that outperforms end-to-end training with a similar\nmemory requirement and a $1.8 \\times$ cheaper communication cost.\n","authors":["Ye Lin Tun","Chu Myaet Thwal","Le Quang Huy","Minh N. H. Nguyen","Choong Seon Hong"],"pdf_url":"https://arxiv.org/pdf/2401.11647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11646v1","updated":"2024-01-22T01:45:34Z","published":"2024-01-22T01:45:34Z","title":"Nonparametric Estimation via Variance-Reduced Sketching","summary":" Nonparametric models are of great interest in various scientific and\nengineering disciplines. Classical kernel methods, while numerically robust and\nstatistically sound in low-dimensional settings, become inadequate in\nhigher-dimensional settings due to the curse of dimensionality. In this paper,\nwe introduce a new framework called Variance-Reduced Sketching (VRS),\nspecifically designed to estimate density functions and nonparametric\nregression functions in higher dimensions with a reduced curse of\ndimensionality. Our framework conceptualizes multivariable functions as\ninfinite-size matrices, and facilitates a new sketching technique motivated by\nnumerical linear algebra literature to reduce the variance in estimation\nproblems. We demonstrate the robust numerical performance of VRS through a\nseries of simulated experiments and real-world data applications. Notably, VRS\nshows remarkable improvement over existing neural network estimators and\nclassical kernel methods in numerous density estimation and nonparametric\nregression models. Additionally, we offer theoretical justifications for VRS to\nsupport its ability to deliver nonparametric estimation with a reduced curse of\ndimensionality.\n","authors":["Yuehaw Khoo","Yifan Peng","Daren Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11646v1.pdf","comment":"64 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.16113v2","updated":"2024-01-22T01:38:12Z","published":"2023-12-20T08:16:53Z","title":"Task-Driven Causal Feature Distillation: Towards Trustworthy Risk\n Prediction","summary":" Since artificial intelligence has seen tremendous recent successes in many\nareas, it has sparked great interest in its potential for trustworthy and\ninterpretable risk prediction. However, most models lack causal reasoning and\nstruggle with class imbalance, leading to poor precision and recall. To address\nthis, we propose a Task-Driven Causal Feature Distillation model (TDCFD) to\ntransform original feature values into causal feature attributions for the\nspecific risk prediction task. The causal feature attribution helps describe\nhow much contribution the value of this feature can make to the risk prediction\nresult. After the causal feature distillation, a deep neural network is applied\nto produce trustworthy prediction results with causal interpretability and high\nprecision/recall. We evaluate the performance of our TDCFD method on several\nsynthetic and real datasets, and the results demonstrate its superiority over\nthe state-of-the-art methods regarding precision, recall, interpretability, and\ncausality.\n","authors":["Zhixuan Chu","Mengxuan Hu","Qing Cui","Longfei Li","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2312.16113v2.pdf","comment":"Proceedings of the 2024 AAAI Conference on Artificial Intelligence"},{"id":"http://arxiv.org/abs/2109.01636v4","updated":"2024-01-22T01:23:23Z","published":"2021-09-03T17:28:04Z","title":"Empirical Study of Named Entity Recognition Performance Using\n Distribution-aware Word Embedding","summary":" With the fast development of Deep Learning techniques, Named Entity\nRecognition (NER) is becoming more and more important in the information\nextraction task. The greatest difficulty that the NER task faces is to keep the\ndetectability even when types of NE and documents are unfamiliar. Realizing\nthat the specificity information may contain potential meanings of a word and\ngenerate semantic-related features for word embedding, we develop a\ndistribution-aware word embedding and implement three different methods to make\nuse of the distribution information in a NER framework. And the result shows\nthat the performance of NER will be improved if the word specificity is\nincorporated into existing NER methods.\n","authors":["Xin Chen","Qi Zhao","Xinyang Liu"],"pdf_url":"https://arxiv.org/pdf/2109.01636v4.pdf","comment":"Want to correct"},{"id":"http://arxiv.org/abs/2401.01084v2","updated":"2024-01-22T01:16:24Z","published":"2024-01-02T07:56:17Z","title":"Global Convergence of Natural Policy Gradient with Hessian-aided\n Momentum Variance Reduction","summary":" Natural policy gradient (NPG) and its variants are widely-used policy search\nmethods in reinforcement learning. Inspired by prior work, a new NPG variant\ncoined NPG-HM is developed in this paper, which utilizes the Hessian-aided\nmomentum technique for variance reduction, while the sub-problem is solved via\nthe stochastic gradient descent method. It is shown that NPG-HM can achieve the\nglobal last iterate $\\epsilon$-optimality with a sample complexity of\n$\\mathcal{O}(\\epsilon^{-2})$, which is the best known result for natural policy\ngradient type methods under the generic Fisher non-degenerate policy\nparameterizations. The convergence analysis is built upon a relaxed weak\ngradient dominance property tailored for NPG under the compatible function\napproximation framework, as well as a neat way to decompose the error when\nhandling the sub-problem. Moreover, numerical experiments on Mujoco-based\nenvironments demonstrate the superior performance of NPG-HM over other\nstate-of-the-art policy gradient methods.\n","authors":["Jie Feng","Ke Wei","Jinchi Chen"],"pdf_url":"https://arxiv.org/pdf/2401.01084v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17778v3","updated":"2024-01-22T00:54:30Z","published":"2023-06-30T16:31:14Z","title":"Look, Remember and Reason: Grounded reasoning in videos with language\n models","summary":" Multi-modal language models (LM) have recently shown promising performance in\nhigh-level reasoning tasks on videos. However, existing methods still fall\nshort in tasks like causal or compositional spatiotemporal reasoning over\nactions, in which model predictions need to be grounded in fine-grained\nlow-level details, such as object motions and object interactions. In this\nwork, we propose training an LM end-to-end on low-level surrogate tasks,\nincluding object detection, re-identification, and tracking, to endow the model\nwith the required low-level visual capabilities. We show that a two-stream\nvideo encoder with spatiotemporal attention is effective at capturing the\nrequired static and motion-based cues in the video. By leveraging the LM's\nability to perform the low-level surrogate tasks, we can cast reasoning in\nvideos as the three-step process of Look, Remember, Reason wherein visual\ninformation is extracted using low-level visual skills step-by-step and then\nintegrated to arrive at a final answer. We demonstrate the effectiveness of our\nframework on diverse visual reasoning tasks from the ACRE, CATER,\nSomething-Else and STAR datasets. Our approach is trainable end-to-end and\nsurpasses state-of-the-art task-specific methods across these tasks by a large\nmargin.\n","authors":["Apratim Bhattacharyya","Sunny Panchal","Mingu Lee","Reza Pourreza","Pulkit Madan","Roland Memisevic"],"pdf_url":"https://arxiv.org/pdf/2306.17778v3.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2306.09136v3","updated":"2024-01-22T00:51:05Z","published":"2023-06-15T13:49:30Z","title":"Finite-Time Logarithmic Bayes Regret Upper Bounds","summary":" We derive the first finite-time logarithmic Bayes regret upper bounds for\nBayesian bandits. In a multi-armed bandit, we obtain $O(c_\\Delta \\log n)$ and\n$O(c_h \\log^2 n)$ upper bounds for an upper confidence bound algorithm, where\n$c_h$ and $c_\\Delta$ are constants depending on the prior distribution and the\ngaps of bandit instances sampled from it, respectively. The latter bound\nasymptotically matches the lower bound of Lai (1987). Our proofs are a major\ntechnical departure from prior works, while being simple and general. To show\nthe generality of our techniques, we apply them to linear bandits. Our results\nprovide insights on the value of prior in the Bayesian setting, both in the\nobjective and as a side information given to the learner. They significantly\nimprove upon existing $\\tilde{O}(\\sqrt{n})$ bounds, which have become standard\nin the literature despite the logarithmic lower bound of Lai (1987).\n","authors":["Alexia Atsidakou","Branislav Kveton","Sumeet Katariya","Constantine Caramanis","Sujay Sanghavi"],"pdf_url":"https://arxiv.org/pdf/2306.09136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.13118v2","updated":"2024-01-22T00:50:55Z","published":"2023-12-20T15:37:50Z","title":"LRS: Enhancing Adversarial Transferability through Lipschitz Regularized\n Surrogate","summary":" The transferability of adversarial examples is of central importance to\ntransfer-based black-box adversarial attacks. Previous works for generating\ntransferable adversarial examples focus on attacking \\emph{given} pretrained\nsurrogate models while the connections between surrogate models and adversarial\ntrasferability have been overlooked. In this paper, we propose {\\em Lipschitz\nRegularized Surrogate} (LRS) for transfer-based black-box attacks, a novel\napproach that transforms surrogate models towards favorable adversarial\ntransferability. Using such transformed surrogate models, any existing\ntransfer-based black-box attack can run without any change, yet achieving much\nbetter performance. Specifically, we impose Lipschitz regularization on the\nloss landscape of surrogate models to enable a smoother and more controlled\noptimization process for generating more transferable adversarial examples. In\naddition, this paper also sheds light on the connection between the inner\nproperties of surrogate models and adversarial transferability, where three\nfactors are identified: smaller local Lipschitz constant, smoother loss\nlandscape, and stronger adversarial robustness. We evaluate our proposed LRS\napproach by attacking state-of-the-art standard deep neural networks and\ndefense models. The results demonstrate significant improvement on the attack\nsuccess rates and transferability. Our code is available at\nhttps://github.com/TrustAIoT/LRS.\n","authors":["Tao Wu","Tie Luo","Donald C. Wunsch"],"pdf_url":"https://arxiv.org/pdf/2312.13118v2.pdf","comment":"AAAI 2024 main track. Code available on Github (see abstract).\n Appendix is included in this updated version"},{"id":"http://arxiv.org/abs/2206.14358v2","updated":"2024-01-22T00:38:08Z","published":"2022-06-29T01:57:44Z","title":"Using Twitter Data to Understand Public Perceptions of Approved versus\n Off-label Use for COVID-19-related Medications","summary":" Understanding public discourse on emergency use of unproven therapeutics is\ncrucial for monitoring safe use and combating misinformation. We developed a\nnatural language processing-based pipeline to comprehend public perceptions of\nand stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter\nover time. This retrospective study included 609,189 US-based tweets from\nJanuary 29, 2020, to November 30, 2021, about four drugs that garnered\nsignificant public attention during the COVID-19 pandemic: (1)\nHydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2)\nMolnupiravir and Remdesivir, FDA-approved treatments for eligible patients.\nTime-trend analysis was employed to understand popularity trends and related\nevents. Content and demographic analyses were conducted to explore potential\nrationales behind people's stances on each drug. Time-trend analysis indicated\nthat Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir\nand Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and\nIvermectin discussions were highly politicized, related to conspiracy theories,\nhearsay, and celebrity influences. The distribution of stances between the two\nmajor US political parties was significantly different (P < .001); Republicans\nwere more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than\nDemocrats. People with healthcare backgrounds tended to oppose\nHydroxychloroquine (7%) more than the general population, while the general\npopulation was more likely to support Ivermectin (14%). Our study found that\nsocial media users have varying perceptions and stances on off-label versus\nFDA-authorized drug use at different stages of COVID-19. This indicates that\nhealth systems, regulatory agencies, and policymakers should design tailored\nstrategies to monitor and reduce misinformation to promote safe drug use.\n","authors":["Yining Hua","Hang Jiang","Shixu Lin","Jie Yang","Joseph M. Plasek","David W. Bates","Li Zhou"],"pdf_url":"https://arxiv.org/pdf/2206.14358v2.pdf","comment":"Full paper published in JAMIA"},{"id":"http://arxiv.org/abs/2310.17168v2","updated":"2024-01-22T00:12:20Z","published":"2023-10-26T05:49:13Z","title":"Learning an Inventory Control Policy with General Inventory Arrival\n Dynamics","summary":" In this paper we address the problem of learning and backtesting inventory\ncontrol policies in the presence of general arrival dynamics -- which we term\nas a quantity-over-time arrivals model (QOT). We also allow for order\nquantities to be modified as a post-processing step to meet vendor constraints\nsuch as order minimum and batch size constraints -- a common practice in real\nsupply chains. To the best of our knowledge this is the first work to handle\neither arbitrary arrival dynamics or an arbitrary downstream post-processing of\norder quantities. Building upon recent work (Madeka et al., 2022) we similarly\nformulate the periodic review inventory control problem as an exogenous\ndecision process, where most of the state is outside the control of the agent.\nMadeka et al., 2022 show how to construct a simulator that replays historic\ndata to solve this class of problem. In our case, we incorporate a deep\ngenerative model for the arrivals process as part of the history replay. By\nformulating the problem as an exogenous decision process, we can apply results\nfrom Madeka et al., 2022 to obtain a reduction to supervised learning. Via\nsimulation studies we show that this approach yields statistically significant\nimprovements in profitability over production baselines. Using data from a\nreal-world A/B test, we show that Gen-QOT generalizes well to off-policy data\nand that the resulting buying policy outperforms traditional inventory\nmanagement systems in real world settings.\n","authors":["Sohrab Andaz","Carson Eisenach","Dhruv Madeka","Kari Torkkola","Randy Jia","Dean Foster","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2310.17168v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.00647v2","updated":"2024-01-22T18:53:48Z","published":"2023-10-01T12:02:59Z","title":"Beyond Task Performance: Evaluating and Reducing the Flaws of Large\n Multimodal Models with In-Context Learning","summary":" Following the success of Large Language Models (LLMs), Large Multimodal\nModels (LMMs), such as the Flamingo model and its subsequent competitors, have\nstarted to emerge as natural steps towards generalist agents. However,\ninteracting with recent LMMs reveals major limitations that are hardly captured\nby the current evaluation benchmarks. Indeed, task performances (e.g., VQA\naccuracy) alone do not provide enough clues to understand their real\ncapabilities, limitations, and to which extent such models are aligned to human\nexpectations. To refine our understanding of those flaws, we deviate from the\ncurrent evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from\n3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention,\ncompositionality, explainability and instruction following. Our evaluation on\nthese axes reveals major flaws in LMMs. While the current go-to solution to\nalign these models is based on training, such as instruction tuning or RLHF, we\nrather (2) explore the training-free in-context learning (ICL) as a solution,\nand study how it affects these limitations. Based on our ICL study, (3) we push\nICL further and propose new multimodal ICL variants such as; Multitask-ICL,\nChain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows.\n(1) Despite their success, LMMs have flaws that remain unsolved with scaling\nalone. (2) The effect of ICL on LMMs flaws is nuanced; despite its\neffectiveness for improved explainability, answer abstention, ICL only slightly\nimproves instruction following, does not improve compositional abilities, and\nactually even amplifies hallucinations. (3) The proposed ICL variants are\npromising as post-hoc approaches to efficiently tackle some of those flaws. The\ncode is available here: https://github.com/mshukor/EvALign-ICL.\n","authors":["Mustafa Shukor","Alexandre Rame","Corentin Dancette","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2310.00647v2.pdf","comment":"ICLR 2024. Project Page: https://evalign-icl.github.io/"},{"id":"http://arxiv.org/abs/2401.11943v1","updated":"2024-01-22T13:33:53Z","published":"2024-01-22T13:33:53Z","title":"Benchmarking Large Multimodal Models against Common Corruptions","summary":" This technical report aims to fill a deficiency in the assessment of large\nmultimodal models (LMMs) by specifically examining the self-consistency of\ntheir outputs when subjected to common corruptions. We investigate the\ncross-modal interactions between text, image, and speech, encompassing four\nessential generation tasks: text-to-image, image-to-text, text-to-speech, and\nspeech-to-text. We create a comprehensive benchmark, named MMCBench, that\ncovers more than 100 popular LMMs (totally over 150 model checkpoints). A\nthorough evaluation under common corruptions is critical for practical\ndeployment and facilitates a better understanding of the reliability of\ncutting-edge LMMs. The benchmarking code is available at\nhttps://github.com/sail-sg/MMCBench\n","authors":["Jiawei Zhang","Tianyu Pang","Chao Du","Yi Ren","Bo Li","Min Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11943v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2401.11818v1","updated":"2024-01-22T10:26:52Z","published":"2024-01-22T10:26:52Z","title":"MInD: Improving Multimodal Sentiment Analysis via Multimodal Information\n Disentanglement","summary":" Learning effective joint representations has been a central task in\nmultimodal sentiment analysis. Previous methods focus on leveraging the\ncorrelations between different modalities and enhancing performance through\nsophisticated fusion techniques. However, challenges still exist due to the\ninherent heterogeneity of distinct modalities, which may lead to distributional\ngap, impeding the full exploitation of inter-modal information and resulting in\nredundancy and impurity in the information extracted from features. To address\nthis problem, we introduce the Multimodal Information Disentanglement (MInD)\napproach. MInD decomposes the multimodal inputs into a modality-invariant\ncomponent, a modality-specific component, and a remnant noise component for\neach modality through a shared encoder and multiple private encoders. The\nshared encoder aims to explore the shared information and commonality across\nmodalities, while the private encoders are deployed to capture the distinctive\ninformation and characteristic features. These representations thus furnish a\ncomprehensive perspective of the multimodal data, facilitating the fusion\nprocess instrumental for subsequent prediction tasks. Furthermore, MInD\nimproves the learned representations by explicitly modeling the task-irrelevant\nnoise in an adversarial manner. Experimental evaluations conducted on benchmark\ndatasets, including CMU-MOSI, CMU-MOSEI, and UR-Funny, demonstrate MInD's\nsuperior performance over existing state-of-the-art methods in both multimodal\nemotion recognition and multimodal humor detection tasks.\n","authors":["Weichen Dai","Xingyu Li","Pengbo Hu","Zeyu Wang","Ji Qi","Jianlin Peng","Yi Zhou"],"pdf_url":"https://arxiv.org/pdf/2401.11818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11764v1","updated":"2024-01-22T08:59:09Z","published":"2024-01-22T08:59:09Z","title":"Identity-Driven Multimedia Forgery Detection via Reference Assistance","summary":" Recent advancements in technologies, such as the 'deepfake' technique, have\npaved the way for the generation of various media forgeries. In response to the\npotential hazards of these media forgeries, many researchers engage in\nexploring detection methods, increasing the demand for high-quality media\nforgery datasets. Despite this, existing datasets have certain limitations.\nFirstly, most of datasets focus on the manipulation of visual modality and\nusually lack diversity, as only a few forgery approaches are considered.\nSecondly, the quality of media is often inadequate in clarity and naturalness.\nMeanwhile, the size of the dataset is also limited. Thirdly, while many\nreal-world forgeries are driven by identity, the identity information of the\nsubject in media is frequently neglected. For detection, identity information\ncould be an essential clue to boost accuracy. Moreover, official media\nconcerning certain identities on the Internet can serve as prior knowledge,\naiding both the audience and forgery detectors in determining the true\nidentity. Therefore, we propose an identity-driven multimedia forgery dataset,\nIDForge, which contains 249,138 video shots. All video shots are sourced from\n324 wild videos collected of 54 celebrities from the Internet. The fake video\nshots involve 9 types of manipulation across visual, audio and textual\nmodalities. Additionally, IDForge provides extra 214,438 real video shots as a\nreference set for the 54 celebrities. Correspondingly, we design an effective\nmultimedia detection network, Reference-assisted Multimodal Forgery Detection\nNetwork (R-MFDN). Through extensive experiments on the proposed dataset, we\ndemonstrate the effectiveness of R-MFDN on the multimedia detection task.\n","authors":["Junhao Xu","Jingjing Chen","Xue Song","Feng Han","Haijun Shan","Yugang Jiang"],"pdf_url":"https://arxiv.org/pdf/2401.11764v1.pdf","comment":null}]},"2024-01-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":" We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11626v1","updated":"2024-01-21T23:37:33Z","published":"2024-01-21T23:37:33Z","title":"Freely Long-Thinking Transformer (FraiLT)","summary":" Freely Long-Thinking Transformer (FraiLT) is an improved transformer model\ndesigned to enhance processing capabilities without scaling up size. It\nutilizes a recursive approach, iterating over a subset of layers multiple\ntimes, and introduces iteration encodings to maintain awareness across these\ncycles. Iteration encoding allows FraiLT to achieve the interpretive depth of\nlarger models in a compact form. When evaluated on a synthetic story dataset,\nFraiLT outperformed larger models, showcasing its ability to deliver\nhigh-quality performance while reducing memory demands. This model represents a\nstep forward towards more efficient and accessible language models.\n","authors":["Akbay Tabak"],"pdf_url":"https://arxiv.org/pdf/2401.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v1","updated":"2024-01-21T23:34:42Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n Survey","summary":" Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["an Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11601v1","updated":"2024-01-21T21:21:51Z","published":"2024-01-21T21:21:51Z","title":"Robust Evaluation Measures for Evaluating Social Biases in Masked\n Language Models","summary":" Many evaluation measures are used to evaluate social biases in masked\nlanguage models (MLMs). However, we find that these previously proposed\nevaluation measures are lacking robustness in scenarios with limited datasets.\nThis is because these measures are obtained by comparing the\npseudo-log-likelihood (PLL) scores of the stereotypical and anti-stereotypical\nsamples using an indicator function. The disadvantage is the limited mining of\nthe PLL score sets without capturing its distributional information. In this\npaper, we represent a PLL score set as a Gaussian distribution and use Kullback\nLeibler (KL) divergence and Jensen Shannon (JS) divergence to construct\nevaluation measures for the distributions of stereotypical and\nanti-stereotypical PLL scores. Experimental results on the publicly available\ndatasets StereoSet (SS) and CrowS-Pairs (CP) show that our proposed measures\nare significantly more robust and interpretable than those proposed previously.\n","authors":["Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2401.11601v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.01361v2","updated":"2024-01-21T21:01:12Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":" Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v2.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo and\n datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n (https://github.com/liruiw/GenSim) for more details"},{"id":"http://arxiv.org/abs/2309.12244v2","updated":"2024-01-21T16:30:35Z","published":"2023-09-21T16:43:17Z","title":"ChaCha: Leveraging Large Language Models to Prompt Children to Share\n Their Emotions about Personal Events","summary":" Children typically learn to identify and express emotions through sharing\ntheir stories and feelings with others, particularly their family. However, it\nis challenging for parents or siblings to have emotional communication with\nchildren since children are still developing their communication skills. We\npresent ChaCha, a chatbot that encourages and guides children to share personal\nevents and associated emotions. ChaCha combines a state machine and large\nlanguage models (LLMs) to keep the dialogue on track while carrying on\nfree-form conversations. Through an exploratory study with 20 children (aged\n8-12), we examine how ChaCha prompts children to share personal events and\nguides them to describe associated emotions. Participants perceived ChaCha as a\nclose friend and shared their stories on various topics, such as family trips\nand personal achievements. Based on the findings, we discuss opportunities for\nleveraging LLMs to design child-friendly chatbots to support children in\nsharing emotions.\n","authors":["Woosuk Seo","Chanmo Yang","Young-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2309.12244v2.pdf","comment":"16 pages, 5 figures, 2 tables; Accepted at ACM CHI 2024"},{"id":"http://arxiv.org/abs/2401.09074v2","updated":"2024-01-21T15:15:30Z","published":"2024-01-17T09:23:59Z","title":"Code Simulation Challenges for Large Language Models","summary":" We investigate the extent to which Large Language Models (LLMs) can simulate\nthe execution of computer code and algorithms. We begin by looking at straight\nline programs, and show that current LLMs demonstrate poor performance even\nwith such simple programs -- performance rapidly degrades with the length of\ncode. We then investigate the ability of LLMs to simulate programs that contain\ncritical paths and redundant instructions. We also go beyond straight line\nprogram simulation with sorting algorithms and nested loops, and we show the\ncomputational complexity of a routine directly affects the ability of an LLM to\nsimulate its execution. We observe that LLMs execute instructions sequentially\nand with a low error margin only for short programs or standard procedures.\nLLMs' code simulation is in tension with their pattern recognition and\nmemorisation capabilities: on tasks where memorisation is detrimental, we\npropose a novel prompting method to simulate code execution line by line.\nEmpirically, our new Chain of Simulation (CoSm) method improves on the standard\nChain of Thought prompting approach by avoiding the pitfalls of memorisation.\n","authors":["Emanuele La Malfa","Christoph Weinhuber","Orazio Torre","Fangru Lin","Anthony Cohn","Nigel Shadbolt","Michael Wooldridge"],"pdf_url":"https://arxiv.org/pdf/2401.09074v2.pdf","comment":"main paper (10 pages) + Appendix (11 pages)"},{"id":"http://arxiv.org/abs/2302.12584v2","updated":"2024-01-21T14:51:26Z","published":"2023-02-24T11:44:24Z","title":"VivesDebate-Speech: A Corpus of Spoken Argumentation to Leverage Audio\n Features for Argument Mining","summary":" In this paper, we describe VivesDebate-Speech, a corpus of spoken\nargumentation created to leverage audio features for argument mining tasks. The\ncreation of this corpus represents an important contribution to the\nintersection of speech processing and argument mining communities, and one of\nthe most complete publicly available resources in this topic. Moreover, we have\nperformed a set of first-of-their-kind experiments which show an improvement\nwhen integrating audio features into the argument mining pipeline. The provided\nresults can be used as a baseline for future research.\n","authors":["Ramon Ruiz-Dolz","Javier Iranzo-Sánchez"],"pdf_url":"https://arxiv.org/pdf/2302.12584v2.pdf","comment":"5 pages; EMNLP 2023 Accepted Version"},{"id":"http://arxiv.org/abs/2203.14647v2","updated":"2024-01-21T14:39:30Z","published":"2022-03-28T11:09:07Z","title":"Automatic Debate Evaluation with Argumentation Semantics and Natural\n Language Argument Graph Networks","summary":" The lack of annotated data on professional argumentation and complete\nargumentative debates has led to the oversimplification and the inability of\napproaching more complex natural language processing tasks. Such is the case of\nthe automatic debate evaluation. In this paper, we propose an original hybrid\nmethod to automatically evaluate argumentative debates. For that purpose, we\ncombine concepts from argumentation theory such as argumentation frameworks and\nsemantics, with Transformer-based architectures and neural graph networks.\nFurthermore, we obtain promising results that lay the basis on an unexplored\nnew instance of the automatic analysis of natural language arguments.\n","authors":["Ramon Ruiz-Dolz","Stella Heras","Ana García-Fornes"],"pdf_url":"https://arxiv.org/pdf/2203.14647v2.pdf","comment":"EMNLP 2023 Accepted Version"},{"id":"http://arxiv.org/abs/2401.11505v1","updated":"2024-01-21T14:30:20Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n Report Labeling","summary":" Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/kakaobrain/CheXGPT.\n","authors":["Jawook Gu","Han-Cheol Cho","Jiho Kim","Kihyun You","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.11504v1","updated":"2024-01-21T14:28:41Z","published":"2024-01-21T14:28:41Z","title":"With Greater Text Comes Greater Necessity: Inference-Time Training Helps\n Long Text Generation","summary":" Long text generation, such as novel writing or discourse-level translation\nwith extremely long contexts, presents significant challenges to current\nlanguage models. Existing methods mainly focus on extending the model's context\nwindow through strategies like length extrapolation. However, these approaches\ndemand substantial hardware resources during the training and/or inference\nphases. Our proposed method, Temp-Lora, introduces an alternative concept.\nInstead of relying on the KV cache to store all context information, Temp-Lora\nembeds this information directly into the model's parameters. In the process of\nlong text generation, we use a temporary Lora module, progressively trained\nwith text generated previously. This approach not only efficiently preserves\ncontextual knowledge but also prevents any permanent alteration to the model's\nparameters given that the module is discarded post-generation. Extensive\nexperiments on the PG19 language modeling benchmark and the GuoFeng\ndiscourse-level translation benchmark validate the effectiveness of Temp-Lora.\nOur results show that: 1) Temp-Lora substantially enhances generation quality\nfor long texts, as indicated by a 13.2% decrease in perplexity on a subset of\nPG19, and a 29.6% decrease in perplexity along with a 53.2% increase in BLEU\nscore on GuoFeng, 2) Temp-Lora is compatible with and enhances most existing\nlong text generation methods, and 3) Temp-Lora can greatly reduce computational\ncosts by shortening the context window. While ensuring a slight improvement in\ngeneration quality (a decrease of 3.8% in PPL), it enables a reduction of 70.5%\nin the FLOPs required for inference and a 51.5% decrease in latency.\n","authors":["Y. Wang","D. Ma","D. Cai"],"pdf_url":"https://arxiv.org/pdf/2401.11504v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v4","updated":"2024-01-21T13:38:20Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":" Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving substantial memory and time costs compared to vanilla\nPT and its variants, without changing trainable parameter sizes. Through\nextensive experiments on 23 natural language processing (NLP) and\nvision-language (VL) tasks, we demonstrate that DePT outperforms\nstate-of-the-art PEFT approaches, including the full fine-tuning baseline, in\nsome scenarios. Additionally, we empirically show that DEPT grows more\nefficient as the model size increases. Our further study reveals that DePT\nintegrates seamlessly with parameter-efficient transfer learning in the\nfew-shot learning setting and highlights its adaptability to various model\narchitectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v4.pdf","comment":"ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2401.11487v1","updated":"2024-01-21T13:18:20Z","published":"2024-01-21T13:18:20Z","title":"Towards Better Inclusivity: A Diverse Tweet Corpus of English Varieties","summary":" The prevalence of social media presents a growing opportunity to collect and\nanalyse examples of English varieties. Whilst usage of these varieties was -\nand, in many cases, still is - used only in spoken contexts or hard-to-access\nprivate messages, social media sites like Twitter provide a platform for users\nto communicate informally in a scrapeable format. Notably, Indian English\n(Hinglish), Singaporean English (Singlish), and African-American English (AAE)\ncan be commonly found online. These varieties pose a challenge to existing\nnatural language processing (NLP) tools as they often differ orthographically\nand syntactically from standard English for which the majority of these tools\nare built. NLP models trained on standard English texts produced biased\noutcomes for users of underrepresented varieties. Some research has aimed to\novercome the inherent biases caused by unrepresentative data through techniques\nlike data augmentation or adjusting training models.\n We aim to address the issue of bias at its root - the data itself. We curate\na dataset of tweets from countries with high proportions of underserved English\nvariety speakers, and propose an annotation framework of six categorical\nclassifications along a pseudo-spectrum that measures the degree of standard\nEnglish and that thereby indirectly aims to surface the manifestations of\nEnglish varieties in these tweets. Following best annotation practices, our\ngrowing corpus features 170,800 tweets taken from 7 countries, labeled by\nannotators who are from those countries and can communicate in\nregionally-dominant varieties of English. Our corpus highlights the accuracy\ndiscrepancies in pre-trained language identifiers between western English and\nnon-western (i.e., less standard) English varieties. We hope to contribute to\nthe growing literature identifying and reducing the implicit demographic\ndiscrepancies in NLP.\n","authors":["Nhi Pham","Lachlan Pham","Adam L. Meyers"],"pdf_url":"https://arxiv.org/pdf/2401.11487v1.pdf","comment":"10 pages (including limitations, references and appendices), 2\n figures"},{"id":"http://arxiv.org/abs/2310.15823v3","updated":"2024-01-21T12:40:48Z","published":"2023-10-24T13:23:57Z","title":"Rosetta Stone at KSAA-RD Shared Task: A Hop From Language Modeling To\n Word--Definition Alignment","summary":" A Reverse Dictionary is a tool enabling users to discover a word based on its\nprovided definition, meaning, or description. Such a technique proves valuable\nin various scenarios, aiding language learners who possess a description of a\nword without its identity, and benefiting writers seeking precise terminology.\nThese scenarios often encapsulate what is referred to as the\n\"Tip-of-the-Tongue\" (TOT) phenomena. In this work, we present our winning\nsolution for the Arabic Reverse Dictionary shared task. This task focuses on\nderiving a vector representation of an Arabic word from its accompanying\ndescription. The shared task encompasses two distinct subtasks: the first\ninvolves an Arabic definition as input, while the second employs an English\ndefinition. For the first subtask, our approach relies on an ensemble of\nfinetuned Arabic BERT-based models, predicting the word embedding for a given\ndefinition. The final representation is obtained through averaging the output\nembeddings from each model within the ensemble. In contrast, the most effective\nsolution for the second subtask involves translating the English test\ndefinitions into Arabic and applying them to the finetuned models originally\ntrained for the first subtask. This straightforward method achieves the highest\nscore across both subtasks.\n","authors":["Ahmed ElBakry","Mohamed Gabr","Muhammad ElNokrashy","Badr AlKhamissi"],"pdf_url":"https://arxiv.org/pdf/2310.15823v3.pdf","comment":"Proceedings of ArabicNLP 2023"},{"id":"http://arxiv.org/abs/2401.11467v1","updated":"2024-01-21T11:42:18Z","published":"2024-01-21T11:42:18Z","title":"Over-Reasoning and Redundant Calculation of Large Language Models","summary":" Large language models (LLMs) can solve problems step-by-step. While this\nchain-of-thought (CoT) reasoning boosts LLMs' performance, it is unclear if\nLLMs \\textit{know} when to use CoT and whether those CoT are always necessary\nto answer the question. This paper shows that LLMs tend to generate redundant\ncalculations and reasoning on a manually constructed math QA dataset,\nGSM8K-Zero. GSM8K-Zero is constructed such that the questions can be answered\nwithout any calculations, but LLMs, including Llama-2 models and Claude-2, tend\nto generate lengthy and unnecessary calculations to answer the questions. We\nalso conduct experiments to explain why LLMs generate redundant calculations\nand reasonings. GSM8K-Zero is publicly available at\nhttps://github.com/d223302/Over-Reasoning-of-LLMs and\nhttps://huggingface.co/datasets/dcml0714/GSM8K-Zero.\n","authors":["Cheng-Han Chiang","Hung-yi Lee"],"pdf_url":"https://arxiv.org/pdf/2401.11467v1.pdf","comment":"EACL 2024 main conference paper. Camera-ready version"},{"id":"http://arxiv.org/abs/2401.11463v1","updated":"2024-01-21T11:04:30Z","published":"2024-01-21T11:04:30Z","title":"Estimating the Usefulness of Clarifying Questions and Answers for\n Conversational Search","summary":" While the body of research directed towards constructing and generating\nclarifying questions in mixed-initiative conversational search systems is vast,\nresearch aimed at processing and comprehending users' answers to such questions\nis scarce. To this end, we present a simple yet effective method for processing\nanswers to clarifying questions, moving away from previous work that simply\nappends answers to the original query and thus potentially degrades retrieval\nperformance. Specifically, we propose a classifier for assessing usefulness of\nthe prompted clarifying question and an answer given by the user. Useful\nquestions or answers are further appended to the conversation history and\npassed to a transformer-based query rewriting module. Results demonstrate\nsignificant improvements over strong non-mixed-initiative baselines.\nFurthermore, the proposed approach mitigates the performance drops when non\nuseful questions and answers are utilized.\n","authors":["Ivan Sekulić","Weronika Łajewska","Krisztian Balog","Fabio Crestani"],"pdf_url":"https://arxiv.org/pdf/2401.11463v1.pdf","comment":"This is the author's version of the work. The definitive version is\n published in: Proceedings of the 46th European Conference on Information\n Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11458v1","updated":"2024-01-21T10:46:23Z","published":"2024-01-21T10:46:23Z","title":"Linear Alignment: A Closed-form Solution for Aligning Human Preferences\n without Tuning and Feedback","summary":" The success of AI assistants based on Language Models (LLMs) hinges on\nReinforcement Learning from Human Feedback (RLHF) to comprehend and align with\nuser intentions. However, traditional alignment algorithms, such as PPO, are\nhampered by complex annotation and training requirements. This reliance limits\nthe applicability of RLHF and hinders the development of professional\nassistants tailored to diverse human preferences. In this work, we introduce\n\\textit{Linear Alignment}, a novel algorithm that aligns language models with\nhuman preferences in one single inference step, eliminating the reliance on\ndata annotation and model training. Linear alignment incorporates a new\nparameterization for policy optimization under divergence constraints, which\nenables the extraction of optimal policy in a closed-form manner and\nfacilitates the direct estimation of the aligned response. Extensive\nexperiments on both general and personalized preference datasets demonstrate\nthat linear alignment significantly enhances the performance and efficiency of\nLLM alignment across diverse scenarios. Our code and dataset will be published\non \\url{https://github.com/Wizardcoast/Linear_Alignment.git}.\n","authors":["Songyang Gao","Qiming Ge","Wei Shen","Shihan Dou","Junjie Ye","Xiao Wang","Rui Zheng","Yicheng Zou","Zhi Chen","Hang Yan","Qi Zhang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2401.11458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11452v1","updated":"2024-01-21T10:15:36Z","published":"2024-01-21T10:15:36Z","title":"Towards Reliable and Factual Response Generation: Detecting Unanswerable\n Questions in Information-Seeking Conversations","summary":" Generative AI models face the challenge of hallucinations that can undermine\nusers' trust in such systems. We approach the problem of conversational\ninformation seeking as a two-step process, where relevant passages in a corpus\nare identified first and then summarized into a final system response. This way\nwe can automatically assess if the answer to the user's question is present in\nthe corpus. Specifically, our proposed method employs a sentence-level\nclassifier to detect if the answer is present, then aggregates these\npredictions on the passage level, and eventually across the top-ranked passages\nto arrive at a final answerability estimate. For training and evaluation, we\ndevelop a dataset based on the TREC CAsT benchmark that includes answerability\nlabels on the sentence, passage, and ranking levels. We demonstrate that our\nproposed method represents a strong baseline and outperforms a state-of-the-art\nLLM on the answerability prediction task.\n","authors":["Weronika Łajewska","Krisztian Balog"],"pdf_url":"https://arxiv.org/pdf/2401.11452v1.pdf","comment":"This is the author's version of the work. The definitive version is\n published in: Proceedings of the 46th European Conference on Information\n Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2312.11532v2","updated":"2024-01-21T09:30:36Z","published":"2023-12-15T15:01:10Z","title":"Topic-VQ-VAE: Leveraging Latent Codebooks for Flexible Topic-Guided\n Document Generation","summary":" This paper introduces a novel approach for topic modeling utilizing latent\ncodebooks from Vector-Quantized Variational Auto-Encoder~(VQ-VAE), discretely\nencapsulating the rich information of the pre-trained embeddings such as the\npre-trained language model. From the novel interpretation of the latent\ncodebooks and embeddings as conceptual bag-of-words, we propose a new\ngenerative topic model called Topic-VQ-VAE~(TVQ-VAE) which inversely generates\nthe original documents related to the respective latent codebook. The TVQ-VAE\ncan visualize the topics with various generative distributions including the\ntraditional BoW distribution and the autoregressive image generation. Our\nexperimental results on document analysis and image generation demonstrate that\nTVQ-VAE effectively captures the topic context which reveals the underlying\nstructures of the dataset and supports flexible forms of document generation.\nOfficial implementation of the proposed TVQ-VAE is available at\nhttps://github.com/clovaai/TVQ-VAE.\n","authors":["YoungJoon Yoo","Jongwon Choi"],"pdf_url":"https://arxiv.org/pdf/2312.11532v2.pdf","comment":"Published in the 38th annual AAAI conference on Artificial\n Intelligence"},{"id":"http://arxiv.org/abs/2401.11431v1","updated":"2024-01-21T08:43:24Z","published":"2024-01-21T08:43:24Z","title":"Majority or Minority: Data Imbalance Learning Method for Named Entity\n Recognition","summary":" Data imbalance presents a significant challenge in various machine learning\n(ML) tasks, particularly named entity recognition (NER) within natural language\nprocessing (NLP). NER exhibits a data imbalance with a long-tail distribution,\nfeaturing numerous minority classes (i.e., entity classes) and a single\nmajority class (i.e., O-class). The imbalance leads to the misclassifications\nof the entity classes as the O-class. To tackle the imbalance, we propose a\nsimple and effective learning method, named majority or minority (MoM)\nlearning. MoM learning incorporates the loss computed only for samples whose\nground truth is the majority class (i.e., the O-class) into the loss of the\nconventional ML model. Evaluation experiments on four NER datasets (Japanese\nand English) showed that MoM learning improves prediction performance of the\nminority classes, without sacrificing the performance of the majority class and\nis more effective than widely known and state-of-the-art methods. We also\nevaluated MoM learning using frameworks as sequential labeling and machine\nreading comprehension, which are commonly used in NER. Furthermore, MoM\nlearning has achieved consistent performance improvements regardless of\nlanguage, model, or framework.\n","authors":["Sota Nemoto","Shunsuke Kitada","Hitoshi Iyatomi"],"pdf_url":"https://arxiv.org/pdf/2401.11431v1.pdf","comment":"6 pages, 1 figures, 6 tables"},{"id":"http://arxiv.org/abs/2302.06419v2","updated":"2024-01-21T07:41:02Z","published":"2023-02-10T02:55:52Z","title":"AV-data2vec: Self-supervised Learning of Audio-Visual Speech\n Representations with Contextualized Target Representations","summary":" Self-supervision has shown great potential for audio-visual speech\nrecognition by vastly reducing the amount of labeled data required to build\ngood systems. However, existing methods are either not entirely end-to-end or\ndo not train joint representations of both modalities. In this paper, we\nintroduce AV-data2vec which addresses these challenges and builds audio-visual\nrepresentations based on predicting contextualized representations which has\nbeen successful in the uni-modal case. The model uses a shared transformer\nencoder for both audio and video and can combine both modalities to improve\nspeech recognition. Results on LRS3 show that AV-data2vec consistently\noutperforms existing methods under all settings with the same amount of data\nand model size.\n","authors":["Jiachen Lian","Alexei Baevski","Wei-Ning Hsu","Michael Auli"],"pdf_url":"https://arxiv.org/pdf/2302.06419v2.pdf","comment":"2023 ASRU"},{"id":"http://arxiv.org/abs/2401.10015v2","updated":"2024-01-21T06:51:25Z","published":"2024-01-18T14:33:01Z","title":"Towards Hierarchical Spoken Language Dysfluency Modeling","summary":" Speech disfluency modeling is the bottleneck for both speech therapy and\nlanguage learning. However, there is no effective AI solution to systematically\ntackle this problem. We solidify the concept of disfluent speech and disfluent\nspeech modeling. We then present Hierarchical Unconstrained Disfluency Modeling\n(H-UDM) approach, the hierarchical extension of UDM that addresses both\ndisfluency transcription and detection to eliminate the need for extensive\nmanual annotation. Our experimental findings serve as clear evidence of the\neffectiveness and reliability of the methods we have introduced, encompassing\nboth transcription and detection tasks.\n","authors":["Jiachen Lian","Gopala Anumanchipalli"],"pdf_url":"https://arxiv.org/pdf/2401.10015v2.pdf","comment":"2024 EACL. Hierarchical extension of our previous workshop paper\n arXiv:2312.12810"},{"id":"http://arxiv.org/abs/2401.11408v1","updated":"2024-01-21T06:10:03Z","published":"2024-01-21T06:10:03Z","title":"SEBERTNets: Sequence Enhanced BERT Networks for Event Entity Extraction\n Tasks Oriented to the Finance Field","summary":" Event extraction lies at the cores of investment analysis and asset\nmanagement in the financial field, and thus has received much attention. The\n2019 China conference on knowledge graph and semantic computing (CCKS)\nchallenge sets up a evaluation competition for event entity extraction task\noriented to the finance field. In this task, we mainly focus on how to extract\nthe event entity accurately, and recall all the corresponding event entity\neffectively. In this paper, we propose a novel model, Sequence Enhanced BERT\nNetworks (SEBERTNets for short), which can inherit the advantages of the\nBERT,and while capturing sequence semantic information. In addition, motivated\nby recommendation system, we propose Hybrid Sequence Enhanced BERT Networks\n(HSEBERTNets for short), which uses a multi-channel recall method to recall all\nthe corresponding event entity. The experimental results show that, the F1\nscore of SEBERTNets is 0.905 in the first stage, and the F1 score of\nHSEBERTNets is 0.934 in the first stage, which demonstarate the effectiveness\nof our methods.\n","authors":["Congqing He","Xiangyu Zhu","Yuquan Le","Yuzhong Liu","Jianhong Yin"],"pdf_url":"https://arxiv.org/pdf/2401.11408v1.pdf","comment":"CCKS 2019"},{"id":"http://arxiv.org/abs/2312.07930v2","updated":"2024-01-21T05:22:22Z","published":"2023-12-13T06:57:00Z","title":"Towards Optimal Statistical Watermarking","summary":" We study statistical watermarking by formulating it as a hypothesis testing\nproblem, a general framework which subsumes all previous statistical\nwatermarking methods. Key to our formulation is a coupling of the output tokens\nand the rejection region, realized by pseudo-random generators in practice,\nthat allows non-trivial trade-off between the Type I error and Type II error.\nWe characterize the Uniformly Most Powerful (UMP) watermark in the general\nhypothesis testing setting and the minimax Type II error in the model-agnostic\nsetting. In the common scenario where the output is a sequence of $n$ tokens,\nwe establish nearly matching upper and lower bounds on the number of i.i.d.\ntokens required to guarantee small Type I and Type II errors. Our rate of\n$\\Theta(h^{-1} \\log (1/h))$ with respect to the average entropy per token $h$\nhighlights potentials for improvement from the rate of $h^{-2}$ in the previous\nworks. Moreover, we formulate the robust watermarking problem where users are\nallowed to perform a class of perturbations on the generated texts, and\ncharacterize the optimal type II error of robust UMP tests via a linear\nprogramming problem. To the best of our knowledge, this is the first systematic\nstatistical treatment on the watermarking problem with near-optimal rates in\nthe i.i.d. setting, which might be of interest for future works.\n","authors":["Baihe Huang","Banghua Zhu","Hanlin Zhu","Jason D. Lee","Jiantao Jiao","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2312.07930v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11403v1","updated":"2024-01-21T04:54:45Z","published":"2024-01-21T04:54:45Z","title":"MolTailor: Tailoring Chemical Molecular Representation to Specific Tasks\n via Text Prompts","summary":" Deep learning is now widely used in drug discovery, providing significant\nacceleration and cost reduction. As the most fundamental building block,\nmolecular representation is essential for predicting molecular properties to\nenable various downstream applications. Most existing methods attempt to\nincorporate more information to learn better representations. However, not all\nfeatures are equally important for a specific task. Ignoring this would\npotentially compromise the training efficiency and predictive accuracy. To\naddress this issue, we propose a novel approach, which treats language models\nas an agent and molecular pretraining models as a knowledge base. The agent\naccentuates task-relevant features in the molecular representation by\nunderstanding the natural language description of the task, just as a tailor\ncustomizes clothes for clients. Thus, we call this approach MolTailor.\nEvaluations demonstrate MolTailor's superior performance over baselines,\nvalidating the efficacy of enhancing relevance for molecular representation\nlearning. This illustrates the potential of language model guided optimization\nto better exploit and unleash the capabilities of existing powerful molecular\nrepresentation methods. Our codes and appendix are available at\nhttps://github.com/SCIR-HI/MolTailor.\n","authors":["Haoqiang Guo","Sendong Zhao","Haochun Wang","Yanrui Du","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2401.11403v1.pdf","comment":"Accepted by AAAI 2024"},{"id":"http://arxiv.org/abs/2310.02255v3","updated":"2024-01-21T03:47:06Z","published":"2023-10-03T17:57:24Z","title":"MathVista: Evaluating Mathematical Reasoning of Foundation Models in\n Visual Contexts","summary":" Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit\nimpressive problem-solving skills in many tasks and domains, but their ability\nin mathematical reasoning in visual contexts has not been systematically\nstudied. To bridge this gap, we present MathVista, a benchmark designed to\ncombine challenges from diverse mathematical and visual tasks. It consists of\n6,141 examples, derived from 28 existing multimodal datasets involving\nmathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and\nPaperQA). Completing these tasks requires fine-grained, deep visual\nunderstanding and compositional reasoning, which all state-of-the-art\nfoundation models find challenging. With MathVista, we have conducted a\ncomprehensive, quantitative evaluation of 12 prominent foundation models. The\nbest-performing GPT-4V model achieves an overall accuracy of 49.9%,\nsubstantially outperforming Bard, the second-best performer, by 15.1%. Our\nin-depth analysis reveals that the superiority of GPT-4V is mainly attributed\nto its enhanced visual perception and mathematical reasoning. However, GPT-4V\nstill falls short of human performance by 10.4%, as it often struggles to\nunderstand complex figures and perform rigorous reasoning. This significant gap\nunderscores the critical role that MathVista will play in the development of\ngeneral-purpose AI agents capable of tackling mathematically intensive and\nvisually rich real-world tasks. We further explore the new ability of\nself-verification, the application of self-consistency, and the interactive\nchatbot capabilities of GPT-4V, highlighting its promising potential for future\nresearch. The project is available at https://mathvista.github.io/.\n","authors":["Pan Lu","Hritik Bansal","Tony Xia","Jiacheng Liu","Chunyuan Li","Hannaneh Hajishirzi","Hao Cheng","Kai-Wei Chang","Michel Galley","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2310.02255v3.pdf","comment":"116 pages, 120 figures. Accepted to ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11389v1","updated":"2024-01-21T03:37:47Z","published":"2024-01-21T03:37:47Z","title":"MedLM: Exploring Language Models for Medical Question Answering Systems","summary":" In the face of rapidly expanding online medical literature, automated systems\nfor aggregating and summarizing information are becoming increasingly crucial\nfor healthcare professionals and patients. Large Language Models (LLMs), with\ntheir advanced generative capabilities, have shown promise in various NLP\ntasks, and their potential in the healthcare domain, particularly for\nClosed-Book Generative QnA, is significant. However, the performance of these\nmodels in domain-specific tasks such as medical Q&A remains largely unexplored.\nThis study aims to fill this gap by comparing the performance of general and\nmedical-specific distilled LMs for medical Q&A. We aim to evaluate the\neffectiveness of fine-tuning domain-specific LMs and compare the performance of\ndifferent families of Language Models. The study will address critical\nquestions about these models' reliability, comparative performance, and\neffectiveness in the context of medical Q&A. The findings will provide valuable\ninsights into the suitability of different LMs for specific applications in the\nmedical domain.\n","authors":["Niraj Yagnik","Jay Jhaveri","Vivek Sharma","Gabriel Pila","Asma Ben","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2401.11389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.10189v2","updated":"2024-01-21T03:37:41Z","published":"2024-01-18T18:20:15Z","title":"Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through\n Text Reconstruction","summary":" Fine-grained few-shot entity extraction in the chemical domain faces two\nunique challenges. First, compared with entity extraction tasks in the general\ndomain, sentences from chemical papers usually contain more entities. Moreover,\nentity extraction models usually have difficulty extracting entities of\nlong-tailed types. In this paper, we propose Chem-FINESE, a novel\nsequence-to-sequence (seq2seq) based few-shot entity extraction approach, to\naddress these two challenges. Our Chem-FINESE has two components: a seq2seq\nentity extractor to extract named entities from the input sentence and a\nseq2seq self-validation module to reconstruct the original input sentence from\nextracted entities. Inspired by the fact that a good entity extraction system\nneeds to extract entities faithfully, our new self-validation module leverages\nentity extraction results to reconstruct the original input sentence. Besides,\nwe design a new contrastive loss to reduce excessive copying during the\nextraction process. Finally, we release ChemNER+, a new fine-grained chemical\nentity extraction dataset that is annotated by domain experts with the ChemNER\nschema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets\nshow that our newly proposed framework has contributed up to 8.26% and 6.84%\nabsolute F1-score gains respectively.\n","authors":["Qingyun Wang","Zixuan Zhang","Hongxiang Li","Xuan Liu","Jiawei Han","Heng Ji","Huimin Zhao"],"pdf_url":"https://arxiv.org/pdf/2401.10189v2.pdf","comment":"16 pages. Accepted by Findings of the Association for Computational\n Linguistics: EACL 2024. Code and resources are available at\n https://github.com/EagleW/Chem-FINESE"},{"id":"http://arxiv.org/abs/2401.11382v1","updated":"2024-01-21T03:15:05Z","published":"2024-01-21T03:15:05Z","title":"Using Large Language Model for End-to-End Chinese ASR and NER","summary":" Mapping speech tokens to the same feature space as text tokens has become the\nparadigm for the integration of speech modality into decoder-only large\nlanguage models (LLMs). An alternative approach is to use an encoder-decoder\narchitecture that incorporates speech features through cross-attention. This\napproach, however, has received less attention in the literature. In this work,\nwe connect the Whisper encoder with ChatGLM3 and provide in-depth comparisons\nof these two approaches using Chinese automatic speech recognition (ASR) and\nname entity recognition (NER) tasks. We evaluate them not only by conventional\nmetrics like the F1 score but also by a novel fine-grained taxonomy of ASR-NER\nerrors. Our experiments reveal that encoder-decoder architecture outperforms\ndecoder-only architecture with a short context, while decoder-only architecture\nbenefits from a long context as it fully exploits all layers of the LLM. By\nusing LLM, we significantly reduced the entity omission errors and improved the\nentity ASR accuracy compared to the Conformer baseline. Additionally, we\nobtained a state-of-the-art (SOTA) F1 score of 0.805 on the AISHELL-NER test\nset by using chain-of-thought (CoT) NER which first infers long-form ASR\ntranscriptions and then predicts NER labels.\n","authors":["Yuang Li","Jiawei Yu","Yanqing Zhao","Min Zhang","Mengxin Ren","Xiaofeng Zhao","Xiaosong Qiao","Chang Su","Miaomiao Ma","Hao Yang"],"pdf_url":"https://arxiv.org/pdf/2401.11382v1.pdf","comment":"5 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.11374v1","updated":"2024-01-21T02:29:12Z","published":"2024-01-21T02:29:12Z","title":"Language Models as Hierarchy Encoders","summary":" Interpreting hierarchical structures latent in language is a key limitation\nof current language models (LMs). While previous research has implicitly\nleveraged these hierarchies to enhance LMs, approaches for their explicit\nencoding are yet to be explored. To address this, we introduce a novel approach\nto re-train transformer encoder-based LMs as Hierarchy Transformer encoders\n(HiTs), harnessing the expansive nature of hyperbolic space. Our method\nsituates the output embedding space of pre-trained LMs within a Poincar\\'e ball\nwith a curvature that adapts to the embedding dimension, followed by\nre-training on hyperbolic cluster and centripetal losses. These losses are\ndesigned to effectively cluster related entities (input as texts) and organise\nthem hierarchically. We evaluate HiTs against pre-trained and fine-tuned LMs,\nfocusing on their capabilities in simulating transitive inference, predicting\nsubsumptions, and transferring knowledge across hierarchies. The results\ndemonstrate that HiTs consistently outperform both pre-trained and fine-tuned\nLMs in these tasks, underscoring the effectiveness and transferability of our\nre-trained hierarchy encoders.\n","authors":["Yuan He","Zhangdie Yuan","Jiaoyan Chen","Ian Horrocks"],"pdf_url":"https://arxiv.org/pdf/2401.11374v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11373v1","updated":"2024-01-21T02:25:29Z","published":"2024-01-21T02:25:29Z","title":"Finding a Needle in the Adversarial Haystack: A Targeted Paraphrasing\n Approach For Uncovering Edge Cases with Minimal Distribution Distortion","summary":" Adversarial attacks against NLP Deep Learning models are a significant\nconcern. In particular, adversarial samples exploit the model's sensitivity to\nsmall input changes. While these changes appear insignificant on the semantics\nof the input sample, they result in significant decay in model performance. In\nthis paper, we propose Targeted Paraphrasing via RL (TPRL), an approach to\nautomatically learn a policy to generate challenging samples that most likely\nimprove the model's performance. TPRL leverages FLAN T5, a language model, as a\ngenerator and employs a self learned policy using a proximal policy gradient to\ngenerate the adversarial examples automatically. TPRL's reward is based on the\nconfusion induced in the classifier, preserving the original text meaning\nthrough a Mutual Implication score. We demonstrate and evaluate TPRL's\neffectiveness in discovering natural adversarial attacks and improving model\nperformance through extensive experiments on four diverse NLP classification\ntasks via Automatic and Human evaluation. TPRL outperforms strong baselines,\nexhibits generalizability across classifiers and datasets, and combines the\nstrengths of language modeling and reinforcement learning to generate diverse\nand influential adversarial examples.\n","authors":["Aly M. Kassem","Sherif Saad"],"pdf_url":"https://arxiv.org/pdf/2401.11373v1.pdf","comment":"EACL 2024 - Main conference"},{"id":"http://arxiv.org/abs/2401.11365v1","updated":"2024-01-21T01:37:25Z","published":"2024-01-21T01:37:25Z","title":"Confidence Preservation Property in Knowledge Distillation Abstractions","summary":" Social media platforms prevent malicious activities by detecting harmful\ncontent of posts and comments. To that end, they employ large-scale deep neural\nnetwork language models for sentiment analysis and content understanding. Some\nmodels, like BERT, are complex, and have numerous parameters, which makes them\nexpensive to operate and maintain. To overcome these deficiencies, industry\nexperts employ a knowledge distillation compression technique, where a\ndistilled model is trained to reproduce the classification behavior of the\noriginal model. The distillation processes terminates when the distillation\nloss function reaches the stopping criteria. This function is mainly designed\nto ensure that the original and the distilled models exhibit alike\nclassification behaviors. However, besides classification accuracy, there are\nadditional properties of the original model that the distilled model should\npreserve to be considered as an appropriate abstraction. In this work, we\nexplore whether distilled TinyBERT models preserve confidence values of the\noriginal BERT models, and investigate how this confidence preservation property\ncould guide tuning hyperparameters of the distillation process.\n","authors":["Dmitry Vengertsev","Elena Sherman"],"pdf_url":"https://arxiv.org/pdf/2401.11365v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11361v1","updated":"2024-01-21T01:18:08Z","published":"2024-01-21T01:18:08Z","title":"Revolutionizing API Documentation through Summarization","summary":" This study tackles the challenges associated with interpreting Application\nProgramming Interface (API) documentation, an integral aspect of software\ndevelopment. Official API documentation, while essential, can be lengthy and\nchallenging to navigate, prompting developers to seek unofficial sources such\nas Stack Overflow. Leveraging the vast user-generated content on Stack\nOverflow, including code snippets and discussions, we employ BERTopic and\nextractive summarization to automatically generate concise and informative API\nsummaries. These summaries encompass key insights like general usage, common\ndeveloper issues, and potential solutions, sourced from the wealth of knowledge\non Stack Overflow. Software developers evaluate these summaries for\nperformance, coherence, and interoperability, providing valuable feedback on\nthe practicality of our approach.\n","authors":["AmirHossein Naghshzan","Sylvie Ratte"],"pdf_url":"https://arxiv.org/pdf/2401.11361v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2308.09070"},{"id":"http://arxiv.org/abs/2401.11356v1","updated":"2024-01-21T00:58:31Z","published":"2024-01-21T00:58:31Z","title":"ProLex: A Benchmark for Language Proficiency-oriented Lexical\n Substitution","summary":" Lexical Substitution discovers appropriate substitutes for a given target\nword in a context sentence. However, the task fails to consider substitutes\nthat are of equal or higher proficiency than the target, an aspect that could\nbe beneficial for language learners looking to improve their writing. To bridge\nthis gap, we propose a new task, language proficiency-oriented lexical\nsubstitution. We also introduce ProLex, a novel benchmark designed to assess\nsystems' ability to generate not only appropriate substitutes but also\nsubstitutes that demonstrate better language proficiency. Besides the\nbenchmark, we propose models that can automatically perform the new task. We\nshow that our best model, a Llama2-13B model fine-tuned with task-specific\nsynthetic data, outperforms ChatGPT by an average of 3.2% in F-score and\nachieves comparable results with GPT-4 on ProLex.\n","authors":["Xuanming Zhang","Zixun Chen","Zhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11356v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":" We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.05105v2","updated":"2024-01-21T23:04:32Z","published":"2023-03-09T08:24:02Z","title":"MaskDiff: Modeling Mask Distribution with Diffusion Probabilistic Model\n for Few-Shot Instance Segmentation","summary":" Few-shot instance segmentation extends the few-shot learning paradigm to the\ninstance segmentation task, which tries to segment instance objects from a\nquery image with a few annotated examples of novel categories. Conventional\napproaches have attempted to address the task via prototype learning, known as\npoint estimation. However, this mechanism depends on prototypes (\\eg mean of\n$K-$shot) for prediction, leading to performance instability. To overcome the\ndisadvantage of the point estimation mechanism, we propose a novel approach,\ndubbed MaskDiff, which models the underlying conditional distribution of a\nbinary mask, which is conditioned on an object region and $K-$shot information.\nInspired by augmentation approaches that perturb data with Gaussian noise for\npopulating low data density regions, we model the mask distribution with a\ndiffusion probabilistic model. We also propose to utilize classifier-free\nguided mask sampling to integrate category information into the binary mask\ngeneration process. Without bells and whistles, our proposed method\nconsistently outperforms state-of-the-art methods on both base and novel\nclasses of the COCO dataset while simultaneously being more stable than\nexisting methods. The source code is available at:\nhttps://github.com/minhquanlecs/MaskDiff.\n","authors":["Minh-Quan Le","Tam V. Nguyen","Trung-Nghia Le","Thanh-Toan Do","Minh N. Do","Minh-Triet Tran"],"pdf_url":"https://arxiv.org/pdf/2303.05105v2.pdf","comment":"Accepted at AAAI 2024 (oral presentation)"},{"id":"http://arxiv.org/abs/2401.11617v1","updated":"2024-01-21T22:50:44Z","published":"2024-01-21T22:50:44Z","title":"A Survey on African Computer Vision Datasets, Topics and Researchers","summary":" Computer vision encompasses a range of tasks such as object detection,\nsemantic segmentation, and 3D reconstruction. Despite its relevance to African\ncommunities, research in this field within Africa represents only 0.06% of\ntop-tier publications over the past decade. This study undertakes a thorough\nanalysis of 63,000 Scopus-indexed computer vision publications from Africa,\nspanning from 2012 to 2022. The aim is to provide a survey of African computer\nvision topics, datasets and researchers. A key aspect of our study is the\nidentification and categorization of African Computer Vision datasets using\nlarge language models that automatically parse abstracts of these publications.\nWe also provide a compilation of unofficial African Computer Vision datasets\ndistributed through challenges or data hosting platforms, and provide a full\ntaxonomy of dataset categories. Our survey also pinpoints computer vision\ntopics trends specific to different African regions, indicating their unique\nfocus areas. Additionally, we carried out an extensive survey to capture the\nviews of African researchers on the current state of computer vision research\nin the continent and the structural barriers they believe need urgent\nattention. In conclusion, this study catalogs and categorizes Computer Vision\ndatasets and topics contributed or initiated by African institutions and\nidentifies barriers to publishing in top-tier Computer Vision venues. This\nsurvey underscores the importance of encouraging African researchers and\ninstitutions in advancing computer vision research in the continent. It also\nstresses on the need for research topics to be more aligned with the needs of\nAfrican communities.\n","authors":["Abdul-Hakeem Omotayo","Ashery Mbilinyi","Lukman Ismaila","Houcemeddine Turki","Mahmoud Abdien","Karim Gamal","Idriss Tondji","Yvan Pimi","Naome A. Etori","Marwa M. Matar","Clifford Broni-Bediako","Abigail Oppong","Mai Gamal","Eman Ehab","Gbetondji Dovonon","Zainab Akinjobi","Daniel Ajisafe","Oluwabukola G. Adegboro","Mennatullah Siam"],"pdf_url":"https://arxiv.org/pdf/2401.11617v1.pdf","comment":"Under Review, Community Work of Ro'ya Grassroots,\n https://ro-ya-cv4africa.github.io/homepage/. arXiv admin note: text overlap\n with arXiv:2305.06773"},{"id":"http://arxiv.org/abs/2311.03500v2","updated":"2024-01-21T22:04:28Z","published":"2023-11-06T20:18:26Z","title":"Predicting Age from White Matter Diffusivity with Residual Learning","summary":" Imaging findings inconsistent with those expected at specific chronological\nage ranges may serve as early indicators of neurological disorders and\nincreased mortality risk. Estimation of chronological age, and deviations from\nexpected results, from structural MRI data has become an important task for\ndeveloping biomarkers that are sensitive to such deviations. Complementary to\nstructural analysis, diffusion tensor imaging (DTI) has proven effective in\nidentifying age-related microstructural changes within the brain white matter,\nthereby presenting itself as a promising additional modality for brain age\nprediction. Although early studies have sought to harness DTI's advantages for\nage estimation, there is no evidence that the success of this prediction is\nowed to the unique microstructural and diffusivity features that DTI provides,\nrather than the macrostructural features that are also available in DTI data.\nTherefore, we seek to develop white-matter-specific age estimation to capture\ndeviations from normal white matter aging. Specifically, we deliberately\ndisregard the macrostructural information when predicting age from DTI scalar\nimages, using two distinct methods. The first method relies on extracting only\nmicrostructural features from regions of interest. The second applies 3D\nresidual neural networks (ResNets) to learn features directly from the images,\nwhich are non-linearly registered and warped to a template to minimize\nmacrostructural variations. When tested on unseen data, the first method yields\nmean absolute error (MAE) of 6.11 years for cognitively normal participants and\nMAE of 6.62 years for cognitively impaired participants, while the second\nmethod achieves MAE of 4.69 years for cognitively normal participants and MAE\nof 4.96 years for cognitively impaired participants. We find that the ResNet\nmodel captures subtler, non-macrostructural features for brain age prediction.\n","authors":["Chenyu Gao","Michael E. Kim","Ho Hin Lee","Qi Yang","Nazirah Mohd Khairi","Praitayini Kanakaraj","Nancy R. Newlin","Derek B. Archer","Angela L. Jefferson","Warren D. Taylor","Brian D. Boyd","Lori L. Beason-Held","Susan M. Resnick","The BIOCARD Study Team","Yuankai Huo","Katherine D. Van Schaik","Kurt G. Schilling","Daniel Moyer","Ivana Išgum","Bennett A. Landman"],"pdf_url":"https://arxiv.org/pdf/2311.03500v2.pdf","comment":"SPIE Medical Imaging: Image Processing. San Diego, CA. February 2024\n (accepted as poster presentation)"},{"id":"http://arxiv.org/abs/2401.11605v1","updated":"2024-01-21T21:49:49Z","published":"2024-01-21T21:49:49Z","title":"Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass\n Diffusion Transformers","summary":" We present the Hourglass Diffusion Transformer (HDiT), an image generative\nmodel that exhibits linear scaling with pixel count, supporting training at\nhigh-resolution (e.g. $1024 \\times 1024$) directly in pixel-space. Building on\nthe Transformer architecture, which is known to scale to billions of\nparameters, it bridges the gap between the efficiency of convolutional U-Nets\nand the scalability of Transformers. HDiT trains successfully without typical\nhigh-resolution training techniques such as multiscale architectures, latent\nautoencoders or self-conditioning. We demonstrate that HDiT performs\ncompetitively with existing models on ImageNet $256^2$, and sets a new\nstate-of-the-art for diffusion models on FFHQ-$1024^2$.\n","authors":["Katherine Crowson","Stefan Andreas Baumann","Alex Birch","Tanishq Mathew Abraham","Daniel Z. Kaplan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2401.11605v1.pdf","comment":"20 pages, 13 figures, project page and code available at\n https://crowsonkb.github.io/hourglass-diffusion-transformers/"},{"id":"http://arxiv.org/abs/2401.11598v1","updated":"2024-01-21T21:04:05Z","published":"2024-01-21T21:04:05Z","title":"TetraLoss: Improving the Robustness of Face Recognition against Morphing\n Attacks","summary":" Face recognition systems are widely deployed in high-security applications\nsuch as for biometric verification at border controls. Despite their high\naccuracy on pristine data, it is well-known that digital manipulations, such as\nface morphing, pose a security threat to face recognition systems. Malicious\nactors can exploit the facilities offered by the identity document issuance\nprocess to obtain identity documents containing morphed images. Thus, subjects\nwho contributed to the creation of the morphed image can with high probability\nuse the identity document to bypass automated face recognition systems. In\nrecent years, no-reference (i.e., single image) and differential morphing\nattack detectors have been proposed to tackle this risk. These systems are\ntypically evaluated in isolation from the face recognition system that they\nhave to operate jointly with and do not consider the face recognition process.\nContrary to most existing works, we present a novel method for adapting deep\nlearning-based face recognition systems to be more robust against face morphing\nattacks. To this end, we introduce TetraLoss, a novel loss function that learns\nto separate morphed face images from its contributing subjects in the embedding\nspace while still preserving high biometric verification performance. In a\ncomprehensive evaluation, we show that the proposed method can significantly\nenhance the original system while also significantly outperforming other tested\nbaseline methods.\n","authors":["Mathias Ibsen","Lázaro J. González-Soler","Christian Rathgeb","Christoph Busch"],"pdf_url":"https://arxiv.org/pdf/2401.11598v1.pdf","comment":"Accepted to the IEEE International Conference on Automatic Face &\n Gesture Recognition 2024 (FG'24)"},{"id":"http://arxiv.org/abs/2310.01361v2","updated":"2024-01-21T21:01:12Z","published":"2023-10-02T17:23:48Z","title":"GenSim: Generating Robotic Simulation Tasks via Large Language Models","summary":" Collecting large amounts of real-world interaction data to train general\nrobotic policies is often prohibitively expensive, thus motivating the use of\nsimulation data. However, existing methods for data generation have generally\nfocused on scene-level diversity (e.g., object instances and poses) rather than\ntask-level diversity, due to the human effort required to come up with and\nverify novel tasks. This has made it challenging for policies trained on\nsimulation data to demonstrate significant task-level generalization. In this\npaper, we propose to automatically generate rich simulation environments and\nexpert demonstrations by exploiting a large language models' (LLM) grounding\nand coding ability. Our approach, dubbed GenSim, has two modes: goal-directed\ngeneration, wherein a target task is given to the LLM and the LLM proposes a\ntask curriculum to solve the target task, and exploratory generation, wherein\nthe LLM bootstraps from previous tasks and iteratively proposes novel tasks\nthat would be helpful in solving more complex tasks. We use GPT4 to expand the\nexisting benchmark by ten times to over 100 tasks, on which we conduct\nsupervised finetuning and evaluate several LLMs including finetuned GPTs and\nCode Llama on code generation for robotic simulation tasks. Furthermore, we\nobserve that LLMs-generated simulation programs can enhance task-level\ngeneralization significantly when used for multitask policy training. We\nfurther find that with minimal sim-to-real adaptation, the multitask policies\npretrained on GPT4-generated simulation tasks exhibit stronger transfer to\nunseen long-horizon tasks in the real world and outperform baselines by 25%.\nSee the project website (https://liruiw.github.io/gensim) for code, demos, and\nvideos.\n","authors":["Lirui Wang","Yiyang Ling","Zhecheng Yuan","Mohit Shridhar","Chen Bao","Yuzhe Qin","Bailin Wang","Huazhe Xu","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01361v2.pdf","comment":"See our project website (https://liruiw.github.io/gensim), demo and\n datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code\n (https://github.com/liruiw/GenSim) for more details"},{"id":"http://arxiv.org/abs/2401.11582v1","updated":"2024-01-21T20:10:02Z","published":"2024-01-21T20:10:02Z","title":"Thermal Image Calibration and Correction using Unpaired Cycle-Consistent\n Adversarial Networks","summary":" Unmanned aerial vehicles (UAVs) offer a flexible and cost-effective solution\nfor wildfire monitoring. However, their widespread deployment during wildfires\nhas been hindered by a lack of operational guidelines and concerns about\npotential interference with aircraft systems. Consequently, the progress in\ndeveloping deep-learning models for wildfire detection and characterization\nusing aerial images is constrained by the limited availability, size, and\nquality of existing datasets. This paper introduces a solution aimed at\nenhancing the quality of current aerial wildfire datasets to align with\nadvancements in camera technology. The proposed approach offers a solution to\ncreate a comprehensive, standardized large-scale image dataset. This paper\npresents a pipeline based on CycleGAN to enhance wildfire datasets and a novel\nfusion method that integrates paired RGB images as attribute conditioning in\nthe generators of both directions, improving the accuracy of the generated\nimages.\n","authors":["Hossein Rajoli","Pouya Afshin","Fatemeh Afghah"],"pdf_url":"https://arxiv.org/pdf/2401.11582v1.pdf","comment":"This paper has been accepted at the Asilomar 2023 Conference and will\n be published"},{"id":"http://arxiv.org/abs/2303.05123v3","updated":"2024-01-21T18:11:49Z","published":"2023-03-09T09:12:21Z","title":"Dominating Set Database Selection for Visual Place Recognition","summary":" This paper presents an approach for creating a visual place recognition (VPR)\ndatabase for localization in indoor environments from RGBD scanning sequences.\nThe proposed approach is formulated as a minimization problem in terms of\ndominating set algorithm for graph, constructed from spatial information, and\nreferred as DominatingSet. Our algorithm shows better scene coverage in\ncomparison to other methodologies that are used for database creation. Also, we\ndemonstrate that using DominatingSet, a database size could be up to 250-1400\ntimes smaller than the original scanning sequence while maintaining a recall\nrate of more than 80% on testing sequences. We evaluated our algorithm on\n7-scenes and BundleFusion datasets and an additionally recorded sequence in a\nhighly repetitive office setting. In addition, the database selection can\nproduce weakly-supervised labels for fine-tuning neural place recognition\nalgorithms to particular settings, improving even more their accuracy. The\npaper also presents a fully automated pipeline for VPR database creation from\nRGBD scanning sequences, as well as a set of metrics for VPR database\nevaluation. The code and released data are available on our web-page~ --\nhttps://prime-slam.github.io/place-recognition-db/\n","authors":["Anastasiia Kornilova","Ivan Moskalenko","Timofei Pushkin","Fakhriddin Tojiboev","Rahim Tariverdizadeh","Gonzalo Ferrer"],"pdf_url":"https://arxiv.org/pdf/2303.05123v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11544v1","updated":"2024-01-21T16:59:44Z","published":"2024-01-21T16:59:44Z","title":"Hierarchical Prompts for Rehearsal-free Continual Learning","summary":" Continual learning endeavors to equip the model with the capability to\nintegrate current task knowledge while mitigating the forgetting of past task\nknowledge. Inspired by prompt tuning, prompt-based methods maintain a frozen\nbackbone and train with slight learnable prompts to minimize the catastrophic\nforgetting that arises due to updating a large number of backbone parameters.\nNonetheless, these learnable prompts tend to concentrate on the discriminatory\nknowledge of the current task while ignoring past task knowledge, leading to\nthat learnable prompts still suffering from catastrophic forgetting. This paper\nintroduces a novel rehearsal-free paradigm for continual learning termed\nHierarchical Prompts (H-Prompts), comprising three categories of prompts --\nclass prompt, task prompt, and general prompt. To effectively depict the\nknowledge of past classes, class prompt leverages Bayesian Distribution\nAlignment to model the distribution of classes in each task. To reduce the\nforgetting of past task knowledge, task prompt employs Cross-task Knowledge\nExcavation to amalgamate the knowledge encapsulated in the learned class\nprompts of past tasks and current task knowledge. Furthermore, general prompt\nutilizes Generalized Knowledge Exploration to deduce highly generalized\nknowledge in a self-supervised manner. Evaluations on two benchmarks\nsubstantiate the efficacy of the proposed H-Prompts, exemplified by an average\naccuracy of 87.8% in Split CIFAR-100 and 70.6% in Split ImageNet-R.\n","authors":["Yukun Zuo","Hantao Yao","Lu Yu","Liansheng Zhuang","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2401.11544v1.pdf","comment":"Submitted to TPAMI"},{"id":"http://arxiv.org/abs/2401.11543v1","updated":"2024-01-21T16:55:40Z","published":"2024-01-21T16:55:40Z","title":"How Robust Are Energy-Based Models Trained With Equilibrium Propagation?","summary":" Deep neural networks (DNNs) are easily fooled by adversarial perturbations\nthat are imperceptible to humans. Adversarial training, a process where\nadversarial examples are added to the training set, is the current\nstate-of-the-art defense against adversarial attacks, but it lowers the model's\naccuracy on clean inputs, is computationally expensive, and offers less\nrobustness to natural noise. In contrast, energy-based models (EBMs), which\nwere designed for efficient implementation in neuromorphic hardware and\nphysical systems, incorporate feedback connections from each layer to the\nprevious layer, yielding a recurrent, deep-attractor architecture which we\nhypothesize should make them naturally robust. Our work is the first to explore\nthe robustness of EBMs to both natural corruptions and adversarial attacks,\nwhich we do using the CIFAR-10 and CIFAR-100 datasets. We demonstrate that EBMs\nare more robust than transformers and display comparable robustness to\nadversarially-trained DNNs on gradient-based (white-box) attacks, query-based\n(black-box) attacks, and natural perturbations without sacrificing clean\naccuracy, and without the need for adversarial training or additional training\ntechniques.\n","authors":["Siddharth Mansingh","Michal Kucer","Garrett Kenyon","Juston Moore","Michael Teti"],"pdf_url":"https://arxiv.org/pdf/2401.11543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11541v1","updated":"2024-01-21T16:46:04Z","published":"2024-01-21T16:46:04Z","title":"Multi-View Neural 3D Reconstruction of Micro-/Nanostructures with Atomic\n Force Microscopy","summary":" Atomic Force Microscopy (AFM) is a widely employed tool for micro-/nanoscale\ntopographic imaging. However, conventional AFM scanning struggles to\nreconstruct complex 3D micro-/nanostructures precisely due to limitations such\nas incomplete sample topography capturing and tip-sample convolution artifacts.\nHere, we propose a multi-view neural-network-based framework with AFM\n(MVN-AFM), which accurately reconstructs surface models of intricate\nmicro-/nanostructures. Unlike previous works, MVN-AFM does not depend on any\nspecially shaped probes or costly modifications to the AFM system. To achieve\nthis, MVN-AFM uniquely employs an iterative method to align multi-view data and\neliminate AFM artifacts simultaneously. Furthermore, we pioneer the application\nof neural implicit surface reconstruction in nanotechnology and achieve\nmarkedly improved results. Extensive experiments show that MVN-AFM effectively\neliminates artifacts present in raw AFM images and reconstructs various\nmicro-/nanostructures including complex geometrical microstructures printed via\nTwo-photon Lithography and nanoparticles such as PMMA nanospheres and ZIF-67\nnanocrystals. This work presents a cost-effective tool for micro-/nanoscale 3D\nanalysis.\n","authors":["Shuo Chen","Mao Peng","Yijin Li","Bing-Feng Ju","Hujun Bao","Yuan-Liu Chen","Guofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11541v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16301v2","updated":"2024-01-21T16:27:06Z","published":"2023-09-28T09:54:10Z","title":"Gated Cross-Attention Network for Depth Completion","summary":" Depth completion is a popular research direction in the field of depth\nestimation. The fusion of color and depth features is the current critical\nchallenge in this task, mainly due to the asymmetry between the rich scene\ndetails in color images and the sparse pixels in depth maps. To tackle this\nissue, we design an efficient Gated Cross-Attention Network that propagates\nconfidence via a gating mechanism, simultaneously extracting and refining key\ninformation in both color and depth branches to achieve local spatial feature\nfusion. Additionally, we employ an attention network based on the Transformer\nin low-dimensional space to effectively fuse global features and increase the\nnetwork's receptive field. With a simple yet efficient gating mechanism, our\nproposed method achieves fast and accurate depth completion without the need\nfor additional branches or post-processing steps. At the same time, we use the\nRay Tune mechanism with the AsyncHyperBandScheduler scheduler and the\nHyperOptSearch algorithm to automatically search for the optimal number of\nmodule iterations, which also allows us to achieve performance comparable to\nstate-of-the-art methods. We conduct experiments on both indoor and outdoor\nscene datasets. Our fast network achieves Pareto-optimal solutions in terms of\ntime and accuracy, and at the time of submission, our accurate network ranks\nfirst among all published papers on the KITTI official website in terms of\naccuracy.\n","authors":["Xiaogang Jia","Songlei Jian","Yusong Tan","Yonggang Che","Wei Chen","Zhengfa Liang"],"pdf_url":"https://arxiv.org/pdf/2309.16301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13472v3","updated":"2024-01-21T16:14:44Z","published":"2023-03-23T17:43:17Z","title":"Promptable Game Models: Text-Guided Game Simulation via Masked Diffusion\n Models","summary":" Neural video game simulators emerged as powerful tools to generate and edit\nvideos. Their idea is to represent games as the evolution of an environment's\nstate driven by the actions of its agents. While such a paradigm enables users\nto play a game action-by-action, its rigidity precludes more semantic forms of\ncontrol. To overcome this limitation, we augment game models with prompts\nspecified as a set of natural language actions and desired states. The result-a\nPromptable Game Model (PGM)-makes it possible for a user to play the game by\nprompting it with high- and low-level action sequences. Most captivatingly, our\nPGM unlocks the director's mode, where the game is played by specifying goals\nfor the agents in the form of a prompt. This requires learning \"game AI\",\nencapsulated by our animation model, to navigate the scene using high-level\nconstraints, play against an adversary, and devise a strategy to win a point.\nTo render the resulting state, we use a compositional NeRF representation\nencapsulated in our synthesis model. To foster future research, we present\nnewly collected, annotated and calibrated Tennis and Minecraft datasets. Our\nmethod significantly outperforms existing neural video game simulators in terms\nof rendering quality and unlocks applications beyond the capabilities of the\ncurrent state of the art. Our framework, data, and models are available at\nhttps://snap-research.github.io/promptable-game-models/.\n","authors":["Willi Menapace","Aliaksandr Siarohin","Stéphane Lathuilière","Panos Achlioptas","Vladislav Golyanik","Sergey Tulyakov","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.13472v3.pdf","comment":"ACM Transactions on Graphics \\c{opyright} Copyright is held by the\n owner/author(s) 2023. This is the author's version of the work. It is posted\n here for your personal use. Not for redistribution. The definitive Version of\n Record was published in ACM Transactions on Graphics,\n http://dx.doi.org/10.1145/3635705"},{"id":"http://arxiv.org/abs/2401.11535v1","updated":"2024-01-21T16:14:04Z","published":"2024-01-21T16:14:04Z","title":"Deformable Endoscopic Tissues Reconstruction with Gaussian Splatting","summary":" Surgical 3D reconstruction is a critical area of research in robotic surgery,\nwith recent works adopting variants of dynamic radiance fields to achieve\nsuccess in 3D reconstruction of deformable tissues from single-viewpoint\nvideos. However, these methods often suffer from time-consuming optimization or\ninferior quality, limiting their adoption in downstream tasks. Inspired by 3D\nGaussian Splatting, a recent trending 3D representation, we present EndoGS,\napplying Gaussian Splatting for deformable endoscopic tissue reconstruction.\nSpecifically, our approach incorporates deformation fields to handle dynamic\nscenes, depth-guided supervision to optimize 3D targets with a single\nviewpoint, and a spatial-temporal weight mask to mitigate tool occlusion. As a\nresult, EndoGS reconstructs and renders high-quality deformable endoscopic\ntissues from a single-viewpoint video, estimated depth maps, and labeled tool\nmasks. Experiments on DaVinci robotic surgery videos demonstrate that EndoGS\nachieves superior rendering quality. Code is available at\nhttps://github.com/HKU-MedAI/EndoGS.\n","authors":["Lingting Zhu","Zhao Wang","Zhenchao Jin","Guying Lin","Lequan Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11535v1.pdf","comment":"Work in progress. 10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2401.11519v1","updated":"2024-01-21T15:22:15Z","published":"2024-01-21T15:22:15Z","title":"CaBuAr: California Burned Areas dataset for delineation","summary":" Forest wildfires represent one of the catastrophic events that, over the last\ndecades, caused huge environmental and humanitarian damages. In addition to a\nsignificant amount of carbon dioxide emission, they are a source of risk to\nsociety in both short-term (e.g., temporary city evacuation due to fire) and\nlong-term (e.g., higher risks of landslides) cases. Consequently, the\navailability of tools to support local authorities in automatically identifying\nburned areas plays an important role in the continuous monitoring requirement\nto alleviate the aftereffects of such catastrophic events. The great\navailability of satellite acquisitions coupled with computer vision techniques\nrepresents an important step in developing such tools. This paper introduces a\nnovel open dataset that tackles the burned area delineation problem, a binary\nsegmentation problem applied to satellite imagery. The presented resource\nconsists of pre- and post-fire Sentinel-2 L2A acquisitions of California forest\nfires that took place starting in 2015. Raster annotations were generated from\nthe data released by California's Department of Forestry and Fire Protection.\nMoreover, in conjunction with the dataset, we release three different baselines\nbased on spectral indexes analyses, SegFormer, and U-Net models.\n","authors":["Daniele Rege Cambrin","Luca Colomba","Paolo Garza"],"pdf_url":"https://arxiv.org/pdf/2401.11519v1.pdf","comment":"Accepted at the IEEE Geoscience and Remote Sensing Magazine"},{"id":"http://arxiv.org/abs/2401.11511v1","updated":"2024-01-21T14:48:38Z","published":"2024-01-21T14:48:38Z","title":"MobileARLoc: On-device Robust Absolute Localisation for Pervasive\n Markerless Mobile AR","summary":" Recent years have seen significant improvement in absolute camera pose\nestimation, paving the way for pervasive markerless Augmented Reality (AR).\nHowever, accurate absolute pose estimation techniques are computation- and\nstorage-heavy, requiring computation offloading. As such, AR systems rely on\nvisual-inertial odometry (VIO) to track the device's relative pose between\nrequests to the server. However, VIO suffers from drift, requiring frequent\nabsolute repositioning. This paper introduces MobileARLoc, a new framework for\non-device large-scale markerless mobile AR that combines an absolute pose\nregressor (APR) with a local VIO tracking system. Absolute pose regressors\n(APRs) provide fast on-device pose estimation at the cost of reduced accuracy.\nTo address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback\nloop where VIO pose estimations refine the APR predictions. The VIO system\nidentifies reliable predictions of APR, which are then used to compensate for\nthe VIO drift. We comprehensively evaluate MobileARLoc through dataset\nsimulations. MobileARLoc halves the error compared to the underlying APR and\nachieve fast (80\\,ms) on-device inference speed.\n","authors":["Changkun Liu","Yukun Zhao","Tristan Braud"],"pdf_url":"https://arxiv.org/pdf/2401.11511v1.pdf","comment":"Accepted for publication at the 3rd edition of the Pervasive and\n Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024).\n arXiv admin note: substantial text overlap with arXiv:2308.05394"},{"id":"http://arxiv.org/abs/2401.11499v1","updated":"2024-01-21T14:09:49Z","published":"2024-01-21T14:09:49Z","title":"Self-Supervised Bird's Eye View Motion Prediction with Cross-Modality\n Signals","summary":" Learning the dense bird's eye view (BEV) motion flow in a self-supervised\nmanner is an emerging research for robotics and autonomous driving. Current\nself-supervised methods mainly rely on point correspondences between point\nclouds, which may introduce the problems of fake flow and inconsistency,\nhindering the model's ability to learn accurate and realistic motion. In this\npaper, we introduce a novel cross-modality self-supervised training framework\nthat effectively addresses these issues by leveraging multi-modality data to\nobtain supervision signals. We design three innovative supervision signals to\npreserve the inherent properties of scene motion, including the masked Chamfer\ndistance loss, the piecewise rigidity loss, and the temporal consistency loss.\nThrough extensive experiments, we demonstrate that our proposed self-supervised\nframework outperforms all previous self-supervision methods for the motion\nprediction task.\n","authors":["Shaoheng Fang","Zuhong Liu","Mingyu Wang","Chenxin Xu","Yiqi Zhong","Siheng Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11499v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11492v1","updated":"2024-01-21T13:45:52Z","published":"2024-01-21T13:45:52Z","title":"Edge-Enabled Real-time Railway Track Segmentation","summary":" Accurate and rapid railway track segmentation can assist automatic train\ndriving and is a key step in early warning to fixed or moving obstacles on the\nrailway track. However, certain existing algorithms tailored for track\nsegmentation often struggle to meet the requirements of real-time and\nefficiency on resource-constrained edge devices. Considering this challenge, we\npropose an edge-enabled real-time railway track segmentation algorithm, which\nis optimized to be suitable for edge applications by optimizing the network\nstructure and quantizing the model after training. Initially, Ghost convolution\nis introduced to reduce the complexity of the backbone, thereby achieving the\nextraction of key information of the interested region at a lower cost. To\nfurther reduce the model complexity and calculation, a new lightweight\ndetection head is proposed to achieve the best balance between accuracy and\nefficiency. Subsequently, we introduce quantization techniques to map the\nmodel's floating-point weights and activation values into lower bit-width\nfixed-point representations, reducing computational demands and memory\nfootprint, ultimately accelerating the model's inference. Finally, we draw\ninspiration from GPU parallel programming principles to expedite the\npre-processing and post-processing stages of the algorithm by doing parallel\nprocessing. The approach is evaluated with public and challenging dataset\nRailSem19 and tested on Jetson Nano. Experimental results demonstrate that our\nenhanced algorithm achieves an accuracy level of 83.3% while achieving a\nreal-time inference rate of 25 frames per second when the input size is\n480x480, thereby effectively meeting the requirements for real-time and\nhigh-efficiency operation.\n","authors":["Chen Chenglin","Wang Fei","Yang Min","Qin Yong","Bai Yun"],"pdf_url":"https://arxiv.org/pdf/2401.11492v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05173v4","updated":"2024-01-21T13:38:20Z","published":"2023-09-11T00:02:05Z","title":"DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning","summary":" Prompt tuning (PT), where a small amount of trainable soft (continuous)\nprompt vectors is affixed to the input of language models (LM), has shown\npromising results across various tasks and models for parameter-efficient\nfine-tuning (PEFT). PT stands out from other PEFT approaches because it\nmaintains competitive performance with fewer trainable parameters and does not\ndrastically scale up its parameters as the model size expands. However, PT\nintroduces additional soft prompt tokens, leading to longer input sequences,\nwhich significantly impacts training and inference time and memory usage due to\nthe Transformer's quadratic complexity. Particularly concerning for Large\nLanguage Models (LLMs) that face heavy daily querying. To address this issue,\nwe propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt\ninto a shorter soft prompt and a pair of low-rank matrices that are then\noptimised with two different learning rates. This allows DePT to achieve better\nperformance while saving substantial memory and time costs compared to vanilla\nPT and its variants, without changing trainable parameter sizes. Through\nextensive experiments on 23 natural language processing (NLP) and\nvision-language (VL) tasks, we demonstrate that DePT outperforms\nstate-of-the-art PEFT approaches, including the full fine-tuning baseline, in\nsome scenarios. Additionally, we empirically show that DEPT grows more\nefficient as the model size increases. Our further study reveals that DePT\nintegrates seamlessly with parameter-efficient transfer learning in the\nfew-shot learning setting and highlights its adaptability to various model\narchitectures and sizes.\n","authors":["Zhengxiang Shi","Aldo Lipani"],"pdf_url":"https://arxiv.org/pdf/2309.05173v4.pdf","comment":"ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT"},{"id":"http://arxiv.org/abs/2303.11681v4","updated":"2024-01-21T13:35:44Z","published":"2023-03-21T08:43:15Z","title":"DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic\n Segmentation Using Diffusion Models","summary":" Collecting and annotating images with pixel-wise labels is time-consuming and\nlaborious. In contrast, synthetic data can be freely available using a\ngenerative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that\nit is possible to automatically obtain accurate semantic masks of synthetic\nimages generated by the Off-the-shelf Stable Diffusion model, which uses only\ntext-image pairs during training. Our approach, called DiffuMask, exploits the\npotential of the cross-attention map between text and image, which is natural\nand seamless to extend the text-driven image synthesis to semantic mask\ngeneration. DiffuMask uses text-guided cross-attention information to localize\nclass/word-specific regions, which are combined with practical techniques to\ncreate a novel high-resolution and class-discriminative pixel-wise mask. The\nmethods help to reduce data collection and annotation costs obviously.\nExperiments demonstrate that the existing segmentation methods trained on\nsynthetic data of DiffuMask can achieve a competitive performance over the\ncounterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird),\nDiffuMask presents promising performance, close to the stateof-the-art result\nof real data (within 3% mIoU gap). Moreover, in the open-vocabulary\nsegmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on\nUnseen class of VOC 2012. The project website can be found at\nhttps://weijiawu.github.io/DiffusionMask/.\n","authors":["Weijia Wu","Yuzhong Zhao","Mike Zheng Shou","Hong Zhou","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2303.11681v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11489v1","updated":"2024-01-21T13:30:02Z","published":"2024-01-21T13:30:02Z","title":"MapChange: Enhancing Semantic Change Detection with Temporal-Invariant\n Historical Maps Based on Deep Triplet Network","summary":" Semantic Change Detection (SCD) is recognized as both a crucial and\nchallenging task in the field of image analysis. Traditional methods for SCD\nhave predominantly relied on the comparison of image pairs. However, this\napproach is significantly hindered by substantial imaging differences, which\narise due to variations in shooting times, atmospheric conditions, and angles.\nSuch discrepancies lead to two primary issues: the under-detection of minor yet\nsignificant changes, and the generation of false alarms due to temporal\nvariances. These factors often result in unchanged objects appearing markedly\ndifferent in multi-temporal images. In response to these challenges, the\nMapChange framework has been developed. This framework introduces a novel\nparadigm that synergizes temporal-invariant historical map data with\ncontemporary high-resolution images. By employing this combination, the\ntemporal variance inherent in conventional image pair comparisons is\neffectively mitigated. The efficacy of the MapChange framework has been\nempirically validated through comprehensive testing on two public datasets.\nThese tests have demonstrated the framework's marked superiority over existing\nstate-of-the-art SCD methods.\n","authors":["Yinhe Liu","Sunan Shi","Zhuo Zheng","Jue Wang","Shiqi Tian","Yanfei Zhong"],"pdf_url":"https://arxiv.org/pdf/2401.11489v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01738v4","updated":"2024-01-21T13:27:31Z","published":"2023-08-03T12:58:23Z","title":"Enhancing Visibility in Nighttime Haze Images Using Guided APSF and\n Gradient Adaptive Convolution","summary":" Visibility in hazy nighttime scenes is frequently reduced by multiple\nfactors, including low light, intense glow, light scattering, and the presence\nof multicolored light sources. Existing nighttime dehazing methods often\nstruggle with handling glow or low-light conditions, resulting in either\nexcessively dark visuals or unsuppressed glow outputs. In this paper, we\nenhance the visibility from a single nighttime haze image by suppressing glow\nand enhancing low-light regions. To handle glow effects, our framework learns\nfrom the rendered glow pairs. Specifically, a light source aware network is\nproposed to detect light sources of night images, followed by the APSF\n(Atmospheric Point Spread Function)-guided glow rendering. Our framework is\nthen trained on the rendered images, resulting in glow suppression. Moreover,\nwe utilize gradient-adaptive convolution, to capture edges and textures in hazy\nscenes. By leveraging extracted edges and textures, we enhance the contrast of\nthe scene without losing important structural details. To boost low-light\nintensity, our network learns an attention map, then adjusted by gamma\ncorrection. This attention has high values on low-light regions and low values\non haze and glow regions. Extensive evaluation on real nighttime haze images,\ndemonstrates the effectiveness of our method. Our experiments demonstrate that\nour method achieves a PSNR of 30.38dB, outperforming state-of-the-art methods\nby 13% on GTA5 nighttime haze dataset. Our data and code is available at\nhttps://github.com/jinyeying/nighttime_dehaze.\n","authors":["Yeying Jin","Beibei Lin","Wending Yan","Yuan Yuan","Wei Ye","Robby T. Tan"],"pdf_url":"https://arxiv.org/pdf/2308.01738v4.pdf","comment":"Accepted to ACM'MM2023, https://github.com/jinyeying/nighttime_dehaze"},{"id":"http://arxiv.org/abs/2308.10610v2","updated":"2024-01-21T13:23:10Z","published":"2023-08-21T10:20:46Z","title":"Ultrafast and Ultralight Network-Based Intelligent System for Real-time\n Diagnosis of Ear diseases in Any Devices","summary":" Traditional ear disease diagnosis heavily depends on experienced specialists\nand specialized equipment, frequently resulting in misdiagnoses, treatment\ndelays, and financial burdens for some patients. Utilizing deep learning models\nfor efficient ear disease diagnosis has proven effective and affordable.\nHowever, existing research overlooked model inference speed and parameter size\nrequired for deployment. To tackle these challenges, we constructed a\nlarge-scale dataset comprising eight ear disease categories and normal ear\ncanal samples from two hospitals. Inspired by ShuffleNetV2, we developed\nBest-EarNet, an ultrafast and ultralight network enabling real-time ear disease\ndiagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature\nFusion Module which can capture global and local spatial information\nsimultaneously and guide the network to focus on crucial regions within feature\nmaps at various levels, mitigating low accuracy issues. Moreover, our network\nuses multiple auxiliary classification heads for efficient parameter\noptimization. With 0.77M parameters, Best-EarNet achieves an average frames per\nsecond of 80 on CPU. Employing transfer learning and five-fold cross-validation\nwith 22,581 images from Hospital-1, the model achieves an impressive 95.23%\naccuracy. External testing on 1,652 images from Hospital-2 validates its\nperformance, yielding 92.14% accuracy. Compared to state-of-the-art networks,\nBest-EarNet establishes a new state-of-the-art (SOTA) in practical\napplications. Most importantly, we developed an intelligent diagnosis system\ncalled Ear Keeper, which can be deployed on common electronic devices. By\nmanipulating a compact electronic otoscope, users can perform comprehensive\nscanning and diagnosis of the ear canal using real-time video. This study\nprovides a novel paradigm for ear endoscopy and other medical endoscopic image\nrecognition applications.\n","authors":["Yubiao Yue","Xinyu Zeng","Xiaoqiang Shi","Meiping Zhang","Haihua Liang","Fan Zhang","Yanmei Chen","Zefeng Xie","Wenrui Wu","Zhenzhang Li"],"pdf_url":"https://arxiv.org/pdf/2308.10610v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11485v1","updated":"2024-01-21T13:16:33Z","published":"2024-01-21T13:16:33Z","title":"ColorVideoVDP: A visual difference predictor for image, video and\n display distortions","summary":" ColorVideoVDP is a video and image quality metric that models spatial and\ntemporal aspects of vision, for both luminance and color. The metric is built\non novel psychophysical models of chromatic spatiotemporal contrast sensitivity\nand cross-channel contrast masking. It accounts for the viewing conditions,\ngeometric, and photometric characteristics of the display. It was trained to\npredict common video streaming distortions (e.g. video compression, rescaling,\nand transmission errors), and also 8 new distortion types related to AR/VR\ndisplays (e.g. light source and waveguide non-uniformities). To address the\nlatter application, we collected our novel XR-Display-Artifact-Video quality\ndataset (XR-DAVID), comprised of 336 distorted videos. Extensive testing on\nXR-DAVID, as well as several datasets from the literature, indicate a\nsignificant gain in prediction performance compared to existing metrics.\nColorVideoVDP opens the doors to many novel applications which require the\njoint automated spatiotemporal assessment of luminance and color distortions,\nincluding video streaming, display specification and design, visual comparison\nof results, and perceptually-guided quality optimization.\n","authors":["Rafal K. Mantiuk","Param Hanji","Maliha Ashraf","Yuta Asano","Alexandre Chapiro"],"pdf_url":"https://arxiv.org/pdf/2401.11485v1.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2401.04614v2","updated":"2024-01-21T12:56:32Z","published":"2024-01-09T15:36:07Z","title":"Generic Knowledge Boosted Pre-training For Remote Sensing Images","summary":" Deep learning models are essential for scene classification, change\ndetection, land cover segmentation, and other remote sensing image\nunderstanding tasks. Most backbones of existing remote sensing deep learning\nmodels are typically initialized by pre-trained weights obtained from ImageNet\npre-training (IMP). However, domain gaps exist between remote sensing images\nand natural images (e.g., ImageNet), making deep learning models initialized by\npre-trained weights of IMP perform poorly for remote sensing image\nunderstanding. Although some pre-training methods are studied in the remote\nsensing community, current remote sensing pre-training methods face the problem\nof vague generalization by only using remote sensing images. In this paper, we\npropose a novel remote sensing pre-training framework, Generic Knowledge\nBoosted Remote Sensing Pre-training (GeRSP), to learn robust representations\nfrom remote sensing and natural images for remote sensing understanding tasks.\nGeRSP contains two pre-training branches: (1) A self-supervised pre-training\nbranch is adopted to learn domain-related representations from unlabeled remote\nsensing images. (2) A supervised pre-training branch is integrated into GeRSP\nfor general knowledge learning from labeled natural images. Moreover, GeRSP\ncombines two pre-training branches using a teacher-student architecture to\nsimultaneously learn representations with general and special knowledge, which\ngenerates a powerful pre-trained model for deep learning model initialization.\nFinally, we evaluate GeRSP and other remote sensing pre-training methods on\nthree downstream tasks, i.e., object detection, semantic segmentation, and\nscene classification. The extensive experimental results consistently\ndemonstrate that GeRSP can effectively learn robust representations in a\nunified manner, improving the performance of remote sensing downstream tasks.\n","authors":["Ziyue Huang","Mingming Zhang","Yuan Gong","Qingjie Liu","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2401.04614v2.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01632v3","updated":"2024-01-21T12:50:08Z","published":"2023-12-04T05:24:45Z","title":"GaussianHead: High-fidelity Head Avatars with Learnable Gaussian\n Derivation","summary":" Constructing vivid 3D head avatars for given subjects and realizing a series\nof animations on them is valuable yet challenging. This paper presents\nGaussianHead, which models the actional human head with anisotropic 3D\nGaussians. In our framework, a motion deformation field and multi-resolution\ntri-plane are constructed respectively to deal with the head's dynamic geometry\nand complex texture. Notably, we impose an exclusive derivation scheme on each\nGaussian, which generates its multiple doppelgangers through a set of learnable\nparameters for position transformation. With this design, we can compactly and\naccurately encode the appearance information of Gaussians, even those fitting\nthe head's particular components with sophisticated structures. In addition, an\ninherited derivation strategy for newly added Gaussians is adopted to\nfacilitate training acceleration. Extensive experiments show that our method\ncan produce high-fidelity renderings, outperforming state-of-the-art approaches\nin reconstruction, cross-identity reenactment, and novel view synthesis tasks.\nOur code is available at: https://github.com/chiehwangs/gaussian-head.\n","authors":["Jie Wang","Jiu-Cheng Xie","Xianyan Li","Feng Xu","Chi-Man Pun","Hao Gao"],"pdf_url":"https://arxiv.org/pdf/2312.01632v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04089v2","updated":"2024-01-21T12:32:04Z","published":"2023-09-08T02:58:17Z","title":"Toward Sufficient Spatial-Frequency Interaction for Gradient-aware\n Underwater Image Enhancement","summary":" Underwater images suffer from complex and diverse degradation, which\ninevitably affects the performance of underwater visual tasks. However, most\nexisting learning-based Underwater image enhancement (UIE) methods mainly\nrestore such degradations in the spatial domain, and rarely pay attention to\nthe fourier frequency information. In this paper, we develop a novel UIE\nframework based on spatial-frequency interaction and gradient maps, namely\nSFGNet, which consists of two stages. Specifically, in the first stage, we\npropose a dense spatial-frequency fusion network (DSFFNet), mainly including\nour designed dense fourier fusion block and dense spatial fusion block,\nachieving sufficient spatial-frequency interaction by cross connections between\nthese two blocks. In the second stage, we propose a gradient-aware corrector\n(GAC) to further enhance perceptual details and geometric structures of images\nby gradient map. Experimental results on two real-world underwater image\ndatasets show that our approach can successfully enhance underwater images, and\nachieves competitive performance in visual quality improvement. The code is\navailable at https://github.com/zhihefang/SFGNet.\n","authors":["Chen Zhao","Weiling Cai","Chenyu Dong","Ziqi Zeng"],"pdf_url":"https://arxiv.org/pdf/2309.04089v2.pdf","comment":"Accepted by ICASSP 2024"},{"id":"http://arxiv.org/abs/2401.11470v1","updated":"2024-01-21T11:55:42Z","published":"2024-01-21T11:55:42Z","title":"Exploring Missing Modality in Multimodal Egocentric Datasets","summary":" Multimodal video understanding is crucial for analyzing egocentric videos,\nwhere integrating multiple sensory signals significantly enhances action\nrecognition and moment localization. However, practical applications often\ngrapple with incomplete modalities due to factors like privacy concerns,\nefficiency demands, or hardware malfunctions. Addressing this, our study delves\ninto the impact of missing modalities on egocentric action recognition,\nparticularly within transformer-based models. We introduce a novel concept\n-Missing Modality Token (MMT)-to maintain performance even when modalities are\nabsent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and\nEpic-Sounds datasets. Our method mitigates the performance loss, reducing it\nfrom its original $\\sim 30\\%$ drop to only $\\sim 10\\%$ when half of the test\nset is modal-incomplete. Through extensive experimentation, we demonstrate the\nadaptability of MMT to different training scenarios and its superiority in\nhandling missing modalities compared to current methods. Our research\ncontributes a comprehensive analysis and an innovative approach, opening\navenues for more resilient multimodal systems in real-world settings.\n","authors":["Merey Ramazanova","Alejandro Pardo","Humam Alwassel","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2401.11470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14835v2","updated":"2024-01-21T11:13:31Z","published":"2022-11-27T14:18:40Z","title":"CLID: Controlled-Length Image Descriptions with Limited Data","summary":" Controllable image captioning models generate human-like image descriptions,\nenabling some kind of control over the generated captions. This paper focuses\non controlling the caption length, i.e. a short and concise description or a\nlong and detailed one. Since existing image captioning datasets contain mostly\nshort captions, generating long captions is challenging. To address the\nshortage of long training examples, we propose to enrich the dataset with\nvarying-length self-generated captions. These, however, might be of varying\nquality and are thus unsuitable for conventional training. We introduce a novel\ntraining strategy that selects the data points to be used at different times\nduring the training. Our method dramatically improves the length-control\nabilities, while exhibiting SoTA performance in terms of caption quality. Our\napproach is general and is shown to be applicable also to paragraph generation.\n","authors":["Elad Hirsch","Ayellet Tal"],"pdf_url":"https://arxiv.org/pdf/2211.14835v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11464v1","updated":"2024-01-21T11:12:00Z","published":"2024-01-21T11:12:00Z","title":"Task-specific regularization loss towards model calibration for reliable\n lung cancer detection","summary":" Lung cancer is one of the significant causes of cancer-related deaths\nglobally. Early detection and treatment improve the chances of survival.\nTraditionally CT scans have been used to extract the most significant lung\ninfection information and diagnose cancer. This process is carried out manually\nby an expert radiologist. The imbalance in the radiologists-to-population ratio\nin a country like India implies significant work pressure on them and thus\nraises the need to automate a few of their responsibilities. The tendency of\nmodern-day Deep Neural networks to make overconfident mistakes limit their\nusage to detect cancer. In this paper, we propose a new task-specific loss\nfunction to calibrate the neural network to reduce the risk of overconfident\nmistakes. We use the state-of-the-art Multi-class Difference in Confidence and\nAccuracy (MDCA) loss in conjunction with the proposed task-specific loss\nfunction to achieve the same. We also integrate post-hoc calibration by\nperforming temperature scaling on top of the train-time calibrated model. We\ndemonstrate 5.98% improvement in the Expected Calibration Error (ECE) and a\n17.9% improvement in Maximum Calibration Error (MCE) as compared to the\nbest-performing SOTA algorithm.\n","authors":["Mehar Prateek Kalra","Mansi Singhal","Rohan Raju Dhanakashirur"],"pdf_url":"https://arxiv.org/pdf/2401.11464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2106.04852v2","updated":"2024-01-21T10:54:55Z","published":"2021-06-09T07:20:54Z","title":"Deep Tiny Network for Recognition-Oriented Face Image Quality Assessment","summary":" Face recognition has made significant progress in recent years due to deep\nconvolutional neural networks (CNN). In many face recognition (FR) scenarios,\nface images are acquired from a sequence with huge intra-variations. These\nintra-variations, which are mainly affected by the low-quality face images,\ncause instability of recognition performance. Previous works have focused on\nad-hoc methods to select frames from a video or use face image quality\nassessment (FIQA) methods, which consider only a particular or combination of\nseveral distortions.\n In this work, we present an efficient non-reference image quality assessment\nfor FR that directly links image quality assessment (IQA) and FR. More\nspecifically, we propose a new measurement to evaluate image quality without\nany reference. Based on the proposed quality measurement, we propose a deep\nTiny Face Quality network (tinyFQnet) to learn a quality prediction function\nfrom data.\n We evaluate the proposed method for different powerful FR models on two\nclassical video-based (or template-based) benchmark: IJB-B and YTF. Extensive\nexperiments show that, although the tinyFQnet is much smaller than the others,\nthe proposed method outperforms state-of-the-art quality assessment methods in\nterms of effectiveness and efficiency.\n","authors":["Baoyun Peng","Min Liu","Zhaoning Zhang","Kai Xu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2106.04852v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11453v1","updated":"2024-01-21T10:20:46Z","published":"2024-01-21T10:20:46Z","title":"Inter-Domain Mixup for Semi-Supervised Domain Adaptation","summary":" Semi-supervised domain adaptation (SSDA) aims to bridge source and target\ndomain distributions, with a small number of target labels available, achieving\nbetter classification performance than unsupervised domain adaptation (UDA).\nHowever, existing SSDA work fails to make full use of label information from\nboth source and target domains for feature alignment across domains, resulting\nin label mismatch in the label space during model testing. This paper presents\na novel SSDA approach, Inter-domain Mixup with Neighborhood Expansion (IDMNE),\nto tackle this issue. Firstly, we introduce a cross-domain feature alignment\nstrategy, Inter-domain Mixup, that incorporates label information into model\nadaptation. Specifically, we employ sample-level and manifold-level data mixing\nto generate compatible training samples. These newly established samples,\ncombined with reliable and actual label information, display diversity and\ncompatibility across domains, while such extra supervision thus facilitates\ncross-domain feature alignment and mitigates label mismatch. Additionally, we\nutilize Neighborhood Expansion to leverage high-confidence pseudo-labeled\nsamples in the target domain, diversifying the label information of the target\ndomain and thereby further increasing the performance of the adaptation model.\nAccordingly, the proposed approach outperforms existing state-of-the-art\nmethods, achieving significant accuracy improvements on popular SSDA\nbenchmarks, including DomainNet, Office-Home, and Office-31.\n","authors":["Jichang Li","Guanbin Li","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11453v1.pdf","comment":"Publisted to Elsevier PR2024, available at\n https://www.sciencedirect.com/science/article/pii/S0031320323007203?via%3Dihub"},{"id":"http://arxiv.org/abs/2401.11448v1","updated":"2024-01-21T09:57:56Z","published":"2024-01-21T09:57:56Z","title":"Adaptive Betweenness Clustering for Semi-Supervised Domain Adaptation","summary":" Compared to unsupervised domain adaptation, semi-supervised domain adaptation\n(SSDA) aims to significantly improve the classification performance and\ngeneralization capability of the model by leveraging the presence of a small\namount of labeled data from the target domain. Several SSDA approaches have\nbeen developed to enable semantic-aligned feature confusion between labeled (or\npseudo labeled) samples across domains; nevertheless, owing to the scarcity of\nsemantic label information of the target domain, they were arduous to fully\nrealize their potential. In this study, we propose a novel SSDA approach named\nGraph-based Adaptive Betweenness Clustering (G-ABC) for achieving categorical\ndomain alignment, which enables cross-domain semantic alignment by mandating\nsemantic transfer from labeled data of both the source and target domains to\nunlabeled target samples. In particular, a heterogeneous graph is initially\nconstructed to reflect the pairwise relationships between labeled samples from\nboth domains and unlabeled ones of the target domain. Then, to degrade the\nnoisy connectivity in the graph, connectivity refinement is conducted by\nintroducing two strategies, namely Confidence Uncertainty based Node Removal\nand Prediction Dissimilarity based Edge Pruning. Once the graph has been\nrefined, Adaptive Betweenness Clustering is introduced to facilitate semantic\ntransfer by using across-domain betweenness clustering and within-domain\nbetweenness clustering, thereby propagating semantic label information from\nlabeled samples across domains to unlabeled target data. Extensive experiments\non three standard benchmark datasets, namely DomainNet, Office-Home, and\nOffice-31, indicated that our method outperforms previous state-of-the-art SSDA\napproaches, demonstrating the superiority of the proposed G-ABC algorithm.\n","authors":["Jichang Li","Guanbin Li","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11448v1.pdf","comment":"16 pages, 9 figures, published to IEEE TIP"},{"id":"http://arxiv.org/abs/2401.11439v1","updated":"2024-01-21T09:39:11Z","published":"2024-01-21T09:39:11Z","title":"General Flow as Foundation Affordance for Scalable Robot Learning","summary":" We address the challenge of acquiring real-world manipulation skills with a\nscalable framework.Inspired by the success of large-scale auto-regressive\nprediction in Large Language Models (LLMs), we hold the belief that identifying\nan appropriate prediction target capable of leveraging large-scale datasets is\ncrucial for achieving efficient and universal learning. Therefore, we propose\nto utilize flow, which represents the future trajectories of 3D points on\nobjects of interest, as an ideal prediction target in robot learning. To\nexploit scalable data resources, we turn our attention to cross-embodiment\ndatasets. We develop, for the first time, a language-conditioned prediction\nmodel directly from large-scale RGBD human video datasets. Our predicted flow\noffers actionable geometric and physics guidance, thus facilitating stable\nzero-shot skill transfer in real-world scenarios.We deploy our method with a\npolicy based on closed-loop flow prediction. Remarkably, without any additional\ntraining, our method achieves an impressive 81% success rate in human-to-robot\nskill transfer, covering 18 tasks in 6 scenes. Our framework features the\nfollowing benefits: (1) scalability: leveraging cross-embodiment data\nresources; (2) universality: multiple object categories, including rigid,\narticulated, and soft bodies; (3) stable skill transfer: providing actionable\nguidance with a small inference domain-gap. These lead to a new pathway towards\nscalable general robot learning. Data, code, and model weights will be made\npublicly available.\n","authors":["Chengbo Yuan","Chuan Wen","Tong Zhang","Yang Gao"],"pdf_url":"https://arxiv.org/pdf/2401.11439v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11436v1","updated":"2024-01-21T09:16:29Z","published":"2024-01-21T09:16:29Z","title":"Geometric Prior Guided Feature Representation Learning for Long-Tailed\n Classification","summary":" Real-world data are long-tailed, the lack of tail samples leads to a\nsignificant limitation in the generalization ability of the model. Although\nnumerous approaches of class re-balancing perform well for moderate class\nimbalance problems, additional knowledge needs to be introduced to help the\ntail class recover the underlying true distribution when the observed\ndistribution from a few tail samples does not represent its true distribution\nproperly, thus allowing the model to learn valuable information outside the\nobserved domain. In this work, we propose to leverage the geometric information\nof the feature distribution of the well-represented head class to guide the\nmodel to learn the underlying distribution of the tail class. Specifically, we\nfirst systematically define the geometry of the feature distribution and the\nsimilarity measures between the geometries, and discover four phenomena\nregarding the relationship between the geometries of different feature\ndistributions. Then, based on four phenomena, feature uncertainty\nrepresentation is proposed to perturb the tail features by utilizing the\ngeometry of the head class feature distribution. It aims to make the perturbed\nfeatures cover the underlying distribution of the tail class as much as\npossible, thus improving the model's generalization performance in the test\ndomain. Finally, we design a three-stage training scheme enabling feature\nuncertainty modeling to be successfully applied. Experiments on\nCIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed\napproach outperforms other similar methods on most metrics. In addition, the\nexperimental phenomena we discovered are able to provide new perspectives and\ntheoretical foundations for subsequent studies.\n","authors":["Yanbiao Ma","Licheng Jiao","Fang Liu","Shuyuan Yang","Xu Liu","Puhua Chen"],"pdf_url":"https://arxiv.org/pdf/2401.11436v1.pdf","comment":"This work was accepted by the IJCV"},{"id":"http://arxiv.org/abs/2401.09496v2","updated":"2024-01-21T08:51:37Z","published":"2024-01-17T01:37:17Z","title":"Learning to Generalize over Subpartitions for Heterogeneity-aware Domain\n Adaptive Nuclei Segmentation","summary":" Annotation scarcity and cross-modality/stain data distribution shifts are two\nmajor obstacles hindering the application of deep learning models for nuclei\nanalysis, which holds a broad spectrum of potential applications in digital\npathology. Recently, unsupervised domain adaptation (UDA) methods have been\nproposed to mitigate the distributional gap between different imaging\nmodalities for unsupervised nuclei segmentation in histopathology images.\nHowever, existing UDA methods are built upon the assumption that data\ndistributions within each domain should be uniform. Based on the\nover-simplified supposition, they propose to align the histopathology target\ndomain with the source domain integrally, neglecting severe intra-domain\ndiscrepancy over subpartitions incurred by mixed cancer types and sampling\norgans. In this paper, for the first time, we propose to explicitly consider\nthe heterogeneity within the histopathology domain and introduce open compound\ndomain adaptation (OCDA) to resolve the crux. In specific, a two-stage\ndisentanglement framework is proposed to acquire domain-invariant feature\nrepresentations at both image and instance levels. The holistic design\naddresses the limitations of existing OCDA approaches which struggle to capture\ninstance-wise variations. Two regularization strategies are specifically\ndevised herein to leverage the rich subpartition-specific characteristics in\nhistopathology images and facilitate subdomain decomposition. Moreover, we\npropose a dual-branch nucleus shape and structure preserving module to prevent\nnucleus over-generation and deformation in the synthesized images. Experimental\nresults on both cross-modality and cross-stain scenarios over a broad range of\ndiverse datasets demonstrate the superiority of our method compared with\nstate-of-the-art UDA and OCDA methods.\n","authors":["Jianan Fan","Dongnan Liu","Hang Chang","Weidong Cai"],"pdf_url":"https://arxiv.org/pdf/2401.09496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11430v1","updated":"2024-01-21T08:35:25Z","published":"2024-01-21T08:35:25Z","title":"Exploring Diffusion Time-steps for Unsupervised Representation Learning","summary":" Representation learning is all about discovering the hidden modular\nattributes that generate the data faithfully. We explore the potential of\nDenoising Diffusion Probabilistic Model (DM) in unsupervised learning of the\nmodular attributes. We build a theoretical framework that connects the\ndiffusion time-steps and the hidden attributes, which serves as an effective\ninductive bias for unsupervised learning. Specifically, the forward diffusion\nprocess incrementally adds Gaussian noise to samples at each time-step, which\nessentially collapses different samples into similar ones by losing attributes,\ne.g., fine-grained attributes such as texture are lost with less noise added\n(i.e., early time-steps), while coarse-grained ones such as shape are lost by\nadding more noise (i.e., late time-steps). To disentangle the modular\nattributes, at each time-step t, we learn a t-specific feature to compensate\nfor the newly lost attribute, and the set of all 1,...,t-specific features,\ncorresponding to the cumulative set of lost attributes, are trained to make up\nfor the reconstruction error of a pre-trained DM at time-step t. On CelebA,\nFFHQ, and Bedroom datasets, the learned feature significantly improves\nattribute classification and enables faithful counterfactual generation, e.g.,\ninterpolating only one specified attribute between two images, validating the\ndisentanglement quality. Codes are in https://github.com/yue-zhongqi/diti.\n","authors":["Zhongqi Yue","Jiankun Wang","Qianru Sun","Lei Ji","Eric I-Chao Chang","Hanwang Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11430v1.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11425v1","updated":"2024-01-21T08:18:45Z","published":"2024-01-21T08:18:45Z","title":"Grayscale Image Colorization with GAN and CycleGAN in Different Image\n Domain","summary":" Automatic colorization of grayscale image has been a challenging task.\nPrevious research have applied supervised methods in conquering this problem [\n1]. In this paper, we reproduces a GAN-based coloring model, and experiments\none of its variant. We also proposed a CycleGAN based model and experiments\nthose methods on various datasets. The result shows that the proposed CycleGAN\nmodel does well in human-face coloring and comic coloring, but lack the ability\nto diverse colorization.\n","authors":["Chen Liang","Yunchen Sheng","Yichen Mo"],"pdf_url":"https://arxiv.org/pdf/2401.11425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11421v1","updated":"2024-01-21T07:57:04Z","published":"2024-01-21T07:57:04Z","title":"Enhancing the vision-language foundation model with key semantic\n knowledge-emphasized report refinement","summary":" Recently, vision-language representation learning has made remarkable\nadvancements in building up medical foundation models, holding immense\npotential for transforming the landscape of clinical research and medical care.\nThe underlying hypothesis is that the rich knowledge embedded in radiology\nreports can effectively assist and guide the learning process, reducing the\nneed for additional labels. However, these reports tend to be complex and\nsometimes even consist of redundant descriptions that make the representation\nlearning too challenging to capture the key semantic information. This paper\ndevelops a novel iterative vision-language representation learning framework by\nproposing a key semantic knowledge-emphasized report refinement method.\nParticularly, raw radiology reports are refined to highlight the key\ninformation according to a constructed clinical dictionary and two\nmodel-optimized knowledge-enhancement metrics. The iterative framework is\ndesigned to progressively learn, starting from gaining a general understanding\nof the patient's condition based on raw reports and gradually refines and\nextracts critical information essential to the fine-grained analysis tasks. The\neffectiveness of the proposed framework is validated on various downstream\nmedical image analysis tasks, including disease classification,\nregion-of-interest segmentation, and phrase grounding. Our framework surpasses\nseven state-of-the-art methods in both fine-tuning and zero-shot settings,\ndemonstrating its encouraging potential for different clinical applications.\n","authors":["Cheng Li","Weijian Huang","Hao Yang","Jiarun Liu","Shanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11420v1","updated":"2024-01-21T07:48:39Z","published":"2024-01-21T07:48:39Z","title":"Embedded Hyperspectral Band Selection with Adaptive Optimization for\n Image Semantic Segmentation","summary":" Hyperspectral band selection plays a pivotal role in remote sensing and image\nanalysis, aiming to identify the most informative spectral bands while\nminimizing computational overhead. In this paper, we introduce a pioneering\napproach for hyperspectral band selection that offers an embedded solution,\nmaking it well-suited for resource-constrained or real-time applications. Our\nproposed method, embedded Hyperspectral Band Selection (EHBS), excels in\nselecting the best bands without the need for prior processing, seamlessly\nintegrating with the downstream task model. This is achieved through the\nadaptation of the Stochastic Gates (STG) algorithm, originally designed for\nfeature selection, for hyperspectral band selection in the context of image\nsemantic segmentation and the integration of a dynamic optimizer, DoG, which\nremoves the need for the required tuning the learning rate. To assess the\nperformance of our method, we introduce a novel metric for evaluating band\nselection methods across different target numbers of selected bands quantified\nby the Area Under the Curve (AUC). We conduct experiments on two distinct\nsemantic-segmentation hyperspectral benchmark datasets, demonstrating its\nsuperiority in terms of its resulting accuracy and its ease of use compared to\nmany common and state-of-the-art methods. Furthermore, our contributions extend\nbeyond the realm of hyperspectral band selection. The adaptability of our\napproach to other tasks, especially those involving grouped features, opens up\npromising avenues for broader applications within the realm of deep learning,\nsuch as feature selection for feature groups. The demonstrated success on the\ntested datasets and the potential for application to a variety of tasks\nunderscore the value of our method as a substantial addition to the field of\ncomputer vision.\n","authors":["Yaniv Zimmer","Oren Glickman"],"pdf_url":"https://arxiv.org/pdf/2401.11420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17005v3","updated":"2024-01-21T07:36:52Z","published":"2023-11-28T17:59:04Z","title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark","summary":" With the rapid development of Multi-modal Large Language Models (MLLMs), a\nnumber of diagnostic benchmarks have recently emerged to evaluate the\ncomprehension capabilities of these models. However, most benchmarks\npredominantly assess spatial understanding in the static image tasks, while\noverlooking temporal understanding in the dynamic video tasks. To alleviate\nthis issue, we introduce a comprehensive Multi-modal Video understanding\nBenchmark, namely MVBench, which covers 20 challenging video tasks that cannot\nbe effectively solved with a single frame. Specifically, we first introduce a\nnovel static-to-dynamic method to define these temporal-related tasks. By\ntransforming various static tasks into dynamic ones, we enable the systematic\ngeneration of video tasks that require a broad spectrum of temporal skills,\nranging from perception to cognition. Then, guided by the task definition, we\nautomatically convert public video annotations into multiple-choice QA to\nevaluate each task. On one hand, such a distinct paradigm allows us to build\nMVBench efficiently, without much manual intervention. On the other hand, it\nguarantees evaluation fairness with ground-truth video annotations, avoiding\nthe biased scoring of LLMs. Moreover, we further develop a robust video MLLM\nbaseline, i.e., VideoChat2, by progressive multi-modal training with diverse\ninstruction-tuning data. The extensive results on our MVBench reveal that, the\nexisting MLLMs are far from satisfactory in temporal understanding, while our\nVideoChat2 largely surpasses these leading models by over 15% on MVBench. All\nmodels and data are available at https://github.com/OpenGVLab/Ask-Anything.\n","authors":["Kunchang Li","Yali Wang","Yinan He","Yizhuo Li","Yi Wang","Yi Liu","Zun Wang","Jilan Xu","Guo Chen","Ping Luo","Limin Wang","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.17005v3.pdf","comment":"18 pages, 7 figures, 19 tables"},{"id":"http://arxiv.org/abs/2401.09671v2","updated":"2024-01-21T07:27:25Z","published":"2024-01-18T01:07:00Z","title":"Towards Identifiable Unsupervised Domain Translation: A Diversified\n Distribution Matching Approach","summary":" Unsupervised domain translation (UDT) aims to find functions that convert\nsamples from one domain (e.g., sketches) to another domain (e.g., photos)\nwithout changing the high-level semantic meaning (also referred to as\n``content''). The translation functions are often sought by probability\ndistribution matching of the transformed source domain and target domain.\nCycleGAN stands as arguably the most representative approach among this line of\nwork. However, it was noticed in the literature that CycleGAN and variants\ncould fail to identify the desired translation functions and produce\ncontent-misaligned translations. This limitation arises due to the presence of\nmultiple translation functions -- referred to as ``measure-preserving\nautomorphism\" (MPA) -- in the solution space of the learning criteria. Despite\nawareness of such identifiability issues, solutions have remained elusive. This\nstudy delves into the core identifiability inquiry and introduces an MPA\nelimination theory. Our analysis shows that MPA is unlikely to exist, if\nmultiple pairs of diverse cross-domain conditional distributions are matched by\nthe learning function. Our theory leads to a UDT learner using distribution\nmatching over auxiliary variable-induced subsets of the domains -- other than\nover the entire data domains as in the classical approaches. The proposed\nframework is the first to rigorously establish translation identifiability\nunder reasonable UDT settings, to our best knowledge. Experiments corroborate\nwith our theoretical claims.\n","authors":["Sagar Shrestha","Xiao Fu"],"pdf_url":"https://arxiv.org/pdf/2401.09671v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11414v1","updated":"2024-01-21T06:47:33Z","published":"2024-01-21T06:47:33Z","title":"S$^3$M-Net: Joint Learning of Semantic Segmentation and Stereo Matching\n for Autonomous Driving","summary":" Semantic segmentation and stereo matching are two essential components of 3D\nenvironmental perception systems for autonomous driving. Nevertheless,\nconventional approaches often address these two problems independently,\nemploying separate models for each task. This approach poses practical\nlimitations in real-world scenarios, particularly when computational resources\nare scarce or real-time performance is imperative. Hence, in this article, we\nintroduce S$^3$M-Net, a novel joint learning framework developed to perform\nsemantic segmentation and stereo matching simultaneously. Specifically,\nS$^3$M-Net shares the features extracted from RGB images between both tasks,\nresulting in an improved overall scene understanding capability. This feature\nsharing process is realized using a feature fusion adaption (FFA) module, which\neffectively transforms the shared features into semantic space and subsequently\nfuses them with the encoded disparity features. The entire joint learning\nframework is trained by minimizing a novel semantic consistency-guided (SCG)\nloss, which places emphasis on the structural consistency in both tasks.\nExtensive experimental results conducted on the vKITTI2 and KITTI datasets\ndemonstrate the effectiveness of our proposed joint learning framework and its\nsuperior performance compared to other state-of-the-art single-task networks.\nOur project webpage is accessible at mias.group/S3M-Net.\n","authors":["Zhiyuan Wu","Yi Feng","Chuang-Wei Liu","Fisher Yu","Qijun Chen","Rui Fan"],"pdf_url":"https://arxiv.org/pdf/2401.11414v1.pdf","comment":"accepted to IEEE Trans. on Intelligent Vehicles (T-IV)"},{"id":"http://arxiv.org/abs/2401.11406v1","updated":"2024-01-21T05:50:39Z","published":"2024-01-21T05:50:39Z","title":"Adversarial Augmentation Training Makes Action Recognition Models More\n Robust to Realistic Video Distribution Shifts","summary":" Despite recent advances in video action recognition achieving strong\nperformance on existing benchmarks, these models often lack robustness when\nfaced with natural distribution shifts between training and test data. We\npropose two novel evaluation methods to assess model resilience to such\ndistribution disparity. One method uses two different datasets collected from\ndifferent sources and uses one for training and validation, and the other for\ntesting. More precisely, we created dataset splits of HMDB-51 or UCF-101 for\ntraining, and Kinetics-400 for testing, using the subset of the classes that\nare overlapping in both train and test datasets. The other proposed method\nextracts the feature mean of each class from the target evaluation dataset's\ntraining data (i.e. class prototype) and estimates test video prediction as a\ncosine similarity score between each sample to the class prototypes of each\ntarget class. This procedure does not alter model weights using the target\ndataset and it does not require aligning overlapping classes of two different\ndatasets, thus is a very efficient method to test the model robustness to\ndistribution shifts without prior knowledge of the target distribution. We\naddress the robustness problem by adversarial augmentation training -\ngenerating augmented views of videos that are \"hard\" for the classification\nmodel by applying gradient ascent on the augmentation parameters - as well as\n\"curriculum\" scheduling the strength of the video augmentations. We\nexperimentally demonstrate the superior performance of the proposed adversarial\naugmentation approach over baselines across three state-of-the-art action\nrecognition models - TSM, Video Swin Transformer, and Uniformer. The presented\nwork provides critical insight into model robustness to distribution shifts and\npresents effective techniques to enhance video action recognition performance\nin a real-world deployment.\n","authors":["Kiyoon Kim","Shreyank N Gowda","Panagiotis Eustratiadis","Antreas Antoniou","Robert B Fisher"],"pdf_url":"https://arxiv.org/pdf/2401.11406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11737v2","updated":"2024-01-21T04:55:06Z","published":"2023-08-22T18:57:07Z","title":"Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape","summary":" Accurately estimating the 3D pose and shape is an essential step towards\nunderstanding animal behavior, and can potentially benefit many downstream\napplications, such as wildlife conservation. However, research in this area is\nheld back by the lack of a comprehensive and diverse dataset with high-quality\n3D pose and shape annotations. In this paper, we propose Animal3D, the first\ncomprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D\nconsists of 3379 images collected from 40 mammal species, high-quality\nannotations of 26 keypoints, and importantly the pose and shape parameters of\nthe SMAL model. All annotations were labeled and checked manually in a\nmulti-stage process to ensure highest quality results. Based on the Animal3D\ndataset, we benchmark representative shape and pose estimation models at: (1)\nsupervised learning from only the Animal3D data, (2) synthetic to real transfer\nfrom synthetically generated images, and (3) fine-tuning human pose and shape\nestimation models. Our experimental results demonstrate that predicting the 3D\nshape and pose of animals across species remains a very challenging task,\ndespite significant advances in human pose estimation. Our results further\ndemonstrate that synthetic pre-training is a viable strategy to boost the model\nperformance. Overall, Animal3D opens new directions for facilitating future\nresearch in animal 3D pose and shape estimation, and is publicly available.\n","authors":["Jiacong Xu","Yi Zhang","Jiawei Peng","Wufei Ma","Artur Jesslen","Pengliang Ji","Qixin Hu","Jiehua Zhang","Qihao Liu","Jiahao Wang","Wei Ji","Chen Wang","Xiaoding Yuan","Prakhar Kaushik","Guofeng Zhang","Jie Liu","Yushan Xie","Yawen Cui","Alan Yuille","Adam Kortylewski"],"pdf_url":"https://arxiv.org/pdf/2308.11737v2.pdf","comment":"11 pages, 5 figures, link to the dataset:\n https://xujiacong.github.io/Animal3D/"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.11632v1","updated":"2024-01-21T23:56:57Z","published":"2024-01-21T23:56:57Z","title":"What Are We Optimizing For? A Human-centric Evaluation Of Deep\n Learning-based Recommender Systems","summary":" Deep learning-based (DL) models in recommender systems (RecSys) have gained\nsignificant recognition for their remarkable accuracy in predicting user\npreferences. However, their performance often lacks a comprehensive evaluation\nfrom a human-centric perspective, which encompasses various dimensions beyond\nsimple interest matching. In this work, we have developed a robust\nhuman-centric evaluation framework that incorporates seven diverse metrics to\nassess the quality of recommendations generated by five recent open-sourced DL\nmodels. Our evaluation datasets consist of both offline benchmark data and\npersonalized online recommendation feedback collected from 445 real users. We\nfind that (1) different DL models have different pros and cons in the\nmulti-dimensional metrics that we test with; (2) users generally want a\ncombination of accuracy with at least one another human values in the\nrecommendation; (3) the degree of combination of different values needs to be\ncarefully experimented to user preferred level.\n","authors":["Ruixuan Sun","Avinash Akella","Xinyi Wu","Ruoyan Kong","Joseph A. Konstan"],"pdf_url":"https://arxiv.org/pdf/2401.11632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11624v1","updated":"2024-01-21T23:34:42Z","published":"2024-01-21T23:34:42Z","title":"In-context Learning with Retrieved Demonstrations for Language Models: A\n Survey","summary":" Language models, especially pre-trained large language models, have showcased\nremarkable abilities as few-shot in-context learners (ICL), adept at adapting\nto new tasks with just a few demonstrations in the input context. However, the\nmodel's ability to perform ICL is sensitive to the choice of the few-shot\ndemonstrations. Instead of using a fixed set of demonstrations, one recent\ndevelopment is to retrieve demonstrations tailored to each input query. The\nimplementation of demonstration retrieval is relatively straightforward,\nleveraging existing databases and retrieval systems. This not only improves the\nefficiency and scalability of the learning process but also has been shown to\nreduce biases inherent in manual example selection. In light of the encouraging\nresults and growing research in ICL with retrieved demonstrations, we conduct\nan extensive review of studies in this area. In this survey, we discuss and\ncompare different design choices for retrieval models, retrieval training\nprocedures, and inference algorithms.\n","authors":["an Luo","Xin Xu","Yue Liu","Panupong Pasupat","Mehran Kazemi"],"pdf_url":"https://arxiv.org/pdf/2401.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11509v1","updated":"2024-01-21T14:35:54Z","published":"2024-01-21T14:35:54Z","title":"Simple Domain Adaptation for Sparse Retrievers","summary":" In Information Retrieval, and more generally in Natural Language Processing,\nadapting models to specific domains is conducted through fine-tuning. Despite\nthe successes achieved by this method and its versatility, the need for\nhuman-curated and labeled data makes it impractical to transfer to new tasks,\ndomains, and/or languages when training data doesn't exist. Using the model\nwithout training (zero-shot) is another option that however suffers an\neffectiveness cost, especially in the case of first-stage retrievers. Numerous\nresearch directions have emerged to tackle these issues, most of them in the\ncontext of adapting to a task or a language. However, the literature is scarcer\nfor domain (or topic) adaptation. In this paper, we address this issue of\ncross-topic discrepancy for a sparse first-stage retriever by transposing a\nmethod initially designed for language adaptation. By leveraging pre-training\non the target data to learn domain-specific knowledge, this technique\nalleviates the need for annotated data and expands the scope of domain\nadaptation. Despite their relatively good generalization ability, we show that\neven sparse retrievers can benefit from our simple domain adaptation method.\n","authors":["Mathias Vast","Yuxuan Zong","Basile Van Cooten","Benjamin Piwowarski","Laure Soulier"],"pdf_url":"https://arxiv.org/pdf/2401.11509v1.pdf","comment":"Accepted at ECIR 2024"},{"id":"http://arxiv.org/abs/2401.11506v1","updated":"2024-01-21T14:33:52Z","published":"2024-01-21T14:33:52Z","title":"Enhancing Recommendation Diversity by Re-ranking with Large Language\n Models","summary":" It has long been recognized that it is not enough for a Recommender System\n(RS) to provide recommendations based only on their relevance to users. Among\nmany other criteria, the set of recommendations may need to be diverse in order\nto handle uncertainty and offer a meaningful choice. The literature reports\nmany ways of measuring diversity and ways of improving the diversity of a set\nof recommendations, most notably by re-ranking and selecting from a larger set\nof candidate recommendations. Driven by promising insights from the literature\non how to incorporate versatile Large Language Models (LLMs) into the RS\npipeline, in this paper, we show how LLMs can be used for diversity re-ranking.\n We begin with an informal study that verifies that LLMs can be used for\nre-ranking tasks and do have some understanding of the concept of diversity.\nThen, we design a more rigorous methodology where LLMs are prompted to generate\na diverse ranking from a candidate ranking using various prompt templates with\ndifferent re-ranking instructions in a zero-shot fashion. We conduct\ncomprehensive experiments testing state-of-the-art conversational LLMs from the\nGPT and Llama families. We compare their re-ranking capabilities with random\nre-ranking and various traditional re-ranking methods from the literature (MMR,\nxQuAD and RxQuAD). We find that LLM-based re-ranking outperforms random\nre-ranking across all the metrics that we use but does not perform as well as\nthe traditional re-ranking methods. We gain insight into prompt design for this\ntask (e.g.\\ on the whole, it is better to prompt for diversity rather than a\nbalance of diversity and relevance). Given that no special knowledge\nengineering is needed, we conclude that LLM-based re-ranking is a promising\napproach, and we highlight directions for future research. We open-source the\ncode of our experiments for reproducibility.\n","authors":["Diego Carraro","Derek Bridge"],"pdf_url":"https://arxiv.org/pdf/2401.11506v1.pdf","comment":"32 pages, 2 figures"},{"id":"http://arxiv.org/abs/2401.11505v1","updated":"2024-01-21T14:30:20Z","published":"2024-01-21T14:30:20Z","title":"CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray\n Report Labeling","summary":" Free-text radiology reports present a rich data source for various medical\ntasks, but effectively labeling these texts remains challenging. Traditional\nrule-based labeling methods fall short of capturing the nuances of diverse\nfree-text patterns. Moreover, models using expert-annotated data are limited by\ndata scarcity and pre-defined classes, impacting their performance, flexibility\nand scalability. To address these issues, our study offers three main\ncontributions: 1) We demonstrate the potential of GPT as an adept labeler using\ncarefully designed prompts. 2) Utilizing only the data labeled by GPT, we\ntrained a BERT-based labeler, CheX-GPT, which operates faster and more\nefficiently than its GPT counterpart. 3) To benchmark labeler performance, we\nintroduced a publicly available expert-annotated test set, MIMIC-500,\ncomprising 500 cases from the MIMIC validation set. Our findings demonstrate\nthat CheX-GPT not only excels in labeling accuracy over existing models, but\nalso showcases superior efficiency, flexibility, and scalability, supported by\nour introduction of the MIMIC-500 dataset for robust benchmarking. Code and\nmodels are available at https://github.com/kakaobrain/CheXGPT.\n","authors":["Jawook Gu","Han-Cheol Cho","Jiho Kim","Kihyun You","Eun Kyoung Hong","Byungseok Roh"],"pdf_url":"https://arxiv.org/pdf/2401.11505v1.pdf","comment":"16 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.11478v1","updated":"2024-01-21T12:51:28Z","published":"2024-01-21T12:51:28Z","title":"D2K: Turning Historical Data into Retrievable Knowledge for Recommender\n Systems","summary":" A vast amount of user behavior data is constantly accumulating on today's\nlarge recommendation platforms, recording users' various interests and tastes.\nPreserving knowledge from the old data while new data continually arrives is a\nvital problem for recommender systems. Existing approaches generally seek to\nsave the knowledge implicitly in the model parameters. However, such a\nparameter-centric approach lacks scalability and flexibility -- the capacity is\nhard to scale, and the knowledge is inflexible to utilize. Hence, in this work,\nwe propose a framework that turns massive user behavior data to retrievable\nknowledge (D2K). It is a data-centric approach that is model-agnostic and easy\nto scale up. Different from only storing unary knowledge such as the user-side\nor item-side information, D2K propose to store ternary knowledge for\nrecommendation, which is determined by the complete recommendation factors --\nuser, item, and context. The knowledge retrieved by target samples can be\ndirectly used to enhance the performance of any recommendation algorithms.\nSpecifically, we introduce a Transformer-based knowledge encoder to transform\nthe old data into knowledge with the user-item-context cross features. A\npersonalized knowledge adaptation unit is devised to effectively exploit the\ninformation from the knowledge base by adapting the retrieved knowledge to the\ntarget samples. Extensive experiments on two public datasets show that D2K\nsignificantly outperforms existing baselines and is compatible with a major\ncollection of recommendation algorithms.\n","authors":["Jiarui Qin","Weiwen Liu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2401.11478v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2401.11463v1","updated":"2024-01-21T11:04:30Z","published":"2024-01-21T11:04:30Z","title":"Estimating the Usefulness of Clarifying Questions and Answers for\n Conversational Search","summary":" While the body of research directed towards constructing and generating\nclarifying questions in mixed-initiative conversational search systems is vast,\nresearch aimed at processing and comprehending users' answers to such questions\nis scarce. To this end, we present a simple yet effective method for processing\nanswers to clarifying questions, moving away from previous work that simply\nappends answers to the original query and thus potentially degrades retrieval\nperformance. Specifically, we propose a classifier for assessing usefulness of\nthe prompted clarifying question and an answer given by the user. Useful\nquestions or answers are further appended to the conversation history and\npassed to a transformer-based query rewriting module. Results demonstrate\nsignificant improvements over strong non-mixed-initiative baselines.\nFurthermore, the proposed approach mitigates the performance drops when non\nuseful questions and answers are utilized.\n","authors":["Ivan Sekulić","Weronika Łajewska","Krisztian Balog","Fabio Crestani"],"pdf_url":"https://arxiv.org/pdf/2401.11463v1.pdf","comment":"This is the author's version of the work. The definitive version is\n published in: Proceedings of the 46th European Conference on Information\n Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11452v1","updated":"2024-01-21T10:15:36Z","published":"2024-01-21T10:15:36Z","title":"Towards Reliable and Factual Response Generation: Detecting Unanswerable\n Questions in Information-Seeking Conversations","summary":" Generative AI models face the challenge of hallucinations that can undermine\nusers' trust in such systems. We approach the problem of conversational\ninformation seeking as a two-step process, where relevant passages in a corpus\nare identified first and then summarized into a final system response. This way\nwe can automatically assess if the answer to the user's question is present in\nthe corpus. Specifically, our proposed method employs a sentence-level\nclassifier to detect if the answer is present, then aggregates these\npredictions on the passage level, and eventually across the top-ranked passages\nto arrive at a final answerability estimate. For training and evaluation, we\ndevelop a dataset based on the TREC CAsT benchmark that includes answerability\nlabels on the sentence, passage, and ranking levels. We demonstrate that our\nproposed method represents a strong baseline and outperforms a state-of-the-art\nLLM on the answerability prediction task.\n","authors":["Weronika Łajewska","Krisztian Balog"],"pdf_url":"https://arxiv.org/pdf/2401.11452v1.pdf","comment":"This is the author's version of the work. The definitive version is\n published in: Proceedings of the 46th European Conference on Information\n Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland"},{"id":"http://arxiv.org/abs/2401.11441v1","updated":"2024-01-21T09:42:24Z","published":"2024-01-21T09:42:24Z","title":"On-Device Recommender Systems: A Comprehensive Survey","summary":" Recommender systems have been widely deployed in various real-world\napplications to help users identify content of interest from massive amounts of\ninformation. Traditional recommender systems work by collecting user-item\ninteraction data in a cloud-based data center and training a centralized model\nto perform the recommendation service. However, such cloud-based recommender\nsystems (CloudRSs) inevitably suffer from excessive resource consumption,\nresponse latency, as well as privacy and security risks concerning both data\nand models. Recently, driven by the advances in storage, communication, and\ncomputation capabilities of edge devices, there has been a shift of focus from\nCloudRSs to on-device recommender systems (DeviceRSs), which leverage the\ncapabilities of edge devices to minimize centralized data storage requirements,\nreduce the response latency caused by communication overheads, and enhance user\nprivacy and security by localizing data processing and model training. Despite\nthe rapid rise of DeviceRSs, there is a clear absence of timely literature\nreviews that systematically introduce, categorize and contrast these methods.\nTo bridge this gap, we aim to provide a comprehensive survey of DeviceRSs,\ncovering three main aspects: (1) the deployment and inference of DeviceRSs (2)\nthe training and update of DeviceRSs (3) the security and privacy of DeviceRSs.\nFurthermore, we provide a fine-grained and systematic taxonomy of the methods\ninvolved in each aspect, followed by a discussion regarding challenges and\nfuture research directions. This is the first comprehensive survey on DeviceRSs\nthat covers a spectrum of tasks to fit various needs. We believe this survey\nwill help readers effectively grasp the current research status in this field,\nequip them with relevant technical foundations, and stimulate new research\nideas for developing DeviceRSs.\n","authors":["Hongzhi Yin","Liang Qu","Tong Chen","Wei Yuan","Ruiqi Zheng","Jing Long","Xin Xia","Yuhui Shi","Chengqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2401.11441v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2401.11632v1","updated":"2024-01-21T23:56:57Z","published":"2024-01-21T23:56:57Z","title":"What Are We Optimizing For? A Human-centric Evaluation Of Deep\n Learning-based Recommender Systems","summary":" Deep learning-based (DL) models in recommender systems (RecSys) have gained\nsignificant recognition for their remarkable accuracy in predicting user\npreferences. However, their performance often lacks a comprehensive evaluation\nfrom a human-centric perspective, which encompasses various dimensions beyond\nsimple interest matching. In this work, we have developed a robust\nhuman-centric evaluation framework that incorporates seven diverse metrics to\nassess the quality of recommendations generated by five recent open-sourced DL\nmodels. Our evaluation datasets consist of both offline benchmark data and\npersonalized online recommendation feedback collected from 445 real users. We\nfind that (1) different DL models have different pros and cons in the\nmulti-dimensional metrics that we test with; (2) users generally want a\ncombination of accuracy with at least one another human values in the\nrecommendation; (3) the degree of combination of different values needs to be\ncarefully experimented to user preferred level.\n","authors":["Ruixuan Sun","Avinash Akella","Xinyi Wu","Ruoyan Kong","Joseph A. Konstan"],"pdf_url":"https://arxiv.org/pdf/2401.11632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11631v1","updated":"2024-01-21T23:54:05Z","published":"2024-01-21T23:54:05Z","title":"Text-to-Image Cross-Modal Generation: A Systematic Review","summary":" We review research on generating visual data from text from the angle of\n\"cross-modal generation.\" This point of view allows us to draw parallels\nbetween various methods geared towards working on input text and producing\nvisual output, without limiting the analysis to narrow sub-areas. It also\nresults in the identification of common templates in the field, which are then\ncompared and contrasted both within pools of similar methods and across lines\nof research. We provide a breakdown of text-to-image generation into various\nflavors of image-from-text methods, video-from-text methods, image editing,\nself-supervised and graph-based approaches. In this discussion, we focus on\nresearch papers published at 8 leading machine learning conferences in the\nyears 2016-2022, also incorporating a number of relevant papers not matching\nthe outlined search criteria. The conducted review suggests a significant\nincrease in the number of papers published in the area and highlights research\ngaps and potential lines of investigation. To our knowledge, this is the first\nreview to systematically look at text-to-image generation from the perspective\nof \"cross-modal generation.\"\n","authors":["Maciej Żelaszczyk","Jacek Mańdziuk"],"pdf_url":"https://arxiv.org/pdf/2401.11631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11630v1","updated":"2024-01-21T23:50:46Z","published":"2024-01-21T23:50:46Z","title":"Reframing Offline Reinforcement Learning as a Regression Problem","summary":" The study proposes the reformulation of offline reinforcement learning as a\nregression problem that can be solved with decision trees. Aiming to predict\nactions based on input states, return-to-go (RTG), and timestep information, we\nobserve that with gradient-boosted trees, the agent training and inference are\nvery fast, the former taking less than a minute. Despite the simplification\ninherent in this reformulated problem, our agent demonstrates performance that\nis at least on par with established methods. This assertion is validated by\ntesting it across standard datasets associated with D4RL Gym-MuJoCo tasks. We\nfurther discuss the agent's ability to generalize by testing it on two extreme\ncases, how it learns to model the return distributions effectively even with\nhighly skewed expert datasets, and how it exhibits robust performance in\nscenarios with sparse/delayed rewards.\n","authors":["Prajwal Koirala","Cody Fleming"],"pdf_url":"https://arxiv.org/pdf/2401.11630v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11627v1","updated":"2024-01-21T23:41:32Z","published":"2024-01-21T23:41:32Z","title":"Tight Verification of Probabilistic Robustness in Bayesian Neural\n Networks","summary":" We introduce two algorithms for computing tight guarantees on the\nprobabilistic robustness of Bayesian Neural Networks (BNNs). Computing\nrobustness guarantees for BNNs is a significantly more challenging task than\nverifying the robustness of standard Neural Networks (NNs) because it requires\nsearching the parameters' space for safe weights. Moreover, tight and complete\napproaches for the verification of standard NNs, such as those based on\nMixed-Integer Linear Programming (MILP), cannot be directly used for the\nverification of BNNs because of the polynomial terms resulting from the\nconsecutive multiplication of variables encoding the weights. Our algorithms\nefficiently and effectively search the parameters' space for safe weights by\nusing iterative expansion and the network's gradient and can be used with any\nverification algorithm of choice for BNNs. In addition to proving that our\nalgorithms compute tighter bounds than the SoA, we also evaluate our algorithms\nagainst the SoA on standard benchmarks, such as MNIST and CIFAR10, showing that\nour algorithms compute bounds up to 40% tighter than the SoA.\n","authors":["Ben Batten","Mehran Hosseini","Alessio Lomuscio"],"pdf_url":"https://arxiv.org/pdf/2401.11627v1.pdf","comment":"Accepted at AISTATS 2024"},{"id":"http://arxiv.org/abs/2401.11626v1","updated":"2024-01-21T23:37:33Z","published":"2024-01-21T23:37:33Z","title":"Freely Long-Thinking Transformer (FraiLT)","summary":" Freely Long-Thinking Transformer (FraiLT) is an improved transformer model\ndesigned to enhance processing capabilities without scaling up size. It\nutilizes a recursive approach, iterating over a subset of layers multiple\ntimes, and introduces iteration encodings to maintain awareness across these\ncycles. Iteration encoding allows FraiLT to achieve the interpretive depth of\nlarger models in a compact form. When evaluated on a synthetic story dataset,\nFraiLT outperformed larger models, showcasing its ability to deliver\nhigh-quality performance while reducing memory demands. This model represents a\nstep forward towards more efficient and accessible language models.\n","authors":["Akbay Tabak"],"pdf_url":"https://arxiv.org/pdf/2401.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11618v1","updated":"2024-01-21T22:55:26Z","published":"2024-01-21T22:55:26Z","title":"Efficient local linearity regularization to overcome catastrophic\n overfitting","summary":" Catastrophic overfitting (CO) in single-step adversarial training (AT)\nresults in abrupt drops in the adversarial test accuracy (even down to 0%). For\nmodels trained with multi-step AT, it has been observed that the loss function\nbehaves locally linearly with respect to the input, this is however lost in\nsingle-step AT. To address CO in single-step AT, several methods have been\nproposed to enforce local linearity of the loss via regularization. However,\nthese regularization terms considerably slow down training due to Double\nBackpropagation. Instead, in this work, we introduce a regularization term,\ncalled ELLE, to mitigate CO effectively and efficiently in classical AT\nevaluations, as well as some more difficult regimes, e.g., large adversarial\nperturbations and long training schedules. Our regularization term can be\ntheoretically linked to curvature of the loss function and is computationally\ncheaper than previous methods by avoiding Double Backpropagation. Our thorough\nexperimental validation demonstrates that our work does not suffer from CO,\neven in challenging settings where previous works suffer from it. We also\nnotice that adapting our regularization parameter during training (ELLE-A)\ngreatly improves the performance, specially in large $\\epsilon$ setups. Our\nimplementation is available in https://github.com/LIONS-EPFL/ELLE .\n","authors":["Elias Abad Rocamora","Fanghui Liu","Grigorios G. Chrysos","Pablo M. Olmos","Volkan Cevher"],"pdf_url":"https://arxiv.org/pdf/2401.11618v1.pdf","comment":"Accepted in ICLR 2024"},{"id":"http://arxiv.org/abs/2310.19491v2","updated":"2024-01-21T22:35:34Z","published":"2023-10-30T12:28:53Z","title":"Generator Identification for Linear SDEs with Additive and\n Multiplicative Noise","summary":" In this paper, we present conditions for identifying the generator of a\nlinear stochastic differential equation (SDE) from the distribution of its\nsolution process with a given fixed initial state. These identifiability\nconditions are crucial in causal inference using linear SDEs as they enable the\nidentification of the post-intervention distributions from its observational\ndistribution. Specifically, we derive a sufficient and necessary condition for\nidentifying the generator of linear SDEs with additive noise, as well as a\nsufficient condition for identifying the generator of linear SDEs with\nmultiplicative noise. We show that the conditions derived for both types of\nSDEs are generic. Moreover, we offer geometric interpretations of the derived\nidentifiability conditions to enhance their understanding. To validate our\ntheoretical results, we perform a series of simulations, which support and\nsubstantiate the established findings.\n","authors":["Yuanyuan Wang","Xi Geng","Wei Huang","Biwei Huang","Mingming Gong"],"pdf_url":"https://arxiv.org/pdf/2310.19491v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11611v1","updated":"2024-01-21T22:18:29Z","published":"2024-01-21T22:18:29Z","title":"Continuous Field Reconstruction from Sparse Observations with Implicit\n Neural Networks","summary":" Reliably reconstructing physical fields from sparse sensor data is a\nchallenge that frequently arises in many scientific domains. In practice, the\nprocess generating the data often is not understood to sufficient accuracy.\nTherefore, there is a growing interest in using the deep neural network route\nto address the problem. This work presents a novel approach that learns a\ncontinuous representation of the physical field using implicit neural\nrepresentations (INRs). Specifically, after factorizing spatiotemporal\nvariability into spatial and temporal components using the separation of\nvariables technique, the method learns relevant basis functions from sparsely\nsampled irregular data points to develop a continuous representation of the\ndata. In experimental evaluations, the proposed model outperforms recent INR\nmethods, offering superior reconstruction quality on simulation data from a\nstate-of-the-art climate model and a second dataset that comprises ultra-high\nresolution satellite-based sea surface temperature fields.\n","authors":["Xihaier Luo","Wei Xu","Yihui Ren","Shinjae Yoo","Balu Nadiga"],"pdf_url":"https://arxiv.org/pdf/2401.11611v1.pdf","comment":"25 pages,21 figures"},{"id":"http://arxiv.org/abs/2401.11609v1","updated":"2024-01-21T22:11:29Z","published":"2024-01-21T22:11:29Z","title":"Graph Edits for Counterfactual Explanations: A Unified GNN Approach","summary":" Counterfactuals have been established as a popular explainability technique\nwhich leverages a set of minimal edits to alter the prediction of a classifier.\nWhen considering conceptual counterfactuals, the edits requested should\ncorrespond to salient concepts present in the input data. At the same time,\nconceptual distances are defined by knowledge graphs, ensuring the optimality\nof conceptual edits. In this work, we extend previous endeavors on conceptual\ncounterfactuals by introducing \\textit{graph edits as counterfactual\nexplanations}: should we represent input data as graphs, which is the shortest\ngraph edit path that results in an alternative classification label as provided\nby a black-box classifier?\n","authors":["Nikolaos Chaidos","Angeliki Dimitriou","Maria Lymperaiou","Giorgos Stamou"],"pdf_url":"https://arxiv.org/pdf/2401.11609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.07364v2","updated":"2024-01-21T22:08:20Z","published":"2024-01-14T20:41:36Z","title":"PDE Generalization of In-Context Operator Networks: A Study on 1D Scalar\n Nonlinear Conservation Laws","summary":" Can we build a single large model for a wide range of PDE-related scientific\nlearning tasks? Can this model generalize to new PDEs, even of new forms,\nwithout any fine-tuning? In-context operator learning and the corresponding\nmodel In-Context Operator Networks (ICON) represent an initial exploration of\nthese questions. The capability of ICON regarding the first question has been\ndemonstrated previously. In this paper, we present a detailed methodology for\nsolving PDE problems with ICON, and show how a single ICON model can make\nforward and reverse predictions for different equations with different strides,\nprovided with appropriately designed data prompts. We show the positive\nevidence to the second question, i.e., ICON can generalize well to some PDEs\nwith new forms without any fine-tuning. This is exemplified through a study on\n1D scalar nonlinear conservation laws, a family of PDEs with temporal\nevolution. We also show how to broaden the range of problems that an ICON model\ncan address, by transforming functions and equations to ICON's capability\nscope. We believe that the progress in this paper is a significant step towards\nthe goal of training a foundation model for PDE-related tasks under the\nin-context operator learning framework.\n","authors":["Liu Yang","Stanley J. Osher"],"pdf_url":"https://arxiv.org/pdf/2401.07364v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11608v1","updated":"2024-01-21T22:01:34Z","published":"2024-01-21T22:01:34Z","title":"$\\texttt{immrax}$: A Parallelizable and Differentiable Toolbox for\n Interval Analysis and Mixed Monotone Reachability in JAX","summary":" We present an implementation of interval analysis and mixed monotone interval\nreachability analysis as function transforms in Python, fully composable with\nthe computational framework JAX. The resulting toolbox inherits several key\nfeatures from JAX, including computational efficiency through Just-In-Time\nCompilation, GPU acceleration for quick parallelized computations, and\nAutomatic Differentiability. We demonstrate the toolbox's performance on\nseveral case studies, including a reachability problem on a vehicle model\ncontrolled by a neural network, and a robust closed-loop optimal control\nproblem for a swinging pendulum.\n","authors":["Akash Harapanahalli","Saber Jafarpour","Samuel Coogan"],"pdf_url":"https://arxiv.org/pdf/2401.11608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11605v1","updated":"2024-01-21T21:49:49Z","published":"2024-01-21T21:49:49Z","title":"Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass\n Diffusion Transformers","summary":" We present the Hourglass Diffusion Transformer (HDiT), an image generative\nmodel that exhibits linear scaling with pixel count, supporting training at\nhigh-resolution (e.g. $1024 \\times 1024$) directly in pixel-space. Building on\nthe Transformer architecture, which is known to scale to billions of\nparameters, it bridges the gap between the efficiency of convolutional U-Nets\nand the scalability of Transformers. HDiT trains successfully without typical\nhigh-resolution training techniques such as multiscale architectures, latent\nautoencoders or self-conditioning. We demonstrate that HDiT performs\ncompetitively with existing models on ImageNet $256^2$, and sets a new\nstate-of-the-art for diffusion models on FFHQ-$1024^2$.\n","authors":["Katherine Crowson","Stefan Andreas Baumann","Alex Birch","Tanishq Mathew Abraham","Daniel Z. Kaplan","Enrico Shippole"],"pdf_url":"https://arxiv.org/pdf/2401.11605v1.pdf","comment":"20 pages, 13 figures, project page and code available at\n https://crowsonkb.github.io/hourglass-diffusion-transformers/"},{"id":"http://arxiv.org/abs/2312.02063v2","updated":"2024-01-21T21:41:32Z","published":"2023-12-04T17:19:37Z","title":"The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet\n Transits","summary":" This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase\nFolding and Convolutional Neural Network (CNN) system to detect exoplanets\nusing the transit method. We devise a fast folding algorithm parallelized on a\nGPU to amplify low signal-to-noise ratio transit signals, allowing a search at\nhigh precision and speed. A CNN trained on two million synthetic light curves\nreports a score indicating the likelihood of a planetary signal at each period.\nWhile the GPFC method has broad applicability across period ranges, this\nresearch specifically focuses on detecting ultra-short-period planets with\norbital periods less than one day. GPFC improves on speed by three orders of\nmagnitude over the predominant Box-fitting Least Squares (BLS) method. Our\nsimulation results show GPFC achieves $97%$ training accuracy, higher true\npositive rate at the same false positive rate of detection, and higher\nprecision at the same recall rate when compared to BLS. GPFC recovers $100\\%$\nof known ultra-short-period planets in $\\textit{Kepler}$ light curves from a\nblind search. These results highlight the promise of GPFC as an alternative\napproach to the traditional BLS algorithm for finding new transiting exoplanets\nin data taken with $\\textit{Kepler}$ and other space transit missions such as\nK2, TESS and future PLATO and Earth 2.0.\n","authors":["Kaitlyn Wang","Jian Ge","Kevin Willis","Kevin Wang","Yinan Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.02063v2.pdf","comment":"16 pages, 19 figures; Accepted for publication in the peer-reviewed\n journal, Monthly Notices of the Royal Astronomical Society (MNRAS), on\n January 20, 2024"}]},"2024-01-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2305.14189v3","updated":"2024-01-20T22:29:15Z","published":"2023-05-23T16:11:00Z","title":"Beyond Shared Vocabulary: Increasing Representational Word Similarities\n across Languages for Multilingual Machine Translation","summary":" Using a vocabulary that is shared across languages is common practice in\nMultilingual Neural Machine Translation (MNMT). In addition to its simple\ndesign, shared tokens play an important role in positive knowledge transfer,\nassuming that shared tokens refer to similar meanings across languages.\nHowever, when word overlap is small, especially due to different writing\nsystems, transfer is inhibited. In this paper, we define word-level information\ntransfer pathways via word equivalence classes and rely on graph networks to\nfuse word embeddings across languages. Our experiments demonstrate the\nadvantages of our approach: 1) embeddings of words with similar meanings are\nbetter aligned across languages, 2) our method achieves consistent BLEU\nimprovements of up to 2.3 points for high- and low-resource MNMT, and 3) less\nthan 1.0\\% additional trainable parameters are required with a limited increase\nin computational costs, while inference time remains identical to the baseline.\nWe release the codebase to the community.\n","authors":["Di Wu","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2305.14189v3.pdf","comment":"15 pages, 3 figures"},{"id":"http://arxiv.org/abs/2401.07510v3","updated":"2024-01-20T22:08:18Z","published":"2024-01-15T07:21:16Z","title":"Developing ChatGPT for Biology and Medicine: A Complete Review of\n Biomedical Question Answering","summary":" ChatGPT explores a strategic blueprint of question answering (QA) in\ndelivering medical diagnosis, treatment recommendations, and other healthcare\nsupport. This is achieved through the increasing incorporation of medical\ndomain data via natural language processing (NLP) and multimodal paradigms. By\ntransitioning the distribution of text, images, videos, and other modalities\nfrom the general domain to the medical domain, these techniques have expedited\nthe progress of medical domain question answering (MDQA). They bridge the gap\nbetween human natural language and sophisticated medical domain knowledge or\nexpert manual annotations, handling large-scale, diverse, unbalanced, or even\nunlabeled data analysis scenarios in medical contexts. Central to our focus is\nthe utilizing of language models and multimodal paradigms for medical question\nanswering, aiming to guide the research community in selecting appropriate\nmechanisms for their specific medical research requirements. Specialized tasks\nsuch as unimodal-related question answering, reading comprehension, reasoning,\ndiagnosis, relation extraction, probability modeling, and others, as well as\nmultimodal-related tasks like vision question answering, image caption,\ncross-modal retrieval, report summarization, and generation, are discussed in\ndetail. Each section delves into the intricate specifics of the respective\nmethod under consideration. This paper highlights the structures and\nadvancements of medical domain explorations against general domain methods,\nemphasizing their applications across different tasks and datasets. It also\noutlines current challenges and opportunities for future medical domain\nresearch, paving the way for continued innovation and application in this\nrapidly evolving field.\n","authors":["Qing Li","Lei Li","Yu Li"],"pdf_url":"https://arxiv.org/pdf/2401.07510v3.pdf","comment":"50 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.02317v3","updated":"2024-01-20T21:16:09Z","published":"2023-12-04T19:58:07Z","title":"GNN2R: Weakly-Supervised Rationale-Providing Question Answering over\n Knowledge Graphs","summary":" Most current methods for multi-hop question answering (QA) over knowledge\ngraphs (KGs) only provide final conclusive answers without explanations, such\nas a set of KG entities that is difficult for normal users to review and\ncomprehend. This issue severely limits the application of KG-based QA in\nreal-world scenarios. However, it is non-trivial to solve due to two\nchallenges: First, annotations of reasoning chains of multi-hop questions,\nwhich could serve as supervision for explanation generation, are usually\nlacking. Second, it is difficult to maintain high efficiency when explicit KG\ntriples need to be retrieved to generate explanations. In this paper, we\npropose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to\nsolve this issue. GNN2R can provide both final answers and reasoning subgraphs\nas a rationale behind final answers efficiently with only weak supervision that\nis available through question-final answer pairs. We extensively evaluated\nGNN2R with detailed analyses in experiments. The results demonstrate that, in\nterms of effectiveness, efficiency, and quality of generated explanations,\nGNN2R outperforms existing state-of-the-art methods that are applicable to this\ntask. Our code and pre-trained models are available at\nhttps://github.com/ruijie-wang-uzh/GNN2R.\n","authors":["Ruijie Wang","Luca Rossetto","Michael Cochez","Abraham Bernstein"],"pdf_url":"https://arxiv.org/pdf/2312.02317v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11323v1","updated":"2024-01-20T20:55:21Z","published":"2024-01-20T20:55:21Z","title":"Analyzing Task-Encoding Tokens in Large Language Models","summary":" In-context learning (ICL) has become an effective solution for few-shot\nlearning in natural language processing. Past work has found that, during this\nprocess, representations of the last prompt token are utilized to store task\nreasoning procedures, thereby explaining the working mechanism of in-context\nlearning. In this paper, we seek to locate and analyze other task-encoding\ntokens whose representations store task reasoning procedures. Supported by\nexperiments that ablate the representations of different token types, we find\nthat template and stopword tokens are the most prone to be task-encoding\ntokens. In addition, we demonstrate experimentally that lexical cues,\nrepetition, and text formats are the main distinguishing characteristics of\nthese tokens. Our work provides additional insights into how large language\nmodels (LLMs) leverage task reasoning procedures in ICL and suggests that\nfuture work may involve using task-encoding tokens to improve the computational\nefficiency of LLMs at inference time and their ability to handle long\nsequences.\n","authors":["Yu Bai","Heyan Huang","Cesare Spinoso-Di Piano","Marc-Antoine Rondeau","Sanxing Chen","Yang Gao","Jackie Chi Kit Cheung"],"pdf_url":"https://arxiv.org/pdf/2401.11323v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.11316v1","updated":"2024-01-20T20:25:17Z","published":"2024-01-20T20:25:17Z","title":"PRILoRA: Pruned and Rank-Increasing Low-Rank Adaptation","summary":" With the proliferation of large pre-trained language models (PLMs),\nfine-tuning all model parameters becomes increasingly inefficient, particularly\nwhen dealing with numerous downstream tasks that entail substantial training\nand storage costs. Several approaches aimed at achieving parameter-efficient\nfine-tuning (PEFT) have been proposed. Among them, Low-Rank Adaptation (LoRA)\nstands out as an archetypal method, incorporating trainable rank decomposition\nmatrices into each target module. Nevertheless, LoRA does not consider the\nvarying importance of each layer. To address these challenges, we introduce\nPRILoRA, which linearly allocates a different rank for each layer, in an\nincreasing manner, and performs pruning throughout the training process,\nconsidering both the temporary magnitude of weights and the accumulated\nstatistics of the input to any given layer. We validate the effectiveness of\nPRILoRA through extensive experiments on eight GLUE benchmarks, setting a new\nstate of the art.\n","authors":["Nadav Benedek","Lior Wolf"],"pdf_url":"https://arxiv.org/pdf/2401.11316v1.pdf","comment":"EACL 2024"},{"id":"http://arxiv.org/abs/2401.11305v1","updated":"2024-01-20T19:32:56Z","published":"2024-01-20T19:32:56Z","title":"Progress in Privacy Protection: A Review of Privacy Preserving\n Techniques in Recommender Systems, Edge Computing, and Cloud Computing","summary":" As digital technology evolves, the increasing use of connected devices brings\nboth challenges and opportunities in the areas of mobile crowdsourcing, edge\ncomputing, and recommender systems. This survey focuses on these dynamic\nfields, emphasizing the critical need for privacy protection in our\nincreasingly data-oriented world. It explores the latest trends in these\ninterconnected areas, with a special emphasis on privacy and data security. Our\nmethod involves an in-depth analysis of various academic works, which helps us\nto gain a comprehensive understanding of these sectors and their shifting focus\ntowards privacy concerns. We present new insights and marks a significant\nadvancement in addressing privacy issues within these technologies. The survey\nis a valuable resource for researchers, industry practitioners, and policy\nmakers, offering an extensive overview of these fields and their related\nprivacy challenges, catering to a wide audience in the modern digital era.\n","authors":["Syed Raza Bashir","Shaina Raza","Vojislav Misic"],"pdf_url":"https://arxiv.org/pdf/2401.11305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04925v3","updated":"2024-01-20T17:23:31Z","published":"2024-01-10T04:37:38Z","title":"The Impact of Reasoning Step Length on Large Language Models","summary":" Chain of Thought (CoT) is significant in improving the reasoning abilities of\nlarge language models (LLMs). However, the correlation between the\neffectiveness of CoT and the length of reasoning steps in prompts remains\nlargely unknown. To shed light on this, we have conducted several empirical\nexperiments to explore the relations. Specifically, we design experiments that\nexpand and compress the rationale reasoning steps within CoT demonstrations,\nwhile keeping all other factors constant. We have the following key findings.\nFirst, the results indicate that lengthening the reasoning steps in prompts,\neven without adding new information into the prompt, considerably enhances\nLLMs' reasoning abilities across multiple datasets. Alternatively, shortening\nthe reasoning steps, even while preserving the key information, significantly\ndiminishes the reasoning abilities of models. This finding highlights the\nimportance of the number of steps in CoT prompts and provides practical\nguidance to make better use of LLMs' potential in complex problem-solving\nscenarios. Second, we also investigated the relationship between the\nperformance of CoT and the rationales used in demonstrations. Surprisingly, the\nresult shows that even incorrect rationales can yield favorable outcomes if\nthey maintain the requisite length of inference. Third, we observed that the\nadvantages of increasing reasoning steps are task-dependent: simpler tasks\nrequire fewer steps, whereas complex tasks gain significantly from longer\ninference sequences.\n","authors":["Mingyu Jin","Qinkai Yu","Dong Shu","Haiyan Zhao","Wenyue Hua","Yanda Meng","Yongfeng Zhang","Mengnan Du"],"pdf_url":"https://arxiv.org/pdf/2401.04925v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11268v1","updated":"2024-01-20T16:48:55Z","published":"2024-01-20T16:48:55Z","title":"Word-Level ASR Quality Estimation for Efficient Corpus Sampling and\n Post-Editing through Analyzing Attentions of a Reference-Free Metric","summary":" In the realm of automatic speech recognition (ASR), the quest for models that\nnot only perform with high accuracy but also offer transparency in their\ndecision-making processes is crucial. The potential of quality estimation (QE)\nmetrics is introduced and evaluated as a novel tool to enhance explainable\nartificial intelligence (XAI) in ASR systems. Through experiments and analyses,\nthe capabilities of the NoRefER (No Reference Error Rate) metric are explored\nin identifying word-level errors to aid post-editors in refining ASR\nhypotheses. The investigation also extends to the utility of NoRefER in the\ncorpus-building process, demonstrating its effectiveness in augmenting datasets\nwith insightful annotations. The diagnostic aspects of NoRefER are examined,\nrevealing its ability to provide valuable insights into model behaviors and\ndecision patterns. This has proven beneficial for prioritizing hypotheses in\npost-editing workflows and fine-tuning ASR models. The findings suggest that\nNoRefER is not merely a tool for error detection but also a comprehensive\nframework for enhancing ASR systems' transparency, efficiency, and\neffectiveness. To ensure the reproducibility of the results, all source codes\nof this study are made publicly available.\n","authors":["Golara Javadi","Kamer Ali Yuksel","Yunsu Kim","Thiago Castro Ferreira","Mohamed Al-Badrashiny"],"pdf_url":"https://arxiv.org/pdf/2401.11268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11248v1","updated":"2024-01-20T15:02:33Z","published":"2024-01-20T15:02:33Z","title":"Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense\n Passage Retrieval","summary":" Masked auto-encoder pre-training has emerged as a prevalent technique for\ninitializing and enhancing dense retrieval systems. It generally utilizes\nadditional Transformer decoder blocks to provide sustainable supervision\nsignals and compress contextual information into dense representations.\nHowever, the underlying reasons for the effectiveness of such a pre-training\ntechnique remain unclear. The usage of additional Transformer-based decoders\nalso incurs significant computational costs. In this study, we aim to shed\nlight on this issue by revealing that masked auto-encoder (MAE) pre-training\nwith enhanced decoding significantly improves the term coverage of input tokens\nin dense representations, compared to vanilla BERT checkpoints. Building upon\nthis observation, we propose a modification to the traditional MAE by replacing\nthe decoder of a masked auto-encoder with a completely simplified Bag-of-Word\nprediction task. This modification enables the efficient compression of lexical\nsignals into dense representations through unsupervised pre-training.\nRemarkably, our proposed method achieves state-of-the-art retrieval performance\non several large-scale retrieval benchmarks without requiring any additional\nparameters, which provides a 67% training speed-up compared to standard masked\nauto-encoder pre-training with enhanced decoding.\n","authors":["Guangyuan Ma","Xing Wu","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2401.11248v1.pdf","comment":"Working in progress. Our code will be available at\n https://github.com/ma787639046/bowdpr"},{"id":"http://arxiv.org/abs/2312.03122v3","updated":"2024-01-20T15:02:20Z","published":"2023-12-05T20:41:34Z","title":"Assertion Enhanced Few-Shot Learning: Instructive Technique for Large\n Language Models to Generate Educational Explanations","summary":" Human educators possess an intrinsic ability to anticipate and seek\neducational explanations from students, which drives them to pose\nthought-provoking questions when students cannot articulate these explanations\nindependently. We aim to imbue Intelligent Tutoring Systems with this ability\nusing few-shot learning capability of Large Language Models. Our work proposes\na novel prompting technique, Assertion Enhanced Few-Shot Learning, to\nfacilitate the generation of accurate, detailed oriented educational\nexplanations. Our central hypothesis is that, in educational domain, few-shot\ndemonstrations are necessary but not a sufficient condition for quality\nexplanation generation. We conducted a study involving 12 in-service teachers,\ncomparing our approach to Traditional Few-Shot Learning. The results show that\nAssertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and\nyields higher-quality explanations, as evaluated by teachers. We also conduct a\nqualitative ablation study to factor the impact of assertions to provide\neducator-friendly prompting guidelines for generating explanations in their\ndomain of interest.\n","authors":["Tasmia Shahriar","Kelly Ramos","Noboru Matsuda"],"pdf_url":"https://arxiv.org/pdf/2312.03122v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.09333v2","updated":"2024-01-20T15:01:01Z","published":"2024-01-17T16:57:18Z","title":"Machines Do See Color: A Guideline to Classify Different Forms of Racist\n Discourse in Large Corpora","summary":" Current methods to identify and classify racist language in text rely on\nsmall-n qualitative approaches or large-n approaches focusing exclusively on\novert forms of racist discourse. This article provides a step-by-step\ngeneralizable guideline to identify and classify different forms of racist\ndiscourse in large corpora. In our approach, we start by conceptualizing racism\nand its different manifestations. We then contextualize these racist\nmanifestations to the time and place of interest, which allows researchers to\nidentify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a\ncross-lingual model for supervised text classification with a cutting-edge\ncontextual understanding of text. We show that XLM-R and XLM-R-Racismo, our\npretrained model, outperform other state-of-the-art approaches in classifying\nracism in large corpora. We illustrate our approach using a corpus of tweets\nrelating to the Ecuadorian ind\\'igena community between 2018 and 2021.\n","authors":["Diana Davila Gordillo","Joan Timoneda","Sebastian Vallejo Vera"],"pdf_url":"https://arxiv.org/pdf/2401.09333v2.pdf","comment":"37 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2401.11246v1","updated":"2024-01-20T14:59:43Z","published":"2024-01-20T14:59:43Z","title":"Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented\n Generation in Niche Domains, Exemplified by Korean Medicine","summary":" We propose a natural language prompt-based retrieval augmented generation\n(Prompt-RAG), a novel approach to enhance the performance of generative large\nlanguage models (LLMs) in niche domains. Conventional RAG methods mostly\nrequire vector embeddings, yet the suitability of generic LLM-based embedding\nrepresentations for specialized domains remains uncertain. To explore and\nexemplify this point, we compared vector embeddings from Korean Medicine (KM)\nand Conventional Medicine (CM) documents, finding that KM document embeddings\ncorrelated more with token overlaps and less with human-assessed document\nrelatedness, in contrast to CM embeddings. Prompt-RAG, distinct from\nconventional RAG models, operates without the need for embedding vectors. Its\nperformance was assessed through a Question-Answering (QA) chatbot application,\nwhere responses were evaluated for relevance, readability, and informativeness.\nThe results showed that Prompt-RAG outperformed existing models, including\nChatGPT and conventional vector embedding-based RAGs, in terms of relevance and\ninformativeness. Despite challenges like content structuring and response\nlatency, the advancements in LLMs are expected to encourage the use of\nPrompt-RAG, making it a promising tool for other domains in need of RAG\nmethods.\n","authors":["Bongsu Kang","Jundong Kim","Tae-Rim Yun","Chang-Eop Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11246v1.pdf","comment":"26 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.16326v2","updated":"2024-01-20T14:33:54Z","published":"2023-05-10T13:40:06Z","title":"Large language models in biomedical natural language processing:\n benchmarks, baselines, and recommendations","summary":" Biomedical literature is growing rapidly, making it challenging to curate and\nextract knowledge manually. Biomedical natural language processing (BioNLP)\ntechniques that can automatically extract information from biomedical\nliterature help alleviate this burden. Recently, large Language Models (LLMs),\nsuch as GPT-3 and GPT-4, have gained significant attention for their impressive\nperformance. However, their effectiveness in BioNLP tasks and impact on method\ndevelopment and downstream users remain understudied. This pilot study (1)\nestablishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and\none-shot settings in eight BioNLP datasets across four applications: named\nentity recognition, relation extraction, multi-label document classification,\nand semantic similarity and reasoning, (2) examines the errors produced by the\nLLMs and categorized the errors into three types: missingness, inconsistencies,\nand unwanted artificial content, and (3) provides suggestions for using LLMs in\nBioNLP applications. We make the datasets, baselines, and results publicly\navailable to the community via\nhttps://github.com/qingyu-qc/gpt_bionlp_benchmark.\n","authors":["Qingyu Chen","Jingcheng Du","Yan Hu","Vipina Kuttichi Keloth","Xueqing Peng","Kalpana Raja","Rui Zhang","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.17080v2","updated":"2024-01-20T14:08:16Z","published":"2023-12-28T15:49:43Z","title":"MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation","summary":" In this work, we introduce a novel evaluation paradigm for Large Language\nModels, one that challenges them to engage in meta-reasoning. This approach\naddresses critical shortcomings in existing math problem-solving benchmarks,\ntraditionally used to evaluate the cognitive capabilities of agents. Our\nparadigm shifts the focus from result-oriented assessments, which often\noverlook the reasoning process, to a more holistic evaluation that effectively\ndifferentiates the cognitive capabilities among models. For example, in our\nbenchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The\nsignificance of this new paradigm lies in its ability to reveal potential\ncognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to\nuncover due to their saturation and lack of effective differentiation among\nvarying reasoning abilities. Our comprehensive analysis includes several\nstate-of-the-art math models from both open-source and closed-source\ncommunities, uncovering fundamental deficiencies in their training and\nevaluation approaches. This paper not only advocates for a paradigm shift in\nthe assessment of LLMs but also contributes to the ongoing discourse on the\ntrajectory towards Artificial General Intelligence (AGI). By promoting the\nadoption of meta-reasoning evaluation methods similar to ours, we aim to\nfacilitate a more accurate assessment of the true cognitive abilities of LLMs.\n","authors":["Zhongshen Zeng","Pengguang Chen","Shu Liu","Haiyun Jiang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.17080v2.pdf","comment":"Code: https://github.com/dvlab-research/MR-GSM8K"},{"id":"http://arxiv.org/abs/2401.05949v3","updated":"2024-01-20T13:46:33Z","published":"2024-01-11T14:38:19Z","title":"Universal Vulnerabilities in Large Language Models: In-context Learning\n Backdoor Attacks","summary":" In-context learning, a paradigm bridging the gap between pre-training and\nfine-tuning, has demonstrated high efficacy in several NLP tasks, especially in\nfew-shot settings. Unlike traditional fine-tuning methods, in-context learning\nadapts pre-trained models to unseen tasks without updating any parameters.\nDespite being widely applied, in-context learning is vulnerable to malicious\nattacks. In this work, we raise security concerns regarding this paradigm. Our\nstudies demonstrate that an attacker can manipulate the behavior of large\nlanguage models by poisoning the demonstration context, without the need for\nfine-tuning the model. Specifically, we have designed a new backdoor attack\nmethod, named ICLAttack, to target large language models based on in-context\nlearning. Our method encompasses two types of attacks: poisoning demonstration\nexamples and poisoning prompts, which can make models behave in accordance with\npredefined intentions. ICLAttack does not require additional fine-tuning to\nimplant a backdoor, thus preserving the model's generality. Furthermore, the\npoisoned examples are correctly labeled, enhancing the natural stealth of our\nattack method. Extensive experimental results across several language models,\nranging in size from 1.3B to 40B parameters, demonstrate the effectiveness of\nour attack method, exemplified by a high average attack success rate of 95.0%\nacross the three datasets on OPT models. Our findings highlight the\nvulnerabilities of language models, and we hope this work will raise awareness\nof the possible security threats associated with in-context learning.\n","authors":["Shuai Zhao","Meihuizi Jia","Luu Anh Tuan","Jinming Wen"],"pdf_url":"https://arxiv.org/pdf/2401.05949v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.04620v3","updated":"2024-01-20T13:04:29Z","published":"2024-01-09T15:44:44Z","title":"Agent Alignment in Evolving Social Norms","summary":" Agents based on Large Language Models (LLMs) are increasingly permeating\nvarious domains of human production and life, highlighting the importance of\naligning them with human values. The current alignment of AI systems primarily\nfocuses on passively aligning LLMs through human intervention. However, agents\npossess characteristics like receiving environmental feedback and\nself-evolution, rendering the LLM alignment methods inadequate. In response, we\npropose an evolutionary framework for agent evolution and alignment, named\nEvolutionaryAgent, which transforms agent alignment into a process of evolution\nand selection under the principle of survival of the fittest. In an environment\nwhere social norms continuously evolve, agents better adapted to the current\nsocial norms will have a higher probability of survival and proliferation,\nwhile those inadequately aligned dwindle over time. Experimental results\nassessing the agents from multiple perspectives in aligning with social norms\ndemonstrate that EvolutionaryAgent can align progressively better with the\nevolving social norms while maintaining its proficiency in general tasks.\nEffectiveness tests conducted on various open and closed-source LLMs as the\nfoundation for agents also prove the applicability of our approach.\n","authors":["Shimin Li","Tianxiang Sun","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.04620v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2401.09003v2","updated":"2024-01-20T12:43:37Z","published":"2024-01-17T06:48:16Z","title":"Augmenting Math Word Problems via Iterative Question Composing","summary":" Despite recent progress in improving the mathematical reasoning ability of\nlarge language models(LLMs), solving competition-level math problems without\nthe use of external tools remains challenging for open-source LLMs. In this\nwork, we introduce the MMIQC dataset, a mixture of processed web data and\nsynthetic question-response pairs, to equip base models with better\nmathematical reasoning skills. In different model sizes, the models fine-tuned\non MMIQC consistently outperform their counterparts by a clear margin on MATH\ntest set. Notably, DeepSeek-67B-MMIQC achieves a 41.0% accuracy, 4.2% higher\nthan the previous open-source SOTA. Our experiments also show that a large part\nof the improvement can be attributed to our novel augmentation method\nIQC(Iterative Question Composing), where we iteratively ask an LLM to compose\nnew questions from the given seed problems and do rejection sampling from\nanother LLM. MMIQC has now been released on\nhttps://huggingface.co/datasets/Vivacem/MMIQC.\n","authors":["Haoxiong Liu","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2401.09003v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11218v1","updated":"2024-01-20T12:00:40Z","published":"2024-01-20T12:00:40Z","title":"End-to-End Argument Mining over Varying Rhetorical Structures","summary":" Rhetorical Structure Theory implies no single discourse interpretation of a\ntext, and the limitations of RST parsers further exacerbate inconsistent\nparsing of similar structures. Therefore, it is important to take into account\nthat the same argumentative structure can be found in semantically similar\ntexts with varying rhetorical structures. In this work, the differences between\nparaphrases within the same argument scheme are evaluated from a rhetorical\nperspective. The study proposes a deep dependency parsing model to assess the\nconnection between rhetorical and argument structures. The model utilizes\nrhetorical relations; RST structures of paraphrases serve as training data\naugmentations. The method allows for end-to-end argumentation analysis using a\nrhetorical tree instead of a word sequence. It is evaluated on the bilingual\nMicrotexts corpus, and the first results on fully-fledged argument parsing for\nthe Russian version of the corpus are reported. The results suggest that\nargument mining can benefit from multiple variants of discourse structure.\n","authors":["Elena Chistova"],"pdf_url":"https://arxiv.org/pdf/2401.11218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11207v1","updated":"2024-01-20T10:42:15Z","published":"2024-01-20T10:42:15Z","title":"Unfair TOS: An Automated Approach using Customized BERT","summary":" Terms of Service (ToS) form an integral part of any agreement as it defines\nthe legal relationship between a service provider and an end-user. Not only do\nthey establish and delineate reciprocal rights and responsibilities, but they\nalso provide users with information on essential aspects of contracts that\npertain to the use of digital spaces. These aspects include a wide range of\ntopics, including limitation of liability, data protection, etc. Users tend to\naccept the ToS without going through it before using any application or\nservice. Such ignorance puts them in a potentially weaker situation in case any\naction is required. Existing methodologies for the detection or classification\nof unfair clauses are however obsolete and show modest performance. In this\nresearch paper, we present SOTA(State of The Art) results on unfair clause\ndetection from ToS documents based on unprecedented Fine-tuning BERT in\nintegration with SVC(Support Vector Classifier). The study shows proficient\nperformance with a macro F1-score of 0.922 at unfair clause detection, and\nsuperior performance is also shown in the classification of unfair clauses by\neach tag. Further, a comparative analysis is performed by answering research\nquestions on the Transformer models utilized. In order to further research and\nexperimentation the code and results are made available on\nhttps://github.com/batking24/Unfair-TOS-An-Automated-Approach-based-on-Fine-tuning-BERT-in-conjunction-with-ML.\n","authors":["Bathini Sai Akash","Akshara Kupireddy","Lalita Bhanu Murthy"],"pdf_url":"https://arxiv.org/pdf/2401.11207v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11206v1","updated":"2024-01-20T10:41:03Z","published":"2024-01-20T10:41:03Z","title":"InferAligner: Inference-Time Alignment for Harmlessness through\n Cross-Model Guidance","summary":" With the rapid development of large language models (LLMs), they are not only\nused as general-purpose AI assistants but are also customized through further\nfine-tuning to meet the requirements of different applications. A pivotal\nfactor in the success of current LLMs is the alignment process. Current\nalignment methods, such as supervised fine-tuning (SFT) and reinforcement\nlearning from human feedback (RLHF), focus on training-time alignment and are\noften complex and cumbersome to implement. Therefore, we develop\n\\textbf{InferAligner}, a novel inference-time alignment method that utilizes\ncross-model guidance for harmlessness alignment. InferAligner utilizes safety\nsteering vectors extracted from safety-aligned model to modify the activations\nof the target model when responding to harmful inputs, thereby guiding the\ntarget model to provide harmless responses. Experimental results show that our\nmethod can be very effectively applied to domain-specific models in finance,\nmedicine, and mathematics, as well as to multimodal large language models\n(MLLMs) such as LLaVA. It significantly diminishes the Attack Success Rate\n(ASR) of both harmful instructions and jailbreak attacks, while maintaining\nalmost unchanged performance in downstream tasks.\n","authors":["Pengyu Wang","Dong Zhang","Linyang Li","Chenkun Tan","Xinghao Wang","Ke Ren","Botian Jiang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2401.11206v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11185v1","updated":"2024-01-20T09:49:59Z","published":"2024-01-20T09:49:59Z","title":"How the Advent of Ubiquitous Large Language Models both Stymie and\n Turbocharge Dynamic Adversarial Question Generation","summary":" Dynamic adversarial question generation, where humans write examples to stump\na model, aims to create examples that are realistic and informative. However,\nthe advent of large language models (LLMs) has been a double-edged sword for\nhuman authors: more people are interested in seeing and pushing the limits of\nthese models, but because the models are so much stronger an opponent, they are\nharder to defeat. To understand how these models impact adversarial question\nwriting process, we enrich the writing guidance with LLMs and retrieval models\nfor the authors to reason why their questions are not adversarial. While\nauthors could create interesting, challenging adversarial questions, they\nsometimes resort to tricks that result in poor questions that are ambiguous,\nsubjective, or confusing not just to a computer but also to humans. To address\nthese issues, we propose new metrics and incentives for eliciting good,\nchallenging questions and present a new dataset of adversarially authored\nquestions.\n","authors":["Yoo Yeon Sung","Ishani Mondal","Jordan Boyd-Graber"],"pdf_url":"https://arxiv.org/pdf/2401.11185v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04691v4","updated":"2024-01-20T09:36:41Z","published":"2023-10-07T05:37:41Z","title":"EMO: Earth Mover Distance Optimization for Auto-Regressive Language\n Modeling","summary":" Neural language models are probabilistic models of human text. They are\npredominantly trained using maximum likelihood estimation (MLE), which is\nequivalent to minimizing the forward cross-entropy between the empirical data\ndistribution and the model distribution. However, various degeneration\nphenomena are still widely observed when decoding from the distributions\nlearned by such models. We establish that the forward cross-entropy is\nsuboptimal as a distance metric for aligning human and model distribution due\nto its (1) recall-prioritization (2) negative diversity ignorance and (3)\ntrain-test mismatch. In this paper, we propose Earth Mover Distance\nOptimization (EMO) for auto-regressive language modeling. EMO capitalizes on\nthe inherent properties of earth mover distance to address the aforementioned\nchallenges. Due to the high complexity of direct computation, we further\nintroduce a feasible upper bound for EMO to ease end-to-end training. Upon\nextensive evaluation of language models trained using EMO and MLE. We find that\nEMO demonstrates a consistently better language modeling performance than MLE\nacross domains. Moreover, EMO demonstrates noteworthy enhancements in\ndownstream performance with minimal fine-tuning on merely 25,000 sentences.\nThis highlights the tremendous potential of EMO as a lightweight calibration\nmethod for enhancing large-scale pre-trained language models.\n","authors":["Siyu Ren","Zhiyong Wu","Kenny Q. Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.04691v4.pdf","comment":"To appear at ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11143v1","updated":"2024-01-20T06:42:32Z","published":"2024-01-20T06:42:32Z","title":"Gaussian Adaptive Attention is All You Need: Robust Contextual\n Representations Across Multiple Modalities","summary":" We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a\nnovel probabilistic attention framework, and the Gaussian Adaptive Transformer\n(GAT), designed to enhance information aggregation across multiple modalities,\nincluding Speech, Text and Vision. GAAM integrates learnable mean and variance\ninto its attention mechanism, implemented in a Multi-Headed framework enabling\nit to collectively model any Probability Distribution for dynamic recalibration\nof feature significance. This method demonstrates significant improvements,\nespecially with highly non-stationary data, surpassing the state-of-the-art\nattention techniques in model performance (up to approximately +20% in\naccuracy) by identifying key elements within the feature space. GAAM's\ncompatibility with dot-product-based attention models and relatively low number\nof parameters showcases its adaptability and potential to boost existing\nattention frameworks. Empirically, GAAM exhibits superior adaptability and\nefficacy across a diverse range of tasks, including emotion recognition in\nspeech, image classification, and text classification, thereby establishing its\nrobustness and versatility in handling multi-modal data. Furthermore, we\nintroduce the Importance Factor (IF), a new learning-based metric that enhances\nthe explainability of models trained with GAAM-based methods. Overall, GAAM\nrepresents an advancement towards development of better performing and more\nexplainable attention models across multiple modalities.\n","authors":["Georgios Ioannides","Aman Chadha","Aaron Elkins"],"pdf_url":"https://arxiv.org/pdf/2401.11143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.15407v2","updated":"2024-01-20T06:26:33Z","published":"2023-12-24T04:50:57Z","title":"A Comprehensive Analysis of the Effectiveness of Large Language Models\n as Automatic Dialogue Evaluators","summary":" Automatic evaluation is an integral aspect of dialogue system research. The\ntraditional reference-based NLG metrics are generally found to be unsuitable\nfor dialogue assessment. Consequently, recent studies have suggested various\nunique, reference-free neural metrics that better align with human evaluations.\nNotably among them, large language models (LLMs), particularly the\ninstruction-tuned variants like ChatGPT, are shown to be promising substitutes\nfor human judges. Yet, existing works on utilizing LLMs for automatic dialogue\nevaluation are limited in their scope in terms of the number of meta-evaluation\ndatasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains\ninconclusive how effective these LLMs are. To this end, we conduct a\ncomprehensive study on the application of LLMs for automatic dialogue\nevaluation. Specifically, we analyze the multi-dimensional evaluation\ncapability of 30 recently emerged LLMs at both turn and dialogue levels, using\na comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the\nrobustness of the LLMs in handling various adversarial perturbations at both\nturn and dialogue levels. Finally, we explore how model-level and\ndimension-level ensembles impact the evaluation performance. All resources are\navailable at https://github.com/e0397123/comp-analysis.\n","authors":["Chen Zhang","Luis Fernando D'Haro","Yiming Chen","Malu Zhang","Haizhou Li"],"pdf_url":"https://arxiv.org/pdf/2312.15407v2.pdf","comment":"An extended version of AAAI-2024 camera-ready paper (appendix\n included, 16 pages)"},{"id":"http://arxiv.org/abs/2401.11120v1","updated":"2024-01-20T05:10:46Z","published":"2024-01-20T05:10:46Z","title":"Enhancing Large Language Models for Clinical Decision Support by\n Incorporating Clinical Practice Guidelines","summary":" Background Large Language Models (LLMs), enhanced with Clinical Practice\nGuidelines (CPGs), can significantly improve Clinical Decision Support (CDS).\nHowever, methods for incorporating CPGs into LLMs are not well studied. Methods\nWe develop three distinct methods for incorporating CPGs into LLMs: Binary\nDecision Tree (BDT), Program-Aided Graph Construction (PAGC), and\nChain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of\nthe proposed methods, we create a set of synthetic patient descriptions and\nconduct both automatic and human evaluation of the responses generated by four\nLLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was\nused as the baseline method. We focus on CDS for COVID-19 outpatient treatment\nas the case study. Results All four LLMs exhibit improved performance when\nenhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP\nand PAGC in automatic evaluation. All of the proposed methods demonstrated high\nperformance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate\nsuperior performance, as compared to plain LLMs with ZSP, in providing accurate\nrecommendations for COVID-19 outpatient treatment, which also highlights the\npotential for broader applications beyond the case study.\n","authors":["David Oniani","Xizhi Wu","Shyam Visweswaran","Sumit Kapoor","Shravan Kooragayalu","Katelyn Polanska","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2401.11120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10873v2","updated":"2024-01-20T03:58:10Z","published":"2023-10-16T22:53:54Z","title":"IDEAL: Influence-Driven Selective Annotations Empower In-Context\n Learners in Large Language Models","summary":" In-context learning is a promising paradigm that utilizes in-context examples\nas prompts for the predictions of large language models. These prompts are\ncrucial for achieving strong performance. However, since the prompts need to be\nsampled from a large volume of annotated examples, finding the right prompt may\nresult in high annotation costs. To address this challenge, this paper\nintroduces an influence-driven selective annotation method that aims to\nminimize annotation costs while improving the quality of in-context examples.\nThe essence of our method is to select a pivotal subset from a large-scale\nunlabeled data pool to annotate for the subsequent sampling of prompts.\nSpecifically, a directed graph is first constructed to represent unlabeled\ndata. Afterward, the influence of candidate unlabeled subsets is quantified\nwith a diffusion process. A simple yet effective greedy algorithm for unlabeled\ndata selection is lastly introduced. It iteratively selects the data if it\nprovides a maximum marginal gain with respect to quantified influence. Compared\nwith previous efforts on selective annotations, our influence-driven method\nworks in an end-to-end manner, avoids an intractable explicit balance between\ndata diversity and representativeness, and enjoys theoretical support.\nExperiments confirm the superiority of the proposed method on various\nbenchmarks, achieving better performance under lower time consumption during\nsubset selection. The project page is available at\nhttps://skzhang1.github.io/IDEAL/.\n","authors":["Shaokun Zhang","Xiaobo Xia","Zhaoqing Wang","Ling-Hao Chen","Jiale Liu","Qingyun Wu","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2310.10873v2.pdf","comment":"Accepted by ICLR 2024"},{"id":"http://arxiv.org/abs/2401.11107v1","updated":"2024-01-20T03:55:17Z","published":"2024-01-20T03:55:17Z","title":"Exploiting Duality in Open Information Extraction with Predicate Prompt","summary":" Open information extraction (OpenIE) aims to extract the schema-free triplets\nin the form of (\\emph{subject}, \\emph{predicate}, \\emph{object}) from a given\nsentence. Compared with general information extraction (IE), OpenIE poses more\nchallenges for the IE models, {especially when multiple complicated triplets\nexist in a sentence. To extract these complicated triplets more effectively, in\nthis paper we propose a novel generative OpenIE model, namely \\emph{DualOIE},\nwhich achieves a dual task at the same time as extracting some triplets from\nthe sentence, i.e., converting the triplets into the sentence.} Such dual task\nencourages the model to correctly recognize the structure of the given sentence\nand thus is helpful to extract all potential triplets from the sentence.\nSpecifically, DualOIE extracts the triplets in two steps: 1) first extracting a\nsequence of all potential predicates, 2) then using the predicate sequence as a\nprompt to induce the generation of triplets. Our experiments on two benchmarks\nand our dataset constructed from Meituan demonstrate that DualOIE achieves the\nbest performance among the state-of-the-art baselines. Furthermore, the online\nA/B test on Meituan platform shows that 0.93\\% improvement of QV-CTR and 0.56\\%\nimprovement of UV-CTR have been obtained when the triplets extracted by DualOIE\nwere leveraged in Meituan's search system.\n","authors":["Zhen Chen","Jingping Liu","Deqing Yang","Yanghua Xiao","Huimin Xu","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2401.11107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08577v3","updated":"2024-01-20T02:36:12Z","published":"2023-02-16T20:46:36Z","title":"For Generated Text, Is NLI-Neutral Text the Best Text?","summary":" We explore incorporating natural language inference (NLI) into the text\ngenerative pipeline by using a pre-trained NLI model to assess whether a\ngenerated sentence entails, contradicts, or is neutral to the prompt and\npreceding text. First, we show that the NLI task is predictive of generation\nerrors made by GPT-3. We use these results to develop an NLI-informed\ngeneration procedure for GPT-J. Then, we evaluate these generations by\nobtaining human annotations on error types and overall quality. We find that an\nNLI strategy of maximizing entailment improves text generation when the nucleus\nsampling randomness parameter value is high, while one which maximizes\ncontradiction is in fact productive when the parameter value is low. Overall,\nthough, we demonstrate that an NLI strategy of maximizing the neutral class\nprovides the highest quality of generated text (significantly better than the\nvanilla generations), regardless of parameter value.\n","authors":["Michail Mersinias","Kyle Mahowald"],"pdf_url":"https://arxiv.org/pdf/2302.08577v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2401.11248v1","updated":"2024-01-20T15:02:33Z","published":"2024-01-20T15:02:33Z","title":"Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense\n Passage Retrieval","summary":" Masked auto-encoder pre-training has emerged as a prevalent technique for\ninitializing and enhancing dense retrieval systems. It generally utilizes\nadditional Transformer decoder blocks to provide sustainable supervision\nsignals and compress contextual information into dense representations.\nHowever, the underlying reasons for the effectiveness of such a pre-training\ntechnique remain unclear. The usage of additional Transformer-based decoders\nalso incurs significant computational costs. In this study, we aim to shed\nlight on this issue by revealing that masked auto-encoder (MAE) pre-training\nwith enhanced decoding significantly improves the term coverage of input tokens\nin dense representations, compared to vanilla BERT checkpoints. Building upon\nthis observation, we propose a modification to the traditional MAE by replacing\nthe decoder of a masked auto-encoder with a completely simplified Bag-of-Word\nprediction task. This modification enables the efficient compression of lexical\nsignals into dense representations through unsupervised pre-training.\nRemarkably, our proposed method achieves state-of-the-art retrieval performance\non several large-scale retrieval benchmarks without requiring any additional\nparameters, which provides a 67% training speed-up compared to standard masked\nauto-encoder pre-training with enhanced decoding.\n","authors":["Guangyuan Ma","Xing Wu","Zijia Lin","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2401.11248v1.pdf","comment":"Working in progress. Our code will be available at\n https://github.com/ma787639046/bowdpr"},{"id":"http://arxiv.org/abs/2401.11246v1","updated":"2024-01-20T14:59:43Z","published":"2024-01-20T14:59:43Z","title":"Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented\n Generation in Niche Domains, Exemplified by Korean Medicine","summary":" We propose a natural language prompt-based retrieval augmented generation\n(Prompt-RAG), a novel approach to enhance the performance of generative large\nlanguage models (LLMs) in niche domains. Conventional RAG methods mostly\nrequire vector embeddings, yet the suitability of generic LLM-based embedding\nrepresentations for specialized domains remains uncertain. To explore and\nexemplify this point, we compared vector embeddings from Korean Medicine (KM)\nand Conventional Medicine (CM) documents, finding that KM document embeddings\ncorrelated more with token overlaps and less with human-assessed document\nrelatedness, in contrast to CM embeddings. Prompt-RAG, distinct from\nconventional RAG models, operates without the need for embedding vectors. Its\nperformance was assessed through a Question-Answering (QA) chatbot application,\nwhere responses were evaluated for relevance, readability, and informativeness.\nThe results showed that Prompt-RAG outperformed existing models, including\nChatGPT and conventional vector embedding-based RAGs, in terms of relevance and\ninformativeness. Despite challenges like content structuring and response\nlatency, the advancements in LLMs are expected to encourage the use of\nPrompt-RAG, making it a promising tool for other domains in need of RAG\nmethods.\n","authors":["Bongsu Kang","Jundong Kim","Tae-Rim Yun","Chang-Eop Kim"],"pdf_url":"https://arxiv.org/pdf/2401.11246v1.pdf","comment":"26 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.16326v2","updated":"2024-01-20T14:33:54Z","published":"2023-05-10T13:40:06Z","title":"Large language models in biomedical natural language processing:\n benchmarks, baselines, and recommendations","summary":" Biomedical literature is growing rapidly, making it challenging to curate and\nextract knowledge manually. Biomedical natural language processing (BioNLP)\ntechniques that can automatically extract information from biomedical\nliterature help alleviate this burden. Recently, large Language Models (LLMs),\nsuch as GPT-3 and GPT-4, have gained significant attention for their impressive\nperformance. However, their effectiveness in BioNLP tasks and impact on method\ndevelopment and downstream users remain understudied. This pilot study (1)\nestablishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and\none-shot settings in eight BioNLP datasets across four applications: named\nentity recognition, relation extraction, multi-label document classification,\nand semantic similarity and reasoning, (2) examines the errors produced by the\nLLMs and categorized the errors into three types: missingness, inconsistencies,\nand unwanted artificial content, and (3) provides suggestions for using LLMs in\nBioNLP applications. We make the datasets, baselines, and results publicly\navailable to the community via\nhttps://github.com/qingyu-qc/gpt_bionlp_benchmark.\n","authors":["Qingyu Chen","Jingcheng Du","Yan Hu","Vipina Kuttichi Keloth","Xueqing Peng","Kalpana Raja","Rui Zhang","Zhiyong Lu","Hua Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16326v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11201v1","updated":"2024-01-20T10:28:25Z","published":"2024-01-20T10:28:25Z","title":"Navigating the Thin Line: Examining User Behavior in Search to Detect\n Engagement and Backfire Effects","summary":" Opinionated users often seek information that aligns with their preexisting\nbeliefs while dismissing contradictory evidence due to confirmation bias. This\nconduct hinders their ability to consider alternative stances when searching\nthe web. Despite this, few studies have analyzed how the diversification of\nsearch results on disputed topics influences the search behavior of highly\nopinionated users. To this end, we present a preregistered user study (n = 257)\ninvestigating whether different levels (low and high) of bias metrics and\nsearch results presentation (with or without AI-predicted stances labels) can\naffect the stance diversity consumption and search behavior of opinionated\nusers on three debated topics (i.e., atheism, intellectual property rights, and\nschool uniforms). Our results show that exposing participants to\n(counter-attitudinally) biased search results increases their consumption of\nattitude-opposing content, but we also found that bias was associated with a\ntrend toward overall fewer interactions within the search page. We also found\nthat 19% of users interacted with queries and search pages but did not select\nany search results. When we removed these participants in a post-hoc analysis,\nwe found that stance labels increased the diversity of stances consumed by\nusers, particularly when the search results were biased. Our findings highlight\nthe need for future research to explore distinct search scenario settings to\ngain insight into opinionated users' behavior.\n","authors":["F. M. Cau","N. Tintarev"],"pdf_url":"https://arxiv.org/pdf/2401.11201v1.pdf","comment":"17 pages, 3 figures, ECIR2024 (46th European Conference on\n Information Retrieval - IR4Good track)"},{"id":"http://arxiv.org/abs/2401.11198v1","updated":"2024-01-20T10:25:58Z","published":"2024-01-20T10:25:58Z","title":"A Deep Learning Approach for Selective Relevance Feedback","summary":" Pseudo-relevance feedback (PRF) can enhance average retrieval effectiveness\nover a sufficiently large number of queries. However, PRF often introduces a\ndrift into the original information need, thus hurting the retrieval\neffectiveness of several queries. While a selective application of PRF can\npotentially alleviate this issue, previous approaches have largely relied on\nunsupervised or feature-based learning to determine whether a query should be\nexpanded. In contrast, we revisit the problem of selective PRF from a deep\nlearning perspective, presenting a model that is entirely data-driven and\ntrained in an end-to-end manner. The proposed model leverages a\ntransformer-based bi-encoder architecture. Additionally, to further improve\nretrieval effectiveness with this selective PRF approach, we make use of the\nmodel's confidence estimates to combine the information from the original and\nexpanded queries. In our experiments, we apply this selective feedback on a\nnumber of different combinations of ranking and feedback models, and show that\nour proposed approach consistently improves retrieval effectiveness for both\nsparse and dense ranking models, with the feedback models being either sparse,\ndense or generative.\n","authors":["Suchana Datta","Debasis Ganguly","Sean MacAvaney","Derek Greene"],"pdf_url":"https://arxiv.org/pdf/2401.11198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07042v2","updated":"2024-01-20T08:58:56Z","published":"2023-04-14T10:33:56Z","title":"Learning Graph ODE for Continuous-Time Sequential Recommendation","summary":" Sequential recommendation aims at understanding user preference by capturing\nsuccessive behavior correlations, which are usually represented as the item\npurchasing sequences based on their past interactions. Existing efforts\ngenerally predict the next item via modeling the sequential patterns. Despite\neffectiveness, there exist two natural deficiencies: (i) user preference is\ndynamic in nature, and the evolution of collaborative signals is often ignored;\nand (ii) the observed interactions are often irregularly-sampled, while\nexisting methods model item transitions assuming uniform intervals. Thus, how\nto effectively model and predict the underlying dynamics for user preference\nbecomes a critical research problem. To tackle the above challenges, in this\npaper, we focus on continuous-time sequential recommendation and propose a\nprincipled graph ordinary differential equation framework named GDERec.\nTechnically, GDERec is characterized by an autoregressive graph ordinary\ndifferential equation consisting of two components, which are parameterized by\ntwo tailored graph neural networks (GNNs) respectively to capture user\npreference from the perspective of hybrid dynamical systems. The two customized\nGNNs are trained alternately in an autoregressive manner to track the evolution\nof the underlying system from irregular observations, and thus learn effective\nrepresentations of users and items beneficial to the sequential recommendation.\nExtensive experiments on five benchmark datasets demonstrate the superiority of\nour model over various state-of-the-art recommendation methods.\n","authors":["Yifang Qin","Wei Ju","Hongjun Wu","Xiao Luo","Ming Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.07042v2.pdf","comment":"Accepted by EEE Transactions on Knowledge and Data Engineering (TKDE\n 2024)"},{"id":"http://arxiv.org/abs/2401.11145v1","updated":"2024-01-20T06:52:14Z","published":"2024-01-20T06:52:14Z","title":"Document Set Expansion with Positive-Unlabeled Learning: A Density\n Estimation-based Approach","summary":" Document set expansion aims to identify relevant documents from a large\ncollection based on a small set of documents that are on a fine-grained topic.\nPrevious work shows that PU learning is a promising method for this task.\nHowever, some serious issues remain unresolved, i.e. typical challenges that PU\nmethods suffer such as unknown class prior and imbalanced data, and the need\nfor transductive experimental settings. In this paper, we propose a novel PU\nlearning framework based on density estimation, called puDE, that can handle\nthe above issues. The advantage of puDE is that it neither constrained to the\nSCAR assumption and nor require any class prior knowledge. We demonstrate the\neffectiveness of the proposed method using a series of real-world datasets and\nconclude that our method is a better alternative for the DSE task.\n","authors":["Haiyang Zhang","Qiuyi Chen","Yuanjie Zou","Yushan Pan","Jia Wang","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2401.11145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.10049v2","updated":"2024-01-20T05:12:52Z","published":"2023-12-02T06:36:14Z","title":"Knowledge Graph Reasoning Based on Attention GCN","summary":" We propose a novel technique to enhance Knowledge Graph Reasoning by\ncombining Graph Convolution Neural Network (GCN) with the Attention Mechanism.\nThis approach utilizes the Attention Mechanism to examine the relationships\nbetween entities and their neighboring nodes, which helps to develop detailed\nfeature vectors for each entity. The GCN uses shared parameters to effectively\nrepresent the characteristics of adjacent entities. We first learn the\nsimilarity of entities for node representation learning. By integrating the\nattributes of the entities and their interactions, this method generates\nextensive implicit feature vectors for each entity, improving performance in\ntasks including entity classification and link prediction, outperforming\ntraditional neural network models. To conclude, this work provides crucial\nmethodological support for a range of applications, such as search engines,\nquestion-answering systems, recommendation systems, and data integration tasks.\n","authors":["Meera Gupta","Ravi Khanna","Divya Choudhary","Nandini Rao"],"pdf_url":"https://arxiv.org/pdf/2312.10049v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11107v1","updated":"2024-01-20T03:55:17Z","published":"2024-01-20T03:55:17Z","title":"Exploiting Duality in Open Information Extraction with Predicate Prompt","summary":" Open information extraction (OpenIE) aims to extract the schema-free triplets\nin the form of (\\emph{subject}, \\emph{predicate}, \\emph{object}) from a given\nsentence. Compared with general information extraction (IE), OpenIE poses more\nchallenges for the IE models, {especially when multiple complicated triplets\nexist in a sentence. To extract these complicated triplets more effectively, in\nthis paper we propose a novel generative OpenIE model, namely \\emph{DualOIE},\nwhich achieves a dual task at the same time as extracting some triplets from\nthe sentence, i.e., converting the triplets into the sentence.} Such dual task\nencourages the model to correctly recognize the structure of the given sentence\nand thus is helpful to extract all potential triplets from the sentence.\nSpecifically, DualOIE extracts the triplets in two steps: 1) first extracting a\nsequence of all potential predicates, 2) then using the predicate sequence as a\nprompt to induce the generation of triplets. Our experiments on two benchmarks\nand our dataset constructed from Meituan demonstrate that DualOIE achieves the\nbest performance among the state-of-the-art baselines. Furthermore, the online\nA/B test on Meituan platform shows that 0.93\\% improvement of QV-CTR and 0.56\\%\nimprovement of UV-CTR have been obtained when the triplets extracted by DualOIE\nwere leveraged in Meituan's search system.\n","authors":["Zhen Chen","Jingping Liu","Deqing Yang","Yanghua Xiao","Huimin Xu","Zongyu Wang","Rui Xie","Yunsen Xian"],"pdf_url":"https://arxiv.org/pdf/2401.11107v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.11089v1","updated":"2024-01-20T02:38:21Z","published":"2024-01-20T02:38:21Z","title":"FedRKG: A Privacy-preserving Federated Recommendation Framework via\n Knowledge Graph Enhancement","summary":" Federated Learning (FL) has emerged as a promising approach for preserving\ndata privacy in recommendation systems by training models locally. Recently,\nGraph Neural Networks (GNN) have gained popularity in recommendation tasks due\nto their ability to capture high-order interactions between users and items.\nHowever, privacy concerns prevent the global sharing of the entire user-item\ngraph. To address this limitation, some methods create pseudo-interacted items\nor users in the graph to compensate for missing information for each client.\nUnfortunately, these methods introduce random noise and raise privacy concerns.\nIn this paper, we propose FedRKG, a novel federated recommendation system,\nwhere a global knowledge graph (KG) is constructed and maintained on the server\nusing publicly available item information, enabling higher-order user-item\ninteractions. On the client side, a relation-aware GNN model leverages diverse\nKG relationships. To protect local interaction items and obscure gradients, we\nemploy pseudo-labeling and Local Differential Privacy (LDP). Extensive\nexperiments conducted on three real-world datasets demonstrate the competitive\nperformance of our approach compared to centralized algorithms while ensuring\nprivacy preservation. Moreover, FedRKG achieves an average accuracy improvement\nof 4% compared to existing federated learning baselines.\n","authors":["Dezhong Yao","Tongtong Liu","Qi Cao","Hai Jin"],"pdf_url":"https://arxiv.org/pdf/2401.11089v1.pdf","comment":null}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 52 + +
+
+
+ + ☆ CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation + + +
+ Chest X-rays (CXRs) are the most frequently performed imaging test in +clinical practice. Recent advances in the development of vision-language +foundation models (FMs) give rise to the possibility of performing automated +CXR interpretation, which can assist physicians with clinical decision-making +and improve patient outcomes. However, developing FMs that can accurately +interpret CXRs is challenging due to the (1) limited availability of +large-scale vision-language datasets in the medical image domain, (2) lack of +vision and language encoders that can capture the complexities of medical data, +and (3) absence of evaluation frameworks for benchmarking the abilities of FMs +on CXR interpretation. In this work, we address these challenges by first +introducing \emph{CheXinstruct} - a large-scale instruction-tuning dataset +curated from 28 publicly-available datasets. We then present \emph{CheXagent} - +an instruction-tuned FM capable of analyzing and summarizing CXRs. To build +CheXagent, we design a clinical large language model (LLM) for parsing +radiology reports, a vision encoder for representing CXR images, and a network +to bridge the vision and language modalities. Finally, we introduce +\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs +across 8 clinically-relevant CXR interpretation tasks. Extensive quantitative +evaluations and qualitative reviews with five expert radiologists demonstrate +that CheXagent outperforms previously-developed general- and medical-domain FMs +on CheXbench tasks. Furthermore, in an effort to improve model transparency, we +perform a fairness evaluation across factors of sex, race and age to highlight +potential performance disparities. Our project is at +\url{https://stanford-aimi.github.io/chexagent.html}. + +
+
+ comment: 24 pages, 8 figures +
+
+
+
+
+ + ☆ APT: Adaptive Pruning and Tuning Pretrained Language Models for + Efficient Training and Inference + + +
+ Fine-tuning and inference with large Language Models (LM) are generally known +to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces +training memory by updating a small number of LM parameters but does not +improve inference efficiency. Structured pruning improves LM inference +efficiency by removing consistent parameter blocks, yet often increases +training memory and time. To improve both training and inference efficiency, we +introduce APT that adaptively prunes and tunes parameters for the LMs. At the +early stage of fine-tuning, APT dynamically adds salient tuning parameters for +fast and accurate convergence while discarding unimportant parameters for +efficiency. Compared to baselines, our experiments show that APT maintains up +to 98% task performance when pruning RoBERTa and T5 models with 40% parameters +left while keeping 86.4% LLaMA models' performance with 70% parameters +remained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces +large LMs memory training footprint by up to 70%. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Text Embedding Inversion Attacks on Multilingual Language Models + + +
+ Representing textual information as real-numbered embeddings has become the +norm in NLP. Moreover, with the rise of public interest in large language +models (LLMs), Embeddings as a Service (EaaS) has rapidly gained traction as a +business model. This is not without outstanding security risks, as previous +research has demonstrated that sensitive data can be reconstructed from +embeddings, even without knowledge of the underlying model that generated them. +However, such work is limited by its sole focus on English, leaving all other +languages vulnerable to attacks by malicious actors. %As many international and +multilingual companies leverage EaaS, there is an urgent need for research into +multilingual LLM security. To this end, this work investigates LLM security +from the perspective of multilingual embedding inversion. Concretely, we define +the problem of black-box multilingual and cross-lingual inversion attacks, with +special attention to a cross-domain scenario. Our findings reveal that +multilingual models are potentially more vulnerable to inversion attacks than +their monolingual counterparts. This stems from the reduced data requirements +for achieving comparable inversion performance in settings where the underlying +language is not known a-priori. To our knowledge, this work is the first to +delve into multilinguality within the context of inversion attacks, and our +findings highlight the need for further investigation and enhanced defenses in +the area of NLP Security. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ WARM: On the Benefits of Weight Averaged Reward Models + + +
+ Aligning large language models (LLMs) with human preferences through +reinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit +failures in the reward model (RM) to achieve seemingly high rewards without +meeting the underlying objectives. We identify two primary challenges when +designing RMs to mitigate reward hacking: distribution shifts during the RL +process and inconsistencies in human preferences. As a solution, we propose +Weight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then +averaging them in the weight space. This strategy follows the observation that +fine-tuned weights remain linearly mode connected when sharing the same +pre-training. By averaging weights, WARM improves efficiency compared to the +traditional ensembling of predictions, while improving reliability under +distribution shifts and robustness to preference inconsistencies. Our +experiments on summarization tasks, using best-of-N and RL methods, shows that +WARM improves the overall quality and alignment of LLM predictions; for +example, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy +RL fine-tuned with a single RM. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Universal Neurons in GPT2 Language Models + + +
+ A basic question within the emerging field of mechanistic interpretability is +the degree to which neural networks learn the same underlying mechanisms. In +other words, are neural mechanisms universal across different models? In this +work, we study the universality of individual neurons across GPT2 models +trained from different initial random seeds, motivated by the hypothesis that +universal neurons are likely to be interpretable. In particular, we compute +pairwise correlations of neuron activations over 100 million tokens for every +neuron pair across five different seeds and find that 1-5\% of neurons are +universal, that is, pairs of neurons which consistently activate on the same +inputs. We then study these universal neurons in detail, finding that they +usually have clear interpretations and taxonomize them into a small number of +neuron families. We conclude by studying patterns in neuron weights to +establish several universal functional roles of neurons in simple circuits: +deactivating attention heads, changing the entropy of the next token +distribution, and predicting the next token to (not) be within a particular +set. + +
+
+
+
+
+ + ☆ In-Context Learning for Extreme Multi-Label Classification + + +
+ Multi-label classification problems with thousands of classes are hard to +solve with in-context learning alone, as language models (LMs) might lack prior +knowledge about the precise classes or how to assign them, and it is generally +infeasible to demonstrate every class in a prompt. We propose a general +program, $\texttt{Infer--Retrieve--Rank}$, that defines multi-step interactions +between LMs and retrievers to efficiently tackle such problems. We implement +this program using the $\texttt{DSPy}$ programming model, which specifies +in-context systems in a declarative manner, and use $\texttt{DSPy}$ optimizers +to tune it towards specific datasets by bootstrapping only tens of few-shot +examples. Our primary extreme classification program, optimized separately for +each task, attains state-of-the-art results across three benchmarks (HOUSE, +TECH, TECHWOLF). We apply the same program to a benchmark with vastly different +characteristics and attain competitive performance as well (BioDEX). Unlike +prior work, our proposed solution requires no finetuning, is easily applicable +to new tasks, alleviates prompt engineering, and requires only tens of labeled +examples. Our code is public at https://github.com/KarelDO/xmc.dspy. + +
+
+
+
+
+ + ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning + Capabilities + + +
+ Understanding and reasoning about spatial relationships is a fundamental +capability for Visual Question Answering (VQA) and robotics. While Vision +Language Models (VLM) have demonstrated remarkable performance in certain VQA +benchmarks, they still lack capabilities in 3D spatial reasoning, such as +recognizing quantitative relationships of physical objects like distances or +size differences. We hypothesize that VLMs' limited spatial reasoning +capability is due to the lack of 3D spatial knowledge in training data and aim +to solve this problem by training VLMs with Internet-scale spatial reasoning +data. To this end, we present a system to facilitate this approach. We first +develop an automatic 3D spatial VQA data generation framework that scales up to +2 billion VQA examples on 10 million real-world images. We then investigate +various factors in the training recipe, including data quality, training +pipeline, and VLM architecture. Our work features the first internet-scale 3D +spatial reasoning dataset in metric space. By training a VLM on such data, we +significantly enhance its ability on both qualitative and quantitative spatial +VQA. Finally, we demonstrate that this VLM unlocks novel downstream +applications in chain-of-thought spatial reasoning and robotics due to its +quantitative estimation capability. Project website: +https://spatial-vlm.github.io/ + +
+
+
+
+
+ + ☆ Anisotropy Is Inherent to Self-Attention in Transformers EACL 2024 + + +
+ The representation degeneration problem is a phenomenon that is widely +observed among self-supervised learning methods based on Transformers. In NLP, +it takes the form of anisotropy, a singular property of hidden representations +which makes them unexpectedly close to each other in terms of angular distance +(cosine-similarity). Some recent works tend to show that anisotropy is a +consequence of optimizing the cross-entropy loss on long-tailed distributions +of tokens. We show in this paper that anisotropy can also be observed +empirically in language models with specific objectives that should not suffer +directly from the same consequences. We also show that the anisotropy problem +extends to Transformers trained on other modalities. Our observations suggest +that anisotropy is actually inherent to Transformers-based models. + +
+
+ comment: Proceedings of EACL 2024. Previously presented at ACL-SRW 2023 + (arXiv:2306.07656). arXiv admin note: substantial text overlap with + arXiv:2306.07656 +
+
+
+
+
+ + ☆ The Curious Case of Nonverbal Abstract Reasoning with Multi-Modal Large + Language Models + + +
+ While large language models (LLMs) are still being adopted to new domains and +utilized in novel applications, we are experiencing an influx of the new +generation of foundation models, namely multi-modal large language models +(MLLMs). These models integrate verbal and visual information, opening new +possibilities to demonstrate more complex reasoning abilities at the +intersection of the two modalities. However, despite the revolutionizing +prospect of MLLMs, our understanding of their reasoning abilities is limited. +In this study, we assess the nonverbal abstract reasoning abilities of +open-source and closed-source MLLMs using variations of Raven's Progressive +Matrices. Our experiments expose the difficulty of solving such problems while +showcasing the immense gap between open-source and closed-source models. We +also reveal critical shortcomings with individual visual and textual modules, +subjecting the models to low-performance ceilings. Finally, to improve MLLMs' +performance, we experiment with various methods, such as Chain-of-Thought +prompting, resulting in a significant (up to 100%) boost in performance. + +
+
+ comment: Code and datasets are available at + https://github.com/kahrabian/mllm-nvar +
+
+
+
+
+ + ☆ An Empirical Analysis of In-context Learning Abilities of LLMs for MT + + +
+ In-context learning (ICL) has consistently demonstrated superior performance +over zero-shot performance in large language models (LLMs). However, the +understanding of the dynamics of ICL and the aspects that influence downstream +performance remains limited, especially for natural language generation (NLG) +tasks. This work aims to address this gap by investigating the ICL capabilities +of LLMs and studying the impact of different aspects of the in-context +demonstrations for the task of machine translation (MT). Our preliminary +investigations aim to discern whether in-context learning (ICL) is +predominantly influenced by demonstrations or instructions by applying diverse +perturbations to in-context demonstrations while preserving the task +instruction. We observe varying behavior to perturbed examples across different +model families, notably with BLOOM-7B derivatives being severely influenced by +noise, whereas Llama 2 derivatives not only exhibit robustness but also tend to +show enhancements over the clean baseline when subject to perturbed +demonstrations. This suggests that the robustness of ICL may be governed by +several factors, including the type of noise, perturbation direction (source or +target), the extent of pretraining of the specific model, and fine-tuning for +downstream tasks if applicable. Further investigation is warranted to develop a +comprehensive understanding of these factors in future research. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Unsupervised Learning of Graph from Recipes + + +
+ Cooking recipes are one of the most readily available kinds of procedural +text. They consist of natural language instructions that can be challenging to +interpret. In this paper, we propose a model to identify relevant information +from recipes and generate a graph to represent the sequence of actions in the +recipe. In contrast with other approaches, we use an unsupervised approach. We +iteratively learn the graph structure and the parameters of a $\mathsf{GNN}$ +encoding the texts (text-to-graph) one sequence at a time while providing the +supervision by decoding the graph into text (graph-to-text) and comparing the +generated text to the input. We evaluate the approach by comparing the +identified entities with annotated datasets, comparing the difference between +the input and output texts, and comparing our generated graphs with those +generated by state of the art methods. + +
+
+
+
+
+ + ☆ Revisiting Demonstration Selection Strategies in In-Context Learning + + +
+ Large language models (LLMs) have shown an impressive ability to perform a +wide range of tasks using in-context learning (ICL), where a few examples are +used to describe a task to the model. However, the performance of ICL varies +significantly with the choice of demonstrations, and it is still unclear why +this happens or what factors will influence its choice. In this work, we first +revisit the factors contributing to this variance from both data and model +aspects, and find that the choice of demonstration is both data- and +model-dependent. We further proposed a data- and model-dependent demonstration +selection method, \textbf{TopK + ConE}, based on the assumption that +\textit{the performance of a demonstration positively correlates with its +contribution to the model's understanding of the test samples}, resulting in a +simple and effective recipe for ICL. Empirically, our method yields consistent +improvements in both language understanding and generation tasks with different +model scales. Further analyses confirm that, besides the generality and +stability under different circumstances, our method provides a unified +explanation for the effectiveness of previous methods. Code will be released. + +
+
+
+
+
+ + ☆ West-of-N: Synthetic Preference Generation for Improved Reward Modeling + + +
+ The success of reinforcement learning from human feedback (RLHF) in language +model alignment is strongly dependent on the quality of the underlying reward +model. In this paper, we present a novel approach to improve reward model +quality by generating synthetic preference data, thereby augmenting the +training dataset with on-policy, high-quality preference pairs. Motivated by +the promising results of Best-of-N sampling strategies in language model +training, we extend their application to reward model training. This results in +a self-training strategy to generate preference pairs by selecting the best and +worst candidates in a pool of responses to a given query. Empirically, we find +that this approach improves the performance of any reward model, with an effect +comparable to the addition of a similar quantity of human preference data. This +work opens up new avenues of research for improving RLHF for language model +alignment, by offering synthetic preference generation as a solution to reward +modeling challenges. + +
+
+
+
+
+ + ☆ Temporal Blind Spots in Large Language Models WSDM'24 + + +
+ Large language models (LLMs) have recently gained significant attention due +to their unparalleled ability to perform various natural language processing +tasks. These models, benefiting from their advanced natural language +understanding capabilities, have demonstrated impressive zero-shot performance. +However, the pre-training data utilized in LLMs is often confined to a specific +corpus, resulting in inherent freshness and temporal scope limitations. +Consequently, this raises concerns regarding the effectiveness of LLMs for +tasks involving temporal intents. In this study, we aim to investigate the +underlying limitations of general-purpose LLMs when deployed for tasks that +require a temporal understanding. We pay particular attention to handling +factual temporal knowledge through three popular temporal QA datasets. +Specifically, we observe low performance on detailed questions about the past +and, surprisingly, for rather new information. In manual and automatic testing, +we find multiple temporal errors and characterize the conditions under which QA +performance deteriorates. Our analysis contributes to understanding LLM +limitations and offers valuable insights into developing future models that can +better cater to the demands of temporally-oriented tasks. The code is +available\footnote{https://github.com/jwallat/temporalblindspots}. + +
+
+ comment: accepted at WSDM'24 +
+
+
+
+
+ + ☆ Cross-lingual Transfer Learning for Javanese Dependency Parsing AACL 2023 + + +
+ While structure learning achieves remarkable performance in high-resource +languages, the situation differs for under-represented languages due to the +scarcity of annotated data. This study focuses on assessing the efficacy of +transfer learning in enhancing dependency parsing for Javanese, a language +spoken by 80 million individuals but characterized by limited representation in +natural language processing. We utilized the Universal Dependencies dataset +consisting of dependency treebanks from more than 100 languages, including +Javanese. We propose two learning strategies to train the model: transfer +learning (TL) and hierarchical transfer learning (HTL). While TL only uses a +source language to pre-train the model, the HTL method uses a source language +and an intermediate language in the learning process. The results show that our +best model uses the HTL method, which improves performance with an increase of +10% for both UAS and LAS evaluations compared to the baseline model. + +
+
+ comment: Accepted at IJCNLP-AACL 2023 SRW +
+
+
+
+
+ + ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated + Text + + +
+ Detecting text generated by modern large language models is thought to be +hard, as both LLMs and humans can exhibit a wide range of complex behaviors. +However, we find that a score based on contrasting two closely related language +models is highly accurate at separating human-generated and machine-generated +text. Based on this mechanism, we propose a novel LLM detector that only +requires simple calculations using a pair of pre-trained LLMs. The method, +called Binoculars, achieves state-of-the-art accuracy without any training +data. It is capable of spotting machine text from a range of modern LLMs +without any model-specific modifications. We comprehensively evaluate +Binoculars on a number of text sources and in varied situations. Over a wide +range of document types, Binoculars detects over 90% of generated samples from +ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being +trained on any ChatGPT data. + +
+
+ comment: 20 pages, code available at https://github.com/ahans30/Binoculars +
+
+
+
+
+ + ☆ ALMs: Authorial Language Models for Authorship Attribution + + +
+ In this paper, we introduce an authorship attribution method called Authorial +Language Models (ALMs) that involves identifying the most likely author of a +questioned document based on the perplexity of the questioned document +calculated for a set of causal language models fine-tuned on the writings of a +set of candidate author. We benchmarked ALMs against state-of-art-systems using +the CCAT50 dataset and the Blogs50 datasets. We find that ALMs achieves a +macro-average accuracy score of 83.6% on Blogs50, outperforming all other +methods, and 74.9% on CCAT50, matching the performance of the best method. To +assess the performance of ALMs on shorter texts, we also conducted text +ablation testing. We found that to reach a macro-average accuracy of 70%, ALMs +needs 40 tokens on Blogs50 and 400 tokens on CCAT50, while to reach 60% ALMs +requires 20 tokens on Blogs50 and 70 tokens on CCAT50. + +
+
+
+
+
+ + ☆ Synergizing Machine Learning & Symbolic Methods: A Survey on Hybrid + Approaches to Natural Language Processing + + +
+ The advancement of machine learning and symbolic approaches have underscored +their strengths and weaknesses in Natural Language Processing (NLP). While +machine learning approaches are powerful in identifying patterns in data, they +often fall short in learning commonsense and the factual knowledge required for +the NLP tasks. Meanwhile, the symbolic methods excel in representing +knowledge-rich data. However, they struggle to adapt dynamic data and +generalize the knowledge. Bridging these two paradigms through hybrid +approaches enables the alleviation of weaknesses in both while preserving their +strengths. Recent studies extol the virtues of this union, showcasing promising +results in a wide range of NLP tasks. In this paper, we present an overview of +hybrid approaches used for NLP. Specifically, we delve into the +state-of-the-art hybrid approaches used for a broad spectrum of NLP tasks +requiring natural language understanding, generation, and reasoning. +Furthermore, we discuss the existing resources available for hybrid approaches +for NLP along with the challenges, offering a roadmap for future directions. + +
+
+
+
+
+ + ☆ Claim Detection for Automated Fact-checking: A Survey on Monolingual, + Multilingual and Cross-Lingual Research + + +
+ Automated fact-checking has drawn considerable attention over the past few +decades due to the increase in the diffusion of misinformation on online +platforms. This is often carried out as a sequence of tasks comprising (i) the +detection of sentences circulating in online platforms which constitute claims +needing verification, followed by (ii) the verification process of those +claims. This survey focuses on the former, by discussing existing efforts +towards detecting claims needing fact-checking, with a particular focus on +multilingual data and methods. This is a challenging and fertile direction +where existing methods are yet far from matching human performance due to the +profoundly challenging nature of the issue. Especially, the dissemination of +information across multiple social platforms, articulated in multiple languages +and modalities demands more generalized solutions for combating misinformation. +Focusing on multilingual misinformation, we present a comprehensive survey of +existing multilingual claim detection research. We present state-of-the-art +multilingual claim detection research categorized into three key factors of the +problem, verifiability, priority, and similarity. Further, we present a +detailed overview of the existing multilingual datasets along with the +challenges and suggest possible future advancements. + +
+
+
+
+
+ + ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. + CMMMU includes 12k manually collected multimodal questions from college +exams, quizzes, and textbooks, covering six core disciplines: Art & Design, +Business, Science, Health & Medicine, Humanities & Social Science, and Tech & +Engineering, like its companion, MMMU. These questions span 30 subjects and +comprise 39 highly heterogeneous image types, such as charts, diagrams, maps, +tables, music sheets, and chemical structures. + CMMMU focuses on complex perception and reasoning with domain-specific +knowledge in the Chinese context. We evaluate 11 open-source LLMs and one +proprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%, +indicating a large space for improvement. CMMMU will boost the community to +build the next-generation LMMs towards expert artificial intelligence and +promote the democratization of LMMs by providing diverse language contexts. + +
+
+
+
+
+ + ☆ Benchmarking Large Multimodal Models against Common Corruptions + + +
+ This technical report aims to fill a deficiency in the assessment of large +multimodal models (LMMs) by specifically examining the self-consistency of +their outputs when subjected to common corruptions. We investigate the +cross-modal interactions between text, image, and speech, encompassing four +essential generation tasks: text-to-image, image-to-text, text-to-speech, and +speech-to-text. We create a comprehensive benchmark, named MMCBench, that +covers more than 100 popular LMMs (totally over 150 model checkpoints). A +thorough evaluation under common corruptions is critical for practical +deployment and facilitates a better understanding of the reliability of +cutting-edge LMMs. The benchmarking code is available at +https://github.com/sail-sg/MMCBench + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Blinded by Generated Contexts: How Language Models Merge Generated and + Retrieved Contexts for Open-Domain QA? + + +
+ While auxiliary information has become a key to enhance Large Language Models +(LLMs), relatively little is known about how well LLMs merge these contexts, +specifically generated and retrieved. To study this, we formulate a task +specifically designed to identify whether the answers, derived from the +integration of generated and retrieved contexts, are attributed to either +generated or retrieved contexts. To support this task, we develop a methodology +to construct datasets with conflicting contexts, where each question is paired +with both generated and retrieved contexts, yet only one of them contains the +correct answer. Our experiments reveal a significant bias in LLMs towards +generated contexts, as evidenced across state-of-the-art open (Llama2-7b/13b) +and closed (GPT 3.5/4) systems. We further identify two key factors +contributing to this bias: i) Contexts generated by LLMs typically show greater +similarity to the questions, increasing their likelihood of selection; ii) The +segmentation process used in retrieved contexts disrupts their completeness, +thereby hindering their full utilization in LLMs. Our analysis enhances the +understanding of how LLMs merge diverse contexts, offering valuable insights +for advancing current augmentation methods for LLMs. + +
+
+
+
+
+ + ☆ PsySafe: A Comprehensive Framework for Psychological-based Attack, + Defense, and Evaluation of Multi-agent System Safety + + +
+ Multi-agent systems, augmented with Large Language Models (LLMs), demonstrate +significant capabilities for collective intelligence. However, the potential +misuse of this intelligence for malicious purposes presents significant risks. +To date, comprehensive research on the safety issues associated with +multi-agent systems remains limited. From the perspective of agent psychology, +we discover that the dark psychological states of agents can lead to severe +safety issues. To address these issues, we propose a comprehensive framework +grounded in agent psychology. In our framework, we focus on three aspects: +identifying how dark personality traits in agents might lead to risky +behaviors, designing defense strategies to mitigate these risks, and evaluating +the safety of multi-agent systems from both psychological and behavioral +perspectives. Our experiments reveal several intriguing phenomena, such as the +collective dangerous behaviors among agents, agents' propensity for +self-reflection when engaging in dangerous behavior, and the correlation +between agents' psychological assessments and their dangerous behaviors. We +anticipate that our framework and observations will provide valuable insights +for further research into the safety of multi-agent systems. We will make our +data and code publicly accessible at https:/github.com/AI4Good24/PsySafe. + +
+
+
+
+
+ + ☆ Improving Small Language Models' Mathematical Reasoning via Mix Thoughts + Distillation + + +
+ This work addresses the challenge of democratizing advanced Large Language +Models (LLMs) by compressing their mathematical reasoning capabilities into +sub-billion parameter Small Language Models (SLMs) without compromising +performance. We introduce Equation-of-Thought Distillation (EoTD), a novel +technique that encapsulates the reasoning process into equation-based +representations to construct an EoTD dataset for fine-tuning SLMs. +Additionally, we propose the Mix Thoughts Distillation (MTD) framework to +enhance the reasoning performance of SLMs. This involves creating a reasoning +dataset with multiple thought processes and using it for fine-tuning. Our +experimental findings demonstrate that EoTD significantly boosts the reasoning +abilities of SLMs, while MTD enables these models to achieve state-of-the-art +reasoning performance. + +
+
+
+
+
+ + ☆ The Right Model for the Job: An Evaluation of Legal Multi-Label + Classification Baselines + + +
+ Multi-Label Classification (MLC) is a common task in the legal domain, where +more than one label may be assigned to a legal document. A wide range of +methods can be applied, ranging from traditional ML approaches to the latest +Transformer-based architectures. In this work, we perform an evaluation of +different MLC methods using two public legal datasets, POSTURE50K and +EURLEX57K. By varying the amount of training data and the number of labels, we +explore the comparative advantage offered by different approaches in relation +to the dataset properties. Our findings highlight DistilRoBERTa and LegalBERT +as performing consistently well in legal MLC with reasonable computational +demands. T5 also demonstrates comparable performance while offering advantages +as a generative model in the presence of changing label sets. Finally, we show +that the CrossEncoder exhibits potential for notable macro-F1 score +improvements, albeit with increased computational costs. + +
+
+
+
+
+ + ☆ AI for social science and social science of AI: A Survey + + +
+ Recent advancements in artificial intelligence, particularly with the +emergence of large language models (LLMs), have sparked a rethinking of +artificial general intelligence possibilities. The increasing human-like +capabilities of AI are also attracting attention in social science research, +leading to various studies exploring the combination of these two fields. In +this survey, we systematically categorize previous explorations in the +combination of AI and social science into two directions that share common +technical approaches but differ in their research objectives. The first +direction is focused on AI for social science, where AI is utilized as a +powerful tool to enhance various stages of social science research. While the +second direction is the social science of AI, which examines AI agents as +social entities with their human-like cognitive and linguistic capabilities. By +conducting a thorough review, particularly on the substantial progress +facilitated by recent advancements in large language models, this paper +introduces a fresh perspective to reassess the relationship between AI and +social science, provides a cohesive framework that allows researchers to +understand the distinctions and connections between AI for social science and +social science of AI, and also summarized state-of-art experiment simulation +platforms to facilitate research in these two directions. We believe that as AI +technology continues to advance and intelligent agents find increasing +applications in our daily lives, the significance of the combination of AI and +social science will become even more prominent. + +
+
+ comment: Accepted by Information Processing and Management (IP&M) +
+
+
+
+
+ + ☆ SuperCLUE-Math6: Graded Multi-Step Math Reasoning Benchmark for LLMs in + Chinese + + +
+ We introduce SuperCLUE-Math6(SC-Math6), a new benchmark dataset to evaluate +the mathematical reasoning abilities of Chinese language models. SC-Math6 is +designed as an upgraded Chinese version of the GSM8K dataset with enhanced +difficulty, diversity, and application scope. It consists of over 2000 +mathematical word problems requiring multi-step reasoning and providing natural +language solutions. We propose an innovative scheme to quantify the reasoning +capability of large models based on performance over problems with different +reasoning steps. Experiments on 12 representative Chinese models demonstrate a +clear stratification of reasoning levels, with top models like GPT-4 showing +superior performance. SC-Math6 fills the gap in Chinese mathematical reasoning +benchmarks and provides a comprehensive testbed to advance the intelligence of +Chinese language models. + +
+
+ comment: 8 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ Hallucination is Inevitable: An Innate Limitation of Large Language + Models + + +
+ Hallucination has been widely recognized to be a significant drawback for +large language models (LLMs). There have been many works that attempt to reduce +the extent of hallucination. These efforts have mostly been empirical so far, +which cannot answer the fundamental question whether it can be completely +eliminated. In this paper, we formalize the problem and show that it is +impossible to eliminate hallucination in LLMs. Specifically, we define a formal +world where hallucination is defined as inconsistencies between a computable +LLM and a computable ground truth function. By employing results from learning +theory, we show that LLMs cannot learn all of the computable functions and will +therefore always hallucinate. Since the formal world is a part of the real +world which is much more complicated, hallucinations are also inevitable for +real world LLMs. Furthermore, for real world LLMs constrained by provable time +complexity, we describe the hallucination-prone tasks and empirically validate +our claims. Finally, using the formal world framework, we discuss the possible +mechanisms and efficacies of existing hallucination mitigators as well as the +practical implications on the safe deployment of LLMs. + +
+
+
+
+
+ + ☆ SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic + Segmentation + + +
+ Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation +models using training image data with only image-level supervision. Since +precise pixel-level annotations are not accessible, existing methods typically +focus on producing pseudo masks for training segmentation models by refining +CAM-like heatmaps. However, the produced heatmaps may only capture +discriminative image regions of target object categories or the associated +co-occurring backgrounds. To address the issues, we propose a Semantic Prompt +Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the +CLIP space to enhance the semantic alignment between the segmented regions and +the target object categories. More specifically, we propose Contrastive Prompt +Learning and Class-associated Semantic Refinement to learn the prompts that +adequately describe and suppress the image backgrounds associated with each +target object category. In this way, our proposed framework is able to perform +better semantic matching between object regions and the associated text labels, +resulting in desired pseudo masks for training the segmentation model. The +proposed SemPLeS framework achieves SOTA performance on the standard WSSS +benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the +semantic visualization of our learned prompts. The codes will be released. + +
+
+
+
+
+ + ☆ Speak It Out: Solving Symbol-Related Problems with Symbol-to-Language + Conversion for Language Models + + +
+ Symbols (or more broadly, non-natural language textual representations) such +as numerical sequences, molecular formulas, and table delimiters widely exist, +playing important roles in various tasks such as abstract reasoning, chemical +property prediction, and table question answering. Despite the impressive +natural language comprehension capabilities of large language models (LLMs), +their reasoning abilities for symbols remain inadequate, which could attributed +to the difference between symbol representations and general natural languages. +We propose symbol-to-language (S2L), a tuning-free method that enables large +language models to solve symbol-related problems with information expressed in +natural language. Specifically, S2L first converts the symbols involved to +language-based representations, which can be implemented by prompting LLMs or +leveraging external tools, then these language-based representations are +integrated into the original problem via direct substitution or concatenation, +serving as useful input information for LLMs. We evaluate the S2L method using +both API-based (GPT-4, ChatGPT) and open-source (OpenChat) models over eight +symbol-related tasks, ranging from symbol-only abstract reasoning to sentiment +analysis in social media. Experimental results show that S2L consistently leads +to superior performance. For example, by employing S2L for GPT-4, there can be +average significant improvements of +21.9% and +9.5% for subtasks in 1D-ARC and +Dyck language, respectively. Codes and data are available at +https://github.com/THUNLP-MT/symbol2language. + +
+
+
+
+
+ + ☆ Keep Decoding Parallel with Effective Knowledge Distillation from + Language Models to End-to-end Speech Recognisers ICASSP 2024 + + +
+ This study presents a novel approach for knowledge distillation (KD) from a +BERT teacher model to an automatic speech recognition (ASR) model using +intermediate layers. To distil the teacher's knowledge, we use an attention +decoder that learns from BERT's token probabilities. Our method shows that +language model (LM) information can be more effectively distilled into an ASR +model using both the intermediate layers and the final layer. By using the +intermediate layers as distillation target, we can more effectively distil LM +knowledge into the lower network layers. Using our method, we achieve better +recognition accuracy than with shallow fusion of an external LM, allowing us to +maintain fast parallel decoding. Experiments on the LibriSpeech dataset +demonstrate the effectiveness of our approach in enhancing greedy decoding with +connectionist temporal classification (CTC). + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Streaming Bilingual End-to-End ASR model using Attention over Multiple + Softmax + + +
+ Even with several advancements in multilingual modeling, it is challenging to +recognize multiple languages using a single neural model, without knowing the +input language and most multilingual models assume the availability of the +input language. In this work, we propose a novel bilingual end-to-end (E2E) +modeling approach, where a single neural model can recognize both languages and +also support switching between the languages, without any language input from +the user. The proposed model has shared encoder and prediction networks, with +language-specific joint networks that are combined via a self-attention +mechanism. As the language-specific posteriors are combined, it produces a +single posterior probability over all the output symbols, enabling a single +beam search decoding and also allowing dynamic switching between the languages. +The proposed approach outperforms the conventional bilingual baseline with +13.3%, 8.23% and 1.3% word error rate relative reduction on Hindi, English and +code-mixed test sets, respectively. + +
+
+ comment: Published in IEEE's Spoken Language Technology (SLT) 2022, 8 pages (6 + + 2 for references), 5 figures +
+
+
+
+
+ + ☆ Revolutionizing Finance with LLMs: An Overview of Applications and + Insights + + +
+ In recent years, Large Language Models (LLMs) like ChatGPT have seen +considerable advancements and have been applied in diverse fields. Built on the +Transformer architecture, these models are trained on extensive datasets, +enabling them to understand and generate human language effectively. In the +financial domain, the deployment of LLMs is gaining momentum. These models are +being utilized for automating financial report generation, forecasting market +trends, analyzing investor sentiment, and offering personalized financial +advice. Leveraging their natural language processing capabilities, LLMs can +distill key insights from vast financial data, aiding institutions in making +informed investment choices and enhancing both operational efficiency and +customer satisfaction. In this study, we provide a comprehensive overview of +the emerging integration of LLMs into various financial tasks. Additionally, we +conducted holistic tests on multiple financial tasks through the combination of +natural language instructions. Our findings show that GPT-4 effectively follow +prompt instructions across various financial tasks. This survey and evaluation +of LLMs in the financial domain aim to deepen the understanding of LLMs' +current role in finance for both financial practitioners and LLM researchers, +identify new research and application prospects, and highlight how these +technologies can be leveraged to solve practical challenges in the finance +industry. + +
+
+
+
+
+ + ♻ ☆ Mind Your Format: Towards Consistent Evaluation of In-Context Learning + Improvements + + +
+ Large language models demonstrate a remarkable capability for learning to +solve new tasks from a few examples. The prompt template, or the way the input +examples are formatted to obtain the prompt, is an important yet often +overlooked aspect of in-context learning. In this work, we conduct a +comprehensive study of the template format's influence on the in-context +learning performance. We evaluate the impact of the prompt template across +models (from 770M to 70B parameters) and 4 standard classification datasets. We +show that a poor choice of the template can reduce the performance of the +strongest models and inference methods to a random guess level. More +importantly, the best templates do not transfer between different setups and +even between models of the same family. Our findings show that the currently +prevalent approach to evaluation, which ignores template selection, may give +misleading results due to different templates in different works. As a first +step towards mitigating this issue, we propose Template Ensembles that +aggregate model predictions across several templates. This simple test-time +augmentation boosts average performance while being robust to the choice of +random set of templates. + +
+
+ comment: 21 pages, 10 figures. Code: + https://github.com/yandex-research/mind-your-format +
+
+
+
+
+ + ♻ ☆ Knowledge Fusion of Large Language Models ICLR 2024 + + +
+ While training large language models (LLMs) from scratch can generate models +with distinct functionalities and strengths, it comes at significant costs and +may result in redundant capabilities. Alternatively, a cost-effective and +compelling approach is to merge existing pre-trained LLMs into a more potent +model. However, due to the varying architectures of these LLMs, directly +blending their weights is impractical. In this paper, we introduce the notion +of knowledge fusion for LLMs, aimed at combining the capabilities of existing +LLMs and transferring them into a single LLM. By leveraging the generative +distributions of source LLMs, we externalize their collective knowledge and +unique strengths, thereby potentially elevating the capabilities of the target +model beyond those of any individual source LLM. We validate our approach using +three popular LLMs with different architectures--Llama-2, MPT, and +OpenLLaMA--across various benchmarks and tasks. Our findings confirm that the +fusion of LLMs can improve the performance of the target model across a range +of capabilities such as reasoning, commonsense, and code generation. Our code, +model weights, and data are public at +\url{https://github.com/fanqiwan/FuseLLM}. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ ICE-Score: Instructing Large Language Models to Evaluate Code EACL 2024 + + +
+ Recent advancements in the field of natural language generation have +facilitated the use of large language models to assess the quality of generated +text. Although these models have shown promising results in tasks such as +machine translation and summarization, their applicability in code intelligence +tasks remains limited without human involvement. The complexity of programming +concepts required for such tasks makes it difficult to develop evaluation +metrics that align with human judgment. Token-matching-based metrics, such as +BLEU, have demonstrated weak correlations with human practitioners in code +intelligence tasks. Moreover, utilizing human-written test suites to evaluate +functional correctness can be challenging in domains with low resources. To +overcome these obstacles, we propose \texttt{ICE-Score}, a new evaluation +metric via instructing large language models (LLMs) for code assessments. Our +metric addresses the limitations of existing approaches by achieving superior +correlations with functional correctness and human preferences, without the +need for test oracles or references. We evaluate the efficacy of our metric on +two different aspects (\textit{human preference} and \textit{execution +success}) and four programming languages. Our results demonstrate that our +metric surpasses state-of-the-art metrics for code generation, delivering high +levels of accuracy and consistency across various programming languages and +tasks. We also make our evaluation metric and datasets available to the +public\footnote{\url{https://github.com/terryyz/ice-score}}, encouraging +further research in evaluating code intelligence tasks. + +
+
+ comment: Accepted to Findings of EACL 2024 +
+
+
+
+
+ + ♻ ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model + Performance EMNLP 2023 + + +
+ When training data are collected from human annotators, the design of the +annotation instrument, the instructions given to annotators, the +characteristics of the annotators, and their interactions can impact training +data. This study demonstrates that design choices made when creating an +annotation instrument also impact the models trained on the resulting +annotations. We introduce the term annotation sensitivity to refer to the +impact of annotation data collection methods on the annotations themselves and +on downstream model performance and predictions. We collect annotations of hate +speech and offensive language in five experimental conditions of an annotation +instrument, randomly assigning annotators to conditions. We then fine-tune BERT +models on each of the five resulting datasets and evaluate model performance on +a holdout portion of each condition. We find considerable differences between +the conditions for 1) the share of hate speech/offensive language annotations, +2) model performance, 3) model predictions, and 4) model learning curves. Our +results emphasize the crucial role played by the annotation instrument which +has received little attention in the machine learning literature. We call for +additional research into how and why the instrument impacts the annotations to +inform the development of best practices in instrument design. + +
+
+ comment: EMNLP 2023 Findings: + https://aclanthology.org/2023.findings-emnlp.992/ +
+
+
+
+
+ + ♻ ☆ Zero and Few-shot Semantic Parsing with Ambiguous Inputs ICLR 2024 + + +
+ Despite the frequent challenges posed by ambiguity when representing meaning +via natural language, it is often ignored or deliberately removed in tasks +mapping language to formally-designed representations, which generally assume a +one-to-one mapping between linguistic and formal representations. We attempt to +address this shortcoming by introducing AmP, a framework, dataset, and +challenge for translating ambiguous natural language to formal representations +like logic and code. We define templates and generate data for five +well-documented linguistic ambiguities. Using AmP, we investigate how several +few-shot text-to-code systems handle ambiguity, introducing three new metrics. +We find that large pre-trained models perform poorly at capturing the +distribution of possible meanings without deliberate instruction. However, +models are able to capture the distribution well when ambiguity is attested in +their inputs. These results motivate a call for including ambiguity explicitly +in datasets and promote considering the distribution of possible outputs when +evaluating systems. Data and code: https://github.com/esteng/ambiguous_parsing + +
+
+ comment: ICLR 2024 Camera Ready +
+
+
+
+
+ + ♻ ☆ TWIZ-v2: The Wizard of Multimodal Conversational-Stimulus + + +
+ In this report, we describe the vision, challenges, and scientific +contributions of the Task Wizard team, TWIZ, in the Alexa Prize TaskBot +Challenge 2022. Our vision, is to build TWIZ bot as an helpful, multimodal, +knowledgeable, and engaging assistant that can guide users towards the +successful completion of complex manual tasks. To achieve this, we focus our +efforts on three main research questions: (1) Humanly-Shaped Conversations, by +providing information in a knowledgeable way; (2) Multimodal Stimulus, making +use of various modalities including voice, images, and videos; and (3) +Zero-shot Conversational Flows, to improve the robustness of the interaction to +unseen scenarios. TWIZ is an assistant capable of supporting a wide range of +tasks, with several innovative features such as creative cooking, video +navigation through voice, and the robust TWIZ-LLM, a Large Language Model +trained for dialoguing about complex manual tasks. Given ratings and feedback +provided by users, we observed that TWIZ bot is an effective and robust system, +capable of guiding users through tasks while providing several multimodal +stimuli. + +
+
+
+
+
+ + ♻ ☆ Connecting the Dots: What Graph-Based Text Representations Work Best for + Text Classification Using Graph Neural Networks? EMNLP 2023 + + +
+ Given the success of Graph Neural Networks (GNNs) for structure-aware machine +learning, many studies have explored their use for text classification, but +mostly in specific domains with limited data characteristics. Moreover, some +strategies prior to GNNs relied on graph mining and classical machine learning, +making it difficult to assess their effectiveness in modern settings. This work +extensively investigates graph representation methods for text classification, +identifying practical implications and open challenges. We compare different +graph construction schemes using a variety of GNN architectures and setups +across five datasets, encompassing short and long documents as well as +unbalanced scenarios in diverse domains. Two Transformer-based large language +models are also included to complement the study. The results show that i) +although the effectiveness of graphs depends on the textual input features and +domain, simple graph constructions perform better the longer the documents are, +ii) graph representations are especially beneficial for longer documents, +outperforming Transformer-based models, iii) graph methods are particularly +efficient at solving the task. + +
+
+ comment: Accepted to Findings of the Association for Computational + Linguistics: EMNLP 2023 (Long Paper). 17 pages, 2 figures, 15 tables. The + Appendix starts on page 12 +
+
+
+
+
+ + ♻ ☆ Who is ChatGPT? Benchmarking LLMs' Psychological Portrayal Using + PsychoBench ICLR 2024 + + +
+ Large Language Models (LLMs) have recently showcased their remarkable +capacities, not only in natural language processing tasks but also across +diverse domains such as clinical medicine, legal consultation, and education. +LLMs become more than mere applications, evolving into assistants capable of +addressing diverse user requests. This narrows the distinction between human +beings and artificial intelligence agents, raising intriguing questions +regarding the potential manifestation of personalities, temperaments, and +emotions within LLMs. In this paper, we propose a framework, PsychoBench, for +evaluating diverse psychological aspects of LLMs. Comprising thirteen scales +commonly used in clinical psychology, PsychoBench further classifies these +scales into four distinct categories: personality traits, interpersonal +relationships, motivational tests, and emotional abilities. Our study examines +five popular models, namely text-davinci-003, gpt-3.5-turbo, gpt-4, LLaMA-2-7b, +and LLaMA-2-13b. Additionally, we employ a jailbreak approach to bypass the +safety alignment protocols and test the intrinsic natures of LLMs. We have made +PsychoBench openly accessible via https://github.com/CUHK-ARISE/PsychoBench. + +
+
+ comment: Accepted for ICLR 2024 Oral Presentation. 15 pages (main text) and 5 + pages (appendix) +
+
+
+
+
+ + ♻ ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource + Security Attack Pattern Recognition EACL 2024 + + +
+ Tactics, Techniques and Procedures (TTPs) represent sophisticated attack +patterns in the cybersecurity domain, described encyclopedically in textual +knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP +mapping, is an important and challenging task. Conventional learning approaches +often target the problem in the classical multi-class or multilabel +classification setting. This setting hinders the learning ability of the model +due to a large number of classes (i.e., TTPs), the inevitable skewness of the +label distribution and the complex hierarchical structure of the label space. +We formulate the problem in a different learning paradigm, where the assignment +of a text to a TTP label is decided by the direct semantic similarity between +the two, thus reducing the complexity of competing solely over the large +labeling space. To that end, we propose a neural matching architecture with an +effective sampling-based learn-to-compare mechanism, facilitating the learning +process of the matching model despite constrained resources. + +
+
+ comment: accepted at EACL 2024, in ARR October 2023 +
+
+
+
+
+ + ♻ ☆ Unifying the Perspectives of NLP and Software Engineering: A Survey on + Language Models for Code + + +
+ In this work we systematically review the recent advancements in code +processing with language models, covering 50+ models, 30+ evaluation tasks, +170+ datasets, and 700+ related works. We break down code processing models +into general language models represented by the GPT family and specialized +models that are specifically pretrained on code, often with tailored +objectives. We discuss the relations and differences between these models, and +highlight the historical transition of code modeling from statistical models +and RNNs to pretrained Transformers and LLMs, which is exactly the same course +that had been taken by NLP. We also discuss code-specific features such as AST, +CFG, and unit tests, along with their application in training code language +models, and identify key challenges and potential future directions in this +domain. We keep the survey open and updated on GitHub at +https://github.com/codefuse-ai/Awesome-Code-LLM. + +
+
+ comment: Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM. + 8 figures, 10 tables, and 713 references +
+
+
+
+
+ + ♻ ☆ TIM: Teaching Large Language Models to Translate with Comparison AAAI 2024 + + +
+ Open-sourced large language models (LLMs) have demonstrated remarkable +efficacy in various tasks with instruction tuning. However, these models can +sometimes struggle with tasks that require more specialized knowledge such as +translation. One possible reason for such deficiency is that instruction tuning +aims to generate fluent and coherent text that continues from a given +instruction without being constrained by any task-specific requirements. +Moreover, it can be more challenging for tuning smaller LLMs with lower-quality +training data. To address this issue, we propose a novel framework using +examples in comparison to teach LLMs to learn translation. Our approach +involves presenting the model with examples of correct and incorrect +translations and using a preference loss to guide the model's learning. We +evaluate our method on WMT2022 test sets and show that it outperforms existing +methods. Our findings offer a new perspective on fine-tuning LLMs for +translation tasks and provide a promising solution for generating high-quality +translations. Please refer to Github for more details: +https://github.com/lemon0830/TIM. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Bad Actor, Good Advisor: Exploring the Role of Large Language Models in + Fake News Detection AAAI 2024 + + +
+ Detecting fake news requires both a delicate sense of diverse clues and a +profound understanding of the real-world background, which remains challenging +for detectors based on small language models (SLMs) due to their knowledge and +capability limitations. Recent advances in large language models (LLMs) have +shown remarkable performance in various tasks, but whether and how LLMs could +help with fake news detection remains underexplored. In this paper, we +investigate the potential of LLMs in fake news detection. First, we conduct an +empirical study and find that a sophisticated LLM such as GPT 3.5 could +generally expose fake news and provide desirable multi-perspective rationales +but still underperforms the basic SLM, fine-tuned BERT. Our subsequent analysis +attributes such a gap to the LLM's inability to select and integrate rationales +properly to conclude. Based on these findings, we propose that current LLMs may +not substitute fine-tuned SLMs in fake news detection but can be a good advisor +for SLMs by providing multi-perspective instructive rationales. To instantiate +this proposal, we design an adaptive rationale guidance network for fake news +detection (ARG), in which SLMs selectively acquire insights on news analysis +from the LLMs' rationales. We further derive a rationale-free version of ARG by +distillation, namely ARG-D, which services cost-sensitive scenarios without +querying LLMs. Experiments on two real-world datasets demonstrate that ARG and +ARG-D outperform three types of baseline methods, including SLM-based, +LLM-based, and combinations of small and large language models. + +
+
+ comment: 16 pages, 5 figures, and 9 tables. To appear at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks + + +
+ Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where +safeguards are bypassed to produce ethically harmful prompts. This study +proposes a simple black-box method to effectively generate jailbreak prompts, +overcoming the high complexity and computational costs associated with existing +methods. The proposed technique iteratively rewrites harmful prompts into +non-harmful expressions using the target LLM itself, based on the hypothesis +that LLMs can directly sample expressions that bypass safeguards. Demonstrated +through experiments with ChatGPT (GPT-3.5 and GPT-4) and Gemini-Pro, this +method achieved an attack success rate of over 80% within an average of 5 +iterations and remained effective despite model updates. The generated +jailbreak prompts were naturally-worded and concise; moreover, they were +difficult-to-defend. These results indicate that creating effective jailbreak +prompts is simpler than previously considered, suggesting that black-box +jailbreak attacks pose a more serious threat. + +
+
+ comment: 12 pages, 4 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ ETPNav: Evolving Topological Planning for Vision-Language Navigation in + Continuous Environments + + +
+ Vision-language navigation is a task that requires an agent to follow +instructions to navigate in environments. It becomes increasingly crucial in +the field of embodied AI, with potential applications in autonomous navigation, +search and rescue, and human-robot interaction. In this paper, we propose to +address a more practical yet challenging counterpart setting - vision-language +navigation in continuous environments (VLN-CE). To develop a robust VLN-CE +agent, we propose a new navigation framework, ETPNav, which focuses on two +critical skills: 1) the capability to abstract environments and generate +long-range navigation plans, and 2) the ability of obstacle-avoiding control in +continuous environments. ETPNav performs online topological mapping of +environments by self-organizing predicted waypoints along a traversed path, +without prior environmental experience. It privileges the agent to break down +the navigation procedure into high-level planning and low-level control. +Concurrently, ETPNav utilizes a transformer-based cross-modal planner to +generate navigation plans based on topological maps and instructions. The plan +is then performed through an obstacle-avoiding controller that leverages a +trial-and-error heuristic to prevent navigation from getting stuck in +obstacles. Experimental results demonstrate the effectiveness of the proposed +method. ETPNav yields more than 10% and 20% improvements over prior +state-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is +available at https://github.com/MarSaKi/ETPNav. + +
+
+ comment: Project page: https://github.com/MarSaKi/ETPNav +
+
+
+
+
+ + ♻ ☆ A Taxonomy of Foundation Model based Systems through the Lens of + Software Architecture + + +
+ The recent release of large language model (LLM) based chatbots, such as +ChatGPT, has attracted huge interest in foundation models. It is widely +believed that foundation models will serve as the fundamental building blocks +for future AI systems. As foundation models are in their early stages, the +design of foundation model based systems has not yet been systematically +explored. There is limited understanding about the impact of introducing +foundation models in software architecture. Therefore, in this paper, we +propose a taxonomy of foundation model based systems, which classifies and +compares the characteristics of foundation models and design options of +foundation model based systems. Our taxonomy comprises three categories: the +pretraining and adaptation of foundation models, the architecture design of +foundation model based systems, and responsible-AI-by-design. This taxonomy can +serve as concrete guidance for making major architectural design decisions when +designing foundation model based systems and highlights trade-offs arising from +design decisions. + +
+
+
+
+
+ + ♻ ☆ ChatRule: Mining Logical Rules with Large Language Models for Knowledge + Graph Reasoning + + +
+ Logical rules are essential for uncovering the logical connections between +relations, which could improve reasoning performance and provide interpretable +results on knowledge graphs (KGs). Although there have been many efforts to +mine meaningful logical rules over KGs, existing methods suffer from +computationally intensive searches over the rule space and a lack of +scalability for large-scale KGs. Besides, they often ignore the semantics of +relations which is crucial for uncovering logical connections. Recently, large +language models (LLMs) have shown impressive performance in the field of +natural language processing and various applications, owing to their emergent +ability and generalizability. In this paper, we propose a novel framework, +ChatRule, unleashing the power of large language models for mining logical +rules over knowledge graphs. Specifically, the framework is initiated with an +LLM-based rule generator, leveraging both the semantic and structural +information of KGs to prompt LLMs to generate logical rules. To refine the +generated rules, a rule ranking module estimates the rule quality by +incorporating facts from existing KGs. Last, the ranked rules can be used to +conduct reasoning over KGs. ChatRule is evaluated on four large-scale KGs, +w.r.t. different rule quality metrics and downstream tasks, showing the +effectiveness and scalability of our method. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Empirical Study of Named Entity Recognition Performance Using + Distribution-aware Word Embedding + + +
+ With the fast development of Deep Learning techniques, Named Entity +Recognition (NER) is becoming more and more important in the information +extraction task. The greatest difficulty that the NER task faces is to keep the +detectability even when types of NE and documents are unfamiliar. Realizing +that the specificity information may contain potential meanings of a word and +generate semantic-related features for word embedding, we develop a +distribution-aware word embedding and implement three different methods to make +use of the distribution information in a NER framework. And the result shows +that the performance of NER will be improved if the word specificity is +incorporated into existing NER methods. + +
+
+ comment: Want to correct +
+
+
+
+
+ + ♻ ☆ Using Twitter Data to Understand Public Perceptions of Approved versus + Off-label Use for COVID-19-related Medications + + +
+ Understanding public discourse on emergency use of unproven therapeutics is +crucial for monitoring safe use and combating misinformation. We developed a +natural language processing-based pipeline to comprehend public perceptions of +and stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter +over time. This retrospective study included 609,189 US-based tweets from +January 29, 2020, to November 30, 2021, about four drugs that garnered +significant public attention during the COVID-19 pandemic: (1) +Hydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2) +Molnupiravir and Remdesivir, FDA-approved treatments for eligible patients. +Time-trend analysis was employed to understand popularity trends and related +events. Content and demographic analyses were conducted to explore potential +rationales behind people's stances on each drug. Time-trend analysis indicated +that Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir +and Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and +Ivermectin discussions were highly politicized, related to conspiracy theories, +hearsay, and celebrity influences. The distribution of stances between the two +major US political parties was significantly different (P < .001); Republicans +were more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than +Democrats. People with healthcare backgrounds tended to oppose +Hydroxychloroquine (7%) more than the general population, while the general +population was more likely to support Ivermectin (14%). Our study found that +social media users have varying perceptions and stances on off-label versus +FDA-authorized drug use at different stages of COVID-19. This indicates that +health systems, regulatory agencies, and policymakers should design tailored +strategies to monitor and reduce misinformation to promote safe drug use. + +
+
+ comment: Full paper published in JAMIA +
+
+
+
+
+ + ♻ ☆ Streamlining Social Media Information Extraction for Public Health + Research with Deep Learning + + +
+ Objective: Social media-based public health research is crucial for epidemic +surveillance, but most studies identify relevant corpora with keyword matching. +This study develops a system to streamline the process of curating colloquial +medical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial +symptom dictionary from COVID-19-related tweets as proof of concept. Methods: +COVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The +pipeline includes three modules: a named entity recognition module to detect +symptoms in tweets; an entity normalization module to aggregate detected +entities; and a mapping module that iteratively maps entities to Unified +Medical Language System concepts. A random 500 entity sample were drawn from +the final dictionary for accuracy validation. Additionally, we conducted a +symptom frequency distribution analysis to compare our dictionary to a +pre-defined lexicon from previous research. Results: We identified 498,480 +unique symptom entity expressions from the tweets. Pre-processing reduces the +number to 18,226. The final dictionary contains 38,175 unique expressions of +symptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom +distribution analysis found that our dictionary detects more symptoms and is +effective at identifying psychiatric disorders like anxiety and depression, +often missed by pre-defined lexicons. Conclusion: This study advances public +health research by implementing a novel, systematic pipeline for curating +symptom lexicons from social media data. The final lexicon's high accuracy, +validated by medical professionals, underscores the potential of this +methodology to reliably interpret and categorize vast amounts of unstructured +social media data into actionable medical insights across diverse linguistic +and regional landscapes. + +
+
+ comment: Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA + Annual Symposium 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 104 + +
+
+
+ + ☆ Exploring Simple Open-Vocabulary Semantic Segmentation + + +
+ Open-vocabulary semantic segmentation models aim to accurately assign a +semantic label to each pixel in an image from a set of arbitrary +open-vocabulary texts. In order to learn such pixel-level alignment, current +approaches typically rely on a combination of (i) image-level VL model (e.g. +CLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this +paper, we introduce S-Seg, a novel model that can achieve surprisingly strong +performance without depending on any of the above elements. S-Seg leverages +pseudo-mask and language to train a MaskFormer, and can be easily trained from +publicly available image-text datasets. Contrary to prior works, our model +directly trains for pixel-level features and language alignment. Once trained, +S-Seg generalizes well to multiple testing datasets without requiring +fine-tuning. In addition, S-Seg has the extra benefits of scalability with data +and consistently improvement when augmented with self-training. We believe that +our simple yet effective approach will serve as a solid baseline for future +research. + +
+
+ comment: Code is available at: https://github.com/zlai0/S-Seg +
+
+
+
+
+ + ☆ Less Could Be Better: Parameter-efficient Fine-tuning Advances Medical + Vision Foundation Models + + +
+ Parameter-efficient fine-tuning (PEFT) that was initially developed for +exploiting pre-trained large language models has recently emerged as an +effective approach to perform transfer learning on computer vision tasks. +However, the effectiveness of PEFT on medical vision foundation models is still +unclear and remains to be explored. As a proof of concept, we conducted a +detailed empirical study on applying PEFT to chest radiography foundation +models. Specifically, we delved into LoRA, a representative PEFT method, and +compared it against full-parameter fine-tuning (FFT) on two self-supervised +radiography foundation models across three well-established chest radiograph +datasets. Our results showed that LoRA outperformed FFT in 13 out of 18 +transfer learning tasks by at most 2.9% using fewer than 1% tunable parameters. +Combining LoRA with foundation models, we set up new state-of-the-art on a +range of data-efficient learning tasks, such as an AUROC score of 80.6% using +1% labeled data on NIH ChestX-ray14. We hope this study can evoke more +attention from the community in the use of PEFT for transfer learning on +medical imaging tasks. Code and models are available at +https://github.com/RL4M/MED-PEFT. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Connecting the Dots: Leveraging Spatio-Temporal Graph Neural Networks + for Accurate Bangla Sign Language Recognition + + +
+ Recent advances in Deep Learning and Computer Vision have been successfully +leveraged to serve marginalized communities in various contexts. One such area +is Sign Language - a primary means of communication for the deaf community. +However, so far, the bulk of research efforts and investments have gone into +American Sign Language, and research activity into low-resource sign languages +- especially Bangla Sign Language - has lagged significantly. In this research +paper, we present a new word-level Bangla Sign Language dataset - BdSL40 - +consisting of 611 videos over 40 words, along with two different approaches: +one with a 3D Convolutional Neural Network model and another with a novel Graph +Neural Network approach for the classification of BdSL40 dataset. This is the +first study on word-level BdSL recognition, and the dataset was transcribed +from Indian Sign Language (ISL) using the Bangla Sign Language Dictionary +(1997). The proposed GNN model achieved an F1 score of 89%. The study +highlights the significant lexical and semantic similarity between BdSL, West +Bengal Sign Language, and ISL, and the lack of word-level datasets for BdSL in +the literature. We release the dataset and source code to stimulate further +research. + +
+
+
+
+
+ + ☆ CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation + + +
+ Chest X-rays (CXRs) are the most frequently performed imaging test in +clinical practice. Recent advances in the development of vision-language +foundation models (FMs) give rise to the possibility of performing automated +CXR interpretation, which can assist physicians with clinical decision-making +and improve patient outcomes. However, developing FMs that can accurately +interpret CXRs is challenging due to the (1) limited availability of +large-scale vision-language datasets in the medical image domain, (2) lack of +vision and language encoders that can capture the complexities of medical data, +and (3) absence of evaluation frameworks for benchmarking the abilities of FMs +on CXR interpretation. In this work, we address these challenges by first +introducing \emph{CheXinstruct} - a large-scale instruction-tuning dataset +curated from 28 publicly-available datasets. We then present \emph{CheXagent} - +an instruction-tuned FM capable of analyzing and summarizing CXRs. To build +CheXagent, we design a clinical large language model (LLM) for parsing +radiology reports, a vision encoder for representing CXR images, and a network +to bridge the vision and language modalities. Finally, we introduce +\emph{CheXbench} - a novel benchmark designed to systematically evaluate FMs +across 8 clinically-relevant CXR interpretation tasks. Extensive quantitative +evaluations and qualitative reviews with five expert radiologists demonstrate +that CheXagent outperforms previously-developed general- and medical-domain FMs +on CheXbench tasks. Furthermore, in an effort to improve model transparency, we +perform a fairness evaluation across factors of sex, race and age to highlight +potential performance disparities. Our project is at +\url{https://stanford-aimi.github.io/chexagent.html}. + +
+
+ comment: 24 pages, 8 figures +
+
+
+
+
+ + ☆ OK-Robot: What Really Matters in Integrating Open-Knowledge Models for + Robotics + + +
+ Remarkable progress has been made in recent years in the fields of vision, +language, and robotics. We now have vision models capable of recognizing +objects based on language queries, navigation systems that can effectively +control mobile systems, and grasping models that can handle a wide range of +objects. Despite these advancements, general-purpose applications of robotics +still lag behind, even though they rely on these fundamental capabilities of +recognition, navigation, and grasping. In this paper, we adopt a systems-first +approach to develop a new Open Knowledge-based robotics framework called +OK-Robot. By combining Vision-Language Models (VLMs) for object detection, +navigation primitives for movement, and grasping primitives for object +manipulation, OK-Robot offers a integrated solution for pick-and-drop +operations without requiring any training. To evaluate its performance, we run +OK-Robot in 10 real-world home environments. The results demonstrate that +OK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks, +representing a new state-of-the-art in Open Vocabulary Mobile Manipulation +(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered +environments, OK-Robot's performance increases to 82%. However, the most +important insight gained from OK-Robot is the critical role of nuanced details +when combining Open Knowledge systems like VLMs with robotic modules. Videos of +our experiments are available on our website: https://ok-robot.github.io + +
+
+
+
+
+ + ☆ LONEStar: The Lunar Flashlight Optical Navigation Experiment + + +
+ This paper documents the results from the highly successful Lunar flashlight +Optical Navigation Experiment with a Star tracker (LONEStar). Launched in +December 2022, Lunar Flashlight (LF) was a NASA-funded technology demonstration +mission. After a propulsion system anomaly prevented capture in lunar orbit, LF +was ejected from the Earth-Moon system and into heliocentric space. NASA +subsequently transferred ownership of LF to Georgia Tech to conduct an unfunded +extended mission to demonstrate further advanced technology objectives, +including LONEStar. From August-December 2023, the LONEStar team performed +on-orbit calibration of the optical instrument and a number of different OPNAV +experiments. This campaign included the processing of nearly 400 images of star +fields, Earth and Moon, and four other planets (Mercury, Mars, Jupiter, and +Saturn). LONEStar provided the first on-orbit demonstrations of heliocentric +navigation using only optical observations of planets. Of special note is the +successful in-flight demonstration of (1) instantaneous triangulation with +simultaneous sightings of two planets with the LOST algorithm and (2) dynamic +triangulation with sequential sightings of multiple planets. + +
+
+
+
+
+ + ☆ Broiler-Net: A Deep Convolutional Framework for Broiler Behavior + Analysis in Poultry Houses + + +
+ Detecting anomalies in poultry houses is crucial for maintaining optimal +chicken health conditions, minimizing economic losses and bolstering +profitability. This paper presents a novel real-time framework for analyzing +chicken behavior in cage-free poultry houses to detect abnormal behaviors. +Specifically, two significant abnormalities, namely inactive broiler and +huddling behavior, are investigated in this study. The proposed framework +comprises three key steps: (1) chicken detection utilizing a state-of-the-art +deep learning model, (2) tracking individual chickens across consecutive frames +with a fast tracker module, and (3) detecting abnormal behaviors within the +video stream. Experimental studies are conducted to evaluate the efficacy of +the proposed algorithm in accurately assessing chicken behavior. The results +illustrate that our framework provides a precise and efficient solution for +real-time anomaly detection, facilitating timely interventions to maintain +chicken health and enhance overall productivity on poultry farms. Github: +https://github.com/TaherehZarratEhsan/Chicken-Behavior-Analysis + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Single-View 3D Human Digitalization with Large Reconstruction Models + + +
+ In this paper, we introduce Human-LRM, a single-stage feed-forward Large +Reconstruction Model designed to predict human Neural Radiance Fields (NeRF) +from a single image. Our approach demonstrates remarkable adaptability in +training using extensive datasets containing 3D scans and multi-view capture. +Furthermore, to enhance the model's applicability for in-the-wild scenarios +especially with occlusions, we propose a novel strategy that distills +multi-view reconstruction into single-view via a conditional triplane diffusion +model. This generative extension addresses the inherent variations in human +body shapes when observed from a single view, and makes it possible to +reconstruct the full body human from an occluded image. Through extensive +experiments, we show that Human-LRM surpasses previous methods by a significant +margin on several benchmarks. + +
+
+
+
+
+ + ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning + Capabilities + + +
+ Understanding and reasoning about spatial relationships is a fundamental +capability for Visual Question Answering (VQA) and robotics. While Vision +Language Models (VLM) have demonstrated remarkable performance in certain VQA +benchmarks, they still lack capabilities in 3D spatial reasoning, such as +recognizing quantitative relationships of physical objects like distances or +size differences. We hypothesize that VLMs' limited spatial reasoning +capability is due to the lack of 3D spatial knowledge in training data and aim +to solve this problem by training VLMs with Internet-scale spatial reasoning +data. To this end, we present a system to facilitate this approach. We first +develop an automatic 3D spatial VQA data generation framework that scales up to +2 billion VQA examples on 10 million real-world images. We then investigate +various factors in the training recipe, including data quality, training +pipeline, and VLM architecture. Our work features the first internet-scale 3D +spatial reasoning dataset in metric space. By training a VLM on such data, we +significantly enhance its ability on both qualitative and quantitative spatial +VQA. Finally, we demonstrate that this VLM unlocks novel downstream +applications in chain-of-thought spatial reasoning and robotics due to its +quantitative estimation capability. Project website: +https://spatial-vlm.github.io/ + +
+
+
+
+
+ + ☆ Semi-supervised segmentation of land cover images using nonlinear + canonical correlation analysis with multiple features and t-SNE + + +
+ Image segmentation is a clustering task whereby each pixel is assigned a +cluster label. Remote sensing data usually consists of multiple bands of +spectral images in which there exist semantically meaningful land cover +subregions, co-registered with other source data such as LIDAR (LIght Detection +And Ranging) data, where available. This suggests that, in order to account for +spatial correlation between pixels, a feature vector associated with each pixel +may be a vectorized tensor representing the multiple bands and a local patch as +appropriate. Similarly, multiple types of texture features based on a pixel's +local patch would also be beneficial for encoding locally statistical +information and spatial variations, without necessarily labelling pixel-wise a +large amount of ground truth, then training a supervised model, which is +sometimes impractical. In this work, by resorting to label only a small +quantity of pixels, a new semi-supervised segmentation approach is proposed. +Initially, over all pixels, an image data matrix is created in high dimensional +feature space. Then, t-SNE projects the high dimensional data onto 3D +embedding. By using radial basis functions as input features, which use the +labelled data samples as centres, to pair with the output class labels, a +modified canonical correlation analysis algorithm, referred to as RBF-CCA, is +introduced which learns the associated projection matrix via the small labelled +data set. The associated canonical variables, obtained for the full image, are +applied by k-means clustering algorithm. The proposed semi-supervised RBF-CCA +algorithm has been implemented on several remotely sensed multispectral images, +demonstrating excellent segmentation results. + +
+
+
+
+
+ + ☆ Automated facial recognition system using deep learning for pain + assessment in adults with cerebral palsy + + +
+ Background: Pain assessment in individuals with neurological conditions, +especially those with limited self-report ability and altered facial +expressions, presents challenges. Existing measures, relying on direct +observation by caregivers, lack sensitivity and specificity. In cerebral palsy, +pain is a common comorbidity and a reliable evaluation protocol is crucial. +Thus, having an automatic system that recognizes facial expressions could be of +enormous help when diagnosing pain in this type of patient. + Objectives: 1) to build a dataset of facial pain expressions in individuals +with cerebral palsy, and 2) to develop an automated facial recognition system +based on deep learning for pain assessment addressed to this population. + Methods: Ten neural networks were trained on three pain image databases, +including the UNBC-McMaster Shoulder Pain Expression Archive Database, the +Multimodal Intensity Pain Dataset, and the Delaware Pain Database. +Additionally, a curated dataset (CPPAIN) was created, consisting of 109 +preprocessed facial pain expression images from individuals with cerebral +palsy, categorized by two physiotherapists using the Facial Action Coding +System observational scale. + Results: InceptionV3 exhibited promising performance on the CP-PAIN dataset, +achieving an accuracy of 62.67% and an F1 score of 61.12%. Explainable +artificial intelligence techniques revealed consistent essential features for +pain identification across models. + Conclusion: This study demonstrates the potential of deep learning models for +robust pain detection in populations with neurological conditions and +communication disabilities. The creation of a larger dataset specific to +cerebral palsy would further enhance model accuracy, offering a valuable tool +for discerning subtle and idiosyncratic pain expressions. The insights gained +could extend to other complex neurological conditions. + +
+
+
+
+
+ + ☆ VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear + Responses in VR Stand-up Interactive Games + + +
+ Understanding and recognizing emotions are important and challenging issues +in the metaverse era. Understanding, identifying, and predicting fear, which is +one of the fundamental human emotions, in virtual reality (VR) environments +plays an essential role in immersive game development, scene development, and +next-generation virtual human-computer interaction applications. In this +article, we used VR horror games as a medium to analyze fear emotions by +collecting multi-modal data (posture, audio, and physiological signals) from 23 +players. We used an LSTM-based model to predict fear with accuracies of 65.31% +and 90.47% under 6-level classification (no fear and five different levels of +fear) and 2-level classification (no fear and fear), respectively. We +constructed a multi-modal natural behavior dataset of immersive human fear +responses (VRMN-bD) and compared it with existing relevant advanced datasets. +The results show that our dataset has fewer limitations in terms of collection +method, data scale and audience scope. We are unique and advanced in targeting +multi-modal datasets of fear and behavior in VR stand-up interactive +environments. Moreover, we discussed the implications of this work for +communities and applications. The dataset and pre-trained model are available +at https://github.com/KindOPSTAR/VRMN-bD. + +
+
+ comment: Accepted to IEEE VR 2024 +
+
+
+
+
+ + ☆ Out-of-Distribution Detection & Applications With Ablated Learned + Temperature Energy + + +
+ As deep neural networks become adopted in high-stakes domains, it is crucial +to be able to identify when inference inputs are Out-of-Distribution (OOD) so +that users can be alerted of likely drops in performance and calibration +despite high confidence. Among many others, existing methods use the following +two scores to do so without training on any apriori OOD examples: a learned +temperature and an energy score. In this paper we introduce Ablated Learned +Temperature Energy (or "AbeT" for short), a method which combines these prior +methods in novel ways with effective modifications. Due to these contributions, +AbeT lowers the False Positive Rate at $95\%$ True Positive Rate (FPR@95) by +$35.39\%$ in classification (averaged across all ID and OOD datasets measured) +compared to state of the art without training networks in multiple stages or +requiring hyperparameters or test-time backward passes. We additionally provide +empirical insights as to how our model learns to distinguish between +In-Distribution (ID) and OOD samples while only being explicitly trained on ID +samples via exposure to misclassified ID examples at training time. Lastly, we +show the efficacy of our method in identifying predicted bounding boxes and +pixels corresponding to OOD objects in object detection and semantic +segmentation, respectively - with an AUROC increase of $5.15\%$ in object +detection and both a decrease in FPR@95 of $41.48\%$ and an increase in AUPRC +of $34.20\%$ on average in semantic segmentation compared to previous state of +the art. + +
+
+
+
+
+ + ☆ DeepCERES: A Deep learning method for cerebellar lobule segmentation + using ultra-high resolution multimodal MRI + + +
+ This paper introduces a novel multimodal and high-resolution human brain +cerebellum lobule segmentation method. Unlike current tools that operate at +standard resolution ($1 \text{ mm}^{3}$) or using mono-modal data, the proposed +method improves cerebellum lobule segmentation through the use of a multimodal +and ultra-high resolution ($0.125 \text{ mm}^{3}$) training dataset. To develop +the method, first, a database of semi-automatically labelled cerebellum lobules +was created to train the proposed method with ultra-high resolution T1 and T2 +MR images. Then, an ensemble of deep networks has been designed and developed, +allowing the proposed method to excel in the complex cerebellum lobule +segmentation task, improving precision while being memory efficient. Notably, +our approach deviates from the traditional U-Net model by exploring alternative +architectures. We have also integrated deep learning with classical machine +learning methods incorporating a priori knowledge from multi-atlas +segmentation, which improved precision and robustness. Finally, a new online +pipeline, named DeepCERES, has been developed to make available the proposed +method to the scientific community requiring as input only a single T1 MR image +at standard resolution. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ CloSe: A 3D Clothing Segmentation Dataset and Model + + +
+ 3D Clothing modeling and datasets play crucial role in the entertainment, +animation, and digital fashion industries. Existing work often lacks detailed +semantic understanding or uses synthetic datasets, lacking realism and +personalization. To address this, we first introduce CloSe-D: a novel +large-scale dataset containing 3D clothing segmentation of 3167 scans, covering +a range of 18 distinct clothing classes. Additionally, we propose CloSe-Net, +the first learning-based 3D clothing segmentation model for fine-grained +segmentation from colored point clouds. CloSe-Net uses local point features, +body-clothing correlation, and a garment-class and point features-based +attention module, improving performance over baselines and prior work. The +proposed attention module enables our model to learn appearance and +geometry-dependent clothing prior from data. We further validate the efficacy +of our approach by successfully segmenting publicly available datasets of +people in clothing. We also introduce CloSe-T, a 3D interactive tool for +refining segmentation labels. Combining the tool with CloSe-T in a continual +learning setup demonstrates improved generalization on real-world data. +Dataset, model, and tool can be found at +https://virtualhumans.mpi-inf.mpg.de/close3dv24/. + +
+
+
+
+
+ + ☆ HomeRobot Open Vocabulary Mobile Manipulation Challenge 2023 Participant + Report (Team KuzHum) + + +
+ We report an improvements to NeurIPS 2023 HomeRobot: Open Vocabulary Mobile +Manipulation (OVMM) Challenge reinforcement learning baseline. More +specifically, we propose more accurate semantic segmentation module, along with +better place skill policy, and high-level heuristic that outperforms the +baseline by 2.4% of overall success rate (sevenfold improvement) and 8.2% of +partial success rate (1.75 times improvement) on Test Standard split of the +challenge dataset. With aforementioned enhancements incorporated our agent +scored 3rd place in the challenge on both simulation and real-world stages. + +
+
+
+
+
+ + ☆ Look, Listen and Recognise: Character-Aware Audio-Visual Subtitling ICASSP 2024 + + +
+ The goal of this paper is automatic character-aware subtitle generation. +Given a video and a minimal amount of metadata, we propose an audio-visual +method that generates a full transcript of the dialogue, with precise speech +timestamps, and the character speaking identified. The key idea is to first use +audio-visual cues to select a set of high-precision audio exemplars for each +character, and then use these exemplars to classify all speech segments by +speaker identity. Notably, the method does not require face detection or +tracking. We evaluate the method over a variety of TV sitcoms, including +Seinfeld, Fraiser and Scrubs. We envision this system being useful for the +automatic generation of subtitles to improve the accessibility of the vast +amount of videos available on modern streaming services. Project page : +\url{https://www.robots.ox.ac.uk/~vgg/research/look-listen-recognise/} + +
+
+ comment: Accepted for publication in ICASSP 2024 +
+
+
+
+
+ + ☆ Momentum-SAM: Sharpness Aware Minimization without Computational + Overhead + + +
+ The recently proposed optimization algorithm for deep neural networks +Sharpness Aware Minimization (SAM) suggests perturbing parameters before +gradient calculation by a gradient ascent step to guide the optimization into +parameter space regions of flat loss. While significant generalization +improvements and thus reduction of overfitting could be demonstrated, the +computational costs are doubled due to the additionally needed gradient +calculation, making SAM unfeasible in case of limited computationally +capacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose +Momentum-SAM (MSAM), which perturbs parameters in the direction of the +accumulated momentum vector to achieve low sharpness without significant +computational overhead or memory demands over SGD or Adam. We evaluate MSAM in +detail and reveal insights on separable mechanisms of NAG, SAM and MSAM +regarding training optimization and generalization. Code is available at +https://github.com/MarlonBecker/MSAM. + +
+
+
+
+
+ + ☆ Stereo-Matching Knowledge Distilled Monocular Depth Estimation Filtered + by Multiple Disparity Consistency ICASSP 2024 + + +
+ In stereo-matching knowledge distillation methods of the self-supervised +monocular depth estimation, the stereo-matching network's knowledge is +distilled into a monocular depth network through pseudo-depth maps. In these +methods, the learning-based stereo-confidence network is generally utilized to +identify errors in the pseudo-depth maps to prevent transferring the errors. +However, the learning-based stereo-confidence networks should be trained with +ground truth (GT), which is not feasible in a self-supervised setting. In this +paper, we propose a method to identify and filter errors in the pseudo-depth +map using multiple disparity maps by checking their consistency without the +need for GT and a training process. Experimental results show that the proposed +method outperforms the previous methods and works well on various +configurations by filtering out erroneous areas where the stereo-matching is +vulnerable, especially such as textureless regions, occlusion boundaries, and +reflective surfaces. + +
+
+ comment: ICASSP 2024. The first two authors are equally contributed +
+
+
+
+
+ + ☆ Robustness to distribution shifts of compressed networks for edge + devices + + +
+ It is necessary to develop efficient DNNs deployed on edge devices with +limited computation resources. However, the compressed networks often execute +new tasks in the target domain, which is different from the source domain where +the original network is trained. It is important to investigate the robustness +of compressed networks in two types of data distribution shifts: domain shifts +and adversarial perturbations. In this study, we discover that compressed +models are less robust to distribution shifts than their original networks. +Interestingly, larger networks are more vulnerable to losing robustness than +smaller ones, even when they are compressed to a similar size as the smaller +networks. Furthermore, compact networks obtained by knowledge distillation are +much more robust to distribution shifts than pruned networks. Finally, +post-training quantization is a reliable method for achieving significant +robustness to distribution shifts, and it outperforms both pruned and distilled +models in terms of robustness. + +
+
+
+
+
+ + ☆ Modeling Stereo-Confidence Out of the End-to-End Stereo-Matching Network + via Disparity Plane Sweep AAAI 2024 + + +
+ We propose a novel stereo-confidence that can be measured externally to +various stereo-matching networks, offering an alternative input modality choice +of the cost volume for learning-based approaches, especially in safety-critical +systems. Grounded in the foundational concepts of disparity definition and the +disparity plane sweep, the proposed stereo-confidence method is built upon the +idea that any shift in a stereo-image pair should be updated in a corresponding +amount shift in the disparity map. Based on this idea, the proposed +stereo-confidence method can be summarized in three folds. 1) Using the +disparity plane sweep, multiple disparity maps can be obtained and treated as a +3-D volume (predicted disparity volume), like the cost volume is constructed. +2) One of these disparity maps serves as an anchor, allowing us to define a +desirable (or ideal) disparity profile at every spatial point. 3) By comparing +the desirable and predicted disparity profiles, we can quantify the level of +matching ambiguity between left and right images for confidence measurement. +Extensive experimental results using various stereo-matching networks and +datasets demonstrate that the proposed stereo-confidence method not only shows +competitive performance on its own but also consistent performance improvements +when it is used as an input modality for learning-based stereo-confidence +methods. + +
+
+ comment: AAAI 2024. The first two authors contributed equally +
+
+
+
+
+ + ☆ Scaling Face Interaction Graph Networks to Real World Scenes + + +
+ Accurately simulating real world object dynamics is essential for various +applications such as robotics, engineering, graphics, and design. To better +capture complex real dynamics such as contact and friction, learned simulators +based on graph networks have recently shown great promise. However, applying +these learned simulators to real scenes comes with two major challenges: first, +scaling learned simulators to handle the complexity of real world scenes which +can involve hundreds of objects each with complicated 3D shapes, and second, +handling inputs from perception rather than 3D state information. Here we +introduce a method which substantially reduces the memory required to run +graph-based learned simulators. Based on this memory-efficient simulation +model, we then present a perceptual interface in the form of editable NeRFs +which can convert real-world scenes into a structured representation that can +be processed by graph network simulator. We show that our method uses +substantially less memory than previous graph-based simulators while retaining +their accuracy, and that the simulators learned in synthetic environments can +be applied to real world scenes captured from multiple camera angles. This +paves the way for expanding the application of learned simulators to settings +where only perceptual information is available at inference time. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ Observation-Guided Meteorological Field Downscaling at Station Scale: A + Benchmark and a New Method + + +
+ Downscaling (DS) of meteorological variables involves obtaining +high-resolution states from low-resolution meteorological fields and is an +important task in weather forecasting. Previous methods based on deep learning +treat downscaling as a super-resolution task in computer vision and utilize +high-resolution gridded meteorological fields as supervision to improve +resolution at specific grid scales. However, this approach has struggled to +align with the continuous distribution characteristics of meteorological +fields, leading to an inherent systematic bias between the downscaled results +and the actual observations at meteorological stations. In this paper, we +extend meteorological downscaling to arbitrary scattered station scales, +establish a brand new benchmark and dataset, and retrieve meteorological states +at any given station location from a coarse-resolution meteorological field. +Inspired by data assimilation techniques, we integrate observational data into +the downscaling process, providing multi-scale observational priors. Building +on this foundation, we propose a new downscaling model based on hypernetwork +architecture, namely HyperDS, which efficiently integrates different +observational information into the model training, achieving continuous scale +modeling of the meteorological field. Through extensive experiments, our +proposed method outperforms other specially designed baseline models on +multiple surface variables. Notably, the mean squared error (MSE) for wind +speed and surface pressure improved by 67% and 19.5% compared to other methods. +We will release the dataset and code subsequently. + +
+
+
+
+
+ + ☆ Feature Denoising Diffusion Model for Blind Image Quality Assessment + + +
+ Blind Image Quality Assessment (BIQA) aims to evaluate image quality in line +with human perception, without reference benchmarks. Currently, deep learning +BIQA methods typically depend on using features from high-level tasks for +transfer learning. However, the inherent differences between BIQA and these +high-level tasks inevitably introduce noise into the quality-aware features. In +this paper, we take an initial step towards exploring the diffusion model for +feature denoising in BIQA, namely Perceptual Feature Diffusion for IQA +(PFD-IQA), which aims to remove noise from quality-aware features. +Specifically, (i) We propose a {Perceptual Prior Discovery and Aggregation +module to establish two auxiliary tasks to discover potential low-level +features in images that are used to aggregate perceptual text conditions for +the diffusion model. (ii) We propose a Perceptual Prior-based Feature +Refinement strategy, which matches noisy features to predefined denoising +trajectories and then performs exact feature denoising based on text +conditions. Extensive experiments on eight standard BIQA datasets demonstrate +the superior performance to the state-of-the-art BIQA methods, i.e., achieving +the PLCC values of 0.935 ( vs. 0.905 in KADID) and 0.922 ( vs. 0.894 in LIVEC). + +
+
+
+
+
+ + ☆ CMMMU: A Chinese Massive Multi-discipline Multimodal Understanding + Benchmark + + +
+ As the capabilities of large multimodal models (LMMs) continue to advance, +evaluating the performance of LMMs emerges as an increasing need. Additionally, +there is an even larger gap in evaluating the advanced knowledge and reasoning +abilities of LMMs in non-English contexts such as Chinese. We introduce CMMMU, +a new Chinese Massive Multi-discipline Multimodal Understanding benchmark +designed to evaluate LMMs on tasks demanding college-level subject knowledge +and deliberate reasoning in a Chinese context. CMMMU is inspired by and +strictly follows the annotation and analysis pattern of MMMU. + CMMMU includes 12k manually collected multimodal questions from college +exams, quizzes, and textbooks, covering six core disciplines: Art & Design, +Business, Science, Health & Medicine, Humanities & Social Science, and Tech & +Engineering, like its companion, MMMU. These questions span 30 subjects and +comprise 39 highly heterogeneous image types, such as charts, diagrams, maps, +tables, music sheets, and chemical structures. + CMMMU focuses on complex perception and reasoning with domain-specific +knowledge in the Chinese context. We evaluate 11 open-source LLMs and one +proprietary GPT-4V(ision). Even GPT-4V only achieves accuracies of 42%, +indicating a large space for improvement. CMMMU will boost the community to +build the next-generation LMMs towards expert artificial intelligence and +promote the democratization of LMMs by providing diverse language contexts. + +
+
+
+
+
+ + ☆ Benchmarking Large Multimodal Models against Common Corruptions + + +
+ This technical report aims to fill a deficiency in the assessment of large +multimodal models (LMMs) by specifically examining the self-consistency of +their outputs when subjected to common corruptions. We investigate the +cross-modal interactions between text, image, and speech, encompassing four +essential generation tasks: text-to-image, image-to-text, text-to-speech, and +speech-to-text. We create a comprehensive benchmark, named MMCBench, that +covers more than 100 popular LMMs (totally over 150 model checkpoints). A +thorough evaluation under common corruptions is critical for practical +deployment and facilitates a better understanding of the reliability of +cutting-edge LMMs. The benchmarking code is available at +https://github.com/sail-sg/MMCBench + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ A Saliency Enhanced Feature Fusion based multiscale RGB-D Salient Object + Detection Network ICASSP 2024 + + +
+ Multiscale convolutional neural network (CNN) has demonstrated remarkable +capabilities in solving various vision problems. However, fusing features of +different scales alwaysresults in large model sizes, impeding the application +of multiscale CNNs in RGB-D saliency detection. In this paper, we propose a +customized feature fusion module, called Saliency Enhanced Feature Fusion +(SEFF), for RGB-D saliency detection. SEFF utilizes saliency maps of the +neighboring scales to enhance the necessary features for fusing, resulting in +more representative fused features. Our multiscale RGB-D saliency detector uses +SEFF and processes images with three different scales. SEFF is used to fuse the +features of RGB and depth images, as well as the features of decoders at +different scales. Extensive experiments on five benchmark datasets have +demonstrated the superiority of our method over ten SOTA saliency detectors. + +
+
+ comment: Accpeted by 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Large receptive field strategy and important feature extraction strategy + in 3D object detection + + +
+ The enhancement of 3D object detection is pivotal for precise environmental +perception and improved task execution capabilities in autonomous driving. +LiDAR point clouds, offering accurate depth information, serve as a crucial +information for this purpose. Our study focuses on key challenges in 3D target +detection. To tackle the challenge of expanding the receptive field of a 3D +convolutional kernel, we introduce the Dynamic Feature Fusion Module (DFFM). +This module achieves adaptive expansion of the 3D convolutional kernel's +receptive field, balancing the expansion with acceptable computational loads. +This innovation reduces operations, expands the receptive field, and allows the +model to dynamically adjust to different object requirements. Simultaneously, +we identify redundant information in 3D features. Employing the Feature +Selection Module (FSM) quantitatively evaluates and eliminates non-important +features, achieving the separation of output box fitting and feature +extraction. This innovation enables the detector to focus on critical features, +resulting in model compression, reduced computational burden, and minimized +candidate frame interference. Extensive experiments confirm that both DFFM and +FSM not only enhance current benchmarks, particularly in small target +detection, but also accelerate network performance. Importantly, these modules +exhibit effective complementarity. + +
+
+
+
+
+ + ☆ A Training-Free Defense Framework for Robust Learned Image Compression + + +
+ We study the robustness of learned image compression models against +adversarial attacks and present a training-free defense technique based on +simple image transform functions. Recent learned image compression models are +vulnerable to adversarial attacks that result in poor compression rate, low +reconstruction quality, or weird artifacts. To address the limitations, we +propose a simple but effective two-way compression algorithm with random input +transforms, which is conveniently applicable to existing image compression +models. Unlike the na\"ive approaches, our approach preserves the original +rate-distortion performance of the models on clean images. Moreover, the +proposed algorithm requires no additional training or modification of existing +models, making it more practical. We demonstrate the effectiveness of the +proposed techniques through extensive experiments under multiple compression +models, evaluation metrics, and attack scenarios. + +
+
+ comment: 10 pages and 14 figures +
+
+
+
+
+ + ☆ Evaluating the Feasibility of Standard Facial Expression Recognition in + Individuals with Moderate to Severe Intellectual Disabilities + + +
+ Recent research has underscored the increasing preference of users for +human-like interactions with machines. Consequently, facial expression +recognition has gained significance as a means of imparting social robots with +the capacity to discern the emotional states of users. In this investigation, +we assess the suitability of deep learning approaches, known for their +remarkable performance in this domain, for recognizing facial expressions in +individuals with intellectual disabilities, which has not been yet studied in +the literature, to the best of our knowledge. To address this objective, we +train a set of twelve distinct convolutional neural networks in different +approaches, including an ensemble of datasets without individuals with +intellectual disabilities and a dataset featuring such individuals. Our +examination of the outcomes achieved by the various models under distinct +training conditions, coupled with a comprehensive analysis of critical facial +regions during expression recognition facilitated by explainable artificial +intelligence techniques, revealed significant distinctions in facial +expressions between individuals with and without intellectual disabilities, as +well as among individuals with intellectual disabilities. Remarkably, our +findings demonstrate the feasibility of facial expression recognition within +this population through tailored user-specific training methodologies, which +enable the models to effectively address the unique expressions of each user. + +
+
+
+
+
+ + ☆ Detect-Order-Construct: A Tree Construction based Approach for + Hierarchical Document Structure Analysis + + +
+ Document structure analysis (aka document layout analysis) is crucial for +understanding the physical layout and logical structure of documents, with +applications in information retrieval, document summarization, knowledge +extraction, etc. In this paper, we concentrate on Hierarchical Document +Structure Analysis (HDSA) to explore hierarchical relationships within +structured documents created using authoring software employing hierarchical +schemas, such as LaTeX, Microsoft Word, and HTML. To comprehensively analyze +hierarchical document structures, we propose a tree construction based approach +that addresses multiple subtasks concurrently, including page object detection +(Detect), reading order prediction of identified objects (Order), and the +construction of intended hierarchical structure (Construct). We present an +effective end-to-end solution based on this framework to demonstrate its +performance. To assess our approach, we develop a comprehensive benchmark +called Comp-HRDoc, which evaluates the above subtasks simultaneously. Our +end-to-end system achieves state-of-the-art performance on two large-scale +document layout analysis datasets (PubLayNet and DocLayNet), a high-quality +hierarchical document structure reconstruction dataset (HRDoc), and our +Comp-HRDoc benchmark. The Comp-HRDoc benchmark will be released to facilitate +further research in this field. + +
+
+ comment: Submitted to Pattern Recognition +
+
+
+
+
+ + ☆ LKFormer: Large Kernel Transformer for Infrared Image Super-Resolution + + +
+ Given the broad application of infrared technology across diverse fields, +there is an increasing emphasis on investigating super-resolution techniques +for infrared images within the realm of deep learning. Despite the impressive +results of current Transformer-based methods in image super-resolution tasks, +their reliance on the self-attentive mechanism intrinsic to the Transformer +architecture results in images being treated as one-dimensional sequences, +thereby neglecting their inherent two-dimensional structure. Moreover, infrared +images exhibit a uniform pixel distribution and a limited gradient range, +posing challenges for the model to capture effective feature information. +Consequently, we suggest a potent Transformer model, termed Large Kernel +Transformer (LKFormer), to address this issue. Specifically, we have designed a +Large Kernel Residual Depth-wise Convolutional Attention (LKRDA) module with +linear complexity. This mainly employs depth-wise convolution with large +kernels to execute non-local feature modeling, thereby substituting the +standard self-attentive layer. Additionally, we have devised a novel +feed-forward network structure called Gated-Pixel Feed-Forward Network (GPFN) +to augment the LKFormer's capacity to manage the information flow within the +network. Comprehensive experimental results reveal that our method surpasses +the most advanced techniques available, using fewer parameters and yielding +considerably superior performance. + +
+
+
+
+
+ + ☆ MOSformer: Momentum encoder-based inter-slice fusion transformer for + medical image segmentation + + +
+ Medical image segmentation takes an important position in various clinical +applications. Deep learning has emerged as the predominant solution for +automated segmentation of volumetric medical images. 2.5D-based segmentation +models bridge computational efficiency of 2D-based models and spatial +perception capabilities of 3D-based models. However, prevailing 2.5D-based +models often treat each slice equally, failing to effectively learn and exploit +inter-slice information, resulting in suboptimal segmentation performances. In +this paper, a novel Momentum encoder-based inter-slice fusion transformer +(MOSformer) is proposed to overcome this issue by leveraging inter-slice +information at multi-scale feature maps extracted by different encoders. +Specifically, dual encoders are employed to enhance feature distinguishability +among different slices. One of the encoders is moving-averaged to maintain the +consistency of slice representations. Moreover, an IF-Swin transformer module +is developed to fuse inter-slice multi-scale features. The MOSformer is +evaluated on three benchmark datasets (Synapse, ACDC, and AMOS), establishing a +new state-of-the-art with 85.63%, 92.19%, and 85.43% of DSC, respectively. +These promising results indicate its competitiveness in medical image +segmentation. Codes and models of MOSformer will be made publicly available +upon acceptance. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ SignVTCL: Multi-Modal Continuous Sign Language Recognition Enhanced by + Visual-Textual Contrastive Learning + + +
+ Sign language recognition (SLR) plays a vital role in facilitating +communication for the hearing-impaired community. SLR is a weakly supervised +task where entire videos are annotated with glosses, making it challenging to +identify the corresponding gloss within a video segment. Recent studies +indicate that the main bottleneck in SLR is the insufficient training caused by +the limited availability of large-scale datasets. To address this challenge, we +present SignVTCL, a multi-modal continuous sign language recognition framework +enhanced by visual-textual contrastive learning, which leverages the full +potential of multi-modal data and the generalization ability of language model. +SignVTCL integrates multi-modal data (video, keypoints, and optical flow) +simultaneously to train a unified visual backbone, thereby yielding more robust +visual representations. Furthermore, SignVTCL contains a visual-textual +alignment approach incorporating gloss-level and sentence-level alignment to +ensure precise correspondence between visual features and glosses at the level +of individual glosses and sentence. Experimental results conducted on three +datasets, Phoenix-2014, Phoenix-2014T, and CSL-Daily, demonstrate that SignVTCL +achieves state-of-the-art results compared with previous methods. + +
+
+
+
+
+ + ☆ Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field + Crop Yield Prediction + + +
+ Accurate crop yield prediction is of utmost importance for informed +decision-making in agriculture, aiding farmers, and industry stakeholders. +However, this task is complex and depends on multiple factors, such as +environmental conditions, soil properties, and management practices. Combining +heterogeneous data views poses a fusion challenge, like identifying the +view-specific contribution to the predictive task. We present a novel +multi-view learning approach to predict crop yield for different crops +(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our +multi-view input data includes multi-spectral optical images from Sentinel-2 +satellites and weather data as dynamic features during the crop growing season, +complemented by static features like soil properties and topographic +information. To effectively fuse the data, we introduce a Multi-view Gated +Fusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU) +module. The view-encoders handle the heterogeneity of data sources with varying +temporal resolutions by learning a view-specific representation. These +representations are adaptively fused via a weighted sum. The fusion weights are +computed for each sample by the GU using a concatenation of the +view-representations. The MVGF model is trained at sub-field level with 10 m +resolution pixels. Our evaluations show that the MVGF outperforms conventional +models on the same task, achieving the best results by incorporating all the +data sources, unlike the usual fusion results in the literature. For Argentina, +the MVGF model achieves an R2 value of 0.68 at sub-field yield prediction, +while at field level evaluation (comparing field averages), it reaches around +0.80 across different countries. The GU module learned different weights based +on the country and crop-type, aligning with the variable significance of each +data source to the prediction task. + +
+
+
+
+
+ + ☆ Unveiling the Human-like Similarities of Automatic Facial Expression + Recognition: An Empirical Exploration through Explainable AI + + +
+ Facial expression recognition is vital for human behavior analysis, and deep +learning has enabled models that can outperform humans. However, it is unclear +how closely they mimic human processing. This study aims to explore the +similarity between deep neural networks and human perception by comparing +twelve different networks, including both general object classifiers and +FER-specific models. We employ an innovative global explainable AI method to +generate heatmaps, revealing crucial facial regions for the twelve networks +trained on six facial expressions. We assess these results both quantitatively +and qualitatively, comparing them to ground truth masks based on Friesen and +Ekman's description and among them. We use Intersection over Union (IoU) and +normalized correlation coefficients for comparisons. We generate 72 heatmaps to +highlight critical regions for each expression and architecture. Qualitatively, +models with pre-trained weights show more similarity in heatmaps compared to +those without pre-training. Specifically, eye and nose areas influence certain +facial expressions, while the mouth is consistently important across all models +and expressions. Quantitatively, we find low average IoU values (avg. 0.2702) +across all expressions and architectures. The best-performing architecture +averages 0.3269, while the worst-performing one averages 0.2066. Dendrograms, +built with the normalized correlation coefficient, reveal two main clusters for +most expressions: models with pre-training and models without pre-training. +Findings suggest limited alignment between human and AI facial expression +recognition, with network architectures influencing the similarity, as similar +architectures prioritize similar facial regions. + +
+
+
+
+
+ + ☆ A Fair Evaluation of Various Deep Learning-Based Document Image + Binarization Approaches + + +
+ Binarization of document images is an important pre-processing step in the +field of document analysis. Traditional image binarization techniques usually +rely on histograms or local statistics to identify a valid threshold to +differentiate between different aspects of the image. Deep learning techniques +are able to generate binarized versions of the images by learning +context-dependent features that are less error-prone to degradation typically +occurring in document images. In recent years, many deep learning-based methods +have been developed for document binarization. But which one to choose? There +have been no studies that compare these methods rigorously. Therefore, this +work focuses on the evaluation of different deep learning-based methods under +the same evaluation protocol. We evaluate them on different Document Image +Binarization Contest (DIBCO) datasets and obtain very heterogeneous results. We +show that the DE-GAN model was able to perform better compared to other models +when evaluated on the DIBCO2013 dataset while DP-LinkNet performed best on the +DIBCO2017 dataset. The 2-StageGAN performed best on the DIBCO2018 dataset while +SauvolaNet outperformed the others on the DIBCO2019 challenge. Finally, we make +the code, all models and evaluation publicly available +(https://github.com/RichSu95/Document_Binarization_Collection) to ensure +reproducibility and simplify future binarization evaluations. + +
+
+ comment: DAS 2022 +
+
+
+
+
+ + ☆ Rethinking Centered Kernel Alignment in Knowledge Distillation + + +
+ Knowledge distillation has emerged as a highly effective method for bridging +the representation discrepancy between large-scale models and lightweight +models. Prevalent approaches involve leveraging appropriate metrics to minimize +the divergence or distance between the knowledge extracted from the teacher +model and the knowledge learned by the student model. Centered Kernel Alignment +(CKA) is widely used to measure representation similarity and has been applied +in several knowledge distillation methods. However, these methods are complex +and fail to uncover the essence of CKA, thus not answering the question of how +to use CKA to achieve simple and effective distillation properly. This paper +first provides a theoretical perspective to illustrate the effectiveness of +CKA, which decouples CKA to the upper bound of Maximum Mean Discrepancy~(MMD) +and a constant term. Drawing from this, we propose a novel Relation-Centered +Kernel Alignment~(RCKA) framework, which practically establishes a connection +between CKA and MMD. Furthermore, we dynamically customize the application of +CKA based on the characteristics of each task, with less computational source +yet comparable performance than the previous methods. The extensive experiments +on the CIFAR-100, ImageNet-1k, and MS-COCO demonstrate that our method achieves +state-of-the-art performance on almost all teacher-student pairs for image +classification and object detection, validating the effectiveness of our +approaches. + +
+
+
+
+
+ + ☆ Symbrain: A large-scale dataset of MRI images for neonatal brain + symmetry analysis + + +
+ This paper presents an annotated dataset of brain MRI images designed to +advance the field of brain symmetry study. Magnetic resonance imaging (MRI) has +gained interest in analyzing brain symmetry in neonatal infants, and challenges +remain due to the vast size differences between fetal and adult brains. +Classification methods for brain structural MRI use scales and visual cues to +assess hemisphere symmetry, which can help diagnose neonatal patients by +comparing hemispheres and anatomical regions of interest in the brain. Using +the Developing Human Connectome Project dataset, this work presents a dataset +comprising cerebral images extracted as slices across selected portions of +interest for clinical evaluation . All the extracted images are annotated with +the brain's midline. All the extracted images are annotated with the brain's +midline. From the assumption that a decrease in symmetry is directly related to +possible clinical pathologies, the dataset can contribute to a more precise +diagnosis because it can be used to train deep learning model application in +neonatal cerebral MRI anomaly detection from postnatal infant scans thanks to +computer vision. Such models learn to identify and classify anomalies by +identifying potential asymmetrical patterns in medical MRI images. Furthermore, +this dataset can contribute to the research and development of methods using +the relative symmetry of the two brain hemispheres for crucial diagnosis and +treatment planning. + +
+
+ comment: 7 pages, 2 figures, Dataset Paper, Medical AI +
+
+
+
+
+ + ☆ Local Agnostic Video Explanations: a Study on the Applicability of + Removal-Based Explanations to Video + + +
+ Explainable artificial intelligence techniques are becoming increasingly +important with the rise of deep learning applications in various domains. These +techniques aim to provide a better understanding of complex "black box" models +and enhance user trust while maintaining high learning performance. While many +studies have focused on explaining deep learning models in computer vision for +image input, video explanations remain relatively unexplored due to the +temporal dimension's complexity. In this paper, we present a unified framework +for local agnostic explanations in the video domain. Our contributions include: +(1) Extending a fine-grained explanation framework tailored for computer vision +data, (2) Adapting six existing explanation techniques to work on video data by +incorporating temporal information and enabling local explanations, and (3) +Conducting an evaluation and comparison of the adapted explanation methods +using different models and datasets. We discuss the possibilities and choices +involved in the removal-based explanation process for visual data. The +adaptation of six explanation methods for video is explained, with comparisons +to existing approaches. We evaluate the performance of the methods using +automated metrics and user-based evaluation, showing that 3D RISE, 3D LIME, and +3D Kernel SHAP outperform other methods. By decomposing the explanation process +into manageable steps, we facilitate the study of each choice's impact and +allow for further refinement of explanation methods to suit specific datasets +and models. + +
+
+
+
+
+ + ☆ SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic + Segmentation + + +
+ Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation +models using training image data with only image-level supervision. Since +precise pixel-level annotations are not accessible, existing methods typically +focus on producing pseudo masks for training segmentation models by refining +CAM-like heatmaps. However, the produced heatmaps may only capture +discriminative image regions of target object categories or the associated +co-occurring backgrounds. To address the issues, we propose a Semantic Prompt +Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the +CLIP space to enhance the semantic alignment between the segmented regions and +the target object categories. More specifically, we propose Contrastive Prompt +Learning and Class-associated Semantic Refinement to learn the prompts that +adequately describe and suppress the image backgrounds associated with each +target object category. In this way, our proposed framework is able to perform +better semantic matching between object regions and the associated text labels, +resulting in desired pseudo masks for training the segmentation model. The +proposed SemPLeS framework achieves SOTA performance on the standard WSSS +benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the +semantic visualization of our learned prompts. The codes will be released. + +
+
+
+
+
+ + ☆ Deep Learning for Computer Vision based Activity Recognition and Fall + Detection of the Elderly: a Systematic Review + + +
+ As the percentage of elderly people in developed countries increases +worldwide, the healthcare of this collective is a worrying matter, especially +if it includes the preservation of their autonomy. In this direction, many +studies are being published on Ambient Assisted Living (AAL) systems, which +help to reduce the preoccupations raised by the independent living of the +elderly. In this study, a systematic review of the literature is presented on +fall detection and Human Activity Recognition (HAR) for the elderly, as the two +main tasks to solve to guarantee the safety of elderly people living alone. To +address the current tendency to perform these two tasks, the review focuses on +the use of Deep Learning (DL) based approaches on computer vision data. In +addition, different collections of data like DL models, datasets or hardware +(e.g. depth or thermal cameras) are gathered from the reviewed studies and +provided for reference in future studies. Strengths and weaknesses of existing +approaches are also discussed and, based on them, our recommendations for +future works are provided. + +
+
+
+
+
+ + ☆ Full-Body Motion Reconstruction with Sparse Sensing from Graph + Perspective + + +
+ Estimating 3D full-body pose from sparse sensor data is a pivotal technique +employed for the reconstruction of realistic human motions in Augmented Reality +and Virtual Reality. However, translating sparse sensor signals into +comprehensive human motion remains a challenge since the sparsely distributed +sensors in common VR systems fail to capture the motion of full human body. In +this paper, we use well-designed Body Pose Graph (BPG) to represent the human +body and translate the challenge into a prediction problem of graph missing +nodes. Then, we propose a novel full-body motion reconstruction framework based +on BPG. To establish BPG, nodes are initially endowed with features extracted +from sparse sensor signals. Features from identifiable joint nodes across +diverse sensors are amalgamated and processed from both temporal and spatial +perspectives. Temporal dynamics are captured using the Temporal Pyramid +Structure, while spatial relations in joint movements inform the spatial +attributes. The resultant features serve as the foundational elements of the +BPG nodes. To further refine the BPG, node features are updated through a graph +neural network that incorporates edge reflecting varying joint relations. Our +method's effectiveness is evidenced by the attained state-of-the-art +performance, particularly in lower body motion, outperforming other baseline +methods. Additionally, an ablation study validates the efficacy of each module +in our proposed framework. + +
+
+
+
+
+ + ☆ Collaborative Position Reasoning Network for Referring Image + Segmentation + + +
+ Given an image and a natural language expression as input, the goal of +referring image segmentation is to segment the foreground masks of the entities +referred by the expression. Existing methods mainly focus on interactive +learning between vision and language to enhance the multi-modal representations +for global context reasoning. However, predicting directly in pixel-level space +can lead to collapsed positioning and poor segmentation results. Its main +challenge lies in how to explicitly model entity localization, especially for +non-salient entities. In this paper, we tackle this problem by executing a +Collaborative Position Reasoning Network (CPRN) via the proposed novel +Row-and-Column interactive (RoCo) and Guided Holistic interactive (Holi) +modules. Specifically, RoCo aggregates the visual features into the row- and +column-wise features corresponding two directional axes respectively. It offers +a fine-grained matching behavior that perceives the associations between the +linguistic features and two decoupled visual features to perform position +reasoning over a hierarchical space. Holi integrates features of the two +modalities by a cross-modal attention mechanism, which suppresses the +irrelevant redundancy under the guide of positioning information from RoCo. +Thus, with the incorporation of RoCo and Holi modules, CPRN captures the visual +details of position reasoning so that the model can achieve more accurate +segmentation. To our knowledge, this is the first work that explicitly focuses +on position reasoning modeling. We also validate the proposed method on three +evaluation datasets. It consistently outperforms existing state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Concealed Object Segmentation with Hierarchical Coherence Modeling + + +
+ Concealed object segmentation (COS) is a challenging task that involves +localizing and segmenting those concealed objects that are visually blended +with their surrounding environments. Despite achieving remarkable success, +existing COS segmenters still struggle to achieve complete segmentation results +in extremely concealed scenarios. In this paper, we propose a Hierarchical +Coherence Modeling (HCM) segmenter for COS, aiming to address this incomplete +segmentation limitation. In specific, HCM promotes feature coherence by +leveraging the intra-stage coherence and cross-stage coherence modules, +exploring feature correlations at both the single-stage and contextual levels. +Additionally, we introduce the reversible re-calibration decoder to detect +previously undetected parts in low-confidence regions, resulting in further +enhancing segmentation performance. Extensive experiments conducted on three +COS tasks, including camouflaged object detection, polyp image segmentation, +and transparent object detection, demonstrate the promising results achieved by +the proposed HCM segmenter. + +
+
+ comment: Accepted to CICAI 2023. 13 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Boosting Multi-view Stereo with Late Cost Aggregation + + +
+ Pairwise matching cost aggregation is a crucial step for modern +learning-based Multi-view Stereo (MVS). Prior works adopt an early aggregation +scheme, which adds up pairwise costs into an intermediate cost. However, we +analyze that this process can degrade informative pairwise matchings, thereby +blocking the depth network from fully utilizing the original geometric matching +cues.To address this challenge, we present a late aggregation approach that +allows for aggregating pairwise costs throughout the network feed-forward +process, achieving accurate estimations with only minor changes of the plain +CasMVSNet.Instead of building an intermediate cost by weighted sum, late +aggregation preserves all pairwise costs along a distinct view channel. This +enables the succeeding depth network to fully utilize the crucial geometric +cues without loss of cost fidelity. Grounded in the new aggregation scheme, we +propose further techniques addressing view order dependence inside the +preserved cost, handling flexible testing views, and improving the depth +filtering process. Despite its technical simplicity, our method improves +significantly upon the baseline cascade-based approach, achieving comparable +results with state-of-the-art methods with favorable computation overhead. + +
+
+ comment: Code and models are available at https://github.com/Wuuu3511/LAMVSNET +
+
+
+
+
+ + ☆ Multi-level Cross-modal Alignment for Image Clustering + + +
+ Recently, the cross-modal pretraining model has been employed to produce +meaningful pseudo-labels to supervise the training of an image clustering +model. However, numerous erroneous alignments in a cross-modal pre-training +model could produce poor-quality pseudo-labels and degrade clustering +performance. To solve the aforementioned issue, we propose a novel +\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in +a cross-modal pretraining model for downstream tasks, by building a smaller but +better semantic space and aligning the images and texts in three levels, i.e., +instance-level, prototype-level, and semantic-level. Theoretical results show +that our proposed method converges, and suggests effective means to reduce the +expected clustering risk of our method. Experimental results on five benchmark +datasets clearly show the superiority of our new method. + +
+
+
+
+
+ + ☆ EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models ICLR 2024 + + +
+ Diffusion models have recently received increasing research attention for +their remarkable transfer abilities in semantic segmentation tasks. However, +generating fine-grained segmentation masks with diffusion models often requires +additional training on annotated datasets, leaving it unclear to what extent +pre-trained diffusion models alone understand the semantic relations of their +generated images. To address this question, we leverage the semantic knowledge +extracted from Stable Diffusion (SD) and aim to develop an image segmentor +capable of generating fine-grained segmentation maps without any additional +training. The primary difficulty stems from the fact that semantically +meaningful feature maps typically exist only in the spatially lower-dimensional +layers, which poses a challenge in directly extracting pixel-level semantic +relations from these feature maps. To overcome this issue, our framework +identifies semantic correspondences between image pixels and spatial locations +of low-dimensional feature maps by exploiting SD's generation process and +utilizes them for constructing image-resolution segmentation maps. In extensive +experiments, the produced segmentation maps are demonstrated to be well +delineated and capture detailed parts of the images, indicating the existence +of highly accurate pixel-level semantic knowledge in diffusion models. + +
+
+ comment: ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/ +
+
+
+
+
+ + ☆ MetaSeg: Content-Aware Meta-Net for Omni-Supervised Semantic + Segmentation + + +
+ Noisy labels, inevitably existing in pseudo segmentation labels generated +from weak object-level annotations, severely hampers model optimization for +semantic segmentation. Previous works often rely on massive hand-crafted losses +and carefully-tuned hyper-parameters to resist noise, suffering poor +generalization capability and high model complexity. Inspired by recent +advances in meta learning, we argue that rather than struggling to tolerate +noise hidden behind clean labels passively, a more feasible solution would be +to find out the noisy regions actively, so as to simply ignore them during +model optimization. With this in mind, this work presents a novel meta learning +based semantic segmentation method, MetaSeg, that comprises a primary +content-aware meta-net (CAM-Net) to sever as a noise indicator for an arbitrary +segmentation model counterpart. Specifically, CAM-Net learns to generate +pixel-wise weights to suppress noisy regions with incorrect pseudo labels while +highlighting clean ones by exploiting hybrid strengthened features from image +content, providing straightforward and reliable guidance for optimizing the +segmentation model. Moreover, to break the barrier of time-consuming training +when applying meta learning to common large segmentation models, we further +present a new decoupled training strategy that optimizes different model layers +in a divide-and-conquer manner. Extensive experiments on object, medical, +remote sensing and human segmentation shows that our method achieves superior +performance, approaching that of fully supervised settings, which paves a new +promising way for omni-supervised semantic segmentation. + +
+
+
+
+
+ + ☆ Colorectal Polyp Segmentation in the Deep Learning Era: A Comprehensive + Survey + + +
+ Colorectal polyp segmentation (CPS), an essential problem in medical image +analysis, has garnered growing research attention. Recently, the deep +learning-based model completely overwhelmed traditional methods in the field of +CPS, and more and more deep CPS methods have emerged, bringing the CPS into the +deep learning era. To help the researchers quickly grasp the main techniques, +datasets, evaluation metrics, challenges, and trending of deep CPS, this paper +presents a systematic and comprehensive review of deep-learning-based CPS +methods from 2014 to 2023, a total of 115 technical papers. In particular, we +first provide a comprehensive review of the current deep CPS with a novel +taxonomy, including network architectures, level of supervision, and learning +paradigm. More specifically, network architectures include eight subcategories, +the level of supervision comprises six subcategories, and the learning paradigm +encompasses 12 subcategories, totaling 26 subcategories. Then, we provided a +comprehensive analysis the characteristics of each dataset, including the +number of datasets, annotation types, image resolution, polyp size, contrast +values, and polyp location. Following that, we summarized CPS's commonly used +evaluation metrics and conducted a detailed analysis of 40 deep SOTA models, +including out-of-distribution generalization and attribute-based performance +analysis. Finally, we discussed deep learning-based CPS methods' main +challenges and opportunities. + +
+
+ comment: 21 pages, 8 figures +
+
+
+
+
+ + ☆ Detecting Out-of-Distribution Samples via Conditional Distribution + Entropy with Optimal Transport + + +
+ When deploying a trained machine learning model in the real world, it is +inevitable to receive inputs from out-of-distribution (OOD) sources. For +instance, in continual learning settings, it is common to encounter OOD samples +due to the non-stationarity of a domain. More generally, when we have access to +a set of test inputs, the existing rich line of OOD detection solutions, +especially the recent promise of distance-based methods, falls short in +effectively utilizing the distribution information from training samples and +test inputs. In this paper, we argue that empirical probability distributions +that incorporate geometric information from both training samples and test +inputs can be highly beneficial for OOD detection in the presence of test +inputs available. To address this, we propose to model OOD detection as a +discrete optimal transport problem. Within the framework of optimal transport, +we propose a novel score function known as the \emph{conditional distribution +entropy} to quantify the uncertainty of a test input being an OOD sample. Our +proposal inherits the merits of certain distance-based methods while +eliminating the reliance on distribution assumptions, a-prior knowledge, and +specific training mechanisms. Extensive experiments conducted on benchmark +datasets demonstrate that our method outperforms its competitors in OOD +detection. + +
+
+
+
+
+ + ☆ Augmenting Prototype Network with TransMix for Few-shot Hyperspectral + Image Classification + + +
+ Few-shot hyperspectral image classification aims to identify the classes of +each pixel in the images by only marking few of these pixels. And in order to +obtain the spatial-spectral joint features of each pixel, the fixed-size +patches centering around each pixel are often used for classification. However, +observing the classification results of existing methods, we found that +boundary patches corresponding to the pixels which are located at the boundary +of the objects in the hyperspectral images, are hard to classify. These +boundary patchs are mixed with multi-class spectral information. Inspired by +this, we propose to augment the prototype network with TransMix for few-shot +hyperspectrial image classification(APNT). While taking the prototype network +as the backbone, it adopts the transformer as feature extractor to learn the +pixel-to-pixel relation and pay different attentions to different pixels. At +the same time, instead of directly using the patches which are cut from the +hyperspectral images for training, it randomly mixs up two patches to imitate +the boundary patches and uses the synthetic patches to train the model, with +the aim to enlarge the number of hard training samples and enhance their +diversity. And by following the data agumentation technique TransMix, the +attention returned by the transformer is also used to mix up the labels of two +patches to generate better labels for synthetic patches. Compared with existing +methods, the proposed method has demonstrated sate of the art performance and +better robustness for few-shot hyperspectral image classification in our +experiments. + +
+
+
+
+
+ + ☆ SFC: Shared Feature Calibration in Weakly Supervised Semantic + Segmentation + + +
+ Image-level weakly supervised semantic segmentation has received increasing +attention due to its low annotation cost. Existing methods mainly rely on Class +Activation Mapping (CAM) to obtain pseudo-labels for training semantic +segmentation models. In this work, we are the first to demonstrate that +long-tailed distribution in training data can cause the CAM calculated through +classifier weights over-activated for head classes and under-activated for tail +classes due to the shared features among head- and tail- classes. This degrades +pseudo-label quality and further influences final semantic segmentation +performance. To address this issue, we propose a Shared Feature Calibration +(SFC) method for CAM generation. Specifically, we leverage the class prototypes +that carry positive shared features and propose a Multi-Scaled +Distribution-Weighted (MSDW) consistency loss for narrowing the gap between the +CAMs generated through classifier weights and class prototypes during training. +The MSDW loss counterbalances over-activation and under-activation by +calibrating the shared features in head-/tail-class classifier weights. +Experimental results show that our SFC significantly improves CAM boundaries +and achieves new state-of-the-art performances. The project is available at +https://github.com/Barrett-python/SFC. + +
+
+
+
+
+ + ☆ MsSVT++: Mixed-scale Sparse Voxel Transformer with Center Voting for 3D + Object Detection + + +
+ Accurate 3D object detection in large-scale outdoor scenes, characterized by +considerable variations in object scales, necessitates features rich in both +long-range and fine-grained information. While recent detectors have utilized +window-based transformers to model long-range dependencies, they tend to +overlook fine-grained details. To bridge this gap, we propose MsSVT++, an +innovative Mixed-scale Sparse Voxel Transformer that simultaneously captures +both types of information through a divide-and-conquer approach. This approach +involves explicitly dividing attention heads into multiple groups, each +responsible for attending to information within a specific range. The outputs +of these groups are subsequently merged to obtain final mixed-scale features. +To mitigate the computational complexity associated with applying a +window-based transformer in 3D voxel space, we introduce a novel Chessboard +Sampling strategy and implement voxel sampling and gathering operations +sparsely using a hash map. Moreover, an important challenge stems from the +observation that non-empty voxels are primarily located on the surface of +objects, which impedes the accurate estimation of bounding boxes. To overcome +this challenge, we introduce a Center Voting module that integrates newly voted +voxels enriched with mixed-scale contextual information towards the centers of +the objects, thereby improving precise object localization. Extensive +experiments demonstrate that our single-stage detector, built upon the +foundation of MsSVT++, consistently delivers exceptional performance across +diverse datasets. + +
+
+
+
+
+ + ☆ Medical Image Debiasing by Learning Adaptive Agreement from a Biased + Council + + +
+ Deep learning could be prone to learning shortcuts raised by dataset bias and +result in inaccurate, unreliable, and unfair models, which impedes its adoption +in real-world clinical applications. Despite its significance, there is a +dearth of research in the medical image classification domain to address +dataset bias. Furthermore, the bias labels are often agnostic, as identifying +biases can be laborious and depend on post-hoc interpretation. This paper +proposes learning Adaptive Agreement from a Biased Council (Ada-ABC), a +debiasing framework that does not rely on explicit bias labels to tackle +dataset bias in medical images. Ada-ABC develops a biased council consisting of +multiple classifiers optimized with generalized cross entropy loss to learn the +dataset bias. A debiasing model is then simultaneously trained under the +guidance of the biased council. Specifically, the debiasing model is required +to learn adaptive agreement with the biased council by agreeing on the +correctly predicted samples and disagreeing on the wrongly predicted samples by +the biased council. In this way, the debiasing model could learn the target +attribute on the samples without spurious correlations while also avoiding +ignoring the rich information in samples with spurious correlations. We +theoretically demonstrated that the debiasing model could learn the target +features when the biased model successfully captures dataset bias. Moreover, to +our best knowledge, we constructed the first medical debiasing benchmark from +four datasets containing seven different bias scenarios. Our extensive +experiments practically showed that our proposed Ada-ABC outperformed +competitive approaches, verifying its effectiveness in mitigating dataset bias +for medical image classification. The codes and organized benchmark datasets +will be made publicly available. + +
+
+ comment: 10 pages, 5 figures, 3 tables. Code and benchmark will be released + via https://github.com/LLYXC/Ada-ABC/tree/main +
+
+
+
+
+ + ☆ HG3-NeRF: Hierarchical Geometric, Semantic, and Photometric Guided + Neural Radiance Fields for Sparse View Inputs + + +
+ Neural Radiance Fields (NeRF) have garnered considerable attention as a +paradigm for novel view synthesis by learning scene representations from +discrete observations. Nevertheless, NeRF exhibit pronounced performance +degradation when confronted with sparse view inputs, consequently curtailing +its further applicability. In this work, we introduce Hierarchical Geometric, +Semantic, and Photometric Guided NeRF (HG3-NeRF), a novel methodology that can +address the aforementioned limitation and enhance consistency of geometry, +semantic content, and appearance across different views. We propose +Hierarchical Geometric Guidance (HGG) to incorporate the attachment of +Structure from Motion (SfM), namely sparse depth prior, into the scene +representations. Different from direct depth supervision, HGG samples volume +points from local-to-global geometric regions, mitigating the misalignment +caused by inherent bias in the depth prior. Furthermore, we draw inspiration +from notable variations in semantic consistency observed across images of +different resolutions and propose Hierarchical Semantic Guidance (HSG) to learn +the coarse-to-fine semantic content, which corresponds to the coarse-to-fine +scene representations. Experimental results demonstrate that HG3-NeRF can +outperform other state-of-the-art methods on different standard benchmarks and +achieve high-fidelity synthesis results for sparse view inputs. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Mastering Text-to-Image Diffusion: Recaptioning, Planning, and + Generating with Multimodal LLMs + + +
+ Diffusion models have exhibit exceptional performance in text-to-image +generation and editing. However, existing methods often face challenges when +handling complex text prompts that involve multiple objects with multiple +attributes and relationships. In this paper, we propose a brand new +training-free text-to-image generation/editing framework, namely Recaption, +Plan and Generate (RPG), harnessing the powerful chain-of-thought reasoning +ability of multimodal LLMs to enhance the compositionality of text-to-image +diffusion models. Our approach employs the MLLM as a global planner to +decompose the process of generating complex images into multiple simpler +generation tasks within subregions. We propose complementary regional diffusion +to enable region-wise compositional generation. Furthermore, we integrate +text-guided image generation and editing within the proposed RPG in a +closed-loop fashion, thereby enhancing generalization ability. Extensive +experiments demonstrate our RPG outperforms state-of-the-art text-to-image +diffusion models, including DALL-E 3 and SDXL, particularly in multi-category +object composition and text-image semantic alignment. Notably, our RPG +framework exhibits wide compatibility with various MLLM architectures (e.g., +MiniGPT-4) and diffusion backbones (e.g., ControlNet). Our code is available +at: https://github.com/YangLing0818/RPG-DiffusionMaster + +
+
+ comment: Project: https://github.com/YangLing0818/RPG-DiffusionMaster +
+
+
+
+
+ + ☆ EK-Net:Real-time Scene Text Detection with Expand Kernel Distance + + +
+ Recently, scene text detection has received significant attention due to its +wide application. However, accurate detection in complex scenes of multiple +scales, orientations, and curvature remains a challenge. Numerous detection +methods adopt the Vatti clipping (VC) algorithm for multiple-instance training +to address the issue of arbitrary-shaped text. Yet we identify several bias +results from these approaches called the "shrinked kernel". Specifically, it +refers to a decrease in accuracy resulting from an output that overly favors +the text kernel. In this paper, we propose a new approach named Expand Kernel +Network (EK-Net) with expand kernel distance to compensate for the previous +deficiency, which includes three-stages regression to complete instance +detection. Moreover, EK-Net not only realize the precise positioning of +arbitrary-shaped text, but also achieve a trade-off between performance and +speed. Evaluation results demonstrate that EK-Net achieves state-of-the-art or +competitive performance compared to other advanced methods, e.g., F-measure of +85.72% at 35.42 FPS on ICDAR 2015, F-measure of 85.75% at 40.13 FPS on CTW1500. + +
+
+ comment: 2024 IEEE International Conference on Acoustics, Speech and Signal + Processing +
+
+
+
+
+ + ☆ TIM: An Efficient Temporal Interaction Module for Spiking Transformer + + +
+ Spiking Neural Networks (SNNs), as the third generation of neural networks, +have gained prominence for their biological plausibility and computational +efficiency, especially in processing diverse datasets. The integration of +attention mechanisms, inspired by advancements in neural network architectures, +has led to the development of Spiking Transformers. These have shown promise in +enhancing SNNs' capabilities, particularly in the realms of both static and +neuromorphic datasets. Despite their progress, a discernible gap exists in +these systems, specifically in the Spiking Self Attention (SSA) mechanism's +effectiveness in leveraging the temporal processing potential of SNNs. To +address this, we introduce the Temporal Interaction Module (TIM), a novel, +convolution-based enhancement designed to augment the temporal data processing +abilities within SNN architectures. TIM's integration into existing SNN +frameworks is seamless and efficient, requiring minimal additional parameters +while significantly boosting their temporal information handling capabilities. +Through rigorous experimentation, TIM has demonstrated its effectiveness in +exploiting temporal information, leading to state-of-the-art performance across +various neuromorphic datasets. + +
+
+ comment: 10pages,6figures +
+
+
+
+
+ + ☆ Memory-Efficient Prompt Tuning for Incremental Histopathology + Classification AAAI 2024 + + +
+ Recent studies have made remarkable progress in histopathology +classification. Based on current successes, contemporary works proposed to +further upgrade the model towards a more generalizable and robust direction +through incrementally learning from the sequentially delivered domains. Unlike +previous parameter isolation based approaches that usually demand massive +computation resources during model updating, we present a memory-efficient +prompt tuning framework to cultivate model generalization potential in +economical memory cost. For each incoming domain, we reuse the existing +parameters of the initial classification model and attach lightweight trainable +prompts into it for customized tuning. Considering the domain heterogeneity, we +perform decoupled prompt tuning, where we adopt a domain-specific prompt for +each domain to independently investigate its distinctive characteristics, and +one domain-invariant prompt shared across all domains to continually explore +the common content embedding throughout time. All domain-specific prompts will +be appended to the prompt bank and isolated from further changes to prevent +forgetting the distinctive features of early-seen domains. While the +domain-invariant prompt will be passed on and iteratively evolve by +style-augmented prompt refining to improve model generalization capability over +time. In specific, we construct a graph with existing prompts and build a +style-augmented graph attention network to guide the domain-invariant prompt +exploring the overlapped latent embedding among all delivered domains for more +domain generic representations. We have extensively evaluated our framework +with two histopathology tasks, i.e., breast cancer metastasis classification +and epithelium-stroma tissue classification, where our approach yielded +superior performance and memory efficiency over the competing methods. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ MVSFormer++: Revealing the Devil in Transformer's Details for Multi-View + Stereo ICLR2024 + + +
+ Recent advancements in learning-based Multi-View Stereo (MVS) methods have +prominently featured transformer-based models with attention mechanisms. +However, existing approaches have not thoroughly investigated the profound +influence of transformers on different MVS modules, resulting in limited depth +estimation capabilities. In this paper, we introduce MVSFormer++, a method that +prudently maximizes the inherent characteristics of attention to enhance +various components of the MVS pipeline. Formally, our approach involves +infusing cross-view information into the pre-trained DINOv2 model to facilitate +MVS learning. Furthermore, we employ different attention mechanisms for the +feature encoder and cost volume regularization, focusing on feature and spatial +aggregations respectively. Additionally, we uncover that some design details +would substantially impact the performance of transformer modules in MVS, +including normalized 3D positional encoding, adaptive attention scaling, and +the position of layer normalization. Comprehensive experiments on DTU, +Tanks-and-Temples, BlendedMVS, and ETH3D validate the effectiveness of the +proposed method. Notably, MVSFormer++ achieves state-of-the-art performance on +the challenging DTU and Tanks-and-Temples benchmarks. + +
+
+ comment: Accepted to ICLR2024 +
+
+
+
+
+ + ☆ RTA-Former: Reverse Transformer Attention for Polyp Segmentation + + +
+ Polyp segmentation is a key aspect of colorectal cancer prevention, enabling +early detection and guiding subsequent treatments. Intelligent diagnostic +tools, including deep learning solutions, are widely explored to streamline and +potentially automate this process. However, even with many powerful network +architectures, there still comes the problem of producing accurate edge +segmentation. In this paper, we introduce a novel network, namely RTA-Former, +that employs a transformer model as the encoder backbone and innovatively +adapts Reverse Attention (RA) with a transformer stage in the decoder for +enhanced edge segmentation. The results of the experiments illustrate that +RTA-Former achieves state-of-the-art (SOTA) performance in five polyp +segmentation datasets. The strong capability of RTA-Former holds promise in +improving the accuracy of Transformer-based polyp segmentation, potentially +leading to better clinical decisions and patient outcomes. Our code will be +publicly available on GitHub. + +
+
+
+
+
+ + ☆ ActionHub: A Large-scale Action Video Description Dataset for Zero-shot + Action Recognition + + +
+ Zero-shot action recognition (ZSAR) aims to learn an alignment model between +videos and class descriptions of seen actions that is transferable to unseen +actions. The text queries (class descriptions) used in existing ZSAR works, +however, are often short action names that fail to capture the rich semantics +in the videos, leading to misalignment. With the intuition that video content +descriptions (e.g., video captions) can provide rich contextual information of +visual concepts in videos, we propose to utilize human annotated video +descriptions to enrich the semantics of the class descriptions of each action. +However, all existing action video description datasets are limited in terms of +the number of actions, the semantics of video descriptions, etc. To this end, +we collect a large-scale action video descriptions dataset named ActionHub, +which covers a total of 1,211 common actions and provides 3.6 million action +video descriptions. With the proposed ActionHub dataset, we further propose a +novel Cross-modality and Cross-action Modeling (CoCo) framework for ZSAR, which +consists of a Dual Cross-modality Alignment module and a Cross-action +Invariance Mining module. Specifically, the Dual Cross-modality Alignment +module utilizes both action labels and video descriptions from ActionHub to +obtain rich class semantic features for feature alignment. The Cross-action +Invariance Mining module exploits a cycle-reconstruction process between the +class semantic feature spaces of seen actions and unseen actions, aiming to +guide the model to learn cross-action invariant representations. Extensive +experimental results demonstrate that our CoCo framework significantly +outperforms the state-of-the-art on three popular ZSAR benchmarks (i.e., +Kinetics-ZSAR, UCF101 and HMDB51) under two different learning protocols in +ZSAR. We will release our code, models, and the proposed ActionHub dataset. + +
+
+
+
+
+ + ☆ OnDev-LCT: On-Device Lightweight Convolutional Transformers towards + federated learning + + +
+ Federated learning (FL) has emerged as a promising approach to +collaboratively train machine learning models across multiple edge devices +while preserving privacy. The success of FL hinges on the efficiency of +participating models and their ability to handle the unique challenges of +distributed learning. While several variants of Vision Transformer (ViT) have +shown great potential as alternatives to modern convolutional neural networks +(CNNs) for centralized training, the unprecedented size and higher +computational demands hinder their deployment on resource-constrained edge +devices, challenging their widespread application in FL. Since client devices +in FL typically have limited computing resources and communication bandwidth, +models intended for such devices must strike a balance between model size, +computational efficiency, and the ability to adapt to the diverse and non-IID +data distributions encountered in FL. To address these challenges, we propose +OnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks +with limited training data and resources. Our models incorporate image-specific +inductive biases through the LCT tokenizer by leveraging efficient depthwise +separable convolutions in residual linear bottleneck blocks to extract local +features, while the multi-head self-attention (MHSA) mechanism in the LCT +encoder implicitly facilitates capturing global representations of images. +Extensive experiments on benchmark image datasets indicate that our models +outperform existing lightweight vision models while having fewer parameters and +lower computational demands, making them suitable for FL scenarios with data +heterogeneity and communication bottlenecks. + +
+
+ comment: Published in Neural Networks +
+
+
+
+
+ + ☆ PointGL: A Simple Global-Local Framework for Efficient Point Cloud + Analysis + + +
+ Efficient analysis of point clouds holds paramount significance in real-world +3D applications. Currently, prevailing point-based models adhere to the +PointNet++ methodology, which involves embedding and abstracting point features +within a sequence of spatially overlapping local point sets, resulting in +noticeable computational redundancy. Drawing inspiration from the streamlined +paradigm of pixel embedding followed by regional pooling in Convolutional +Neural Networks (CNNs), we introduce a novel, uncomplicated yet potent +architecture known as PointGL, crafted to facilitate efficient point cloud +analysis. PointGL employs a hierarchical process of feature acquisition through +two recursive steps. First, the Global Point Embedding leverages +straightforward residual Multilayer Perceptrons (MLPs) to effectuate feature +embedding for each individual point. Second, the novel Local Graph Pooling +technique characterizes point-to-point relationships and abstracts regional +representations through succinct local graphs. The harmonious fusion of +one-time point embedding and parameter-free graph pooling contributes to +PointGL's defining attributes of minimized model complexity and heightened +efficiency. Our PointGL attains state-of-the-art accuracy on the ScanObjectNN +dataset while exhibiting a runtime that is more than 5 times faster and +utilizing only approximately 4% of the FLOPs and 30% of the parameters compared +to the recent PointMLP model. The code for PointGL is available at +https://github.com/Roywangj/PointGL. + +
+
+
+
+
+ + ☆ M2-CLIP: A Multimodal, Multi-task Adapting Framework for Video Action + Recognition + + +
+ Recently, the rise of large-scale vision-language pretrained models like +CLIP, coupled with the technology of Parameter-Efficient FineTuning (PEFT), has +captured substantial attraction in video action recognition. Nevertheless, +prevailing approaches tend to prioritize strong supervised performance at the +expense of compromising the models' generalization capabilities during +transfer. In this paper, we introduce a novel Multimodal, Multi-task CLIP +adapting framework named \name to address these challenges, preserving both +high supervised performance and robust transferability. Firstly, to enhance the +individual modality architectures, we introduce multimodal adapters to both the +visual and text branches. Specifically, we design a novel visual TED-Adapter, +that performs global Temporal Enhancement and local temporal Difference +modeling to improve the temporal representation capabilities of the visual +encoder. Moreover, we adopt text encoder adapters to strengthen the learning of +semantic label information. Secondly, we design a multi-task decoder with a +rich set of supervisory signals to adeptly satisfy the need for strong +supervised performance and generalization within a multimodal framework. +Experimental results validate the efficacy of our approach, demonstrating +exceptional performance in supervised learning while maintaining strong +generalization in zero-shot scenarios. + +
+
+
+
+
+ + ☆ Friends Across Time: Multi-Scale Action Segmentation Transformer for + Surgical Phase Recognition + + +
+ Automatic surgical phase recognition is a core technology for modern +operating rooms and online surgical video assessment platforms. Current +state-of-the-art methods use both spatial and temporal information to tackle +the surgical phase recognition task. Building on this idea, we propose the +Multi-Scale Action Segmentation Transformer (MS-AST) for offline surgical phase +recognition and the Multi-Scale Action Segmentation Causal Transformer +(MS-ASCT) for online surgical phase recognition. We use ResNet50 or +EfficientNetV2-M for spatial feature extraction. Our MS-AST and MS-ASCT can +model temporal information at different scales with multi-scale temporal +self-attention and multi-scale temporal cross-attention, which enhances the +capture of temporal relationships between frames and segments. We demonstrate +that our method can achieve 95.26% and 96.15% accuracy on the Cholec80 dataset +for online and offline surgical phase recognition, respectively, which achieves +new state-of-the-art results. Our method can also achieve state-of-the-art +results on non-medical datasets in the video action segmentation domain. + +
+
+
+
+
+ + ☆ Zoom-shot: Fast and Efficient Unsupervised Zero-Shot Transfer of CLIP to + Vision Encoders with Multimodal Loss + + +
+ The fusion of vision and language has brought about a transformative shift in +computer vision through the emergence of Vision-Language Models (VLMs). +However, the resource-intensive nature of existing VLMs poses a significant +challenge. We need an accessible method for developing the next generation of +VLMs. To address this issue, we propose Zoom-shot, a novel method for +transferring the zero-shot capabilities of CLIP to any pre-trained vision +encoder. We do this by exploiting the multimodal information (i.e. text and +image) present in the CLIP latent space through the use of specifically +designed multimodal loss functions. These loss functions are (1) +cycle-consistency loss and (2) our novel prompt-guided knowledge distillation +loss (PG-KD). PG-KD combines the concept of knowledge distillation with CLIP's +zero-shot classification, to capture the interactions between text and image +features. With our multimodal losses, we train a $\textbf{linear mapping}$ +between the CLIP latent space and the latent space of a pre-trained vision +encoder, for only a $\textbf{single epoch}$. Furthermore, Zoom-shot is entirely +unsupervised and is trained using $\textbf{unpaired}$ data. We test the +zero-shot capabilities of a range of vision encoders augmented as new VLMs, on +coarse and fine-grained classification datasets, outperforming the previous +state-of-the-art in this problem domain. In our ablations, we find Zoom-shot +allows for a trade-off between data and compute during training; and our +state-of-the-art results can be obtained by reducing training from 20% to 1% of +the ImageNet training data with 20 epochs. All code and models are available on +GitHub. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Beyond Task Performance: Evaluating and Reducing the Flaws of Large + Multimodal Models with In-Context Learning ICLR 2024 + + +
+ Following the success of Large Language Models (LLMs), Large Multimodal +Models (LMMs), such as the Flamingo model and its subsequent competitors, have +started to emerge as natural steps towards generalist agents. However, +interacting with recent LMMs reveals major limitations that are hardly captured +by the current evaluation benchmarks. Indeed, task performances (e.g., VQA +accuracy) alone do not provide enough clues to understand their real +capabilities, limitations, and to which extent such models are aligned to human +expectations. To refine our understanding of those flaws, we deviate from the +current evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from +3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention, +compositionality, explainability and instruction following. Our evaluation on +these axes reveals major flaws in LMMs. While the current go-to solution to +align these models is based on training, such as instruction tuning or RLHF, we +rather (2) explore the training-free in-context learning (ICL) as a solution, +and study how it affects these limitations. Based on our ICL study, (3) we push +ICL further and propose new multimodal ICL variants such as; Multitask-ICL, +Chain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows. +(1) Despite their success, LMMs have flaws that remain unsolved with scaling +alone. (2) The effect of ICL on LMMs flaws is nuanced; despite its +effectiveness for improved explainability, answer abstention, ICL only slightly +improves instruction following, does not improve compositional abilities, and +actually even amplifies hallucinations. (3) The proposed ICL variants are +promising as post-hoc approaches to efficiently tackle some of those flaws. The +code is available here: https://github.com/mshukor/EvALign-ICL. + +
+
+ comment: ICLR 2024. Project Page: https://evalign-icl.github.io/ +
+
+
+
+
+ + ♻ ☆ Interpreting CLIP's Image Representation via Text-Based Decomposition + + +
+ We investigate the CLIP image encoder by analyzing how individual model +components affect the final representation. We decompose the image +representation as a sum across individual image patches, model layers, and +attention heads, and use CLIP's text representation to interpret the summands. +Interpreting the attention heads, we characterize each head's role by +automatically finding text representations that span its output space, which +reveals property-specific roles for many heads (e.g. location or shape). Next, +interpreting the image patches, we uncover an emergent spatial localization +within CLIP. Finally, we use this understanding to remove spurious features +from CLIP and to create a strong zero-shot image segmenter. Our results +indicate that a scalable understanding of transformer models is attainable and +can be used to repair and improve models. + +
+
+ comment: Project page and code: + https://yossigandelsman.github.io/clip_decomposition/ +
+
+
+
+
+ + ♻ ☆ Benchmarking the Robustness of Image Watermarks + + +
+ This paper investigates the weaknesses of image watermarking techniques. We +present WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel +benchmark for assessing watermark robustness, overcoming the limitations of +current evaluation methods.WAVES integrates detection and identification tasks, +and establishes a standardized evaluation protocol comprised of a diverse range +of stress tests. The attacks in WAVES range from traditional image distortions +to advanced and novel variations of diffusive, and adversarial attacks. Our +evaluation examines two pivotal dimensions: the degree of image quality +degradation and the efficacy of watermark detection after attacks. We develop a +series of Performance vs. Quality 2D plots, varying over several prominent +image similarity metrics, which are then aggregated in a heuristically novel +manner to paint an overall picture of watermark robustness and attack potency. +Our comprehensive evaluation reveals previously undetected vulnerabilities of +several modern watermarking algorithms. We envision WAVES as a toolkit for the +future development of robust watermarking systems. The project is available at +https://wavesbench.github.io/ + +
+
+
+
+
+ + ♻ ☆ Joint Hierarchical Priors and Adaptive Spatial Resolution for Efficient + Neural Image Compression + + +
+ Recently, the performance of neural image compression (NIC) has steadily +improved thanks to the last line of study, reaching or outperforming +state-of-the-art conventional codecs. Despite significant progress, current NIC +methods still rely on ConvNet-based entropy coding, limited in modeling +long-range dependencies due to their local connectivity and the increasing +number of architectural biases and priors, resulting in complex underperforming +models with high decoding latency. Motivated by the efficiency investigation of +the Tranformer-based transform coding framework, namely SwinT-ChARM, we propose +to enhance the latter, as first, with a more straightforward yet effective +Tranformer-based channel-wise auto-regressive prior model, resulting in an +absolute image compression transformer (ICT). Through the proposed ICT, we can +capture both global and local contexts from the latent representations and +better parameterize the distribution of the quantized latents. Further, we +leverage a learnable scaling module with a sandwich ConvNeXt-based +pre-/post-processor to accurately extract more compact latent codes while +reconstructing higher-quality images. Extensive experimental results on +benchmark datasets showed that the proposed framework significantly improves +the trade-off between coding efficiency and decoder complexity over the +versatile video coding (VVC) reference encoder (VTM-18.0) and the neural codec +SwinT-ChARM. Moreover, we provide model scaling studies to verify the +computational efficiency of our approach and conduct several objective and +subjective analyses to bring to the fore the performance gap between the +adaptive image compression transformer (AICT) and the neural codec SwinT-ChARM. + +
+
+
+
+
+ + ♻ ☆ DFU: scale-robust diffusion model for zero-shot super-resolution image + generation + + +
+ Diffusion generative models have achieved remarkable success in generating +images with a fixed resolution. However, existing models have limited ability +to generalize to different resolutions when training data at those resolutions +are not available. Leveraging techniques from operator learning, we present a +novel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the +score operator by combining both spatial and spectral information at multiple +resolutions. Comparisons of DFU to baselines demonstrate its scalability: 1) +simultaneously training on multiple resolutions improves FID over training at +any single fixed resolution; 2) DFU generalizes beyond its training +resolutions, allowing for coherent, high-fidelity generation at +higher-resolutions with the same model, i.e. zero-shot super-resolution +image-generation; 3) we propose a fine-tuning strategy to further enhance the +zero-shot super-resolution image-generation capability of our model, leading to +a FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no +other method can come close to achieving. + +
+
+
+
+
+ + ♻ ☆ UniLVSeg: Unified Left Ventricular Segmentation with Sparsely Annotated + Echocardiogram Videos through Self-Supervised Temporal Masking and Weakly + Supervised Training + + +
+ Echocardiography has become an indispensable clinical imaging modality for +general heart health assessment. From calculating biomarkers such as ejection +fraction to the probability of a patient's heart failure, accurate segmentation +of the heart and its structures allows doctors to plan and execute treatments +with greater precision and accuracy. However, achieving accurate and robust +left ventricle segmentation is time-consuming and challenging due to different +reasons. This work introduces a novel approach for consistent left ventricular +(LV) segmentation from sparsely annotated echocardiogram videos. We achieve +this through (1) self-supervised learning (SSL) using temporal masking followed +by (2) weakly supervised training. We investigate two different segmentation +approaches: 3D segmentation and a novel 2D superimage (SI). We demonstrate how +our proposed method outperforms the state-of-the-art solutions by achieving a +93.32% (95%CI 93.21-93.43%) dice score on a large-scale dataset +(EchoNet-Dynamic) while being more efficient. To show the effectiveness of our +approach, we provide extensive ablation studies, including pre-training +settings and various deep learning backbones. Additionally, we discuss how our +proposed methodology achieves high data utility by incorporating unlabeled +frames in the training process. To help support the AI in medicine community, +the complete solution with the source code will be made publicly available upon +acceptance. + +
+
+
+
+
+ + ♻ ☆ The Effect of Intrinsic Dataset Properties on Generalization: Unraveling + Learning Differences Between Natural and Medical Images ICLR 2024 + + +
+ This paper investigates discrepancies in how neural networks learn from +different imaging domains, which are commonly overlooked when adopting computer +vision techniques from the domain of natural images to other specialized +domains such as medical images. Recent works have found that the generalization +error of a trained network typically increases with the intrinsic dimension +($d_{data}$) of its training set. Yet, the steepness of this relationship +varies significantly between medical (radiological) and natural imaging +domains, with no existing theoretical explanation. We address this gap in +knowledge by establishing and empirically validating a generalization scaling +law with respect to $d_{data}$, and propose that the substantial scaling +discrepancy between the two considered domains may be at least partially +attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging +datasets, a metric which we propose. Next, we demonstrate an additional benefit +of measuring the label sharpness of a training set: it is negatively correlated +with the trained model's adversarial robustness, which notably leads to models +for medical images having a substantially higher vulnerability to adversarial +attack. Finally, we extend our $d_{data}$ formalism to the related metric of +learned representation intrinsic dimension ($d_{repr}$), derive a +generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$ +serves as an upper bound for $d_{repr}$. Our theoretical results are supported +by thorough experiments with six models and eleven natural and medical imaging +datasets over a range of training set sizes. Our findings offer insights into +the influence of intrinsic dataset properties on generalization, representation +learning, and robustness in deep neural networks. + +
+
+ comment: ICLR 2024. Code: + https://github.com/mazurowski-lab/intrinsic-properties +
+
+
+
+
+ + ♻ ☆ IPR-NeRF: Ownership Verification meets Neural Radiance Field + + +
+ Neural Radiance Field (NeRF) models have gained significant attention in the +computer vision community in the recent past with state-of-the-art visual +quality and produced impressive demonstrations. Since then, technopreneurs have +sought to leverage NeRF models into a profitable business. Therefore, NeRF +models make it worth the risk of plagiarizers illegally copying, +re-distributing, or misusing those models. This paper proposes a comprehensive +intellectual property (IP) protection framework for the NeRF model in both +black-box and white-box settings, namely IPR-NeRF. In the black-box setting, a +diffusion-based solution is introduced to embed and extract the watermark via a +two-stage optimization process. In the white-box setting, a designated digital +signature is embedded into the weights of the NeRF model by adopting the sign +loss objective. Our extensive experiments demonstrate that not only does our +approach maintain the fidelity (\ie, the rendering quality) of IPR-NeRF models, +but it is also robust against both ambiguity and removal attacks compared to +prior arts. + +
+
+ comment: Error on result tabulation for the state of the art method which + might cause misleading to the readers +
+
+
+
+
+ + ♻ ☆ MixRT: Mixed Neural Representations For Real-Time NeRF Rendering 3DV'24 + + +
+ Neural Radiance Field (NeRF) has emerged as a leading technique for novel +view synthesis, owing to its impressive photorealistic reconstruction and +rendering capability. Nevertheless, achieving real-time NeRF rendering in +large-scale scenes has presented challenges, often leading to the adoption of +either intricate baked mesh representations with a substantial number of +triangles or resource-intensive ray marching in baked representations. We +challenge these conventions, observing that high-quality geometry, represented +by meshes with substantial triangles, is not necessary for achieving +photorealistic rendering quality. Consequently, we propose MixRT, a novel NeRF +representation that includes a low-quality mesh, a view-dependent displacement +map, and a compressed NeRF model. This design effectively harnesses the +capabilities of existing graphics hardware, thus enabling real-time NeRF +rendering on edge devices. Leveraging a highly-optimized WebGL-based rendering +framework, our proposed MixRT attains real-time rendering speeds on edge +devices (over 30 FPS at a resolution of 1280 x 720 on a MacBook M1 Pro laptop), +better rendering quality (0.2 PSNR higher in indoor scenes of the Unbounded-360 +datasets), and a smaller storage size (less than 80% compared to +state-of-the-art methods). + +
+
+ comment: Accepted by 3DV'24. Project Page: https://licj15.github.io/MixRT/ +
+
+
+
+
+ + ♻ ☆ Forging Tokens for Improved Storage-efficient Training + + +
+ Recent advancements in Deep Neural Network (DNN) models have significantly +improved performance across computer vision tasks. However, achieving highly +generalizable and high-performing vision models requires extensive datasets, +leading to large storage requirements. This storage challenge poses a critical +bottleneck for scaling up vision models. Motivated by the success of discrete +representations, SeiT proposes to use Vector-Quantized (VQ) feature vectors +(i.e., tokens) as network inputs for vision classification. However, applying +traditional data augmentations to tokens faces challenges due to input domain +shift. To address this issue, we introduce TokenAdapt and ColorAdapt, simple +yet effective token-based augmentation strategies. TokenAdapt realigns token +embedding space for compatibility with spatial augmentations, preserving the +model's efficiency without requiring fine-tuning. Additionally, ColorAdapt +addresses color-based augmentations for tokens inspired by Adaptive Instance +Normalization (AdaIN). We evaluate our approach across various scenarios, +including storage-efficient ImageNet-1k classification, fine-grained +classification, robustness benchmarks, and ADE-20k semantic segmentation. +Experimental results demonstrate consistent performance improvement in diverse +experiments. Code is available at https://github.com/naver-ai/tokenadapt. + +
+
+ comment: First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ CapST: An Enhanced and Lightweight Model Attribution Approach for + Synthetic Videos + + +
+ Deepfake videos, generated through AI faceswapping techniques, have garnered +considerable attention due to their potential for powerful impersonation +attacks. While existing research primarily focuses on binary classification to +discern between real and fake videos, however determining the specific +generation model for a fake video is crucial for forensic investigation. +Addressing this gap, this paper investigates the model attribution problem of +Deepfake videos from a recently proposed dataset, Deepfakes from Different +Models (DFDM), derived from various Autoencoder models. The dataset comprises +6,450 Deepfake videos generated by five distinct models with variations in +encoder, decoder, intermediate layer, input resolution, and compression ratio. +This study formulates Deepfakes model attribution as a multiclass +classification task, proposing a segment of VGG19 as a feature extraction +backbone, known for its effectiveness in imagerelated tasks, while integrated a +Capsule Network with a Spatio-Temporal attention mechanism. The Capsule module +captures intricate hierarchies among features for robust identification of +deepfake attributes. Additionally, the video-level fusion technique leverages +temporal attention mechanisms to handle concatenated feature vectors, +capitalizing on inherent temporal dependencies in deepfake videos. By +aggregating insights across frames, our model gains a comprehensive +understanding of video content, resulting in more precise predictions. +Experimental results on the deepfake benchmark dataset (DFDM) demonstrate the +efficacy of our proposed method, achieving up to a 4% improvement in accurately +categorizing deepfake videos compared to baseline models while demanding fewer +computational resources. + +
+
+ comment: Rejected from jounal and will have to conduct several more + experiments +
+
+
+
+
+ + ♻ ☆ A Generalized Multi-Modal Fusion Detection Framework + + +
+ LiDAR point clouds have become the most common data source in autonomous +driving. However, due to the sparsity of point clouds, accurate and reliable +detection cannot be achieved in specific scenarios. Because of their +complementarity with point clouds, images are getting increasing attention. +Although with some success, existing fusion methods either perform hard fusion +or do not fuse in a direct manner. In this paper, we propose a generic 3D +detection framework called MMFusion, using multi-modal features. The framework +aims to achieve accurate fusion between LiDAR and images to improve 3D +detection in complex scenes. Our framework consists of two separate streams: +the LiDAR stream and the camera stream, which can be compatible with any +single-modal feature extraction network. The Voxel Local Perception Module in +the LiDAR stream enhances local feature representation, and then the +Multi-modal Feature Fusion Module selectively combines feature output from +different streams to achieve better fusion. Extensive experiments have shown +that our framework not only outperforms existing benchmarks but also improves +their detection, especially for detecting cyclists and pedestrians on KITTI +benchmarks, with strong robustness and generalization capabilities. Hopefully, +our work will stimulate more research into multi-modal fusion for autonomous +driving tasks. + +
+
+
+
+
+ + ♻ ☆ Panoptic Scene Graph Generation with Semantics-Prototype Learning AAAI 2024 + + +
+ Panoptic Scene Graph Generation (PSG) parses objects and predicts their +relationships (predicate) to connect human language and visual scenes. However, +different language preferences of annotators and semantic overlaps between +predicates lead to biased predicate annotations in the dataset, i.e. different +predicates for same object pairs. Biased predicate annotations make PSG models +struggle in constructing a clear decision plane among predicates, which greatly +hinders the real application of PSG models. To address the intrinsic bias +above, we propose a novel framework named ADTrans to adaptively transfer biased +predicate annotations to informative and unified ones. To promise consistency +and accuracy during the transfer process, we propose to measure the invariance +of representations in each predicate class, and learn unbiased prototypes of +predicates with different intensities. Meanwhile, we continuously measure the +distribution changes between each presentation and its prototype, and +constantly screen potential biased data. Finally, with the unbiased +predicate-prototype representation embedding space, biased annotations are +easily identified. Experiments show that ADTrans significantly improves the +performance of benchmark models, achieving a new state-of-the-art performance, +and shows great generalization and effectiveness on multiple datasets. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Physics-guided Noise Neural Proxy for Practical Low-light Raw Image + Denoising + + +
+ Recently, the mainstream practice for training low-light raw image denoising +methods has shifted towards employing synthetic data. Noise modeling, which +focuses on characterizing the noise distribution of real-world sensors, +profoundly influences the effectiveness and practicality of synthetic data. +Currently, physics-based noise modeling struggles to characterize the entire +real noise distribution, while learning-based noise modeling impractically +depends on paired real data. In this paper, we propose a novel strategy: +learning the noise model from dark frames instead of paired real data, to break +down the data dependency. Based on this strategy, we introduce an efficient +physics-guided noise neural proxy (PNNP) to approximate the real-world sensor +noise model. Specifically, we integrate physical priors into neural proxies and +introduce three efficient techniques: physics-guided noise decoupling (PND), +physics-guided proxy model (PPM), and differentiable distribution loss (DDL). +PND decouples the dark frame into different components and handles different +levels of noise flexibly, which reduces the complexity of noise modeling. PPM +incorporates physical priors to constrain the generated noise, which promotes +the accuracy of noise modeling. DDL provides explicit and reliable supervision +for noise distribution, which promotes the precision of noise modeling. PNNP +exhibits powerful potential in characterizing the real noise distribution. +Extensive experiments on public datasets demonstrate superior performance in +practical low-light raw image denoising. The code will be available at +\url{https://github.com/fenghansen/PNNP}. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Digital Fingerprinting of Microstructures + + +
+ Finding efficient means of fingerprinting microstructural information is a +critical step towards harnessing data-centric machine learning approaches. A +statistical framework is systematically developed for compressed +characterisation of a population of images, which includes some classical +computer vision methods as special cases. The focus is on materials +microstructure. The ultimate purpose is to rapidly fingerprint sample images in +the context of various high-throughput design/make/test scenarios. This +includes, but is not limited to, quantification of the disparity between +microstructures for quality control, classifying microstructures, predicting +materials properties from image data and identifying potential processing +routes to engineer new materials with specific properties. Here, we consider +microstructure classification and utilise the resulting features over a range +of related machine learning tasks, namely supervised, semi-supervised, and +unsupervised learning. + The approach is applied to two distinct datasets to illustrate various +aspects and some recommendations are made based on the findings. In particular, +methods that leverage transfer learning with convolutional neural networks +(CNNs), pretrained on the ImageNet dataset, are generally shown to outperform +other methods. Additionally, dimensionality reduction of these CNN-based +fingerprints is shown to have negligible impact on classification accuracy for +the supervised learning approaches considered. In situations where there is a +large dataset with only a handful of images labelled, graph-based label +propagation to unlabelled data is shown to be favourable over discarding +unlabelled data and performing supervised learning. In particular, label +propagation by Poisson learning is shown to be highly effective at low label +rates. + +
+
+
+
+
+ + ♻ ☆ Rethinking Unsupervised Domain Adaptation for Semantic Segmentation + + +
+ Unsupervised domain adaptation (UDA) adapts a model trained on one domain +(called source) to a novel domain (called target) using only unlabeled data. +Due to its high annotation cost, researchers have developed many UDA methods +for semantic segmentation, which assume no labeled sample is available in the +target domain. We question the practicality of this assumption for two reasons. +First, after training a model with a UDA method, we must somehow verify the +model before deployment. Second, UDA methods have at least a few +hyper-parameters that need to be determined. The surest solution to these is to +evaluate the model using validation data, i.e., a certain amount of labeled +target-domain samples. This question about the basic assumption of UDA leads us +to rethink UDA from a data-centric point of view. Specifically, we assume we +have access to a minimum level of labeled data. Then, we ask how much is +necessary to find good hyper-parameters of existing UDA methods. We then +consider what if we use the same data for supervised training of the same +model, e.g., finetuning. We conducted experiments to answer these questions +with popular scenarios, {GTA5, SYNTHIA}$\rightarrow$Cityscapes. We found that +i) choosing good hyper-parameters needs only a few labeled images for some UDA +methods whereas a lot more for others; and ii) simple finetuning works +surprisingly well; it outperforms many UDA methods if only several dozens of +labeled images are available. + +
+
+ comment: Under review in Pattern Recognition Letters +
+
+
+
+
+ + ♻ ☆ Compressed 3D Gaussian Splatting for Accelerated Novel View Synthesis + + +
+ Recently, high-fidelity scene reconstruction with an optimized 3D Gaussian +splat representation has been introduced for novel view synthesis from sparse +image sets. Making such representations suitable for applications like network +streaming and rendering on low-power devices requires significantly reduced +memory consumption as well as improved rendering efficiency. We propose a +compressed 3D Gaussian splat representation that utilizes sensitivity-aware +vector clustering with quantization-aware training to compress directional +colors and Gaussian parameters. The learned codebooks have low bitrates and +achieve a compression rate of up to $31\times$ on real-world scenes with only +minimal degradation of visual quality. We demonstrate that the compressed splat +representation can be efficiently rendered with hardware rasterization on +lightweight GPUs at up to $4\times$ higher framerates than reported via an +optimized GPU compute pipeline. Extensive experiments across multiple datasets +demonstrate the robustness and rendering speed of the proposed approach. + +
+
+
+
+
+ + ♻ ☆ 2D-3D Interlaced Transformer for Point Cloud Segmentation with + Scene-Level Supervision ICCV 2023 + + +
+ We present a Multimodal Interlaced Transformer (MIT) that jointly considers +2D and 3D data for weakly supervised point cloud segmentation. Research studies +have shown that 2D and 3D features are complementary for point cloud +segmentation. However, existing methods require extra 2D annotations to achieve +2D-3D information fusion. Considering the high annotation cost of point clouds, +effective 2D and 3D feature fusion based on weakly supervised learning is in +great demand. To this end, we propose a transformer model with two encoders and +one decoder for weakly supervised point cloud segmentation using only +scene-level class tags. Specifically, the two encoders compute the +self-attended features for 3D point clouds and 2D multi-view images, +respectively. The decoder implements interlaced 2D-3D cross-attention and +carries out implicit 2D and 3D feature fusion. We alternately switch the roles +of queries and key-value pairs in the decoder layers. It turns out that the 2D +and 3D features are iteratively enriched by each other. Experiments show that +it performs favorably against existing weakly supervised point cloud +segmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The +project page will be available at https://jimmy15923.github.io/mit_web/. + +
+
+ comment: ICCV 2023 (main + supp). Website: + https://jimmy15923.github.io/mit_web/ +
+
+
+
+
+ + ♻ ☆ Continual Learning in Medical Image Analysis: A Comprehensive Review of + Recent Advancements and Future Prospects + + +
+ Medical imaging analysis has witnessed remarkable advancements even +surpassing human-level performance in recent years, driven by the rapid +development of advanced deep-learning algorithms. However, when the inference +dataset slightly differs from what the model has seen during one-time training, +the model performance is greatly compromised. The situation requires restarting +the training process using both the old and the new data which is +computationally costly, does not align with the human learning process, and +imposes storage constraints and privacy concerns. Alternatively, continual +learning has emerged as a crucial approach for developing unified and +sustainable deep models to deal with new classes, tasks, and the drifting +nature of data in non-stationary environments for various application areas. +Continual learning techniques enable models to adapt and accumulate knowledge +over time, which is essential for maintaining performance on evolving datasets +and novel tasks. This systematic review paper provides a comprehensive overview +of the state-of-the-art in continual learning techniques applied to medical +imaging analysis. We present an extensive survey of existing research, covering +topics including catastrophic forgetting, data drifts, stability, and +plasticity requirements. Further, an in-depth discussion of key components of a +continual learning framework such as continual learning scenarios, techniques, +evaluation schemes, and metrics is provided. Continual learning techniques +encompass various categories, including rehearsal, regularization, +architectural, and hybrid strategies. We assess the popularity and +applicability of continual learning categories in various medical sub-fields +like radiology and histopathology... + +
+
+
+
+
+ + ♻ ☆ Modality-missing RGBT Tracking via Invertible Prompt Learning and A + High-quality Data Simulation Method + + +
+ Current RGBT tracking researches mainly focus on the modality-complete +scenarios, overlooking the modality-missing challenge in real-world scenes. In +this work, we comprehensively investigate the impact of modality-missing +challenge in RGBT tracking and propose a novel invertible prompt learning +approach, which integrates the content-preserving prompts into a well-trained +tracking model to adapt to various modality-missing scenarios, for +modality-missing RGBT tracking. In particular, given one modality-missing +scenario, we propose to utilize the available modality to generate the prompt +of the missing modality to adapt to RGBT tracking model. However, the +cross-modality gap between available and missing modalities usually causes +semantic distortion and information loss in prompt generation. To handle this +issue, we propose the invertible prompt learning scheme by incorporating the +full reconstruction of the input available modality from the prompt in prompt +generation model. Considering that there lacks a modality-missing RGBT tracking +dataset and many modality-missing scenarios are difficult to capture, we design +a high-quality data simulation method based on hierarchical combination schemes +to generate real-world modality-missing data. Extensive experiments on three +modality-missing datasets show that our method achieves significant performance +improvements compared with state-of-the-art methods. We will release the code +and simulation dataset. + +
+
+
+
+
+ + ♻ ☆ Towards the Detection of Diffusion Model Deepfakes + + +
+ In the course of the past few years, diffusion models (DMs) have reached an +unprecedented level of visual quality. However, relatively little attention has +been paid to the detection of DM-generated images, which is critical to prevent +adverse impacts on our society. In contrast, generative adversarial networks +(GANs), have been extensively studied from a forensic perspective. In this +work, we therefore take the natural next step to evaluate whether previous +methods can be used to detect images generated by DMs. Our experiments yield +two key findings: (1) state-of-the-art GAN detectors are unable to reliably +distinguish real from DM-generated images, but (2) re-training them on +DM-generated images allows for almost perfect detection, which remarkably even +generalizes to GANs. Together with a feature space analysis, our results lead +to the hypothesis that DMs produce fewer detectable artifacts and are thus more +difficult to detect compared to GANs. One possible reason for this is the +absence of grid-like frequency artifacts in DM-generated images, which are a +known weakness of GANs. However, we make the interesting observation that +diffusion models tend to underestimate high frequencies, which we attribute to +the learning objective. + +
+
+ comment: Accepted at VISAPP 2024. This is the extended version with additional + experiments and supplemental material. Code and data: + https://github.com/jonasricker/diffusion-model-deepfake-detection +
+
+
+
+
+ + ♻ ☆ Diffusion Model is Secretly a Training-free Open Vocabulary Semantic + Segmenter + + +
+ The pre-trained text-image discriminative models, such as CLIP, has been +explored for open-vocabulary semantic segmentation with unsatisfactory results +due to the loss of crucial localization information and awareness of object +shapes. Recently, there has been a growing interest in expanding the +application of generative models from generation tasks to semantic +segmentation. These approaches utilize generative models either for generating +annotated data or extracting features to facilitate semantic segmentation. This +typically involves generating a considerable amount of synthetic data or +requiring additional mask annotations. To this end, we uncover the potential of +generative text-to-image diffusion models (e.g., Stable Diffusion) as highly +efficient open-vocabulary semantic segmenters, and introduce a novel +training-free approach named DiffSegmenter. The insight is that to generate +realistic objects that are semantically faithful to the input text, both the +complete object shapes and the corresponding semantics are implicitly learned +by diffusion models. We discover that the object shapes are characterized by +the self-attention maps while the semantics are indicated through the +cross-attention maps produced by the denoising U-Net, forming the basis of our +segmentation results.Additionally, we carefully design effective textual +prompts and a category filtering mechanism to further enhance the segmentation +results. Extensive experiments on three benchmark datasets show that the +proposed DiffSegmenter achieves impressive results for open-vocabulary semantic +segmentation. + +
+
+
+
+
+ + ♻ ☆ LISA++: An Improved Baseline for Reasoning Segmentation with Large + Language Model + + +
+ While LISA effectively bridges the gap between segmentation and large +language models to enable reasoning segmentation, it poses certain limitations: +unable to distinguish different instances of the target region, and constrained +by the pre-defined textual response formats. In this work, we introduce LISA++, +an update to the existing LISA model, focusing on improving core +functionalities while keeping the base architecture intact. The main +enhancements in LISA++ include: \textbf{1) Enhanced Segmentation}: The instance +segmentation ability has been added, providing a more detailed scene analysis +along with the existing multi-region semantic segmentation. \textbf{2) More +Natural Conversation}: Improved capability for multi-turn dialogue, with the +ability to incorporate segmentation results directly into text responses, i.e., +Segmentation in Dialogue (SiD). These improvements are achieved by curating the +existing samples of generic segmentation datasets, aimed specifically at +enhancing the segmentation and conversational skills without structural change +and additional data sources. Comparative analysis with the original LISA model +shows significant advancements in these areas, positioning LISA++ as a notable +upgrade in visual understanding and interaction. LISA++'s adaptability and +improved features highlight the versatility of the mask-as-embedding paradigm +proposed by LISA, and the potential as a foundational model for diverse +applications. + +
+
+ comment: Typo fixed +
+
+
+
+
+ + ♻ ☆ SMILEtrack: SiMIlarity LEarning for Occlusion-Aware Multiple Object + Tracking AAAI2024 + + +
+ Despite recent progress in Multiple Object Tracking (MOT), several obstacles +such as occlusions, similar objects, and complex scenes remain an open +challenge. Meanwhile, a systematic study of the cost-performance tradeoff for +the popular tracking-by-detection paradigm is still lacking. This paper +introduces SMILEtrack, an innovative object tracker that effectively addresses +these challenges by integrating an efficient object detector with a Siamese +network-based Similarity Learning Module (SLM). The technical contributions of +SMILETrack are twofold. First, we propose an SLM that calculates the appearance +similarity between two objects, overcoming the limitations of feature +descriptors in Separate Detection and Embedding (SDE) models. The SLM +incorporates a Patch Self-Attention (PSA) block inspired by the vision +Transformer, which generates reliable features for accurate similarity +matching. Second, we develop a Similarity Matching Cascade (SMC) module with a +novel GATE function for robust object matching across consecutive video frames, +further enhancing MOT performance. Together, these innovations help SMILETrack +achieve an improved trade-off between the cost ({\em e.g.}, running speed) and +performance (e.g., tracking accuracy) over several existing state-of-the-art +benchmarks, including the popular BYTETrack method. SMILETrack outperforms +BYTETrack by 0.4-0.8 MOTA and 2.1-2.2 HOTA points on MOT17 and MOT20 datasets. +Code is available at https://github.com/pingyang1117/SMILEtrack_Official + +
+
+ comment: Our paper was accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ From 2D Images to 3D Model:Weakly Supervised Multi-View Face + Reconstruction with Deep Fusion + + +
+ While weakly supervised multi-view face reconstruction (MVR) is garnering +increased attention, one critical issue still remains open: how to effectively +fuse multiple image information to reconstruct high-precision 3D models. In +this regard, we propose a novel model called Deep Fusion MVR (DF-MVR) to +reconstruct high-precision 3D facial shapes from multi-view images. +Specifically, we introduce MulEn-Unet, a multi-view encoding to single decoding +framework with skip connections and attention. This design allows for the +extraction, integration, and compensation of deep features with attention from +multi-view images. Furthermore, we adopt the involution kernel to enrich deep +fusion features with channel features. In addition, we develop the face parse +network to learn, identify, and emphasize the critical common face area within +multi-view images. Experiments on Pixel-Face and Bosphorus datasets indicate +the superiority of our model. Without 3D annotation, DF-MVR achieves 5.2% and +3.0% RMSE improvement over the existing weakly supervised MVRs respectively on +Pixel-Face and Bosphorus dataset. Code will be available publicly at +https://github.com/weiguangzhao/DF_MVR. + +
+
+
+
+
+ + ♻ ☆ ETPNav: Evolving Topological Planning for Vision-Language Navigation in + Continuous Environments + + +
+ Vision-language navigation is a task that requires an agent to follow +instructions to navigate in environments. It becomes increasingly crucial in +the field of embodied AI, with potential applications in autonomous navigation, +search and rescue, and human-robot interaction. In this paper, we propose to +address a more practical yet challenging counterpart setting - vision-language +navigation in continuous environments (VLN-CE). To develop a robust VLN-CE +agent, we propose a new navigation framework, ETPNav, which focuses on two +critical skills: 1) the capability to abstract environments and generate +long-range navigation plans, and 2) the ability of obstacle-avoiding control in +continuous environments. ETPNav performs online topological mapping of +environments by self-organizing predicted waypoints along a traversed path, +without prior environmental experience. It privileges the agent to break down +the navigation procedure into high-level planning and low-level control. +Concurrently, ETPNav utilizes a transformer-based cross-modal planner to +generate navigation plans based on topological maps and instructions. The plan +is then performed through an obstacle-avoiding controller that leverages a +trial-and-error heuristic to prevent navigation from getting stuck in +obstacles. Experimental results demonstrate the effectiveness of the proposed +method. ETPNav yields more than 10% and 20% improvements over prior +state-of-the-art on R2R-CE and RxR-CE datasets, respectively. Our code is +available at https://github.com/MarSaKi/ETPNav. + +
+
+ comment: Project page: https://github.com/MarSaKi/ETPNav +
+
+
+
+
+ + ♻ ☆ Ultrasound Image Segmentation of Thyroid Nodule via Latent Semantic + Feature Co-Registration + + +
+ Segmentation of nodules in thyroid ultrasound imaging plays a crucial role in +the detection and treatment of thyroid cancer. However, owing to the diversity +of scanner vendors and imaging protocols in different hospitals, the automatic +segmentation model, which has already demonstrated expert-level accuracy in the +field of medical image segmentation, finds its accuracy reduced as the result +of its weak generalization performance when being applied in clinically +realistic environments. To address this issue, the present paper proposes ASTN, +a framework for thyroid nodule segmentation achieved through a new type +co-registration network. By extracting latent semantic information from the +atlas and target images and utilizing in-depth features to accomplish the +co-registration of nodules in thyroid ultrasound images, this framework can +ensure the integrity of anatomical structure and reduce the impact on +segmentation as the result of overall differences in image caused by different +devices. In addition, this paper also provides an atlas selection algorithm to +mitigate the difficulty of co-registration. As shown by the evaluation results +collected from the datasets of different devices, thanks to the method we +proposed, the model generalization has been greatly improved while maintaining +a high level of segmentation accuracy. + +
+
+
+
+
+ + ♻ ☆ Semi-supervised Semantic Segmentation using Redesigned Self-Training for + White Blood Cell + + +
+ Artificial Intelligence (AI) in healthcare, especially in white blood cell +cancer diagnosis, is hindered by two primary challenges: the lack of +large-scale labeled datasets for white blood cell (WBC) segmentation and +outdated segmentation methods. To address the first challenge, a +semi-supervised learning framework should be brought to efficiently annotate +the large dataset. In this work, we address this issue by proposing a novel +self-training pipeline with the incorporation of FixMatch. We discover that by +incorporating FixMatch in the self-training pipeline, the performance improves +in the majority of cases. Our performance achieved the best performance with +the self-training scheme with consistency on DeepLab-V3 architecture and +ResNet-50, reaching 90.69%, 87.37%, and 76.49% on Zheng 1, Zheng 2, and LISC +datasets, respectively. + +
+
+
+
+
+ + ♻ ☆ LanguageBind: Extending Video-Language Pretraining to N-modality by + Language-based Semantic Alignment ICLR 2024 + + +
+ The video-language (VL) pretraining has achieved remarkable improvement in +multiple downstream tasks. However, the current VL pretraining framework is +hard to extend to multiple modalities (N modalities, N>=3) beyond vision and +language. We thus propose LanguageBind, taking the language as the bind across +different modalities because the language modality is well-explored and +contains rich semantics. Specifically, we freeze the language encoder acquired +by VL pretraining, then train encoders for other modalities with contrastive +learning. As a result, all modalities are mapped to a shared feature space, +implementing multi-modal semantic alignment. While LanguageBind ensures that we +can extend VL modalities to N modalities, we also need a high-quality dataset +with alignment data pairs centered on language. We thus propose VIDAL-10M with +Video, Infrared, Depth, Audio and their corresponding Language, naming as +VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with +complete semantics rather than truncated segments from long videos, and all the +video, depth, infrared, and audio modalities are aligned to their textual +descriptions. LanguageBind has achieved superior performance on a wide range of +15 benchmarks covering video, audio, depth, and infrared. Moreover, multiple +experiments have provided evidence for the effectiveness of LanguageBind in +achieving indirect alignment and complementarity among diverse modalities. Code +address: https://github.com/PKU-YuanGroup/LanguageBind + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Recurrent Generic Contour-based Instance Segmentation with Progressive + Learning + + +
+ Contour-based instance segmentation has been actively studied, thanks to its +flexibility and elegance in processing visual objects within complex +backgrounds. In this work, we propose a novel deep network architecture, i.e., +PolySnake, for generic contour-based instance segmentation. Motivated by the +classic Snake algorithm, the proposed PolySnake achieves superior and robust +segmentation performance with an iterative and progressive contour refinement +strategy. Technically, PolySnake introduces a recurrent update operator to +estimate the object contour iteratively. It maintains a single estimate of the +contour that is progressively deformed toward the object boundary. At each +iteration, PolySnake builds a semantic-rich representation for the current +contour and feeds it to the recurrent operator for further contour adjustment. +Through the iterative refinements, the contour progressively converges to a +stable status that tightly encloses the object instance. Beyond the scope of +general instance segmentation, extensive experiments are conducted to validate +the effectiveness and generalizability of our PolySnake in two additional +specific task scenarios, including scene text detection and lane detection. The +results demonstrate that the proposed PolySnake outperforms the existing +advanced methods on several multiple prevalent benchmarks across the three +tasks. The codes and pre-trained models are available at +https://github.com/fh2019ustc/PolySnake + +
+
+
+
+
+ + ♻ ☆ The Art of Camouflage: Few-shot Learning for Animal Detection and + Segmentation + + +
+ Camouflaged object detection and segmentation is a new and challenging +research topic in computer vision. There is a serious issue of lacking data of +camouflaged objects such as camouflaged animals in natural scenes. In this +paper, we address the problem of few-shot learning for camouflaged object +detection and segmentation. To this end, we first collect a new dataset, +CAMO-FS, for the benchmark. We then propose a novel method to efficiently +detect and segment the camouflaged objects in the images. In particular, we +introduce the instance triplet loss and the instance memory storage. The +extensive experiments demonstrated that our proposed method achieves +state-of-the-art performance on the newly collected dataset. + +
+
+ comment: Under-review Journal +
+
+
+
+
+ + ♻ ☆ Modulate Your Spectrum in Self-Supervised Learning ICLR 2024 + + +
+ Whitening loss offers a theoretical guarantee against feature collapse in +self-supervised learning (SSL) with joint embedding architectures. Typically, +it involves a hard whitening approach, transforming the embedding and applying +loss to the whitened output. In this work, we introduce Spectral Transformation +(ST), a framework to modulate the spectrum of embedding and to seek for +functions beyond whitening that can avoid dimensional collapse. We show that +whitening is a special instance of ST by definition, and our empirical +investigations unveil other ST instances capable of preventing collapse. +Additionally, we propose a novel ST instance named IterNorm with trace loss +(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse +and modulating the spectrum of embedding toward equal-eigenvalues during +optimization. Our experiments on ImageNet classification and COCO object +detection demonstrate INTL's potential in learning superior representations. +The code is available at https://github.com/winci-ai/INTL. + +
+
+ comment: Accepted at ICLR 2024. The code is available at + https://github.com/winci-ai/intl +
+
+
+
+
+ + ♻ ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for + Diffusion-Based Video Generation + + +
+ Recent large-scale pre-trained diffusion models have demonstrated a powerful +generative ability to produce high-quality videos from detailed text +descriptions. However, exerting control over the motion of objects in videos +generated by any video diffusion model is a challenging problem. In this paper, +we propose a novel zero-shot moving object trajectory control framework, +Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video +diffusion model. To this end, an initial noise prior module is designed to +provide a position-based prior to improve the stability of the appearance of +the moving object and the accuracy of position. In addition, based on the +attention map of the U-net, spatial constraints are directly applied to the +denoising process of diffusion models, which further ensures the positional and +spatial consistency of moving objects during the inference. Furthermore, +temporal consistency is guaranteed with a proposed shift temporal attention +mechanism. Our method can be flexibly applied to various state-of-the-art video +diffusion models without any training process. Extensive experiments +demonstrate our proposed method can control the motion trajectories of objects +and generate high-quality videos. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Computational Pathology: A Survey Review and The Way Forward + + +
+ Computational Pathology CPath is an interdisciplinary science that augments +developments of computational approaches to analyze and model medical +histopathology images. The main objective for CPath is to develop +infrastructure and workflows of digital diagnostics as an assistive CAD system +for clinical pathology, facilitating transformational changes in the diagnosis +and treatment of cancer that are mainly address by CPath tools. With +evergrowing developments in deep learning and computer vision algorithms, and +the ease of the data flow from digital pathology, currently CPath is witnessing +a paradigm shift. Despite the sheer volume of engineering and scientific works +being introduced for cancer image analysis, there is still a considerable gap +of adopting and integrating these algorithms in clinical practice. This raises +a significant question regarding the direction and trends that are undertaken +in CPath. In this article we provide a comprehensive review of more than 800 +papers to address the challenges faced in problem design all-the-way to the +application and implementation viewpoints. We have catalogued each paper into a +model-card by examining the key works and challenges faced to layout the +current landscape in CPath. We hope this helps the community to locate relevant +works and facilitate understanding of the field's future directions. In a +nutshell, we oversee the CPath developments in cycle of stages which are +required to be cohesively linked together to address the challenges associated +with such multidisciplinary science. We overview this cycle from different +perspectives of data-centric, model-centric, and application-centric problems. +We finally sketch remaining challenges and provide directions for future +technical developments and clinical integration of CPath +(https://github.com/AtlasAnalyticsLab/CPath_Survey). + +
+
+ comment: Accepted in Elsevier Journal of Pathology Informatics (JPI) 2024 +
+
+
+
+
+ + ♻ ☆ Look, Remember and Reason: Grounded reasoning in videos with language + models ICLR 2024 + + +
+ Multi-modal language models (LM) have recently shown promising performance in +high-level reasoning tasks on videos. However, existing methods still fall +short in tasks like causal or compositional spatiotemporal reasoning over +actions, in which model predictions need to be grounded in fine-grained +low-level details, such as object motions and object interactions. In this +work, we propose training an LM end-to-end on low-level surrogate tasks, +including object detection, re-identification, and tracking, to endow the model +with the required low-level visual capabilities. We show that a two-stream +video encoder with spatiotemporal attention is effective at capturing the +required static and motion-based cues in the video. By leveraging the LM's +ability to perform the low-level surrogate tasks, we can cast reasoning in +videos as the three-step process of Look, Remember, Reason wherein visual +information is extracted using low-level visual skills step-by-step and then +integrated to arrive at a final answer. We demonstrate the effectiveness of our +framework on diverse visual reasoning tasks from the ACRE, CATER, +Something-Else and STAR datasets. Our approach is trainable end-to-end and +surpasses state-of-the-art task-specific methods across these tasks by a large +margin. + +
+
+ comment: To appear at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Neural Image Stitching + + +
+ Existing frameworks for image stitching often provide visually reasonable +stitchings. However, they suffer from blurry artifacts and disparities in +illumination, depth level, etc. Although the recent learning-based stitchings +relax such disparities, the required methods impose sacrifice of image +qualities failing to capture high-frequency details for stitched images. To +address the problem, we propose a novel approach, implicit Neural Image +Stitching (NIS) that extends arbitrary-scale super-resolution. Our method +estimates Fourier coefficients of images for quality-enhancing warps. Then, the +suggested model blends color mismatches and misalignment in the latent space +and decodes the features into RGB values of stitched images. Our experiments +show that our approach achieves improvement in resolving the low-definition +imaging of the previous deep image stitching with favorable accelerated +image-enhancing methods. Our source code is available at +https://github.com/minshu-kim/NIS. + +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ Revisiting Document-Level Relation Extraction with Context-Guided Link + Prediction AAAI 2024 + + +
+ Document-level relation extraction (DocRE) poses the challenge of identifying +relationships between entities within a document as opposed to the traditional +RE setting where a single sentence is input. Existing approaches rely on +logical reasoning or contextual cues from entities. This paper reframes +document-level RE as link prediction over a knowledge graph with distinct +benefits: 1) Our approach combines entity context with document-derived logical +reasoning, enhancing link prediction quality. 2) Predicted links between +entities offer interpretability, elucidating employed reasoning. We evaluate +our approach on three benchmark datasets: DocRED, ReDocRED, and DWIE. The +results indicate that our proposed method outperforms the state-of-the-art +models and suggests that incorporating context-based link prediction techniques +can enhance the performance of document-level relation extraction models. + +
+
+ comment: Accepted in AAAI 2024 +
+
+
+
+
+ + ☆ Knowledge Navigation: Inferring the Interlocking Map of Knowledge from + Research Trajectories + + +
+ "If I have seen further, it is by standing on the shoulders of giants," Isaac +Newton's renowned statement hints that new knowledge builds upon existing +foundations, which means there exists an interdependent relationship between +knowledge, which, yet uncovered, is implied in the historical development of +scientific systems for hundreds of years. By leveraging natural language +processing techniques, this study introduces an innovative embedding scheme +designed to infer the "knowledge interlocking map." This map, derived from the +research trajectories of millions of scholars, reveals the intricate +connections among knowledge. We validate that the inferred map effectively +delineates disciplinary boundaries and captures the intricate relationships +between diverse concepts. The utility of the interlocking map is showcased +through multiple applications. Firstly, we demonstrated the multi-step analogy +inferences within the knowledge space and the functional connectivity between +concepts in different disciplines. Secondly, we trace the evolution of +knowledge across domains, observing trends such as shifts from "Theoretical" to +"Applied" or "Chemistry" to "Biomedical" along predefined functional +directions. Lastly, by analyzing the high-dimensional knowledge network +structure, we found that knowledge connects each other with shorter global +pathways, and the interdisciplinary knowledge plays a critical role in +accessibility of the global knowledge network. Our framework offers a novel +approach to mining knowledge inheritance pathways in extensive scientific +literature, which is of great significance for understanding scientific +development patterns, tailoring scientific learning trajectories, and +accelerating scientific progress. + +
+
+ comment: 28 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ Domain-Aware Cross-Attention for Cross-domain Recommendation + + +
+ Cross-domain recommendation (CDR) is an important method to improve +recommender system performance, especially when observations in target domains +are sparse. However, most existing cross-domain recommendations fail to fully +utilize the target domain's special features and are hard to be generalized to +new domains. The designed network is complex and is not suitable for rapid +industrial deployment. Our method introduces a two-step domain-aware +cross-attention, extracting transferable features of the source domain from +different granularity, which allows the efficient expression of both domain and +user interests. In addition, we simplify the training process, and our model +can be easily deployed on new domains. We conduct experiments on both public +datasets and industrial datasets, and the experimental results demonstrate the +effectiveness of our method. We have also deployed the model in an online +advertising system and observed significant improvements in both +Click-Through-Rate (CTR) and effective cost per mille (ECPM). + +
+
+ comment: 6 pages, 1 figure +
+
+
+
+
+ + ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal + Contrastive EHR Modelling with Hierarchical Regularisation EACL 2024 + + +
+ Predicting next visit diagnosis using Electronic Health Records (EHR) is an +essential task in healthcare, critical for devising proactive future plans for +both healthcare providers and patients. Nonetheless, many preceding studies +have not sufficiently addressed the heterogeneous and hierarchical +characteristics inherent in EHR data, inevitably leading to sub-optimal +performance. To this end, we propose NECHO, a novel medical code-centric +multimodal contrastive EHR learning framework with hierarchical regularisation. +First, we integrate multifaceted information encompassing medical codes, +demographics, and clinical notes using a tailored network design and a pair of +bimodal contrastive losses, all of which pivot around a medical code +representation. We also regularise modality-specific encoders using a parental +level information in medical ontology to learn hierarchical structure of EHR +data. A series of experiments on MIMIC-III data demonstrates effectiveness of +our approach. + +
+
+ comment: Accepted to EACL 2024 (The 18th Conference of the European Chapter of + the Association for Computational Linguistics) +
+
+
+
+
+ + ♻ ☆ Hierarchical Locality Sensitive Hashing for Structured Data: A Survey + + +
+ Data similarity (or distance) computation is a fundamental research topic +which fosters a variety of similarity-based machine learning and data mining +applications. In big data analytics, it is impractical to compute the exact +similarity of data instances due to high computational cost. To this end, the +Locality Sensitive Hashing (LSH) technique has been proposed to provide +accurate estimators for various similarity measures between sets or vectors in +an efficient manner without the learning process. Structured data (e.g., +sequences, trees and graphs), which are composed of elements and relations +between the elements, are commonly seen in the real world, but the traditional +LSH algorithms cannot preserve the structure information represented as +relations between elements. In order to conquer the issue, researchers have +been devoted to the family of the hierarchical LSH algorithms. In this paper, +we explore the present progress of the research into hierarchical LSH from the +following perspectives: 1) Data structures, where we review various +hierarchical LSH algorithms for three typical data structures and uncover their +inherent connections; 2) Applications, where we review the hierarchical LSH +algorithms in multiple application scenarios; 3) Challenges, where we discuss +some potential challenges as future directions. + +
+
+
+
+
+ + ♻ ☆ Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale + Localization + + +
+ Advancements in nanotechnology and material science are paving the way toward +nanoscale devices that combine sensing, computing, data and energy storage, and +wireless communication. In precision medicine, these nanodevices show promise +for disease diagnostics, treatment, and monitoring from within the patients' +bloodstreams. Assigning the location of a sensed biological event with the +event itself, which is the main proposition of flow-guided in-body nanoscale +localization, would be immensely beneficial from the perspective of precision +medicine. The nanoscale nature of the nanodevices and the challenging +environment that the bloodstream represents, result in current flow-guided +localization approaches being constrained in their communication and +energy-related capabilities. The communication and energy constraints of the +nanodevices result in different features of raw data for flow-guided +localization, in turn affecting its performance. An analytical modeling of the +effects of imperfect communication and constrained energy causing intermittent +operation of the nanodevices on the raw data produced by the nanodevices would +be beneficial. Hence, we propose an analytical model of raw data for +flow-guided localization, where the raw data is modeled as a function of +communication and energy-related capabilities of the nanodevice. We evaluate +the model by comparing its output with the one obtained through the utilization +of a simulator for objective evaluation of flow-guided localization, featuring +comparably higher level of realism. Our results across a number of scenarios +and heterogeneous performance metrics indicate high similarity between the +model and simulator-generated raw datasets. + +
+
+ comment: 6 pages, 7 figures, 4 tables, 16 references +
+
+
+
+
+ + ♻ ☆ Medication Recommendation via Domain Knowledge Informed Deep Learning + + +
+ Medication recommendation is a fundamental yet crucial branch of healthcare, +which provides opportunities to support clinical physicians with more accurate +medication prescriptions for patients with complex health conditions. Learning +from electronic health records (EHR) to recommend medications is the most +common way in previous studies. However, most of them neglect incorporating +domain knowledge according to the clinical manifestations in the EHR of the +patient. To address these issues, we propose a novel \textbf{D}omain +\textbf{K}nowledge \textbf{I}nformed \textbf{Net}work (DKINet) to integrate +domain knowledge with observable clinical manifestations of the patient, which +is the first dynamic domain knowledge informed framework toward medication +recommendation. In particular, we first design a knowledge-driven encoder to +capture the domain information and then develop a data-driven encoder to +integrate domain knowledge into the observable EHR. To endow the model with the +capability of temporal decision, we design an explicit medication encoder for +learning the longitudinal dependence of the patient. Extensive experiments on +three publicly available datasets verify the superiority of our method. The +code will be public upon acceptance. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ A greedy approach for increased vehicle utilization in ridesharing + networks + + +
+ In recent years, ridesharing platforms have become a prominent mode of +transportation for the residents of urban areas. As a fundamental problem, +route recommendation for these platforms is vital for their sustenance. The +works done in this direction have recommended routes with higher passenger +demand. Despite the existing works, statistics have suggested that these +services cause increased greenhouse emissions compared to private vehicles as +they roam around in search of riders. This analysis provides finer details +regarding the functionality of ridesharing systems and it reveals that in the +face of their boom, they have not utilized the vehicle capacity efficiently. We +propose to overcome the above limitations and recommend routes that will fetch +multiple passengers simultaneously which will result in increased vehicle +utilization and thereby decrease the effect of these systems on the +environment. As route recommendation is NP-hard, we propose a k-hop-based +sliding window approximation algorithm that reduces the search space from +entire road network to a window. We further demonstrate that maximizing +expected demand is submodular and greedy algorithms can be used to optimize our +objective function within a window. We evaluate our proposed model on +real-world datasets and experimental results demonstrate superior performance +by our proposed model. + +
+
+
+
+
+ + ♻ ☆ Streamlining Social Media Information Extraction for Public Health + Research with Deep Learning + + +
+ Objective: Social media-based public health research is crucial for epidemic +surveillance, but most studies identify relevant corpora with keyword matching. +This study develops a system to streamline the process of curating colloquial +medical dictionaries. We demonstrate the pipeline by curating a UMLS-colloquial +symptom dictionary from COVID-19-related tweets as proof of concept. Methods: +COVID-19-related tweets from February 1, 2020, to April 30, 2022 were used. The +pipeline includes three modules: a named entity recognition module to detect +symptoms in tweets; an entity normalization module to aggregate detected +entities; and a mapping module that iteratively maps entities to Unified +Medical Language System concepts. A random 500 entity sample were drawn from +the final dictionary for accuracy validation. Additionally, we conducted a +symptom frequency distribution analysis to compare our dictionary to a +pre-defined lexicon from previous research. Results: We identified 498,480 +unique symptom entity expressions from the tweets. Pre-processing reduces the +number to 18,226. The final dictionary contains 38,175 unique expressions of +symptoms that can be mapped to 966 UMLS concepts (accuracy = 95%). Symptom +distribution analysis found that our dictionary detects more symptoms and is +effective at identifying psychiatric disorders like anxiety and depression, +often missed by pre-defined lexicons. Conclusion: This study advances public +health research by implementing a novel, systematic pipeline for curating +symptom lexicons from social media data. The final lexicon's high accuracy, +validated by medical professionals, underscores the potential of this +methodology to reliably interpret and categorize vast amounts of unstructured +social media data into actionable medical insights across diverse linguistic +and regional landscapes. + +
+
+ comment: Updated full paper. Abstract presented at IEEE ICHI 2023 and AMIA + Annual Symposium 2023 +
+
+
+
+
+
+
+
+ + Machine Learning 137 + +
+
+
+ + ☆ Exploring Simple Open-Vocabulary Semantic Segmentation + + +
+ Open-vocabulary semantic segmentation models aim to accurately assign a +semantic label to each pixel in an image from a set of arbitrary +open-vocabulary texts. In order to learn such pixel-level alignment, current +approaches typically rely on a combination of (i) image-level VL model (e.g. +CLIP), (ii) ground truth masks, and (iii) custom grouping encoders. In this +paper, we introduce S-Seg, a novel model that can achieve surprisingly strong +performance without depending on any of the above elements. S-Seg leverages +pseudo-mask and language to train a MaskFormer, and can be easily trained from +publicly available image-text datasets. Contrary to prior works, our model +directly trains for pixel-level features and language alignment. Once trained, +S-Seg generalizes well to multiple testing datasets without requiring +fine-tuning. In addition, S-Seg has the extra benefits of scalability with data +and consistently improvement when augmented with self-training. We believe that +our simple yet effective approach will serve as a solid baseline for future +research. + +
+
+ comment: Code is available at: https://github.com/zlai0/S-Seg +
+
+
+
+
+ + ☆ Mitigating Covariate Shift in Misspecified Regression with Applications + to Reinforcement Learning + + +
+ A pervasive phenomenon in machine learning applications is distribution +shift, where training and deployment conditions for a machine learning model +differ. As distribution shift typically results in a degradation in +performance, much attention has been devoted to algorithmic interventions that +mitigate these detrimental effects. In this paper, we study the effect of +distribution shift in the presence of model misspecification, specifically +focusing on $L_{\infty}$-misspecified regression and adversarial covariate +shift, where the regression target remains fixed while the covariate +distribution changes arbitrarily. We show that empirical risk minimization, or +standard least squares regression, can result in undesirable misspecification +amplification where the error due to misspecification is amplified by the +density ratio between the training and testing distributions. As our main +result, we develop a new algorithm -- inspired by robust optimization +techniques -- that avoids this undesirable behavior, resulting in no +misspecification amplification while still obtaining optimal statistical rates. +As applications, we use this regression procedure to obtain new guarantees in +offline and online reinforcement learning with misspecification and establish +new separations between previously studied structural conditions and notions of +coverage. + +
+
+
+
+
+ + ☆ Rate-Distortion-Perception Tradeoff Based on the + Conditional-Distribution Perception Measure + + +
+ We study the rate-distortion-perception (RDP) tradeoff for a memoryless +source model in the asymptotic limit of large block-lengths. Our perception +measure is based on a divergence between the distributions of the source and +reconstruction sequences conditioned on the encoder output, which was first +proposed in [1], [2]. We consider the case when there is no shared randomness +between the encoder and the decoder. For the case of discrete memoryless +sources we derive a single-letter characterization of the RDP function, thus +settling a problem that remains open for the marginal metric introduced in Blau +and Michaeli [3] (with no shared randomness). Our achievability scheme is based +on lossy source coding with a posterior reference map proposed in [4]. For the +case of continuous valued sources under squared error distortion measure and +squared quadratic Wasserstein perception measure we also derive a single-letter +characterization and show that a noise-adding mechanism at the decoder suffices +to achieve the optimal representation. For the case of zero perception loss, we +show that our characterization interestingly coincides with the results for the +marginal metric derived in [5], [6] and again demonstrate that zero perception +loss can be achieved with a $3$-dB penalty in the minimum distortion. Finally +we specialize our results to the case of Gaussian sources. We derive the RDP +function for vector Gaussian sources and propose a waterfilling type solution. +We also partially characterize the RDP function for a mixture of vector +Gaussians. + +
+
+
+
+
+ + ☆ Retrieval-Guided Reinforcement Learning for Boolean Circuit Minimization ICLR 2024 + + +
+ Logic synthesis, a pivotal stage in chip design, entails optimizing chip +specifications encoded in hardware description languages like Verilog into +highly efficient implementations using Boolean logic gates. The process +involves a sequential application of logic minimization heuristics (``synthesis +recipe"), with their arrangement significantly impacting crucial metrics such +as area and delay. Addressing the challenge posed by the broad spectrum of +design complexities - from variations of past designs (e.g., adders and +multipliers) to entirely novel configurations (e.g., innovative processor +instructions) - requires a nuanced `synthesis recipe` guided by human expertise +and intuition. This study conducts a thorough examination of learning and +search techniques for logic synthesis, unearthing a surprising revelation: +pre-trained agents, when confronted with entirely novel designs, may veer off +course, detrimentally affecting the search trajectory. We present ABC-RL, a +meticulously tuned $\alpha$ parameter that adeptly adjusts recommendations from +pre-trained agents during the search process. Computed based on similarity +scores through nearest neighbor retrieval from the training dataset, ABC-RL +yields superior synthesis recipes tailored for a wide array of hardware +designs. Our findings showcase substantial enhancements in the +Quality-of-result (QoR) of synthesized circuits, boasting improvements of up to +24.8% compared to state-of-the-art techniques. Furthermore, ABC-RL achieves an +impressive up to 9x reduction in runtime (iso-QoR) when compared to current +state-of-the-art methodologies. + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ☆ OK-Robot: What Really Matters in Integrating Open-Knowledge Models for + Robotics + + +
+ Remarkable progress has been made in recent years in the fields of vision, +language, and robotics. We now have vision models capable of recognizing +objects based on language queries, navigation systems that can effectively +control mobile systems, and grasping models that can handle a wide range of +objects. Despite these advancements, general-purpose applications of robotics +still lag behind, even though they rely on these fundamental capabilities of +recognition, navigation, and grasping. In this paper, we adopt a systems-first +approach to develop a new Open Knowledge-based robotics framework called +OK-Robot. By combining Vision-Language Models (VLMs) for object detection, +navigation primitives for movement, and grasping primitives for object +manipulation, OK-Robot offers a integrated solution for pick-and-drop +operations without requiring any training. To evaluate its performance, we run +OK-Robot in 10 real-world home environments. The results demonstrate that +OK-Robot achieves a 58.5% success rate in open-ended pick-and-drop tasks, +representing a new state-of-the-art in Open Vocabulary Mobile Manipulation +(OVMM) with nearly 1.8x the performance of prior work. On cleaner, uncluttered +environments, OK-Robot's performance increases to 82%. However, the most +important insight gained from OK-Robot is the critical role of nuanced details +when combining Open Knowledge systems like VLMs with robotic modules. Videos of +our experiments are available on our website: https://ok-robot.github.io + +
+
+
+
+
+ + ☆ APT: Adaptive Pruning and Tuning Pretrained Language Models for + Efficient Training and Inference + + +
+ Fine-tuning and inference with large Language Models (LM) are generally known +to be expensive. Parameter-efficient fine-tuning over pretrained LMs reduces +training memory by updating a small number of LM parameters but does not +improve inference efficiency. Structured pruning improves LM inference +efficiency by removing consistent parameter blocks, yet often increases +training memory and time. To improve both training and inference efficiency, we +introduce APT that adaptively prunes and tunes parameters for the LMs. At the +early stage of fine-tuning, APT dynamically adds salient tuning parameters for +fast and accurate convergence while discarding unimportant parameters for +efficiency. Compared to baselines, our experiments show that APT maintains up +to 98% task performance when pruning RoBERTa and T5 models with 40% parameters +left while keeping 86.4% LLaMA models' performance with 70% parameters +remained. Furthermore, APT speeds up LMs fine-tuning by up to 8x and reduces +large LMs memory training footprint by up to 70%. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ WARM: On the Benefits of Weight Averaged Reward Models + + +
+ Aligning large language models (LLMs) with human preferences through +reinforcement learning (RLHF) can lead to reward hacking, where LLMs exploit +failures in the reward model (RM) to achieve seemingly high rewards without +meeting the underlying objectives. We identify two primary challenges when +designing RMs to mitigate reward hacking: distribution shifts during the RL +process and inconsistencies in human preferences. As a solution, we propose +Weight Averaged Reward Models (WARM), first fine-tuning multiple RMs, then +averaging them in the weight space. This strategy follows the observation that +fine-tuned weights remain linearly mode connected when sharing the same +pre-training. By averaging weights, WARM improves efficiency compared to the +traditional ensembling of predictions, while improving reliability under +distribution shifts and robustness to preference inconsistencies. Our +experiments on summarization tasks, using best-of-N and RL methods, shows that +WARM improves the overall quality and alignment of LLM predictions; for +example, a policy RL fine-tuned with WARM has a 79.4% win rate against a policy +RL fine-tuned with a single RM. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ Universal Neurons in GPT2 Language Models + + +
+ A basic question within the emerging field of mechanistic interpretability is +the degree to which neural networks learn the same underlying mechanisms. In +other words, are neural mechanisms universal across different models? In this +work, we study the universality of individual neurons across GPT2 models +trained from different initial random seeds, motivated by the hypothesis that +universal neurons are likely to be interpretable. In particular, we compute +pairwise correlations of neuron activations over 100 million tokens for every +neuron pair across five different seeds and find that 1-5\% of neurons are +universal, that is, pairs of neurons which consistently activate on the same +inputs. We then study these universal neurons in detail, finding that they +usually have clear interpretations and taxonomize them into a small number of +neuron families. We conclude by studying patterns in neuron weights to +establish several universal functional roles of neurons in simple circuits: +deactivating attention heads, changing the entropy of the next token +distribution, and predicting the next token to (not) be within a particular +set. + +
+
+
+
+
+ + ☆ DITTO: Diffusion Inference-Time T-Optimization for Music Generation + + +
+ We propose Diffusion Inference-Time T-Optimization (DITTO), a general-purpose +frame-work for controlling pre-trained text-to-music diffusion models at +inference-time via optimizing initial noise latents. Our method can be used to +optimize through any differentiable feature matching loss to achieve a target +(stylized) output and leverages gradient checkpointing for memory efficiency. +We demonstrate a surprisingly wide-range of applications for music generation +including inpainting, outpainting, and looping as well as intensity, melody, +and musical structure control - all without ever fine-tuning the underlying +model. When we compare our approach against related training, guidance, and +optimization-based methods, we find DITTO achieves state-of-the-art performance +on nearly all tasks, including outperforming comparable approaches on +controllability, audio quality, and computational efficiency, thus opening the +door for high-quality, flexible, training-free control of diffusion models. +Sound examples can be found at https://DITTO-Music.github.io/web/. + +
+
+
+
+
+ + ☆ SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning + Capabilities + + +
+ Understanding and reasoning about spatial relationships is a fundamental +capability for Visual Question Answering (VQA) and robotics. While Vision +Language Models (VLM) have demonstrated remarkable performance in certain VQA +benchmarks, they still lack capabilities in 3D spatial reasoning, such as +recognizing quantitative relationships of physical objects like distances or +size differences. We hypothesize that VLMs' limited spatial reasoning +capability is due to the lack of 3D spatial knowledge in training data and aim +to solve this problem by training VLMs with Internet-scale spatial reasoning +data. To this end, we present a system to facilitate this approach. We first +develop an automatic 3D spatial VQA data generation framework that scales up to +2 billion VQA examples on 10 million real-world images. We then investigate +various factors in the training recipe, including data quality, training +pipeline, and VLM architecture. Our work features the first internet-scale 3D +spatial reasoning dataset in metric space. By training a VLM on such data, we +significantly enhance its ability on both qualitative and quantitative spatial +VQA. Finally, we demonstrate that this VLM unlocks novel downstream +applications in chain-of-thought spatial reasoning and robotics due to its +quantitative estimation capability. Project website: +https://spatial-vlm.github.io/ + +
+
+
+
+
+ + ☆ Personalized Over-the-Air Federated Learning with Personalized + Reconfigurable Intelligent Surfaces ICASSP 2024 + + +
+ Over-the-air federated learning (OTA-FL) provides bandwidth-efficient +learning by leveraging the inherent superposition property of wireless +channels. Personalized federated learning balances performance for users with +diverse datasets, addressing real-life data heterogeneity. We propose the first +personalized OTA-FL scheme through multi-task learning, assisted by personal +reconfigurable intelligent surfaces (RIS) for each user. We take a cross-layer +approach that optimizes communication and computation resources for global and +personalized tasks in time-varying channels with imperfect channel state +information, using multi-task learning for non-i.i.d data. Our PROAR-PFed +algorithm adaptively designs power, local iterations, and RIS configurations. +We present convergence analysis for non-convex objectives and demonstrate that +PROAR-PFed outperforms state-of-the-art on the Fashion-MNIST dataset. + +
+
+ comment: Copyright 2024 IEEE. Published in ICASSP 2024, 14-19 April, Seoul, + Korea. Personal use of this material is permitted. However, permission to + reprint/republish this material for advertising or promotional purposes or + for creating new collective works for resale or redistribution to servers or + lists, or to reuse any copyrighted component of this work in other works, + must be obtained from the IEEE +
+
+
+
+
+ + ☆ VRMN-bD: A Multi-modal Natural Behavior Dataset of Immersive Human Fear + Responses in VR Stand-up Interactive Games + + +
+ Understanding and recognizing emotions are important and challenging issues +in the metaverse era. Understanding, identifying, and predicting fear, which is +one of the fundamental human emotions, in virtual reality (VR) environments +plays an essential role in immersive game development, scene development, and +next-generation virtual human-computer interaction applications. In this +article, we used VR horror games as a medium to analyze fear emotions by +collecting multi-modal data (posture, audio, and physiological signals) from 23 +players. We used an LSTM-based model to predict fear with accuracies of 65.31% +and 90.47% under 6-level classification (no fear and five different levels of +fear) and 2-level classification (no fear and fear), respectively. We +constructed a multi-modal natural behavior dataset of immersive human fear +responses (VRMN-bD) and compared it with existing relevant advanced datasets. +The results show that our dataset has fewer limitations in terms of collection +method, data scale and audience scope. We are unique and advanced in targeting +multi-modal datasets of fear and behavior in VR stand-up interactive +environments. Moreover, we discussed the implications of this work for +communities and applications. The dataset and pre-trained model are available +at https://github.com/KindOPSTAR/VRMN-bD. + +
+
+ comment: Accepted to IEEE VR 2024 +
+
+
+
+
+ + ☆ Evaluation of QCNN-LSTM for Disability Forecasting in Multiple Sclerosis + Using Sequential Multisequence MRI + + +
+ Introduction Quantum Convolutional Neural Network (QCNN)-Long Short-Term +Memory (LSTM) models were studied to provide sequential relationships for each +timepoint in MRIs of patients with Multiple Sclerosis (MS). In this pilot +study, we compared three QCNN-LSTM models for binary classification of MS +disability benchmarked against classical neural network architectures. Our +hypothesis is that quantum models will provide competitive performance. Methods +Matrix Product State (MPS), reverse Multistate Entanglement Renormalization +Ansatz (MERA), and Tree-Tensor Network (TTN) circuits were paired with LSTM +layer to process near-annual MRI data of patients diagnosed with MS. These were +benchmarked against a Visual Geometry Group (VGG)-LSTM and a Video Vision +Transformer (ViViT). Predicted logits were measured against ground truth labels +of each patient's Extended Disability Severity Score (EDSS) using binary +cross-entropy loss. Training/validation/holdout testing was partitioned using +5-fold cross validation with a total split of 60:20:20. Levene's test of +variance was used to measure statistical difference and Student's t-test for +paired model differences in mean. Results The MPS-LSTM, reverse MERA-LSTM, and +TTN-LSTM had holdout testing ROC-AUC of 0.70, 0.77, and 0.81, respectively +(p-value 0.915). VGG16-LSTM and ViViT performed similarly with ROC-AUC of 0.73 +and 0.77, respectively (p-value 0.631). Overall variance and mean were not +statistically significant (p-value 0.713), however, time to train was +significantly faster for the QCNN-LSTMs (39.4 sec per fold vs. 224 and 218, +respectively, p-value <0.001). Conclusion QCNN-LSTM models perform +competitively to their classical counterparts with greater efficiency in train +time. Clinically, these can add value in terms of efficiency to time-dependent +deep learning prediction of disease progression based upon medical imaging. + +
+
+
+
+
+ + ☆ NeuroSynt: A Neuro-symbolic Portfolio Solver for Reactive Synthesis + + +
+ We introduce NeuroSynt, a neuro-symbolic portfolio solver framework for +reactive synthesis. At the core of the solver lies a seamless integration of +neural and symbolic approaches to solving the reactive synthesis problem. To +ensure soundness, the neural engine is coupled with model checkers verifying +the predictions of the underlying neural models. The open-source implementation +of NeuroSynt provides an integration framework for reactive synthesis in which +new neural and state-of-the-art symbolic approaches can be seamlessly +integrated. Extensive experiments demonstrate its efficacy in handling +challenging specifications, enhancing the state-of-the-art reactive synthesis +solvers, with NeuroSynt contributing novel solves in the current SYNTCOMP +benchmarks. + +
+
+
+
+
+ + ☆ Out-of-Distribution Detection & Applications With Ablated Learned + Temperature Energy + + +
+ As deep neural networks become adopted in high-stakes domains, it is crucial +to be able to identify when inference inputs are Out-of-Distribution (OOD) so +that users can be alerted of likely drops in performance and calibration +despite high confidence. Among many others, existing methods use the following +two scores to do so without training on any apriori OOD examples: a learned +temperature and an energy score. In this paper we introduce Ablated Learned +Temperature Energy (or "AbeT" for short), a method which combines these prior +methods in novel ways with effective modifications. Due to these contributions, +AbeT lowers the False Positive Rate at $95\%$ True Positive Rate (FPR@95) by +$35.39\%$ in classification (averaged across all ID and OOD datasets measured) +compared to state of the art without training networks in multiple stages or +requiring hyperparameters or test-time backward passes. We additionally provide +empirical insights as to how our model learns to distinguish between +In-Distribution (ID) and OOD samples while only being explicitly trained on ID +samples via exposure to misclassified ID examples at training time. Lastly, we +show the efficacy of our method in identifying predicted bounding boxes and +pixels corresponding to OOD objects in object detection and semantic +segmentation, respectively - with an AUROC increase of $5.15\%$ in object +detection and both a decrease in FPR@95 of $41.48\%$ and an increase in AUPRC +of $34.20\%$ on average in semantic segmentation compared to previous state of +the art. + +
+
+
+
+
+ + ☆ Extracting Formulae in Many-Valued Logic from Deep Neural Networks + + +
+ We propose a new perspective on deep ReLU networks, namely as circuit +counterparts of Lukasiewicz infinite-valued logic -- a many-valued (MV) +generalization of Boolean logic. An algorithm for extracting formulae in MV +logic from deep ReLU networks is presented. As the algorithm applies to +networks with general, in particular also real-valued, weights, it can be used +to extract logical formulae from deep ReLU networks trained on data. + +
+
+
+
+
+ + ☆ On-Time Delivery in Crowdshipping Systems: An Agent-Based Approach Using + Streaming Data + + +
+ In parcel delivery, the "last mile" from the parcel hub to the customer is +costly, especially for time-sensitive delivery tasks that have to be completed +within hours after arrival. Recently, crowdshipping has attracted increased +attention as a new alternative to traditional delivery modes. In crowdshipping, +private citizens ("the crowd") perform short detours in their daily lives to +contribute to parcel delivery in exchange for small incentives. However, +achieving desirable crowd behavior is challenging as the crowd is highly +dynamic and consists of autonomous, self-interested individuals. Leveraging +crowdshipping for time-sensitive deliveries remains an open challenge. In this +paper, we present an agent-based approach to on-time parcel delivery with +crowds. Our system performs data stream processing on the couriers' smartphone +sensor data to predict delivery delays. Whenever a delay is predicted, the +system attempts to forge an agreement for transferring the parcel from the +current deliverer to a more promising courier nearby. Our experiments show that +through accurate delay predictions and purposeful task transfers many delays +can be prevented that would occur without our approach. + +
+
+
+
+
+ + ☆ LearnedWMP: Workload Memory Prediction Using Distribution of Query + Templates + + +
+ In a modern DBMS, working memory is frequently the limiting factor when +processing in-memory analytic query operations such as joins, sorting, and +aggregation. Existing resource estimation approaches for a DBMS estimate the +resource consumption of a query by computing an estimate of each individual +database operator in the query execution plan. Such an approach is slow and +error-prone as it relies upon simplifying assumptions, such as uniformity and +independence of the underlying data. Additionally, the existing approach +focuses on individual queries separately and does not factor in other queries +in the workload that may be executed concurrently. In this research, we are +interested in query performance optimization under concurrent execution of a +batch of queries (a workload). Specifically, we focus on predicting the memory +demand for a workload rather than providing separate estimates for each query +within it. We introduce the problem of workload memory prediction and formalize +it as a distribution regression problem. We propose Learned Workload Memory +Prediction (LearnedWMP) to improve and simplify estimating the working memory +demands of workloads. Through a comprehensive experimental evaluation, we show +that LearnedWMP reduces the memory estimation error of the +state-of-the-practice method by up to 47.6%. Compared to an alternative +single-query model, during training and inferencing, the LearnedWMP model and +its variants were 3x to 10x faster. Moreover, LearnedWMP-based models were at +least 50% smaller in most cases. Overall, the results demonstrate the +advantages of the LearnedWMP approach and its potential for a broader impact on +query performance optimization. + +
+
+
+
+
+ + ☆ West-of-N: Synthetic Preference Generation for Improved Reward Modeling + + +
+ The success of reinforcement learning from human feedback (RLHF) in language +model alignment is strongly dependent on the quality of the underlying reward +model. In this paper, we present a novel approach to improve reward model +quality by generating synthetic preference data, thereby augmenting the +training dataset with on-policy, high-quality preference pairs. Motivated by +the promising results of Best-of-N sampling strategies in language model +training, we extend their application to reward model training. This results in +a self-training strategy to generate preference pairs by selecting the best and +worst candidates in a pool of responses to a given query. Empirically, we find +that this approach improves the performance of any reward model, with an effect +comparable to the addition of a similar quantity of human preference data. This +work opens up new avenues of research for improving RLHF for language model +alignment, by offering synthetic preference generation as a solution to reward +modeling challenges. + +
+
+
+
+
+ + ☆ Collaborative Reinforcement Learning Based Unmanned Aerial Vehicle (UAV) + Trajectory Design for 3D UAV Tracking + + +
+ In this paper, the problem of using one active unmanned aerial vehicle (UAV) +and four passive UAVs to localize a 3D target UAV in real time is investigated. +In the considered model, each passive UAV receives reflection signals from the +target UAV, which are initially transmitted by the active UAV. The received +reflection signals allow each passive UAV to estimate the signal transmission +distance which will be transmitted to a base station (BS) for the estimation of +the position of the target UAV. Due to the movement of the target UAV, each +active/passive UAV must optimize its trajectory to continuously localize the +target UAV. Meanwhile, since the accuracy of the distance estimation depends on +the signal-to-noise ratio of the transmission signals, the active UAV must +optimize its transmit power. This problem is formulated as an optimization +problem whose goal is to jointly optimize the transmit power of the active UAV +and trajectories of both active and passive UAVs so as to maximize the target +UAV positioning accuracy. To solve this problem, a Z function decomposition +based reinforcement learning (ZD-RL) method is proposed. Compared to value +function decomposition based RL (VD-RL), the proposed method can find the +probability distribution of the sum of future rewards to accurately estimate +the expected value of the sum of future rewards thus finding better transmit +power of the active UAV and trajectories for both active and passive UAVs and +improving target UAV positioning accuracy. Simulation results show that the +proposed ZD-RL method can reduce the positioning errors by up to 39.4% and +64.6%, compared to VD-RL and independent deep RL methods, respectively. + +
+
+
+
+
+ + ☆ Spotting LLMs With Binoculars: Zero-Shot Detection of Machine-Generated + Text + + +
+ Detecting text generated by modern large language models is thought to be +hard, as both LLMs and humans can exhibit a wide range of complex behaviors. +However, we find that a score based on contrasting two closely related language +models is highly accurate at separating human-generated and machine-generated +text. Based on this mechanism, we propose a novel LLM detector that only +requires simple calculations using a pair of pre-trained LLMs. The method, +called Binoculars, achieves state-of-the-art accuracy without any training +data. It is capable of spotting machine text from a range of modern LLMs +without any model-specific modifications. We comprehensively evaluate +Binoculars on a number of text sources and in varied situations. Over a wide +range of document types, Binoculars detects over 90% of generated samples from +ChatGPT (and other LLMs) at a false positive rate of 0.01%, despite not being +trained on any ChatGPT data. + +
+
+ comment: 20 pages, code available at https://github.com/ahans30/Binoculars +
+
+
+
+
+ + ☆ Beyond TreeSHAP: Efficient Computation of Any-Order Shapley Interactions + for Tree Ensembles + + +
+ While shallow decision trees may be interpretable, larger ensemble models +like gradient-boosted trees, which often set the state of the art in machine +learning problems involving tabular data, still remain black box models. As a +remedy, the Shapley value (SV) is a well-known concept in explainable +artificial intelligence (XAI) research for quantifying additive feature +attributions of predictions. The model-specific TreeSHAP methodology solves the +exponential complexity for retrieving exact SVs from tree-based models. +Expanding beyond individual feature attribution, Shapley interactions reveal +the impact of intricate feature interactions of any order. In this work, we +present TreeSHAP-IQ, an efficient method to compute any-order additive Shapley +interactions for predictions of tree-based models. TreeSHAP-IQ is supported by +a mathematical framework that exploits polynomial arithmetic to compute the +interaction scores in a single recursive traversal of the tree, akin to Linear +TreeSHAP. We apply TreeSHAP-IQ on state-of-the-art tree ensembles and explore +interactions on well-established benchmark datasets. + +
+
+
+
+
+ + ☆ Resource-constrained stereo singing voice cancellation + + +
+ We study the problem of stereo singing voice cancellation, a subtask of music +source separation, whose goal is to estimate an instrumental background from a +stereo mix. We explore how to achieve performance similar to large +state-of-the-art source separation networks starting from a small, efficient +model for real-time speech separation. Such a model is useful when memory and +compute are limited and singing voice processing has to run with limited +look-ahead. In practice, this is realised by adapting an existing mono model to +handle stereo input. Improvements in quality are obtained by tuning model +parameters and expanding the training set. Moreover, we highlight the benefits +a stereo model brings by introducing a new metric which detects attenuation +inconsistencies between channels. Our approach is evaluated using objective +offline metrics and a large-scale MUSHRA trial, confirming the effectiveness of +our techniques in stringent listening tests. + +
+
+
+
+
+ + ☆ The Dimension Strikes Back with Gradients: Generalization of Gradient + Methods in Stochastic Convex Optimization + + +
+ We study the generalization performance of gradient methods in the +fundamental stochastic convex optimization setting, focusing on its dimension +dependence. First, for full-batch gradient descent (GD) we give a construction +of a learning problem in dimension $d=O(n^2)$, where the canonical version of +GD (tuned for optimal performance of the empirical risk) trained with $n$ +training examples converges, with constant probability, to an approximate +empirical risk minimizer with $\Omega(1)$ population excess risk. Our bound +translates to a lower bound of $\Omega (\sqrt{d})$ on the number of training +examples required for standard GD to reach a non-trivial test error, answering +an open question raised by Feldman (2016) and Amir, Koren, and Livni (2021b) +and showing that a non-trivial dimension dependence is unavoidable. +Furthermore, for standard one-pass stochastic gradient descent (SGD), we show +that an application of the same construction technique provides a similar +$\Omega(\sqrt{d})$ lower bound for the sample complexity of SGD to reach a +non-trivial empirical error, despite achieving optimal test performance. This +again provides an exponential improvement in the dimension dependence compared +to previous work (Koren, Livni, Mansour, and Sherman, 2022), resolving an open +question left therein. + +
+
+
+
+
+ + ☆ NEUROSEC: FPGA-Based Neuromorphic Audio Security + + +
+ Neuromorphic systems, inspired by the complexity and functionality of the +human brain, have gained interest in academic and industrial attention due to +their unparalleled potential across a wide range of applications. While their +capabilities herald innovation, it is imperative to underscore that these +computational paradigms, analogous to their traditional counterparts, are not +impervious to security threats. Although the exploration of neuromorphic +methodologies for image and video processing has been rigorously pursued, the +realm of neuromorphic audio processing remains in its early stages. Our results +highlight the robustness and precision of our FPGA-based neuromorphic system. +Specifically, our system showcases a commendable balance between desired signal +and background noise, efficient spike rate encoding, and unparalleled +resilience against adversarial attacks such as FGSM and PGD. A standout feature +of our framework is its detection rate of 94%, which, when compared to other +methodologies, underscores its greater capability in identifying and mitigating +threats within 5.39 dB, a commendable SNR ratio. Furthermore, neuromorphic +computing and hardware security serve many sensor domains in mission-critical +and privacy-preserving applications. + +
+
+ comment: Audio processing, FPGA, Hardware Security, Neuromorphic Computing +
+
+
+
+
+ + ☆ Fourier Transporter: Bi-Equivariant Robotic Manipulation in 3D + + +
+ Many complex robotic manipulation tasks can be decomposed as a sequence of +pick and place actions. Training a robotic agent to learn this sequence over +many different starting conditions typically requires many iterations or +demonstrations, especially in 3D environments. In this work, we propose Fourier +Transporter (\ours{}) which leverages the two-fold $\SE(d)\times\SE(d)$ +symmetry in the pick-place problem to achieve much higher sample efficiency. +\ours{} is an open-loop behavior cloning method trained using expert +demonstrations to predict pick-place actions on new environments. \ours{} is +constrained to incorporate symmetries of the pick and place actions +independently. Our method utilizes a fiber space Fourier transformation that +allows for memory-efficient construction. We test our proposed network on the +RLbench benchmark and achieve state-of-the-art results across various tasks. + +
+
+
+
+
+ + ☆ Momentum-SAM: Sharpness Aware Minimization without Computational + Overhead + + +
+ The recently proposed optimization algorithm for deep neural networks +Sharpness Aware Minimization (SAM) suggests perturbing parameters before +gradient calculation by a gradient ascent step to guide the optimization into +parameter space regions of flat loss. While significant generalization +improvements and thus reduction of overfitting could be demonstrated, the +computational costs are doubled due to the additionally needed gradient +calculation, making SAM unfeasible in case of limited computationally +capacities. Motivated by Nesterov Accelerated Gradient (NAG) we propose +Momentum-SAM (MSAM), which perturbs parameters in the direction of the +accumulated momentum vector to achieve low sharpness without significant +computational overhead or memory demands over SGD or Adam. We evaluate MSAM in +detail and reveal insights on separable mechanisms of NAG, SAM and MSAM +regarding training optimization and generalization. Code is available at +https://github.com/MarlonBecker/MSAM. + +
+
+
+
+
+ + ☆ Multimodal Visual-Tactile Representation Learning through + Self-Supervised Contrastive Pre-Training + + +
+ The rapidly evolving field of robotics necessitates methods that can +facilitate the fusion of multiple modalities. Specifically, when it comes to +interacting with tangible objects, effectively combining visual and tactile +sensory data is key to understanding and navigating the complex dynamics of the +physical world, enabling a more nuanced and adaptable response to changing +environments. Nevertheless, much of the earlier work in merging these two +sensory modalities has relied on supervised methods utilizing datasets labeled +by humans.This paper introduces MViTac, a novel methodology that leverages +contrastive learning to integrate vision and touch sensations in a +self-supervised fashion. By availing both sensory inputs, MViTac leverages +intra and inter-modality losses for learning representations, resulting in +enhanced material property classification and more adept grasping prediction. +Through a series of experiments, we showcase the effectiveness of our method +and its superiority over existing state-of-the-art self-supervised and +supervised techniques. In evaluating our methodology, we focus on two distinct +tasks: material classification and grasping success prediction. Our results +indicate that MViTac facilitates the development of improved modality encoders, +yielding more robust representations as evidenced by linear probing +assessments. + +
+
+
+
+
+ + ☆ Robustness to distribution shifts of compressed networks for edge + devices + + +
+ It is necessary to develop efficient DNNs deployed on edge devices with +limited computation resources. However, the compressed networks often execute +new tasks in the target domain, which is different from the source domain where +the original network is trained. It is important to investigate the robustness +of compressed networks in two types of data distribution shifts: domain shifts +and adversarial perturbations. In this study, we discover that compressed +models are less robust to distribution shifts than their original networks. +Interestingly, larger networks are more vulnerable to losing robustness than +smaller ones, even when they are compressed to a similar size as the smaller +networks. Furthermore, compact networks obtained by knowledge distillation are +much more robust to distribution shifts than pruned networks. Finally, +post-training quantization is a reliable method for achieving significant +robustness to distribution shifts, and it outperforms both pruned and distilled +models in terms of robustness. + +
+
+
+
+
+ + ☆ TurboSVM-FL: Boosting Federated Learning through SVM Aggregation for + Lazy Clients + + +
+ Federated learning is a distributed collaborative machine learning paradigm +that has gained strong momentum in recent years. In federated learning, a +central server periodically coordinates models with clients and aggregates the +models trained locally by clients without necessitating access to local data. +Despite its potential, the implementation of federated learning continues to +encounter several challenges, predominantly the slow convergence that is +largely due to data heterogeneity. The slow convergence becomes particularly +problematic in cross-device federated learning scenarios where clients may be +strongly limited by computing power and storage space, and hence counteracting +methods that induce additional computation or memory cost on the client side +such as auxiliary objective terms and larger training iterations can be +impractical. In this paper, we propose a novel federated aggregation strategy, +TurboSVM-FL, that poses no additional computation burden on the client side and +can significantly accelerate convergence for federated classification task, +especially when clients are "lazy" and train their models solely for few epochs +for next global aggregation. TurboSVM-FL extensively utilizes support vector +machine to conduct selective aggregation and max-margin spread-out +regularization on class embeddings. We evaluate TurboSVM-FL on multiple +datasets including FEMNIST, CelebA, and Shakespeare using user-independent +validation with non-iid data distribution. Our results show that TurboSVM-FL +can significantly outperform existing popular algorithms on convergence rate +and reduce communication rounds while delivering better test metrics including +accuracy, F1 score, and MCC. + +
+
+
+
+
+ + ☆ Tensor-view Topological Graph Neural Network AISTATS 2024 + + +
+ Graph classification is an important learning task for graph-structured data. +Graph neural networks (GNNs) have recently gained growing attention in graph +learning and have shown significant improvements in many important graph +problems. Despite their state-of-the-art performances, existing GNNs only use +local information from a very limited neighborhood around each node, suffering +from loss of multi-modal information and overheads of excessive computation. To +address these issues, we propose a novel Tensor-view Topological Graph Neural +Network (TTG-NN), a class of simple yet effective topological deep learning +built upon persistent homology, graph convolution, and tensor operations. This +new method incorporates tensor learning to simultaneously capture Tensor-view +Topological (TT), as well as Tensor-view Graph (TG) structural information on +both local and global levels. Computationally, to fully exploit graph topology +and structure, we propose two flexible TT and TG representation learning +modules that disentangle feature tensor aggregation and transformation and +learn to preserve multi-modal structure with less computation. Theoretically, +we derive high probability bounds on both the out-of-sample and in-sample mean +squared approximation errors for our proposed Tensor Transformation Layer +(TTL). Real data experiments show that the proposed TTG-NN outperforms 20 +state-of-the-art methods on various graph benchmarks. + +
+
+ comment: Accepted at AISTATS 2024 +
+
+
+
+
+ + ☆ NLCG-Net: A Model-Based Zero-Shot Learning Framework for Undersampled + Quantitative MRI Reconstruction + + +
+ Typical quantitative MRI (qMRI) methods estimate parameter maps after image +reconstructing, which is prone to biases and error propagation. We propose a +Nonlinear Conjugate Gradient (NLCG) optimizer for model-based T2/T1 estimation, +which incorporates U-Net regularization trained in a scan-specific manner. This +end-to-end method directly estimates qMRI maps from undersampled k-space data +using mono-exponential signal modeling with zero-shot scan-specific neural +network regularization to enable high fidelity T1 and T2 mapping. T2 and T1 +mapping results demonstrate the ability of the proposed NLCG-Net to improve +estimation quality compared to subspace reconstruction at high accelerations. + +
+
+ comment: 8 pages, 5 figures, submitted to International Society for Magnetic + Resonance in Medicine 2024 +
+
+
+
+
+ + ☆ HgbNet: predicting hemoglobin level/anemia degree from EHR data + + +
+ Anemia is a prevalent medical condition that typically requires invasive +blood tests for diagnosis and monitoring. Electronic health records (EHRs) have +emerged as valuable data sources for numerous medical studies. EHR-based +hemoglobin level/anemia degree prediction is non-invasive and rapid but still +faces some challenges due to the fact that EHR data is typically an irregular +multivariate time series containing a significant number of missing values and +irregular time intervals. To address these issues, we introduce HgbNet, a +machine learning-based prediction model that emulates clinicians' +decision-making processes for hemoglobin level/anemia degree prediction. The +model incorporates a NanDense layer with a missing indicator to handle missing +values and employs attention mechanisms to account for both local irregularity +and global irregularity. We evaluate the proposed method using two real-world +datasets across two use cases. In our first use case, we predict hemoglobin +level/anemia degree at moment T+1 by utilizing records from moments prior to +T+1. In our second use case, we integrate all historical records with +additional selected test results at moment T+1 to predict hemoglobin +level/anemia degree at the same moment, T+1. HgbNet outperforms the best +baseline results across all datasets and use cases. These findings demonstrate +the feasibility of estimating hemoglobin levels and anemia degree from EHR +data, positioning HgbNet as an effective non-invasive anemia diagnosis solution +that could potentially enhance the quality of life for millions of affected +individuals worldwide. To our knowledge, HgbNet is the first machine learning +model leveraging EHR data for hemoglobin level/anemia degree prediction. + +
+
+
+
+
+ + ☆ Integrating Statistical Significance and Discriminative Power in Pattern + Discovery + + +
+ Pattern discovery plays a central role in both descriptive and predictive +tasks across multiple domains. Actionable patterns must meet rigorous +statistical significance criteria and, in the presence of target variables, +further uphold discriminative power. Our work addresses the underexplored area +of guiding pattern discovery by integrating statistical significance and +discriminative power criteria into state-of-the-art algorithms while preserving +pattern quality. We also address how pattern quality thresholds, imposed by +some algorithms, can be rectified to accommodate these additional criteria. To +test the proposed methodology, we select the triclustering task as the guiding +pattern discovery case and extend well-known greedy and multi-objective +optimization triclustering algorithms, $\delta$-Trimax and TriGen, that use +various pattern quality criteria, such as Mean Squared Residual (MSR), Least +Squared Lines (LSL), and Multi Slope Measure (MSL). Results from three case +studies show the role of the proposed methodology in discovering patterns with +pronounced improvements of discriminative power and statistical significance +without quality deterioration, highlighting its importance in supervisedly +guiding the search. Although the proposed methodology is motivated over +multivariate time series data, it can be straightforwardly extended to pattern +discovery tasks involving multivariate, N-way (N>3), transactional, and +sequential data structures. + Availability: The code is freely available at +https://github.com/JupitersMight/MOF_Triclustering under the MIT license. + +
+
+
+
+
+ + ☆ Expert-Driven Monitoring of Operational ML Models + + +
+ We propose Expert Monitoring, an approach that leverages domain expertise to +enhance the detection and mitigation of concept drift in machine learning (ML) +models. Our approach supports practitioners by consolidating domain expertise +related to concept drift-inducing events, making this expertise accessible to +on-call personnel, and enabling automatic adaptability with expert oversight. + +
+
+
+
+
+ + ☆ Scaling Face Interaction Graph Networks to Real World Scenes + + +
+ Accurately simulating real world object dynamics is essential for various +applications such as robotics, engineering, graphics, and design. To better +capture complex real dynamics such as contact and friction, learned simulators +based on graph networks have recently shown great promise. However, applying +these learned simulators to real scenes comes with two major challenges: first, +scaling learned simulators to handle the complexity of real world scenes which +can involve hundreds of objects each with complicated 3D shapes, and second, +handling inputs from perception rather than 3D state information. Here we +introduce a method which substantially reduces the memory required to run +graph-based learned simulators. Based on this memory-efficient simulation +model, we then present a perceptual interface in the form of editable NeRFs +which can convert real-world scenes into a structured representation that can +be processed by graph network simulator. We show that our method uses +substantially less memory than previous graph-based simulators while retaining +their accuracy, and that the simulators learned in synthetic environments can +be applied to real world scenes captured from multiple camera angles. This +paves the way for expanding the application of learned simulators to settings +where only perceptual information is available at inference time. + +
+
+ comment: 16 pages, 12 figures +
+
+
+
+
+ + ☆ Cross-Validation Conformal Risk Control + + +
+ Conformal risk control (CRC) is a recently proposed technique that applies +post-hoc to a conventional point predictor to provide calibration guarantees. +Generalizing conformal prediction (CP), with CRC, calibration is ensured for a +set predictor that is extracted from the point predictor to control a risk +function such as the probability of miscoverage or the false negative rate. The +original CRC requires the available data set to be split between training and +validation data sets. This can be problematic when data availability is +limited, resulting in inefficient set predictors. In this paper, a novel CRC +method is introduced that is based on cross-validation, rather than on +validation as the original CRC. The proposed cross-validation CRC (CV-CRC) +extends a version of the jackknife-minmax from CP to CRC, allowing for the +control of a broader range of risk functions. CV-CRC is proved to offer +theoretical guarantees on the average risk of the set predictor. Furthermore, +numerical experiments show that CV-CRC can reduce the average set size with +respect to CRC when the available data are limited. + +
+
+
+
+
+ + ☆ Bridging Evolutionary Algorithms and Reinforcement Learning: A + Comprehensive Survey + + +
+ Evolutionary Reinforcement Learning (ERL), which integrates Evolutionary +Algorithms (EAs) and Reinforcement Learning (RL) for optimization, has +demonstrated remarkable performance advancements. By fusing the strengths of +both approaches, ERL has emerged as a promising research direction. This survey +offers a comprehensive overview of the diverse research branches in ERL. +Specifically, we systematically summarize recent advancements in relevant +algorithms and identify three primary research directions: EA-assisted +optimization of RL, RL-assisted optimization of EA, and synergistic +optimization of EA and RL. Following that, we conduct an in-depth analysis of +each research direction, organizing multiple research branches. We elucidate +the problems that each branch aims to tackle and how the integration of EA and +RL addresses these challenges. In conclusion, we discuss potential challenges +and prospective future research directions across various research directions. + +
+
+
+
+
+ + ☆ RUMBoost: Gradient Boosted Random Utility Models + + +
+ This paper introduces the RUMBoost model, a novel discrete choice modelling +approach that combines the interpretability and behavioural robustness of +Random Utility Models (RUMs) with the generalisation and predictive ability of +deep learning methods. We obtain the full functional form of non-linear utility +specifications by replacing each linear parameter in the utility functions of a +RUM with an ensemble of gradient boosted regression trees. This enables +piece-wise constant utility values to be imputed for all alternatives directly +from the data for any possible combination of input variables. We introduce +additional constraints on the ensembles to ensure three crucial features of the +utility specifications: (i) dependency of the utilities of each alternative on +only the attributes of that alternative, (ii) monotonicity of marginal +utilities, and (iii) an intrinsically interpretable functional form, where the +exact response of the model is known throughout the entire input space. +Furthermore, we introduce an optimisation-based smoothing technique that +replaces the piece-wise constant utility values of alternative attributes with +monotonic piece-wise cubic splines to identify non-linear parameters with +defined gradient. We demonstrate the potential of the RUMBoost model compared +to various ML and Random Utility benchmark models for revealed preference mode +choice data from London. The results highlight the great predictive performance +and the direct interpretability of our proposed approach. Furthermore, the +smoothed attribute utility functions allow for the calculation of various +behavioural indicators and marginal utilities. Finally, we demonstrate the +flexibility of our methodology by showing how the RUMBoost model can be +extended to complex model specifications, including attribute interactions, +correlation within alternative error terms and heterogeneity within the +population. + +
+
+
+
+
+ + ☆ Benchmarking Large Multimodal Models against Common Corruptions + + +
+ This technical report aims to fill a deficiency in the assessment of large +multimodal models (LMMs) by specifically examining the self-consistency of +their outputs when subjected to common corruptions. We investigate the +cross-modal interactions between text, image, and speech, encompassing four +essential generation tasks: text-to-image, image-to-text, text-to-speech, and +speech-to-text. We create a comprehensive benchmark, named MMCBench, that +covers more than 100 popular LMMs (totally over 150 model checkpoints). A +thorough evaluation under common corruptions is critical for practical +deployment and facilitates a better understanding of the reliability of +cutting-edge LMMs. The benchmarking code is available at +https://github.com/sail-sg/MMCBench + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Low-Tubal-Rank Tensor Recovery via Factorized Gradient Descent + + +
+ This paper considers the problem of recovering a tensor with an underlying +low-tubal-rank structure from a small number of corrupted linear measurements. +Traditional approaches tackling such a problem require the computation of +tensor Singular Value Decomposition (t-SVD), that is a computationally +intensive process, rendering them impractical for dealing with large-scale +tensors. Aim to address this challenge, we propose an efficient and effective +low-tubal-rank tensor recovery method based on a factorization procedure akin +to the Burer-Monteiro (BM) method. Precisely, our fundamental approach involves +decomposing a large tensor into two smaller factor tensors, followed by solving +the problem through factorized gradient descent (FGD). This strategy eliminates +the need for t-SVD computation, thereby reducing computational costs and +storage requirements. We provide rigorous theoretical analysis to ensure the +convergence of FGD under both noise-free and noisy situations. Additionally, it +is worth noting that our method does not require the precise estimation of the +tensor tubal-rank. Even in cases where the tubal-rank is slightly +overestimated, our approach continues to demonstrate robust performance. A +series of experiments have been carried out to demonstrate that, as compared to +other popular ones, our approach exhibits superior performance in multiple +scenarios, in terms of the faster computational speed and the smaller +convergence error. + +
+
+ comment: 13 pages, 4 figures +
+
+
+
+
+ + ☆ The Bigger the Better? Rethinking the Effective Model Scale in Long-term + Time Series Forecasting + + +
+ Long-term time series forecasting (LTSF) represents a critical frontier in +time series analysis, distinguished by its focus on extensive input sequences, +in contrast to the constrained lengths typical of traditional approaches. While +longer sequences inherently convey richer information, potentially enhancing +predictive precision, prevailing techniques often respond by escalating model +complexity. These intricate models can inflate into millions of parameters, +incorporating parameter-intensive elements like positional encodings, +feed-forward networks and self-attention mechanisms. This complexity, however, +leads to prohibitive model scale, particularly given the time series data's +semantic simplicity. Motivated by the pursuit of parsimony, our research +employs conditional correlation and auto-correlation as investigative tools, +revealing significant redundancies within the input data. Leveraging these +insights, we introduce the HDformer, a lightweight Transformer variant enhanced +with hierarchical decomposition. This novel architecture not only inverts the +prevailing trend toward model expansion but also accomplishes precise +forecasting with drastically fewer computations and parameters. Remarkably, +HDformer outperforms existing state-of-the-art LTSF models, while requiring +over 99\% fewer parameters. Through this work, we advocate a paradigm shift in +LTSF, emphasizing the importance to tailor the model to the inherent dynamics +of time series data-a timely reminder that in the realm of LTSF, bigger is not +invariably better. + +
+
+
+
+
+ + ☆ Multimodal Deep Learning of Word-of-Mouth Text and Demographics to + Predict Customer Rating: Handling Consumer Heterogeneity in Marketing + + +
+ In the marketing field, understanding consumer heterogeneity, which is the +internal or psychological difference among consumers that cannot be captured by +behavioral logs, has long been a critical challenge. However, a number of +consumers today usually post their evaluation on the specific product on the +online platform, which can be the valuable source of such unobservable +differences among consumers. Several previous studies have shown the validity +of the analysis on text modality, but on the other hand, such analyses may not +necessarily demonstrate sufficient predictive accuracy for text alone, as they +may not include information readily available from cross-sectional data, such +as consumer profile data. In addition, recent advances in machine learning +techniques, such as large-scale language models (LLMs) and multimodal learning +have made it possible to deal with the various kind of dataset simultaneously, +including textual data and the traditional cross-sectional data, and the joint +representations can be effectively obtained from multiple modalities. +Therefore, this study constructs a product evaluation model that takes into +account consumer heterogeneity by multimodal learning of online product reviews +and consumer profile information. We also compare multiple models using +different modalities or hyper-parameters to demonstrate the robustness of +multimodal learning in marketing analysis. + +
+
+
+
+
+ + ☆ A Review of Physics-Informed Machine Learning Methods with Applications + to Condition Monitoring and Anomaly Detection + + +
+ This study presents a comprehensive overview of PIML techniques in the +context of condition monitoring. The central concept driving PIML is the +incorporation of known physical laws and constraints into machine learning +algorithms, enabling them to learn from available data while remaining +consistent with physical principles. Through fusing domain knowledge with +data-driven learning, PIML methods offer enhanced accuracy and interpretability +in comparison to purely data-driven approaches. In this comprehensive survey, +detailed examinations are performed with regard to the methodology by which +known physical principles are integrated within machine learning frameworks, as +well as their suitability for specific tasks within condition monitoring. +Incorporation of physical knowledge into the ML model may be realized in a +variety of methods, with each having its unique advantages and drawbacks. The +distinct advantages and limitations of each methodology for the integration of +physics within data-driven models are detailed, considering factors such as +computational efficiency, model interpretability, and generalizability to +different systems in condition monitoring and fault detection. Several case +studies and works of literature utilizing this emerging concept are presented +to demonstrate the efficacy of PIML in condition monitoring applications. From +the literature reviewed, the versatility and potential of PIML in condition +monitoring may be demonstrated. Novel PIML methods offer an innovative solution +for addressing the complexities of condition monitoring and associated +challenges. This comprehensive survey helps form the foundation for future work +in the field. As the technology continues to advance, PIML is expected to play +a crucial role in enhancing maintenance strategies, system reliability, and +overall operational efficiency in engineering systems. + +
+
+ comment: Paper has been submitted for review to the journal Expert Systems + with Applications (December 31, 2023). 90 pages, 22 figures, 9 tables +
+
+
+
+
+ + ☆ Self-Labeling the Job Shop Scheduling Problem + + +
+ In this work, we propose a Self-Supervised training strategy specifically +designed for combinatorial problems. One of the main obstacles in applying +supervised paradigms to such problems is the requirement of expensive target +solutions as ground-truth, often produced with costly exact solvers. Inspired +by Semi- and Self-Supervised learning, we show that it is possible to easily +train generative models by sampling multiple solutions and using the best one +according to the problem objective as a pseudo-label. In this way, we +iteratively improve the model generation capability by relying only on its +self-supervision, completely removing the need for optimality information. We +prove the effectiveness of this Self-Labeling strategy on the Job Shop +Scheduling (JSP), a complex combinatorial problem that is receiving much +attention from the Reinforcement Learning community. We propose a generative +model based on the well-known Pointer Network and train it with our strategy. +Experiments on two popular benchmarks demonstrate the potential of this +approach as the resulting models outperform constructive heuristics and current +state-of-the-art Reinforcement Learning proposals. + +
+
+
+
+
+ + ☆ Adaptive Fusion of Multi-view Remote Sensing data for Optimal Sub-field + Crop Yield Prediction + + +
+ Accurate crop yield prediction is of utmost importance for informed +decision-making in agriculture, aiding farmers, and industry stakeholders. +However, this task is complex and depends on multiple factors, such as +environmental conditions, soil properties, and management practices. Combining +heterogeneous data views poses a fusion challenge, like identifying the +view-specific contribution to the predictive task. We present a novel +multi-view learning approach to predict crop yield for different crops +(soybean, wheat, rapeseed) and regions (Argentina, Uruguay, and Germany). Our +multi-view input data includes multi-spectral optical images from Sentinel-2 +satellites and weather data as dynamic features during the crop growing season, +complemented by static features like soil properties and topographic +information. To effectively fuse the data, we introduce a Multi-view Gated +Fusion (MVGF) model, comprising dedicated view-encoders and a Gated Unit (GU) +module. The view-encoders handle the heterogeneity of data sources with varying +temporal resolutions by learning a view-specific representation. These +representations are adaptively fused via a weighted sum. The fusion weights are +computed for each sample by the GU using a concatenation of the +view-representations. The MVGF model is trained at sub-field level with 10 m +resolution pixels. Our evaluations show that the MVGF outperforms conventional +models on the same task, achieving the best results by incorporating all the +data sources, unlike the usual fusion results in the literature. For Argentina, +the MVGF model achieves an R2 value of 0.68 at sub-field yield prediction, +while at field level evaluation (comparing field averages), it reaches around +0.80 across different countries. The GU module learned different weights based +on the country and crop-type, aligning with the variable significance of each +data source to the prediction task. + +
+
+
+
+
+ + ☆ Learning to Approximate Adaptive Kernel Convolution on Graphs AAAI 2024 + + +
+ Various Graph Neural Networks (GNNs) have been successful in analyzing data +in non-Euclidean spaces, however, they have limitations such as oversmoothing, +i.e., information becomes excessively averaged as the number of hidden layers +increases. The issue stems from the intrinsic formulation of conventional graph +convolution where the nodal features are aggregated from a direct neighborhood +per layer across the entire nodes in the graph. As setting different number of +hidden layers per node is infeasible, recent works leverage a diffusion kernel +to redefine the graph structure and incorporate information from farther nodes. +Unfortunately, such approaches suffer from heavy diagonalization of a graph +Laplacian or learning a large transform matrix. In this regards, we propose a +diffusion learning framework, where the range of feature aggregation is +controlled by the scale of a diffusion kernel. For efficient computation, we +derive closed-form derivatives of approximations of the graph convolution with +respect to the scale, so that node-wise range can be adaptively learned. With a +downstream classifier, the entire framework is made trainable in an end-to-end +manner. Our model is tested on various standard datasets for node-wise +classification for the state-of-the-art performance, and it is also validated +on a real-world brain network data for graph classifications to demonstrate its +practicality for Alzheimer classification. + +
+
+ comment: 15 pages, Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Privacy-Preserving Data Fusion for Traffic State Estimation: A Vertical + Federated Learning Approach + + +
+ This paper proposes a privacy-preserving data fusion method for traffic state +estimation (TSE). Unlike existing works that assume all data sources to be +accessible by a single trusted party, we explicitly address data privacy +concerns that arise in the collaboration and data sharing between multiple data +owners, such as municipal authorities (MAs) and mobility providers (MPs). To +this end, we propose a novel vertical federated learning (FL) approach, FedTSE, +that enables multiple data owners to collaboratively train and apply a TSE +model without having to exchange their private data. To enhance the +applicability of the proposed FedTSE in common TSE scenarios with limited +availability of ground-truth data, we further propose a privacy-preserving +physics-informed FL approach, i.e., FedTSE-PI, that integrates traffic models +into FL. Real-world data validation shows that the proposed methods can protect +privacy while yielding similar accuracy to the oracle method without privacy +considerations. + +
+
+
+
+
+ + ☆ Sparse discovery of differential equations based on multi-fidelity + Gaussian process + + +
+ Sparse identification of differential equations aims to compute the analytic +expressions from the observed data explicitly. However, there exist two primary +challenges. Firstly, it exhibits sensitivity to the noise in the observed data, +particularly for the derivatives computations. Secondly, existing literature +predominantly concentrates on single-fidelity (SF) data, which imposes +limitations on its applicability due to the computational cost. In this paper, +we present two novel approaches to address these problems from the view of +uncertainty quantification. We construct a surrogate model employing the +Gaussian process regression (GPR) to mitigate the effect of noise in the +observed data, quantify its uncertainty, and ultimately recover the equations +accurately. Subsequently, we exploit the multi-fidelity Gaussian processes +(MFGP) to address scenarios involving multi-fidelity (MF), sparse, and noisy +observed data. We demonstrate the robustness and effectiveness of our +methodologies through several numerical experiments. + +
+
+
+
+
+ + ☆ Hallucination is Inevitable: An Innate Limitation of Large Language + Models + + +
+ Hallucination has been widely recognized to be a significant drawback for +large language models (LLMs). There have been many works that attempt to reduce +the extent of hallucination. These efforts have mostly been empirical so far, +which cannot answer the fundamental question whether it can be completely +eliminated. In this paper, we formalize the problem and show that it is +impossible to eliminate hallucination in LLMs. Specifically, we define a formal +world where hallucination is defined as inconsistencies between a computable +LLM and a computable ground truth function. By employing results from learning +theory, we show that LLMs cannot learn all of the computable functions and will +therefore always hallucinate. Since the formal world is a part of the real +world which is much more complicated, hallucinations are also inevitable for +real world LLMs. Furthermore, for real world LLMs constrained by provable time +complexity, we describe the hallucination-prone tasks and empirically validate +our claims. Finally, using the formal world framework, we discuss the possible +mechanisms and efficacies of existing hallucination mitigators as well as the +practical implications on the safe deployment of LLMs. + +
+
+
+
+
+ + ☆ Generalization and Informativeness of Conformal Prediction + + +
+ The safe integration of machine learning modules in decision-making processes +hinges on their ability to quantify uncertainty. A popular technique to achieve +this goal is conformal prediction (CP), which transforms an arbitrary base +predictor into a set predictor with coverage guarantees. While CP certifies the +predicted set to contain the target quantity with a user-defined tolerance, it +does not provide control over the average size of the predicted sets, i.e., +over the informativeness of the prediction. In this work, a theoretical +connection is established between the generalization properties of the base +predictor and the informativeness of the resulting CP prediction sets. To this +end, an upper bound is derived on the expected size of the CP set predictor +that builds on generalization error bounds for the base predictor. The derived +upper bound provides insights into the dependence of the average size of the CP +set predictor on the amount of calibration data, the target reliability, and +the generalization performance of the base predictor. The theoretical insights +are validated using simple numerical regression and classification tasks. + +
+
+
+
+
+ + ☆ Knowledge Distillation on Spatial-Temporal Graph Convolutional Network + for Traffic Prediction + + +
+ Efficient real-time traffic prediction is crucial for reducing transportation +time. To predict traffic conditions, we employ a spatio-temporal graph neural +network (ST-GNN) to model our real-time traffic data as temporal graphs. +Despite its capabilities, it often encounters challenges in delivering +efficient real-time predictions for real-world traffic data. Recognizing the +significance of timely prediction due to the dynamic nature of real-time data, +we employ knowledge distillation (KD) as a solution to enhance the execution +time of ST-GNNs for traffic prediction. In this paper, We introduce a cost +function designed to train a network with fewer parameters (the student) using +distilled data from a complex network (the teacher) while maintaining its +accuracy close to that of the teacher. We use knowledge distillation, +incorporating spatial-temporal correlations from the teacher network to enable +the student to learn the complex patterns perceived by the teacher. However, a +challenge arises in determining the student network architecture rather than +considering it inadvertently. To address this challenge, we propose an +algorithm that utilizes the cost function to calculate pruning scores, +addressing small network architecture search issues, and jointly fine-tunes the +network resulting from each pruning stage using KD. Ultimately, we evaluate our +proposed ideas on two real-world datasets, PeMSD7 and PeMSD8. The results +indicate that our method can maintain the student's accuracy close to that of +the teacher, even with the retention of only $3\%$ of network parameters. + +
+
+
+
+
+ + ☆ Safe and Generalized end-to-end Autonomous Driving System with + Reinforcement Learning and Demonstrations + + +
+ An intelligent driving system should be capable of dynamically formulating +appropriate driving strategies based on the current environment and vehicle +status, while ensuring the security and reliability of the system. However, +existing methods based on reinforcement learning and imitation learning suffer +from low safety, poor generalization, and inefficient sampling. Additionally, +they cannot accurately predict future driving trajectories, and the accurate +prediction of future driving trajectories is a precondition for making optimal +decisions. To solve these problems, in this paper, we introduce a Safe and +Generalized end-to-end Autonomous Driving System (SGADS) for complex and +various scenarios. Our SGADS incorporates variational inference with +normalizing flows, enabling the intelligent vehicle to accurately predict +future driving trajectories. Moreover, we propose the formulation of robust +safety constraints. Furthermore, we combine reinforcement learning with +demonstrations to augment search process of the agent. The experimental results +demonstrate that our SGADS can significantly improve safety performance, +exhibit strong generalization, and enhance the training efficiency of +intelligent vehicles in complex urban scenarios compared to existing methods. + +
+
+
+
+
+ + ☆ SemPLeS: Semantic Prompt Learning for Weakly-Supervised Semantic + Segmentation + + +
+ Weakly-Supervised Semantic Segmentation (WSSS) aims to train segmentation +models using training image data with only image-level supervision. Since +precise pixel-level annotations are not accessible, existing methods typically +focus on producing pseudo masks for training segmentation models by refining +CAM-like heatmaps. However, the produced heatmaps may only capture +discriminative image regions of target object categories or the associated +co-occurring backgrounds. To address the issues, we propose a Semantic Prompt +Learning for WSSS (SemPLeS) framework, which learns to effectively prompt the +CLIP space to enhance the semantic alignment between the segmented regions and +the target object categories. More specifically, we propose Contrastive Prompt +Learning and Class-associated Semantic Refinement to learn the prompts that +adequately describe and suppress the image backgrounds associated with each +target object category. In this way, our proposed framework is able to perform +better semantic matching between object regions and the associated text labels, +resulting in desired pseudo masks for training the segmentation model. The +proposed SemPLeS framework achieves SOTA performance on the standard WSSS +benchmarks, PASCAL VOC and MS COCO, and demonstrated interpretability with the +semantic visualization of our learned prompts. The codes will be released. + +
+
+
+
+
+ + ☆ LightDiC: A Simple yet Effective Approach for Large-scale Digraph + Representation Learning + + +
+ Most existing graph neural networks (GNNs) are limited to undirected graphs, +whose restricted scope of the captured relational information hinders their +expressive capabilities and deployments in real-world scenarios. Compared with +undirected graphs, directed graphs (digraphs) fit the demand for modeling more +complex topological systems by capturing more intricate relationships between +nodes, such as formulating transportation and financial networks. While some +directed GNNs have been introduced, their inspiration mainly comes from deep +learning architectures, which lead to redundant complexity and computation, +making them inapplicable to large-scale databases. To address these issues, we +propose LightDiC, a scalable variant of the digraph convolution based on the +magnetic Laplacian. Since topology-related computations are conducted solely +during offline pre-processing, LightDiC achieves exceptional scalability, +enabling downstream predictions to be trained separately without incurring +recursive computational costs. Theoretical analysis shows that LightDiC +utilizes directed information to achieve message passing based on the complex +field, which corresponds to the proximal gradient descent process of the +Dirichlet energy optimization function from the perspective of digraph signal +denoising, ensuring its expressiveness. Experimental results demonstrate that +LightDiC performs comparably well or even outperforms other SOTA methods in +various downstream tasks, with fewer learnable parameters and higher training +efficiency. Notably, LightDiC is the first DiGNN to provide satisfactory +results in the most representative large-scale database (ogbn-papers100M). + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ ADA-GNN: Atom-Distance-Angle Graph Neural Network for Crystal Material + Property Prediction + + +
+ Property prediction is a fundamental task in crystal material research. To +model atoms and structures, structures represented as graphs are widely used +and graph learning-based methods have achieved significant progress. Bond +angles and bond distances are two key structural information that greatly +influence crystal properties. However, most of the existing works only consider +bond distances and overlook bond angles. The main challenge lies in the time +cost of handling bond angles, which leads to a significant increase in +inference time. To solve this issue, we first propose a crystal structure +modeling based on dual scale neighbor partitioning mechanism, which uses a +larger scale cutoff for edge neighbors and a smaller scale cutoff for angle +neighbors. Then, we propose a novel Atom-Distance-Angle Graph Neural Network +(ADA-GNN) for property prediction tasks, which can process node information and +structural information separately. The accuracy of predictions and inference +time are improved with the dual scale modeling and the specially designed +architecture of ADA-GNN. The experimental results validate that our approach +achieves state-of-the-art results in two large-scale material benchmark +datasets on property prediction tasks. + +
+
+
+
+
+ + ☆ Towards Effective and General Graph Unlearning via Mutual Evolution AAAI 2024 + + +
+ With the rapid advancement of AI applications, the growing needs for data +privacy and model robustness have highlighted the importance of machine +unlearning, especially in thriving graph-based scenarios. However, most +existing graph unlearning strategies primarily rely on well-designed +architectures or manual process, rendering them less user-friendly and posing +challenges in terms of deployment efficiency. Furthermore, striking a balance +between unlearning performance and framework generalization is also a pivotal +concern. To address the above issues, we propose \underline{\textbf{M}}utual +\underline{\textbf{E}}volution \underline{\textbf{G}}raph +\underline{\textbf{U}}nlearning (MEGU), a new mutual evolution paradigm that +simultaneously evolves the predictive and unlearning capacities of graph +unlearning. By incorporating aforementioned two components, MEGU ensures +complementary optimization in a unified training framework that aligns with the +prediction and unlearning requirements. Extensive experiments on 9 graph +benchmark datasets demonstrate the superior performance of MEGU in addressing +unlearning requirements at the feature, node, and edge levels. Specifically, +MEGU achieves average performance improvements of 2.7\%, 2.5\%, and 3.2\% +across these three levels of unlearning tasks when compared to state-of-the-art +baselines. Furthermore, MEGU exhibits satisfactory training efficiency, +reducing time and space overhead by an average of 159.8x and 9.6x, +respectively, in comparison to retraining GNN from scratch. + +
+
+ comment: Accepted by AAAI 2024 Oral +
+
+
+
+
+ + ☆ FedGTA: Topology-aware Averaging for Federated Graph Learning VLDB 2024 + + +
+ Federated Graph Learning (FGL) is a distributed machine learning paradigm +that enables collaborative training on large-scale subgraphs across multiple +local systems. Existing FGL studies fall into two categories: (i) FGL +Optimization, which improves multi-client training in existing machine learning +models; (ii) FGL Model, which enhances performance with complex local models +and multi-client interactions. However, most FGL optimization strategies are +designed specifically for the computer vision domain and ignore graph +structure, presenting dissatisfied performance and slow convergence. Meanwhile, +complex local model architectures in FGL Models studies lack scalability for +handling large-scale subgraphs and have deployment limitations. To address +these issues, we propose Federated Graph Topology-aware Aggregation (FedGTA), a +personalized optimization strategy that optimizes through topology-aware local +smoothing confidence and mixed neighbor features. During experiments, we deploy +FedGTA in 12 multi-scale real-world datasets with the Louvain and Metis split. +This allows us to evaluate the performance and robustness of FedGTA across a +range of scenarios. Extensive experiments demonstrate that FedGTA achieves +state-of-the-art performance while exhibiting high scalability and efficiency. +The experiment includes ogbn-papers100M, the most representative large-scale +graph database so that we can verify the applicability of our method to +large-scale graph learning. To the best of our knowledge, our study is the +first to bridge large-scale graph learning with FGL using this optimization +strategy, contributing to the development of efficient and scalable FGL +methods. + +
+
+ comment: Accepted by VLDB 2024 +
+
+
+
+
+ + ☆ AdaFGL: A New Paradigm for Federated Node Classification with Topology + Heterogeneity ICDE 2024 + + +
+ Recently, Federated Graph Learning (FGL) has attracted significant attention +as a distributed framework based on graph neural networks, primarily due to its +capability to break data silos. Existing FGL studies employ community split on +the homophilous global graph by default to simulate federated semi-supervised +node classification settings. Such a strategy assumes the consistency of +topology between the multi-client subgraphs and the global graph, where +connected nodes are highly likely to possess similar feature distributions and +the same label. However, in real-world implementations, the varying +perspectives of local data engineering result in various subgraph topologies, +posing unique heterogeneity challenges in FGL. Unlike the well-known label +Non-independent identical distribution (Non-iid) problems in federated +learning, FGL heterogeneity essentially reveals the topological divergence +among multiple clients, namely homophily or heterophily. To simulate and handle +this unique challenge, we introduce the concept of structure Non-iid split and +then present a new paradigm called \underline{Ada}ptive \underline{F}ederated +\underline{G}raph \underline{L}earning (AdaFGL), a decoupled two-step +personalized approach. To begin with, AdaFGL employs standard multi-client +federated collaborative training to acquire the federated knowledge extractor +by aggregating uploaded models in the final round at the server. Then, each +client conducts personalized training based on the local subgraph and the +federated knowledge extractor. Extensive experiments on the 12 graph benchmark +datasets validate the superior performance of AdaFGL over state-of-the-art +baselines. Specifically, in terms of test accuracy, our proposed AdaFGL +outperforms baselines by significant margins of 3.24\% and 5.57\% on community +split and structure Non-iid split, respectively. + +
+
+ comment: Accepted by ICDE 2024 +
+
+
+
+
+ + ☆ GI-PIP: Do We Require Impractical Auxiliary Dataset for Gradient + Inversion Attacks? ICASSP 2024 + + +
+ Deep gradient inversion attacks expose a serious threat to Federated Learning +(FL) by accurately recovering private data from shared gradients. However, the +state-of-the-art heavily relies on impractical assumptions to access excessive +auxiliary data, which violates the basic data partitioning principle of FL. In +this paper, a novel method, Gradient Inversion Attack using Practical Image +Prior (GI-PIP), is proposed under a revised threat model. GI-PIP exploits +anomaly detection models to capture the underlying distribution from fewer +data, while GAN-based methods consume significant more data to synthesize +images. The extracted distribution is then leveraged to regulate the attack +process as Anomaly Score loss. Experimental results show that GI-PIP achieves a +16.12 dB PSNR recovery using only 3.8\% data of ImageNet, while GAN-based +methods necessitate over 70\%. Moreover, GI-PIP exhibits superior capability on +distribution generalization compared to GAN-based methods. Our approach +significantly alleviates the auxiliary data requirement on both amount and +distribution in gradient inversion attacks, hence posing more substantial +threat to real-world FL. + +
+
+ comment: 5pages, 5 figures, accepted to ICASSP 2024, not published yet +
+
+
+
+
+ + ☆ Multi-level Cross-modal Alignment for Image Clustering + + +
+ Recently, the cross-modal pretraining model has been employed to produce +meaningful pseudo-labels to supervise the training of an image clustering +model. However, numerous erroneous alignments in a cross-modal pre-training +model could produce poor-quality pseudo-labels and degrade clustering +performance. To solve the aforementioned issue, we propose a novel +\textbf{Multi-level Cross-modal Alignment} method to improve the alignments in +a cross-modal pretraining model for downstream tasks, by building a smaller but +better semantic space and aligning the images and texts in three levels, i.e., +instance-level, prototype-level, and semantic-level. Theoretical results show +that our proposed method converges, and suggests effective means to reduce the +expected clustering risk of our method. Experimental results on five benchmark +datasets clearly show the superiority of our new method. + +
+
+
+
+
+ + ☆ EmerDiff: Emerging Pixel-level Semantic Knowledge in Diffusion Models ICLR 2024 + + +
+ Diffusion models have recently received increasing research attention for +their remarkable transfer abilities in semantic segmentation tasks. However, +generating fine-grained segmentation masks with diffusion models often requires +additional training on annotated datasets, leaving it unclear to what extent +pre-trained diffusion models alone understand the semantic relations of their +generated images. To address this question, we leverage the semantic knowledge +extracted from Stable Diffusion (SD) and aim to develop an image segmentor +capable of generating fine-grained segmentation maps without any additional +training. The primary difficulty stems from the fact that semantically +meaningful feature maps typically exist only in the spatially lower-dimensional +layers, which poses a challenge in directly extracting pixel-level semantic +relations from these feature maps. To overcome this issue, our framework +identifies semantic correspondences between image pixels and spatial locations +of low-dimensional feature maps by exploiting SD's generation process and +utilizes them for constructing image-resolution segmentation maps. In extensive +experiments, the produced segmentation maps are demonstrated to be well +delineated and capture detailed parts of the images, indicating the existence +of highly accurate pixel-level semantic knowledge in diffusion models. + +
+
+ comment: ICLR 2024. Project page: https://kmcode1.github.io/Projects/EmerDiff/ +
+
+
+
+
+ + ☆ Attention on Personalized Clinical Decision Support System: Federated + Learning Approach + + +
+ Health management has become a primary problem as new kinds of diseases and +complex symptoms are introduced to a rapidly growing modern society. Building a +better and smarter healthcare infrastructure is one of the ultimate goals of a +smart city. To the best of our knowledge, neural network models are already +employed to assist healthcare professionals in achieving this goal. Typically, +training a neural network requires a rich amount of data but heterogeneous and +vulnerable properties of clinical data introduce a challenge for the +traditional centralized network. Moreover, adding new inputs to a medical +database requires re-training an existing model from scratch. To tackle these +challenges, we proposed a deep learning-based clinical decision support system +trained and managed under a federated learning paradigm. We focused on a novel +strategy to guarantee the safety of patient privacy and overcome the risk of +cyberattacks while enabling large-scale clinical data mining. As a result, we +can leverage rich clinical data for training each local neural network without +the need for exchanging the confidential data of patients. Moreover, we +implemented the proposed scheme as a sequence-to-sequence model architecture +integrating the attention mechanism. Thus, our objective is to provide a +personalized clinical decision support system with evolvable characteristics +that can deliver accurate solutions and assist healthcare professionals in +medical diagnosing. + +
+
+ comment: Published in IEEE BigComp 2021 +
+
+
+
+
+ + ☆ Fast and Scalable Network Slicing by Integrating Deep Learning with + Lagrangian Methods + + +
+ Network slicing is a key technique in 5G and beyond for efficiently +supporting diverse services. Many network slicing solutions rely on deep +learning to manage complex and high-dimensional resource allocation problems. +However, deep learning models suffer limited generalization and adaptability to +dynamic slicing configurations. In this paper, we propose a novel framework +that integrates constrained optimization methods and deep learning models, +resulting in strong generalization and superior approximation capability. Based +on the proposed framework, we design a new neural-assisted algorithm to +allocate radio resources to slices to maximize the network utility under +inter-slice resource constraints. The algorithm exhibits high scalability, +accommodating varying numbers of slices and slice configurations with ease. We +implement the proposed solution in a system-level network simulator and +evaluate its performance extensively by comparing it to state-of-the-art +solutions including deep reinforcement learning approaches. The numerical +results show that our solution obtains near-optimal quality-of-service +satisfaction and promising generalization performance under different network +slicing scenarios. + +
+
+ comment: 6 pages, 5 figures, IEEE Global Communications Conference 2023 +
+
+
+
+
+ + ☆ Detecting Out-of-Distribution Samples via Conditional Distribution + Entropy with Optimal Transport + + +
+ When deploying a trained machine learning model in the real world, it is +inevitable to receive inputs from out-of-distribution (OOD) sources. For +instance, in continual learning settings, it is common to encounter OOD samples +due to the non-stationarity of a domain. More generally, when we have access to +a set of test inputs, the existing rich line of OOD detection solutions, +especially the recent promise of distance-based methods, falls short in +effectively utilizing the distribution information from training samples and +test inputs. In this paper, we argue that empirical probability distributions +that incorporate geometric information from both training samples and test +inputs can be highly beneficial for OOD detection in the presence of test +inputs available. To address this, we propose to model OOD detection as a +discrete optimal transport problem. Within the framework of optimal transport, +we propose a novel score function known as the \emph{conditional distribution +entropy} to quantify the uncertainty of a test input being an OOD sample. Our +proposal inherits the merits of certain distance-based methods while +eliminating the reliance on distribution assumptions, a-prior knowledge, and +specific training mechanisms. Extensive experiments conducted on benchmark +datasets demonstrate that our method outperforms its competitors in OOD +detection. + +
+
+
+
+
+ + ☆ Graph Condensation: A Survey + + +
+ The burgeoning volume of graph data poses significant challenges in storage, +transmission, and particularly the training of graph neural networks (GNNs). To +address these challenges, graph condensation (GC) has emerged as an innovative +solution. GC focuses on synthesizing a compact yet highly representative graph, +on which GNNs can achieve performance comparable to trained on the large +original graph. The notable efficacy of GC and its broad prospects have +garnered significant attention and spurred extensive research. This survey +paper provides an up-to-date and systematic overview of GC, organizing existing +research into four categories aligned with critical GC evaluation criteria: +effectiveness, generalization, fairness, and efficiency. To facilitate an +in-depth and comprehensive understanding of GC, we examine various methods +under each category and thoroughly discuss two essential components within GC: +optimization strategies and condensed graph generation. Additionally, we +introduce the applications of GC in a variety of fields, and highlight the +present challenges and novel insights in GC, promoting advancements in future +research. + +
+
+
+
+
+ + ☆ Admission Prediction in Undergraduate Applications: an Interpretable + Deep Learning Approach + + +
+ This article addresses the challenge of validating the admission committee's +decisions for undergraduate admissions. In recent years, the traditional review +process has struggled to handle the overwhelmingly large amount of applicants' +data. Moreover, this traditional assessment often leads to human bias, which +might result in discrimination among applicants. Although classical machine +learning-based approaches exist that aim to verify the quantitative assessment +made by the application reviewers, these methods lack scalability and suffer +from performance issues when a large volume of data is in place. In this +context, we propose deep learning-based classifiers, namely Feed-Forward and +Input Convex neural networks, which overcome the challenges faced by the +existing methods. Furthermore, we give additional insights into our model by +incorporating an interpretability module, namely LIME. Our training and test +datasets comprise applicants' data with a wide range of variables and +information. Our models achieve higher accuracy compared to the best-performing +traditional machine learning-based approach by a considerable margin of 3.03\%. +Additionally, we show the sensitivity of different features and their relative +impacts on the overall admission decision using the LIME technique. + +
+
+ comment: This paper has been accepted for Transdisciplinary AI 2023 conference +
+
+
+
+
+ + ☆ Parametric Matrix Models + + +
+ We present a general class of machine learning algorithms called parametric +matrix models. Parametric matrix models are based on matrix equations, and the +design is motivated by the efficiency of reduced basis methods for +approximating solutions of parametric equations. The dependent variables can be +defined implicitly or explicitly, and the equations may use algebraic, +differential, or integral relations. Parametric matrix models can be trained +with empirical data only, and no high-fidelity model calculations are needed. +While originally designed for scientific computing, parametric matrix models +are universal function approximators that can be applied to general machine +learning problems. After introducing the underlying theory, we apply parametric +matrix models to a series of different challenges that show their performance +for a wide range of problems. For all the challenges tested here, parametric +matrix models produce accurate results within a computational framework that +allows for parameter extrapolation and interpretability. + +
+
+
+
+
+ + ☆ TIM: An Efficient Temporal Interaction Module for Spiking Transformer + + +
+ Spiking Neural Networks (SNNs), as the third generation of neural networks, +have gained prominence for their biological plausibility and computational +efficiency, especially in processing diverse datasets. The integration of +attention mechanisms, inspired by advancements in neural network architectures, +has led to the development of Spiking Transformers. These have shown promise in +enhancing SNNs' capabilities, particularly in the realms of both static and +neuromorphic datasets. Despite their progress, a discernible gap exists in +these systems, specifically in the Spiking Self Attention (SSA) mechanism's +effectiveness in leveraging the temporal processing potential of SNNs. To +address this, we introduce the Temporal Interaction Module (TIM), a novel, +convolution-based enhancement designed to augment the temporal data processing +abilities within SNN architectures. TIM's integration into existing SNN +frameworks is seamless and efficient, requiring minimal additional parameters +while significantly boosting their temporal information handling capabilities. +Through rigorous experimentation, TIM has demonstrated its effectiveness in +exploiting temporal information, leading to state-of-the-art performance across +various neuromorphic datasets. + +
+
+ comment: 10pages,6figures +
+
+
+
+
+ + ☆ Simulating Nighttime Visible Satellite Imagery of Tropical Cyclones + Using Conditional Generative Adversarial Networks + + +
+ Visible (VIS) imagery of satellites has various important applications in +meteorology, including monitoring Tropical Cyclones (TCs). However, it is +unavailable at night because of the lack of sunlight. This study presents a +Conditional Generative Adversarial Networks (CGAN) model that generates highly +accurate nighttime visible reflectance using infrared (IR) bands and sunlight +direction parameters as input. The model was trained and validated using target +area observations of the Advanced Himawari Imager (AHI) in the daytime. This +study also presents the first nighttime model validation using the Day/Night +Band (DNB) of the Visible/Infrared Imager Radiometer Suite (VIIRS). The daytime +statistical results of the Structural Similarity Index Measure (SSIM), Peak +Signal-to-Noise Ratio (PSNR), Root Mean Square Error (RMSE), Correlation +Coefficient (CC), and Bias are 0.885, 28.3, 0.0428, 0.984, and -0.0016 +respectively, completely surpassing the model performance of previous studies. +The nighttime statistical results of SSIM, PSNR, RMSE, and CC are 0.821, 24.4, +0.0643, and 0.969 respectively, which are slightly negatively impacted by the +parallax between satellites. We performed full-disk model validation which +proves our model could also be readily applied in the tropical ocean without +TCs in the northern hemisphere. This model contributes to the nighttime +monitoring of meteorological phenomena by providing accurate AI-generated +visible imagery with adjustable virtual sunlight directions. + +
+
+
+
+
+ + ☆ RTA-Former: Reverse Transformer Attention for Polyp Segmentation + + +
+ Polyp segmentation is a key aspect of colorectal cancer prevention, enabling +early detection and guiding subsequent treatments. Intelligent diagnostic +tools, including deep learning solutions, are widely explored to streamline and +potentially automate this process. However, even with many powerful network +architectures, there still comes the problem of producing accurate edge +segmentation. In this paper, we introduce a novel network, namely RTA-Former, +that employs a transformer model as the encoder backbone and innovatively +adapts Reverse Attention (RA) with a transformer stage in the decoder for +enhanced edge segmentation. The results of the experiments illustrate that +RTA-Former achieves state-of-the-art (SOTA) performance in five polyp +segmentation datasets. The strong capability of RTA-Former holds promise in +improving the accuracy of Transformer-based polyp segmentation, potentially +leading to better clinical decisions and patient outcomes. Our code will be +publicly available on GitHub. + +
+
+
+
+
+ + ☆ An Improved Grey Wolf Optimization Algorithm for Heart Disease + Prediction + + +
+ This paper presents a unique solution to challenges in medical image +processing by incorporating an adaptive curve grey wolf optimization (ACGWO) +algorithm into neural network backpropagation. Neural networks show potential +in medical data but suffer from issues like overfitting and lack of +interpretability due to imbalanced and scarce data. Traditional Gray Wolf +Optimization (GWO) also has its drawbacks, such as a lack of population +diversity and premature convergence. This paper addresses these problems by +introducing an adaptive algorithm, enhancing the standard GWO with a sigmoid +function. This algorithm was extensively compared to four leading algorithms +using six well-known test functions, outperforming them effectively. Moreover, +by utilizing the ACGWO, we increase the robustness and generalization of the +neural network, resulting in more interpretable predictions. Applied to the +publicly accessible Cleveland Heart Disease dataset, our technique surpasses +ten other methods, achieving 86.8% accuracy, indicating its potential for +efficient heart disease prediction in the clinical setting. + +
+
+
+
+
+ + ☆ INCPrompt: Task-Aware incremental Prompting for Rehearsal-Free + Class-incremental Learning ICASSP 2024 + + +
+ This paper introduces INCPrompt, an innovative continual learning solution +that effectively addresses catastrophic forgetting. INCPrompt's key innovation +lies in its use of adaptive key-learner and task-aware prompts that capture +task-relevant information. This unique combination encapsulates general +knowledge across tasks and encodes task-specific knowledge. Our comprehensive +evaluation across multiple continual learning benchmarks demonstrates +INCPrompt's superiority over existing algorithms, showing its effectiveness in +mitigating catastrophic forgetting while maintaining high performance. These +results highlight the significant impact of task-aware incremental prompting on +continual learning performance. + +
+
+ comment: Accepted by the 49th IEEE International Conference on Acoustics, + Speech, and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ P2DT: Mitigating Forgetting in task-incremental Learning with + progressive prompt Decision Transformer ICASSP 2024 + + +
+ Catastrophic forgetting poses a substantial challenge for managing +intelligent agents controlled by a large model, causing performance degradation +when these agents face new tasks. In our work, we propose a novel solution - +the Progressive Prompt Decision Transformer (P2DT). This method enhances a +transformer-based model by dynamically appending decision tokens during new +task training, thus fostering task-specific policies. Our approach mitigates +forgetting in continual and offline reinforcement learning scenarios. Moreover, +P2DT leverages trajectories collected via traditional reinforcement learning +from all tasks and generates new task-specific tokens during training, thereby +retaining knowledge from previous studies. Preliminary results demonstrate that +our model effectively alleviates catastrophic forgetting and scales well with +increasing task environments. + +
+
+ comment: Accepted by the 49th IEEE International Conference on Acoustics, + Speech, and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Accelerating Approximate Thompson Sampling with Underdamped Langevin + Monte Carlo AISTATS 2024 + + +
+ Approximate Thompson sampling with Langevin Monte Carlo broadens its reach +from Gaussian posterior sampling to encompass more general smooth posteriors. +However, it still encounters scalability issues in high-dimensional problems +when demanding high accuracy. To address this, we propose an approximate +Thompson sampling strategy, utilizing underdamped Langevin Monte Carlo, where +the latter is the go-to workhorse for simulations of high-dimensional +posteriors. Based on the standard smoothness and log-concavity conditions, we +study the accelerated posterior concentration and sampling using a specific +potential function. This design improves the sample complexity for realizing +logarithmic regrets from $\mathcal{\tilde O}(d)$ to $\mathcal{\tilde +O}(\sqrt{d})$. The scalability and robustness of our algorithm are also +empirically validated through synthetic experiments in high-dimensional bandit +problems. + +
+
+ comment: 50 pages, 1 figure, to appear in AISTATS 2024 +
+
+
+
+
+ + ☆ Zero-Space Cost Fault Tolerance for Transformer-based Language Models on + ReRAM + + +
+ Resistive Random Access Memory (ReRAM) has emerged as a promising platform +for deep neural networks (DNNs) due to its support for parallel in-situ +matrix-vector multiplication. However, hardware failures, such as +stuck-at-fault defects, can result in significant prediction errors during +model inference. While additional crossbars can be used to address these +failures, they come with storage overhead and are not efficient in terms of +space, energy, and cost. In this paper, we propose a fault protection mechanism +that incurs zero space cost. Our approach includes: 1) differentiable structure +pruning of rows and columns to reduce model redundancy, 2) weight duplication +and voting for robust output, and 3) embedding duplicated most significant bits +(MSBs) into the model weight. We evaluate our method on nine tasks of the GLUE +benchmark with the BERT model, and experimental results prove its +effectiveness. + +
+
+
+
+
+ + ☆ Differentiable Tree Search in Latent State Space + + +
+ In decision-making problems with limited training data, policy functions +approximated using deep neural networks often exhibit suboptimal performance. +An alternative approach involves learning a world model from the limited data +and determining actions through online search. However, the performance is +adversely affected by compounding errors arising from inaccuracies in the +learnt world model. While methods like TreeQN have attempted to address these +inaccuracies by incorporating algorithmic structural biases into their +architectures, the biases they introduce are often weak and insufficient for +complex decision-making tasks. In this work, we introduce Differentiable Tree +Search (DTS), a novel neural network architecture that significantly +strengthens the inductive bias by embedding the algorithmic structure of a +best-first online search algorithm. DTS employs a learnt world model to conduct +a fully differentiable online search in latent state space. The world model is +jointly optimised with the search algorithm, enabling the learning of a robust +world model and mitigating the effect of model inaccuracies. We address +potential Q-function discontinuities arising from naive incorporation of +best-first search by adopting a stochastic tree expansion policy, formulating +search tree expansion as a decision-making task, and introducing an effective +variance reduction technique for the gradient computation. We evaluate DTS in +an offline-RL setting with a limited training data scenario on Procgen games +and grid navigation task, and demonstrate that DTS outperforms popular +model-free and model-based baselines. + +
+
+
+
+
+ + ☆ OnDev-LCT: On-Device Lightweight Convolutional Transformers towards + federated learning + + +
+ Federated learning (FL) has emerged as a promising approach to +collaboratively train machine learning models across multiple edge devices +while preserving privacy. The success of FL hinges on the efficiency of +participating models and their ability to handle the unique challenges of +distributed learning. While several variants of Vision Transformer (ViT) have +shown great potential as alternatives to modern convolutional neural networks +(CNNs) for centralized training, the unprecedented size and higher +computational demands hinder their deployment on resource-constrained edge +devices, challenging their widespread application in FL. Since client devices +in FL typically have limited computing resources and communication bandwidth, +models intended for such devices must strike a balance between model size, +computational efficiency, and the ability to adapt to the diverse and non-IID +data distributions encountered in FL. To address these challenges, we propose +OnDev-LCT: Lightweight Convolutional Transformers for On-Device vision tasks +with limited training data and resources. Our models incorporate image-specific +inductive biases through the LCT tokenizer by leveraging efficient depthwise +separable convolutions in residual linear bottleneck blocks to extract local +features, while the multi-head self-attention (MHSA) mechanism in the LCT +encoder implicitly facilitates capturing global representations of images. +Extensive experiments on benchmark image datasets indicate that our models +outperform existing lightweight vision models while having fewer parameters and +lower computational demands, making them suitable for FL scenarios with data +heterogeneity and communication bottlenecks. + +
+
+ comment: Published in Neural Networks +
+
+
+
+
+ + ☆ Next Visit Diagnosis Prediction via Medical Code-Centric Multimodal + Contrastive EHR Modelling with Hierarchical Regularisation EACL 2024 + + +
+ Predicting next visit diagnosis using Electronic Health Records (EHR) is an +essential task in healthcare, critical for devising proactive future plans for +both healthcare providers and patients. Nonetheless, many preceding studies +have not sufficiently addressed the heterogeneous and hierarchical +characteristics inherent in EHR data, inevitably leading to sub-optimal +performance. To this end, we propose NECHO, a novel medical code-centric +multimodal contrastive EHR learning framework with hierarchical regularisation. +First, we integrate multifaceted information encompassing medical codes, +demographics, and clinical notes using a tailored network design and a pair of +bimodal contrastive losses, all of which pivot around a medical code +representation. We also regularise modality-specific encoders using a parental +level information in medical ontology to learn hierarchical structure of EHR +data. A series of experiments on MIMIC-III data demonstrates effectiveness of +our approach. + +
+
+ comment: Accepted to EACL 2024 (The 18th Conference of the European Chapter of + the Association for Computational Linguistics) +
+
+
+
+
+ + ☆ LW-FedSSL: Resource-efficient Layer-wise Federated Self-supervised + Learning + + +
+ Many recent studies integrate federated learning (FL) with self-supervised +learning (SSL) to take advantage of raw training data distributed across edge +devices. However, edge devices often struggle with high computation and +communication costs imposed by SSL and FL algorithms. To tackle this hindrance, +we propose LW-FedSSL, a layer-wise federated self-supervised learning approach +that allows edge devices to incrementally train one layer of the model at a +time. LW-FedSSL comprises server-side calibration and representation alignment +mechanisms to maintain comparable performance with end-to-end FedSSL while +significantly lowering clients' resource requirements. The server-side +calibration mechanism takes advantage of the resource-rich server in an FL +environment to assist in global model training. Meanwhile, the representation +alignment mechanism encourages closeness between representations of FL local +models and those of the global model. Our experiments show that LW-FedSSL has a +$3.3 \times$ lower memory requirement and a $3.2 \times$ cheaper communication +cost than its end-to-end counterpart. We also explore a progressive training +strategy called Prog-FedSSL that outperforms end-to-end training with a similar +memory requirement and a $1.8 \times$ cheaper communication cost. + +
+
+
+
+
+ + ☆ Nonparametric Estimation via Variance-Reduced Sketching + + +
+ Nonparametric models are of great interest in various scientific and +engineering disciplines. Classical kernel methods, while numerically robust and +statistically sound in low-dimensional settings, become inadequate in +higher-dimensional settings due to the curse of dimensionality. In this paper, +we introduce a new framework called Variance-Reduced Sketching (VRS), +specifically designed to estimate density functions and nonparametric +regression functions in higher dimensions with a reduced curse of +dimensionality. Our framework conceptualizes multivariable functions as +infinite-size matrices, and facilitates a new sketching technique motivated by +numerical linear algebra literature to reduce the variance in estimation +problems. We demonstrate the robust numerical performance of VRS through a +series of simulated experiments and real-world data applications. Notably, VRS +shows remarkable improvement over existing neural network estimators and +classical kernel methods in numerous density estimation and nonparametric +regression models. Additionally, we offer theoretical justifications for VRS to +support its ability to deliver nonparametric estimation with a reduced curse of +dimensionality. + +
+
+ comment: 64 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Large Language Models Should Ask Clarifying Questions to Increase + Confidence in Generated Code + + +
+ Large language models (LLMs) have significantly improved the ability to +perform tasks in the field of code generation. However, there is still a gap +between LLMs being capable coders and being top-tier software engineers. Based +on the observation that toplevel software engineers often ask clarifying +questions to reduce ambiguity in both requirements and coding solutions, I +argue that the same should be applied to LLMs for code generation tasks. By +asking probing questions in various topics before generating the final code, +the challenges of programming with LLMs, such as unclear intent specification, +lack of computational thinking, and undesired code quality, may be alleviated. +This, in turn, increases confidence in the generated code. In this work, I +explore how to leverage better communication skills to achieve greater +confidence in generated code. I propose a communication-centered process that +uses an LLM-generated communicator to identify issues with high ambiguity or +low confidence in problem descriptions and generated code. I then ask +clarifying questions to obtain responses from users for refining the code. + +
+
+ comment: 6 pages, 2 figures, 1 table. Accepted and presented at the 7th Annual + Symposium on Machine Programming (MAPS 2023 Workshop, see + https://mapsworkshop.github.io/). Reference: "Wu, Jie JW. Large Language + Models Should Ask Clarifying Questions to Increase Confidence in Generated + Code. The 7th Annual Symposium on Machine Programming (MAPS 23), December 3, + 2023, San Francisco, CA, USA" +
+
+
+
+
+ + ♻ ☆ DiarizationLM: Speaker Diarization Post-Processing with Large Language + Models + + +
+ In this paper, we introduce DiarizationLM, a framework to leverage large +language models (LLM) to post-process the outputs from a speaker diarization +system. Various goals can be achieved with the proposed framework, such as +improving the readability of the diarized transcript, or reducing the word +diarization error rate (WDER). In this framework, the outputs of the automatic +speech recognition (ASR) and speaker diarization systems are represented as a +compact textual format, which is included in the prompt to an optionally +finetuned LLM. The outputs of the LLM can be used as the refined diarization +results with the desired enhancement. As a post-processing step, this framework +can be easily applied to any off-the-shelf ASR and speaker diarization systems +without retraining existing components. Our experiments show that a finetuned +PaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone +conversation dataset, and rel. 44.9% on the Callhome English dataset. + +
+
+
+
+
+ + ♻ ☆ Personality Trait Inference Via Mobile Phone Sensors: A Machine Learning + Approach + + +
+ This study provides evidence that personality can be reliably predicted from +activity data collected through mobile phone sensors. Employing a set of well +informed indicators calculable from accelerometer records and movement +patterns, we were able to predict users' personality up to a 0.78 F1 score on a +two class problem. Given the fast growing number of data collected from mobile +phones, our novel personality indicators open the door to exciting avenues for +future research in social sciences. Our results reveal distinct behavioral +patterns that proved to be differentially predictive of big five personality +traits. They potentially enable cost effective, questionnaire free +investigation of personality related questions at an unprecedented scale. We +show how a combination of rich behavioral data obtained with smartphone sensing +and the use of machine learning techniques can help to advance personality +research and can inform both practitioners and researchers about the different +behavioral patterns of personality. These findings have practical implications +for organizations harnessing mobile sensor data for personality assessment, +guiding the refinement of more precise and efficient prediction models in the +future. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Towards Size-Independent Generalization Bounds for Deep Operator Nets + + +
+ In recent times machine learning methods have made significant advances in +becoming a useful tool for analyzing physical systems. A particularly active +area in this theme has been "physics-informed machine learning" which focuses +on using neural nets for numerically solving differential equations. In this +work, we aim to advance the theory of measuring out-of-sample error while +training DeepONets -- which is among the most versatile ways to solve PDE +systems in one-shot. + Firstly, for a class of DeepONets, we prove a bound on their Rademacher +complexity which does not explicitly scale with the width of the nets involved. +Secondly, we use this to show how the Huber loss can be chosen so that for +these DeepONet classes generalization error bounds can be obtained that have no +explicit dependence on the size of the nets. We note that our theoretical +results apply to any PDE being targeted to be solved by DeepONets. + +
+
+ comment: 27 pages, 5 figures; Added theorem on generalization error indicating + benefits of training DeepONets on the Huber loss and corresponding + experiments +
+
+
+
+
+ + ♻ ☆ Benchmarking the Robustness of Image Watermarks + + +
+ This paper investigates the weaknesses of image watermarking techniques. We +present WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel +benchmark for assessing watermark robustness, overcoming the limitations of +current evaluation methods.WAVES integrates detection and identification tasks, +and establishes a standardized evaluation protocol comprised of a diverse range +of stress tests. The attacks in WAVES range from traditional image distortions +to advanced and novel variations of diffusive, and adversarial attacks. Our +evaluation examines two pivotal dimensions: the degree of image quality +degradation and the efficacy of watermark detection after attacks. We develop a +series of Performance vs. Quality 2D plots, varying over several prominent +image similarity metrics, which are then aggregated in a heuristically novel +manner to paint an overall picture of watermark robustness and attack potency. +Our comprehensive evaluation reveals previously undetected vulnerabilities of +several modern watermarking algorithms. We envision WAVES as a toolkit for the +future development of robust watermarking systems. The project is available at +https://wavesbench.github.io/ + +
+
+
+
+
+ + ♻ ☆ DFU: scale-robust diffusion model for zero-shot super-resolution image + generation + + +
+ Diffusion generative models have achieved remarkable success in generating +images with a fixed resolution. However, existing models have limited ability +to generalize to different resolutions when training data at those resolutions +are not available. Leveraging techniques from operator learning, we present a +novel deep-learning architecture, Dual-FNO UNet (DFU), which approximates the +score operator by combining both spatial and spectral information at multiple +resolutions. Comparisons of DFU to baselines demonstrate its scalability: 1) +simultaneously training on multiple resolutions improves FID over training at +any single fixed resolution; 2) DFU generalizes beyond its training +resolutions, allowing for coherent, high-fidelity generation at +higher-resolutions with the same model, i.e. zero-shot super-resolution +image-generation; 3) we propose a fine-tuning strategy to further enhance the +zero-shot super-resolution image-generation capability of our model, leading to +a FID of 11.3 at 1.66 times the maximum training resolution on FFHQ, which no +other method can come close to achieving. + +
+
+
+
+
+ + ♻ ☆ DTC: Deep Tracking Control + + +
+ Legged locomotion is a complex control problem that requires both accuracy +and robustness to cope with real-world challenges. Legged systems have +traditionally been controlled using trajectory optimization with inverse +dynamics. Such hierarchical model-based methods are appealing due to intuitive +cost function tuning, accurate planning, generalization, and most importantly, +the insightful understanding gained from more than one decade of extensive +research. However, model mismatch and violation of assumptions are common +sources of faulty operation. Simulation-based reinforcement learning, on the +other hand, results in locomotion policies with unprecedented robustness and +recovery skills. Yet, all learning algorithms struggle with sparse rewards +emerging from environments where valid footholds are rare, such as gaps or +stepping stones. In this work, we propose a hybrid control architecture that +combines the advantages of both worlds to simultaneously achieve greater +robustness, foot-placement accuracy, and terrain generalization. Our approach +utilizes a model-based planner to roll out a reference motion during training. +A deep neural network policy is trained in simulation, aiming to track the +optimized footholds. We evaluate the accuracy of our locomotion pipeline on +sparse terrains, where pure data-driven methods are prone to fail. Furthermore, +we demonstrate superior robustness in the presence of slippery or deformable +ground when compared to model-based counterparts. Finally, we show that our +proposed tracking controller generalizes across different trajectory +optimization methods not seen during training. In conclusion, our work unites +the predictive capabilities and optimality guarantees of online planning with +the inherent robustness attributed to offline learning. + +
+
+
+
+
+ + ♻ ☆ Better Batch for Deep Probabilistic Time Series Forecasting AISTATS 2024 + + +
+ Deep probabilistic time series forecasting has gained significant attention +due to its superior performance in nonlinear approximation and its ability to +provide valuable uncertainty quantification for decision-making tasks. However, +many existing models oversimplify the problem by assuming that the error +process is time-independent, thereby overlooking the serial correlation in the +error process. To overcome this limitation, we propose an innovative training +method that incorporates error autocorrelation to further enhance the accuracy +of probabilistic forecasting. Our method involves constructing a mini-batch as +a collection of $D$ consecutive time series segments for model training and +explicitly learning a time-varying covariance matrix over each mini-batch that +encodes the error correlation among adjacent time steps. The learned covariance +matrix can be used to improve prediction accuracy and enhance uncertainty +quantification. We evaluate our method on two different neural forecasting +models and multiple public datasets, and the experimental results confirm the +effectiveness of the proposed approach in enhancing the performance of both +models across a wide range of datasets, yielding notable improvements in +predictive accuracy. + +
+
+ comment: 9 pages, 3 figures, camera-ready version, The 27th International + Conference on Artificial Intelligence and Statistics (AISTATS 2024) +
+
+
+
+
+ + ♻ ☆ The Effect of Intrinsic Dataset Properties on Generalization: Unraveling + Learning Differences Between Natural and Medical Images ICLR 2024 + + +
+ This paper investigates discrepancies in how neural networks learn from +different imaging domains, which are commonly overlooked when adopting computer +vision techniques from the domain of natural images to other specialized +domains such as medical images. Recent works have found that the generalization +error of a trained network typically increases with the intrinsic dimension +($d_{data}$) of its training set. Yet, the steepness of this relationship +varies significantly between medical (radiological) and natural imaging +domains, with no existing theoretical explanation. We address this gap in +knowledge by establishing and empirically validating a generalization scaling +law with respect to $d_{data}$, and propose that the substantial scaling +discrepancy between the two considered domains may be at least partially +attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging +datasets, a metric which we propose. Next, we demonstrate an additional benefit +of measuring the label sharpness of a training set: it is negatively correlated +with the trained model's adversarial robustness, which notably leads to models +for medical images having a substantially higher vulnerability to adversarial +attack. Finally, we extend our $d_{data}$ formalism to the related metric of +learned representation intrinsic dimension ($d_{repr}$), derive a +generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$ +serves as an upper bound for $d_{repr}$. Our theoretical results are supported +by thorough experiments with six models and eleven natural and medical imaging +datasets over a range of training set sizes. Our findings offer insights into +the influence of intrinsic dataset properties on generalization, representation +learning, and robustness in deep neural networks. + +
+
+ comment: ICLR 2024. Code: + https://github.com/mazurowski-lab/intrinsic-properties +
+
+
+
+
+ + ♻ ☆ Augment on Manifold: Mixup Regularization with UMAP ICASSP 2024 + + +
+ Data augmentation techniques play an important role in enhancing the +performance of deep learning models. Despite their proven benefits in computer +vision tasks, their application in the other domains remains limited. This +paper proposes a Mixup regularization scheme, referred to as UMAP Mixup, +designed for ``on-manifold" automated data augmentation for deep learning +predictive models. The proposed approach ensures that the Mixup operations +result in synthesized samples that lie on the data manifold of the features and +labels by utilizing a dimensionality reduction technique known as uniform +manifold approximation and projection. Evaluations across diverse regression +tasks show that UMAP Mixup is competitive with or outperforms other Mixup +variants, show promise for its potential as an effective tool for enhancing the +generalization performance of deep learning models. + +
+
+ comment: accepted paper to be published in the proceedings of ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model + Performance EMNLP 2023 + + +
+ When training data are collected from human annotators, the design of the +annotation instrument, the instructions given to annotators, the +characteristics of the annotators, and their interactions can impact training +data. This study demonstrates that design choices made when creating an +annotation instrument also impact the models trained on the resulting +annotations. We introduce the term annotation sensitivity to refer to the +impact of annotation data collection methods on the annotations themselves and +on downstream model performance and predictions. We collect annotations of hate +speech and offensive language in five experimental conditions of an annotation +instrument, randomly assigning annotators to conditions. We then fine-tune BERT +models on each of the five resulting datasets and evaluate model performance on +a holdout portion of each condition. We find considerable differences between +the conditions for 1) the share of hate speech/offensive language annotations, +2) model performance, 3) model predictions, and 4) model learning curves. Our +results emphasize the crucial role played by the annotation instrument which +has received little attention in the machine learning literature. We call for +additional research into how and why the instrument impacts the annotations to +inform the development of best practices in instrument design. + +
+
+ comment: EMNLP 2023 Findings: + https://aclanthology.org/2023.findings-emnlp.992/ +
+
+
+
+
+ + ♻ ☆ Neural Stochastic Differential Equations with Change Points: A + Generative Adversarial Approach ICASSP 2024 + + +
+ Stochastic differential equations (SDEs) have been widely used to model real +world random phenomena. Existing works mainly focus on the case where the time +series is modeled by a single SDE, which might be restrictive for modeling time +series with distributional shift. In this work, we propose a change point +detection algorithm for time series modeled as neural SDEs. Given a time series +dataset, the proposed method jointly learns the unknown change points and the +parameters of distinct neural SDE models corresponding to each change point. +Specifically, the SDEs are learned under the framework of generative +adversarial networks (GANs) and the change points are detected based on the +output of the GAN discriminator in a forward pass. At each step of the proposed +algorithm, the change points and the SDE model parameters are updated in an +alternating fashion. Numerical results on both synthetic and real datasets are +provided to validate the performance of our algorithm in comparison to +classical change point detection benchmarks, standard GAN-based neural SDEs, +and other state-of-the-art deep generative models for time series data. + +
+
+ comment: accepted paper to be published in the proceedings of ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Decision Tree Search as a Markov Decision Problem + + +
+ Finding an optimal decision tree for a supervised learning task is a +challenging combinatorial problem to solve at scale. It was recently proposed +to frame the problem as a Markov Decision Problem (MDP) and use deep +reinforcement learning to tackle scaling. Unfortunately, these methods are not +competitive with the current branch-and-bound state-of-the-art. We propose +instead to scale the resolution of such MDPs using an information-theoretic +tests generating function that heuristically, and dynamically for every state, +limits the set of admissible test actions to a few good candidates. As a +solver, we show empirically that our algorithm is at the very least competitive +with branch-and-bound alternatives. As a machine learning tool, a key advantage +of our approach is to solve for multiple complexity-performance trade-offs at +virtually no additional cost. With such a set of solutions, a user can then +select the tree that generalizes best and which has the interpretability level +that best suits their needs, which no current branch-and-bound method allows. + +
+
+
+
+
+ + ♻ ☆ Machine Learning-Based Analysis of Ebola Virus' Impact on Gene + Expression in Nonhuman Primates + + +
+ This study introduces the Supervised Magnitude-Altitude Scoring (SMAS) +methodology, a machine learning-based approach, for analyzing gene expression +data obtained from nonhuman primates (NHPs) infected with Ebola virus (EBOV). +We utilize a comprehensive dataset of NanoString gene expression profiles from +Ebola-infected NHPs, deploying the SMAS system for nuanced host-pathogen +interaction analysis. SMAS effectively combines gene selection based on +statistical significance and expression changes, employing linear classifiers +such as logistic regression to accurately differentiate between RT-qPCR +positive and negative NHP samples. A key finding of our research is the +identification of IFI6 and IFI27 as critical biomarkers, demonstrating +exceptional predictive performance with 100% accuracy and Area Under the Curve +(AUC) metrics in classifying various stages of Ebola infection. Alongside IFI6 +and IFI27, genes, including MX1, OAS1, and ISG15, were significantly +upregulated, highlighting their essential roles in the immune response to EBOV. +Our results underscore the efficacy of the SMAS method in revealing complex +genetic interactions and response mechanisms during EBOV infection. This +research provides valuable insights into EBOV pathogenesis and aids in +developing more precise diagnostic tools and therapeutic strategies to address +EBOV infection in particular and viral infection in general. + +
+
+ comment: 28 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian + Optimization Approach + + +
+ Solving large-scale capacity expansion problems (CEPs) is central to +cost-effective decarbonization of regional-scale energy systems. To ensure the +intended outcomes of CEPs, modeling uncertainty due to weather-dependent +variable renewable energy (VRE) supply and energy demand becomes crucially +important. However, the resulting stochastic optimization models are often less +computationally tractable than their deterministic counterparts. Here, we +propose a learning-assisted approximate solution method to tractably solve +two-stage stochastic CEPs. Our method identifies low-cost planning decisions by +constructing and solving a sequence of tractable temporally aggregated +surrogate problems. We adopt a Bayesian optimization approach to searching the +space of time series aggregation hyperparameters and compute approximate +solutions that minimize costs on a validation set of supply-demand projections. +Importantly, we evaluate solved planning outcomes on a held-out set of test +projections. We apply our approach to generation and transmission expansion +planning for a joint power-gas system spanning New England. We show that our +approach yields an estimated cost savings of up to 3.8% in comparison to +benchmark time series aggregation approaches. + +
+
+
+
+
+ + ♻ ☆ New Versions of Gradient Temporal Difference Learning + + +
+ Sutton, Szepesv\'{a}ri and Maei introduced the first gradient +temporal-difference (GTD) learning algorithms compatible with both linear +function approximation and off-policy training. The goal of this paper is (a) +to propose some variants of GTDs with extensive comparative analysis and (b) to +establish new theoretical analysis frameworks for the GTDs. These variants are +based on convex-concave saddle-point interpretations of GTDs, which effectively +unify all the GTDs into a single framework, and provide simple stability +analysis based on recent results on primal-dual gradient dynamics. Finally, +numerical comparative analysis is given to evaluate these approaches. + +
+
+
+
+
+ + ♻ ☆ Approximating Langevin Monte Carlo with ResNet-like Neural Network + architectures + + +
+ We sample from a given target distribution by constructing a neural network +which maps samples from a simple reference, e.g. the standard normal +distribution, to samples from the target. To that end, we propose using a +neural network architecture inspired by the Langevin Monte Carlo (LMC) +algorithm. Based on LMC perturbation results, we show approximation rates of +the proposed architecture for smooth, log-concave target distributions measured +in the Wasserstein-$2$ distance. The analysis heavily relies on the notion of +sub-Gaussianity of the intermediate measures of the perturbed LMC process. In +particular, we derive bounds on the growth of the intermediate variance proxies +under different assumptions on the perturbations. Moreover, we propose an +architecture similar to deep residual neural networks and derive expressivity +results for approximating the sample to target distribution map. + +
+
+
+
+
+ + ♻ ☆ Comparison analysis between standard polysomnographic data and + in-ear-EEG signals: A preliminary study + + +
+ Study Objectives: Polysomnography (PSG) currently serves as the benchmark for +evaluating sleep disorders. Its discomfort, impracticality for home-use, and +introduction of bias in sleep quality assessment necessitate the exploration of +less invasive, cost-effective, and portable alternatives. One promising +contender is the in-ear-EEG sensor, which offers advantages in terms of +comfort, fixed electrode positions, resistance to electromagnetic interference, +and user-friendliness. This study aims to establish a methodology to assess the +similarity between the in-ear-EEG signal and standard PSG. + Methods: We assess the agreement between the PSG and in-ear-EEG derived +hypnograms. We extract features in the time- and frequency- domain from PSG and +in-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers +and the in-ear-EEG-scorers were in agreement. We introduce a methodology to +quantify the similarity between PSG derivations and the single-channel +in-ear-EEG. The approach relies on a comparison of distributions of selected +features -- extracted for each sleep stage and subject on both PSG and the +in-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity +Index (JSD-FSI). + Results: We found a high intra-scorer variability, mainly due to the +uncertainty the scorers had in evaluating the in-ear-EEG signals. We show that +the similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/- +0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line +with the similarity values computed independently on standard +PSG-channel-combinations. + Conclusions: In-ear-EEG is a valuable solution for home-based sleep +monitoring, however further studies with a larger and more heterogeneous +dataset are needed. + +
+
+ comment: 29 pages, 12 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Physics-guided Noise Neural Proxy for Practical Low-light Raw Image + Denoising + + +
+ Recently, the mainstream practice for training low-light raw image denoising +methods has shifted towards employing synthetic data. Noise modeling, which +focuses on characterizing the noise distribution of real-world sensors, +profoundly influences the effectiveness and practicality of synthetic data. +Currently, physics-based noise modeling struggles to characterize the entire +real noise distribution, while learning-based noise modeling impractically +depends on paired real data. In this paper, we propose a novel strategy: +learning the noise model from dark frames instead of paired real data, to break +down the data dependency. Based on this strategy, we introduce an efficient +physics-guided noise neural proxy (PNNP) to approximate the real-world sensor +noise model. Specifically, we integrate physical priors into neural proxies and +introduce three efficient techniques: physics-guided noise decoupling (PND), +physics-guided proxy model (PPM), and differentiable distribution loss (DDL). +PND decouples the dark frame into different components and handles different +levels of noise flexibly, which reduces the complexity of noise modeling. PPM +incorporates physical priors to constrain the generated noise, which promotes +the accuracy of noise modeling. DDL provides explicit and reliable supervision +for noise distribution, which promotes the precision of noise modeling. PNNP +exhibits powerful potential in characterizing the real noise distribution. +Extensive experiments on public datasets demonstrate superior performance in +practical low-light raw image denoising. The code will be available at +\url{https://github.com/fenghansen/PNNP}. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource + Security Attack Pattern Recognition EACL 2024 + + +
+ Tactics, Techniques and Procedures (TTPs) represent sophisticated attack +patterns in the cybersecurity domain, described encyclopedically in textual +knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP +mapping, is an important and challenging task. Conventional learning approaches +often target the problem in the classical multi-class or multilabel +classification setting. This setting hinders the learning ability of the model +due to a large number of classes (i.e., TTPs), the inevitable skewness of the +label distribution and the complex hierarchical structure of the label space. +We formulate the problem in a different learning paradigm, where the assignment +of a text to a TTP label is decided by the direct semantic similarity between +the two, thus reducing the complexity of competing solely over the large +labeling space. To that end, we propose a neural matching architecture with an +effective sampling-based learn-to-compare mechanism, facilitating the learning +process of the matching model despite constrained resources. + +
+
+ comment: accepted at EACL 2024, in ARR October 2023 +
+
+
+
+
+ + ♻ ☆ Deep Reinforcement Learning with Swin Transformers + + +
+ Transformers are neural network models that utilize multiple layers of +self-attention heads and have exhibited enormous potential in natural language +processing tasks. Meanwhile, there have been efforts to adapt transformers to +visual tasks of machine learning, including Vision Transformers and Swin +Transformers. Although some researchers use Vision Transformers for +reinforcement learning tasks, their experiments remain at a small scale due to +the high computational cost. This article presents the first online +reinforcement learning scheme that is based on Swin Transformers: Swin DQN. In +contrast to existing research, our novel approach demonstrate the superior +performance with experiments on 49 games in the Arcade Learning Environment. +The results show that our approach achieves significantly higher maximal +evaluation scores than the baseline method in 45 of all the 49 games (92%), and +higher mean evaluation scores than the baseline method in 40 of all the 49 +games (82%). + +
+
+
+
+
+ + ♻ ☆ Catastrophic Interference is Mitigated in Naturalistic Power-Law + Learning Environments + + +
+ Neural networks often suffer from catastrophic interference (CI): performance +on previously learned tasks drops off significantly when learning a new task. +This contrasts strongly with humans, who can sequentially learn new tasks +without appreciably forgetting previous tasks. Prior work has explored various +techniques for mitigating CI such as regularization, rehearsal, generative +replay, and distillation methods. The current work takes a different approach, +one guided by cognitive science research showing that in naturalistic +environments, the probability of encountering a task decreases as a power-law +of the time since it was last performed. We argue that a realistic evaluation +of techniques for the mitigation of CI should be performed in simulated +naturalistic learning environments. Thus, we evaluate the extent of mitigation +of CI when training simple rehearsal-based methods in power-law environments +similar to the ones humans face. Our work explores this novel rehearsal-based +approach for a domain-incremental task: learning permutations in the MNIST +task. We compare our rehearsal environment with other baselines to show its +efficacy in promoting continual learning. Additionally, we investigate whether +this environment shows forward facilitation, i.e., faster learning of later +tasks. Next, we explore the robustness of our learning environment to the +number of tasks, model size, and amount of data rehearsed after each task. +Notably, our results show that the performance is comparable or superior to +that of models trained using popular regularization methods and also to +rehearsals in non-power-law environments. The benefits of this training +paradigm include simplicity and the lack of a need for extra neural circuitry. +In addition, because our method is orthogonal to other methods, future research +can combine training in power-law environments with other continual learning +mechanisms. + +
+
+
+
+
+ + ♻ ☆ Neural Algorithmic Reasoning for Combinatorial Optimisation + + +
+ Solving NP-hard/complete combinatorial problems with neural networks is a +challenging research area that aims to surpass classical approximate +algorithms. The long-term objective is to outperform hand-designed heuristics +for NP-hard/complete problems by learning to generate superior solutions solely +from training data. Current neural-based methods for solving CO problems often +overlook the inherent "algorithmic" nature of the problems. In contrast, +heuristics designed for CO problems, e.g. TSP, frequently leverage +well-established algorithms, such as those for finding the minimum spanning +tree. In this paper, we propose leveraging recent advancements in neural +algorithmic reasoning to improve the learning of CO problems. Specifically, we +suggest pre-training our neural model on relevant algorithms before training it +on CO instances. Our results demonstrate that by using this learning setup, we +achieve superior performance compared to non-algorithmically informed deep +learning models. + +
+
+
+
+
+ + ♻ ☆ Explaining RL Decisions with Trajectories ICLR + + +
+ Explanation is a key component for the adoption of reinforcement learning +(RL) in many real-world decision-making problems. In the literature, the +explanation is often provided by saliency attribution to the features of the RL +agent's state. In this work, we propose a complementary approach to these +explanations, particularly for offline RL, where we attribute the policy +decisions of a trained RL agent to the trajectories encountered by it during +training. To do so, we encode trajectories in offline training data +individually as well as collectively (encoding a set of trajectories). We then +attribute policy decisions to a set of trajectories in this encoded space by +estimating the sensitivity of the decision with respect to that set. Further, +we demonstrate the effectiveness of the proposed approach in terms of quality +of attributions as well as practical scalability in diverse environments that +involve both discrete and continuous state and action spaces such as +grid-worlds, video games (Atari) and continuous control (MuJoCo). We also +conduct a human study on a simple navigation task to observe how their +understanding of the task compares with data attributed for a trained RL +policy. Keywords -- Explainable AI, Verifiability of AI Decisions, Explainable +RL. + +
+
+ comment: Published at International Conference on Learning Representations + (ICLR), 2023 +
+
+
+
+
+ + ♻ ☆ ImpNet: Imperceptible and blackbox-undetectable backdoors in compiled + neural networks + + +
+ Early backdoor attacks against machine learning set off an arms race in +attack and defence development. Defences have since appeared demonstrating some +ability to detect backdoors in models or even remove them. These defences work +by inspecting the training data, the model, or the integrity of the training +procedure. In this work, we show that backdoors can be added during +compilation, circumventing any safeguards in the data preparation and model +training stages. The attacker can not only insert existing weight-based +backdoors during compilation, but also a new class of weight-independent +backdoors, such as ImpNet. These backdoors are impossible to detect during the +training or data preparation processes, because they are not yet present. Next, +we demonstrate that some backdoors, including ImpNet, can only be reliably +detected at the stage where they are inserted and removing them anywhere else +presents a significant challenge. We conclude that ML model security requires +assurance of provenance along the entire technical pipeline, including the +data, model architecture, compiler, and hardware specification. + +
+
+ comment: 10 pages, 7 figures, to be published in IEEE Secure and Trustworthy + Machine Learning 2024. For website see https://ml.backdoors.uk . For source + code, see https://git.sr.ht/~tim-clifford/impnet_source +
+
+
+
+
+ + ♻ ☆ Analytical Modelling of Raw Data for Flow-Guided In-body Nanoscale + Localization + + +
+ Advancements in nanotechnology and material science are paving the way toward +nanoscale devices that combine sensing, computing, data and energy storage, and +wireless communication. In precision medicine, these nanodevices show promise +for disease diagnostics, treatment, and monitoring from within the patients' +bloodstreams. Assigning the location of a sensed biological event with the +event itself, which is the main proposition of flow-guided in-body nanoscale +localization, would be immensely beneficial from the perspective of precision +medicine. The nanoscale nature of the nanodevices and the challenging +environment that the bloodstream represents, result in current flow-guided +localization approaches being constrained in their communication and +energy-related capabilities. The communication and energy constraints of the +nanodevices result in different features of raw data for flow-guided +localization, in turn affecting its performance. An analytical modeling of the +effects of imperfect communication and constrained energy causing intermittent +operation of the nanodevices on the raw data produced by the nanodevices would +be beneficial. Hence, we propose an analytical model of raw data for +flow-guided localization, where the raw data is modeled as a function of +communication and energy-related capabilities of the nanodevice. We evaluate +the model by comparing its output with the one obtained through the utilization +of a simulator for objective evaluation of flow-guided localization, featuring +comparably higher level of realism. Our results across a number of scenarios +and heterogeneous performance metrics indicate high similarity between the +model and simulator-generated raw datasets. + +
+
+ comment: 6 pages, 7 figures, 4 tables, 16 references +
+
+
+
+
+ + ♻ ☆ On the different regimes of Stochastic Gradient Descent + + +
+ Modern deep networks are trained with stochastic gradient descent (SGD) whose +key hyperparameters are the number of data considered at each step or batch +size $B$, and the step size or learning rate $\eta$. For small $B$ and large +$\eta$, SGD corresponds to a stochastic evolution of the parameters, whose +noise amplitude is governed by the `temperature' $T\equiv \eta/B$. Yet this +description is observed to break down for sufficiently large batches $B\geq +B^*$, or simplifies to gradient descent (GD) when the temperature is +sufficiently small. Understanding where these cross-overs take place remains a +central challenge. Here, we resolve these questions for a teacher-student +perceptron classification model and show empirically that our key predictions +still apply to deep networks. Specifically, we obtain a phase diagram in the +$B$-$\eta$ plane that separates three dynamical phases: \textit{(i)} a +noise-dominated SGD governed by temperature, \textit{(ii)} a +large-first-step-dominated SGD and \textit{(iii)} GD. These different phases +also correspond to different regimes of generalization error. Remarkably, our +analysis reveals that the batch size $B^*$ separating regimes \textit{(i)} and +\textit{(ii)} scale with the size $P$ of the training set, with an exponent +that characterizes the hardness of the classification problem. + +
+
+ comment: Main: 8 pages, 4 figures; Appendix: 20 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Robust Uncertainty Quantification Using Conformalised Monte Carlo + Prediction + + +
+ Deploying deep learning models in safety-critical applications remains a very +challenging task, mandating the provision of assurances for the dependable +operation of these models. Uncertainty quantification (UQ) methods estimate the +model's confidence per prediction, informing decision-making by considering the +effect of randomness and model misspecification. Despite the advances of +state-of-the-art UQ methods, they are computationally expensive or produce +conservative prediction sets/intervals. We introduce MC-CP, a novel hybrid UQ +method that combines a new adaptive Monte Carlo (MC) dropout method with +conformal prediction (CP). MC-CP adaptively modulates the traditional MC +dropout at runtime to save memory and computation resources, enabling +predictions to be consumed by CP, yielding robust prediction sets/intervals. +Throughout comprehensive experiments, we show that MC-CP delivers significant +improvements over advanced UQ methods, like MC dropout, RAPS and CQR, both in +classification and regression benchmarks. MC-CP can be easily added to existing +models, making its deployment simple. + +
+
+
+
+
+ + ♻ ☆ On Optimal Regularization Parameters via Bilevel Learning + + +
+ Variational regularization is commonly used to solve linear inverse problems, +and involves augmenting a data fidelity by a regularizer. The regularizer is +used to promote a priori information and is weighted by a regularization +parameter. Selection of an appropriate regularization parameter is critical, +with various choices leading to very different reconstructions. Classical +strategies used to determine a suitable parameter value include the discrepancy +principle and the L-curve criterion, and in recent years a supervised machine +learning approach called bilevel learning has been employed. Bilevel learning +is a powerful framework to determine optimal parameters and involves solving a +nested optimization problem. While previous strategies enjoy various +theoretical results, the well-posedness of bilevel learning in this setting is +still an open question. In particular, a necessary property is positivity of +the determined regularization parameter. In this work, we provide a new +condition that better characterizes positivity of optimal regularization +parameters than the existing theory. Numerical results verify and explore this +new condition for both small and high-dimensional problems. + +
+
+ comment: 34 pages, 11 figures. Version for publication +
+
+
+
+
+ + ♻ ☆ Beyond Expected Return: Accounting for Policy Reproducibility when + Evaluating Reinforcement Learning Algorithms + + +
+ Many applications in Reinforcement Learning (RL) usually have noise or +stochasticity present in the environment. Beyond their impact on learning, +these uncertainties lead the exact same policy to perform differently, i.e. +yield different return, from one roll-out to another. Common evaluation +procedures in RL summarise the consequent return distributions using solely the +expected return, which does not account for the spread of the distribution. Our +work defines this spread as the policy reproducibility: the ability of a policy +to obtain similar performance when rolled out many times, a crucial property in +some real-world applications. We highlight that existing procedures that only +use the expected return are limited on two fronts: first an infinite number of +return distributions with a wide range of performance-reproducibility +trade-offs can have the same expected return, limiting its effectiveness when +used for comparing policies; second, the expected return metric does not leave +any room for practitioners to choose the best trade-off value for considered +applications. In this work, we address these limitations by recommending the +use of Lower Confidence Bound, a metric taken from Bayesian optimisation that +provides the user with a preference parameter to choose a desired +performance-reproducibility trade-off. We also formalise and quantify policy +reproducibility, and demonstrate the benefit of our metrics using extensive +experiments of popular RL algorithms on common uncertain RL tasks. + +
+
+
+
+
+ + ♻ ☆ Towards Cross Domain Generalization of Hamiltonian Representation via + Meta Learning ICLR 2024 + + +
+ Recent advances in deep learning for physics have focused on discovering +shared representations of target systems by incorporating physics priors or +inductive biases into neural networks. While effective, these methods are +limited to the system domain, where the type of system remains consistent and +thus cannot ensure the adaptation to new, or unseen physical systems governed +by different laws. For instance, a neural network trained on a mass-spring +system cannot guarantee accurate predictions for the behavior of a two-body +system or any other system with different physical laws. In this work, we take +a significant leap forward by targeting cross domain generalization within the +field of Hamiltonian dynamics. We model our system with a graph neural network +and employ a meta learning algorithm to enable the model to gain experience +over a distribution of tasks and make it adapt to new physics. Our approach +aims to learn a unified Hamiltonian representation that is generalizable across +multiple system domains, thereby overcoming the limitations of system-specific +models. Our results demonstrate that the meta-trained model not only adapts +effectively to new systems but also captures a generalized Hamiltonian +representation that is consistent across different physical domains. Overall, +through the use of meta learning, we offer a framework that achieves cross +domain generalization, providing a step towards a unified model for +understanding a wide array of dynamical systems via deep learning. + +
+
+ comment: Conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Convolve and Conquer: Data Comparison with Wiener Filters + + +
+ Quantitative evaluations of differences and/or similarities between data +samples define and shape optimisation problems associated with learning data +distributions. Current methods to compare data often suffer from limitations in +capturing such distributions or lack desirable mathematical properties for +optimisation (e.g. smoothness, differentiability, or convexity). In this paper, +we introduce a new method to measure (dis)similarities between paired samples +inspired by Wiener-filter theory. The convolutional nature of Wiener filters +allows us to comprehensively compare data samples in a globally correlated way. +We validate our approach in four machine learning applications: data +compression, medical imaging imputation, translated classification, and +non-parametric generative modelling. Our results demonstrate increased +resolution in reconstructed images with better perceptual quality and higher +data fidelity, as well as robustness against translations, compared to +conventional mean-squared-error analogue implementations. + +
+
+ comment: 10 pages, 5 figures, Medical Imaging Meets Neurips Workshop +
+
+
+
+
+ + ♻ ☆ 2D-3D Interlaced Transformer for Point Cloud Segmentation with + Scene-Level Supervision ICCV 2023 + + +
+ We present a Multimodal Interlaced Transformer (MIT) that jointly considers +2D and 3D data for weakly supervised point cloud segmentation. Research studies +have shown that 2D and 3D features are complementary for point cloud +segmentation. However, existing methods require extra 2D annotations to achieve +2D-3D information fusion. Considering the high annotation cost of point clouds, +effective 2D and 3D feature fusion based on weakly supervised learning is in +great demand. To this end, we propose a transformer model with two encoders and +one decoder for weakly supervised point cloud segmentation using only +scene-level class tags. Specifically, the two encoders compute the +self-attended features for 3D point clouds and 2D multi-view images, +respectively. The decoder implements interlaced 2D-3D cross-attention and +carries out implicit 2D and 3D feature fusion. We alternately switch the roles +of queries and key-value pairs in the decoder layers. It turns out that the 2D +and 3D features are iteratively enriched by each other. Experiments show that +it performs favorably against existing weakly supervised point cloud +segmentation methods by a large margin on the S3DIS and ScanNet benchmarks. The +project page will be available at https://jimmy15923.github.io/mit_web/. + +
+
+ comment: ICCV 2023 (main + supp). Website: + https://jimmy15923.github.io/mit_web/ +
+
+
+
+
+ + ♻ ☆ In-Context Learning for MIMO Equalization Using Transformer-Based + Sequence Models + + +
+ Large pre-trained sequence models, such as transformer-based architectures, +have been recently shown to have the capacity to carry out in-context learning +(ICL). In ICL, a decision on a new input is made via a direct mapping of the +input and of a few examples from the given task, serving as the task's context, +to the output variable. No explicit updates of the model parameters are needed +to tailor the decision to a new task. Pre-training, which amounts to a form of +meta-learning, is based on the observation of examples from several related +tasks. Prior work has shown ICL capabilities for linear regression. In this +study, we leverage ICL to address the inverse problem of multiple-input and +multiple-output (MIMO) equalization based on a context given by pilot symbols. +A task is defined by the unknown fading channel and by the signal-to-noise +ratio (SNR) level, which may be known. To highlight the practical potential of +the approach, we allow the presence of quantization of the received signals. We +demonstrate via numerical results that transformer-based ICL has a threshold +behavior, whereby, as the number of pre-training tasks grows, the performance +switches from that of a minimum mean squared error (MMSE) equalizer with a +prior determined by the pre-trained tasks to that of an MMSE equalizer with the +true data-generating prior. + +
+
+
+
+
+ + ♻ ☆ Beyond Vanilla Variational Autoencoders: Detecting Posterior Collapse in + Conditional and Hierarchical Variational Autoencoders ICLR + + +
+ The posterior collapse phenomenon in variational autoencoder (VAE), where the +variational posterior distribution closely matches the prior distribution, can +hinder the quality of the learned latent variables. As a consequence of +posterior collapse, the latent variables extracted by the encoder in VAE +preserve less information from the input data and thus fail to produce +meaningful representations as input to the reconstruction process in the +decoder. While this phenomenon has been an actively addressed topic related to +VAE performance, the theory for posterior collapse remains underdeveloped, +especially beyond the standard VAE. In this work, we advance the theoretical +understanding of posterior collapse to two important and prevalent yet less +studied classes of VAE: conditional VAE and hierarchical VAE. Specifically, via +a non-trivial theoretical analysis of linear conditional VAE and hierarchical +VAE with two levels of latent, we prove that the cause of posterior collapses +in these models includes the correlation between the input and output of the +conditional VAE and the effect of learnable encoder variance in the +hierarchical VAE. We empirically validate our theoretical findings for linear +conditional and hierarchical VAE and demonstrate that these results are also +predictive for non-linear cases with extensive experiments. + +
+
+ comment: International Conference on Learning Representations (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Through the Dual-Prism: A Spectral Perspective on Graph Data + Augmentation for Graph Classification + + +
+ Graph Neural Networks (GNNs) have become the preferred tool to process graph +data, with their efficacy being boosted through graph data augmentation +techniques. Despite the evolution of augmentation methods, issues like graph +property distortions and restricted structural changes persist. This leads to +the question: Is it possible to develop more property-conserving and +structure-sensitive augmentation methods? Through a spectral lens, we +investigate the interplay between graph properties, their augmentation, and +their spectral behavior, and found that keeping the low-frequency eigenvalues +unchanged can preserve the critical properties at a large scale when generating +augmented graphs. These observations inform our introduction of the Dual-Prism +(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly +retains essential graph properties while diversifying augmented graphs. +Extensive experiments validate the efficiency of our approach, providing a new +and promising direction for graph data augmentation. + +
+
+
+
+
+ + ♻ ☆ Starlit: Privacy-Preserving Federated Learning to Enhance Financial + Fraud Detection + + +
+ Federated Learning (FL) is a data-minimization approach enabling +collaborative model training across diverse clients with local data, avoiding +direct data exchange. However, state-of-the-art FL solutions to identify +fraudulent financial transactions exhibit a subset of the following +limitations. They (1) lack a formal security definition and proof, (2) assume +prior freezing of suspicious customers' accounts by financial institutions +(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$ +computationally expensive modular exponentiation (where $n$ is the total number +of financial institutions) or highly inefficient fully homomorphic encryption, +(4) assume the parties have already completed the identity alignment phase, +hence excluding it from the implementation, performance evaluation, and +security analysis, and (5) struggle to resist clients' dropouts. This work +introduces Starlit, a novel scalable privacy-preserving FL mechanism that +overcomes these limitations. It has various applications, such as enhancing +financial fraud detection, mitigating terrorism, and enhancing digital health. +We implemented Starlit and conducted a thorough performance analysis using +synthetic data from a key player in global financial transactions. The +evaluation indicates Starlit's scalability, efficiency, and accuracy. + +
+
+
+
+
+ + ♻ ☆ Medication Recommendation via Domain Knowledge Informed Deep Learning + + +
+ Medication recommendation is a fundamental yet crucial branch of healthcare, +which provides opportunities to support clinical physicians with more accurate +medication prescriptions for patients with complex health conditions. Learning +from electronic health records (EHR) to recommend medications is the most +common way in previous studies. However, most of them neglect incorporating +domain knowledge according to the clinical manifestations in the EHR of the +patient. To address these issues, we propose a novel \textbf{D}omain +\textbf{K}nowledge \textbf{I}nformed \textbf{Net}work (DKINet) to integrate +domain knowledge with observable clinical manifestations of the patient, which +is the first dynamic domain knowledge informed framework toward medication +recommendation. In particular, we first design a knowledge-driven encoder to +capture the domain information and then develop a data-driven encoder to +integrate domain knowledge into the observable EHR. To endow the model with the +capability of temporal decision, we design an explicit medication encoder for +learning the longitudinal dependence of the patient. Extensive experiments on +three publicly available datasets verify the superiority of our method. The +code will be public upon acceptance. + +
+
+ comment: 11 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ An Empirical Study of Using Large Language Models for Unit Test + Generation + + +
+ A code generation model generates code by taking a prompt from a code +comment, existing code, or a combination of both. Although code generation +models (e.g., GitHub Copilot) are increasingly being adopted in practice, it is +unclear whether they can successfully be used for unit test generation without +fine-tuning for a strongly typed language like Java. To fill this gap, we +investigated how well three models (Codex, GPT-3.5-Turbo, and StarCoder) can +generate unit tests. We used two benchmarks (HumanEval and Evosuite SF110) to +investigate the effect of context generation on the unit test generation +process. We evaluated the models based on compilation rates, test correctness, +test coverage, and test smells. We found that the Codex model achieved above +80% coverage for the HumanEval dataset, but no model had more than 2% coverage +for the EvoSuite SF110 benchmark. The generated tests also suffered from test +smells, such as Duplicated Asserts and Empty Tests. + +
+
+
+
+
+ + ♻ ☆ Langevin Unlearning: A New Perspective of Noisy Gradient Descent for + Machine Unlearning + + +
+ Machine unlearning has raised significant interest with the adoption of laws +ensuring the ``right to be forgotten''. Researchers have provided a +probabilistic notion of approximate unlearning under a similar definition of +Differential Privacy (DP), where privacy is defined as statistical +indistinguishability to retraining from scratch. We propose Langevin +unlearning, an unlearning framework based on noisy gradient descent with +privacy guarantees for approximate unlearning problems. Langevin unlearning +unifies the DP learning process and the privacy-certified unlearning process +with many algorithmic benefits. These include approximate certified unlearning +for non-convex problems, complexity saving compared to retraining, sequential +and batch unlearning for multiple unlearning requests. We verify the +practicality of Langevin unlearning by studying its privacy-utility-complexity +trade-off via experiments on benchmark datasets, and also demonstrate its +superiority against gradient-decent-plus-output-perturbation based approximate +unlearning. + +
+
+
+
+
+ + ♻ ☆ A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive + Sampling + + +
+ Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate +modeling and design optimization by incorporating data from various +low-fidelity (LF) models. While most existing MF methods assume a fixed +dataset, adaptive sampling methods that dynamically allocate resources among +fidelity models can achieve higher efficiency in the exploring and exploiting +the design space. However, most existing MF methods rely on the hierarchical +assumption of fidelity levels or fail to capture the intercorrelation between +multiple fidelity levels and utilize it to quantify the value of the future +samples and navigate the adaptive sampling. To address this hurdle, we propose +a framework hinged on a latent embedding for different fidelity models and the +associated pre-posterior analysis to explicitly utilize their correlation for +adaptive sampling. In this framework, each infill sampling iteration includes +two steps: We first identify the location of interest with the greatest +potential improvement using the high-fidelity (HF) model, then we search for +the next sample across all fidelity levels that maximize the improvement per +unit cost at the location identified in the first step. This is made possible +by a single Latent Variable Gaussian Process (LVGP) model that maps different +fidelity models into an interpretable latent space to capture their +correlations without assuming hierarchical fidelity levels. The LVGP enables us +to assess how LF sampling candidates will affect HF response with pre-posterior +analysis and determine the next sample with the best benefit-to-cost ratio. +Through test cases, we demonstrate that the proposed method outperforms the +benchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO) +problems in convergence rate and robustness. Moreover, the method offers the +flexibility to switch between GF and BO by simply changing the acquisition +function. + +
+
+
+
+
+ + ♻ ☆ Multi-UAV Speed Control with Collision Avoidance and Handover-aware Cell + Association: DRL with Action Branching + + +
+ This paper presents a deep reinforcement learning solution for optimizing +multi-UAV cell-association decisions and their moving velocity on a 3D aerial +highway. The objective is to enhance transportation and communication +performance, including collision avoidance, connectivity, and handovers. The +problem is formulated as a Markov decision process (MDP) with UAVs' states +defined by velocities and communication data rates. We propose a neural +architecture with a shared decision module and multiple network branches, each +dedicated to a specific action dimension in a 2D transportation-communication +space. This design efficiently handles the multi-dimensional action space, +allowing independence for individual action dimensions. We introduce two +models, Branching Dueling Q-Network (BDQ) and Branching Dueling Double Deep +Q-Network (Dueling DDQN), to demonstrate the approach. Simulation results show +a significant improvement of 18.32% compared to existing benchmarks. + +
+
+ comment: IEEE Globecom 2023 Accepted +
+
+
+
+
+ + ♻ ☆ Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov + Decision Processes AAMAS + + +
+ A fundamental (and largely open) challenge in sequential decision-making is +dealing with non-stationary environments, where exogenous environmental +conditions change over time. Such problems are traditionally modeled as +non-stationary Markov decision processes (NSMDP). However, existing approaches +for decision-making in NSMDPs have two major shortcomings: first, they assume +that the updated environmental dynamics at the current time are known (although +future dynamics can change); and second, planning is largely pessimistic, i.e., +the agent acts ``safely'' to account for the non-stationary evolution of the +environment. We argue that both these assumptions are invalid in practice -- +updated environmental conditions are rarely known, and as the agent interacts +with the environment, it can learn about the updated dynamics and avoid being +pessimistic, at least in states whose dynamics it is confident about. We +present a heuristic search algorithm called \textit{Adaptive Monte Carlo Tree +Search (ADA-MCTS)} that addresses these challenges. We show that the agent can +learn the updated dynamics of the environment over time and then act as it +learns, i.e., if the agent is in a region of the state space about which it has +updated knowledge, it can avoid being pessimistic. To quantify ``updated +knowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the +agent's updated belief and show how the agent can use these estimates for +decision-making. We compare the proposed approach with the multiple +state-of-the-art approaches in decision-making across multiple well-established +open-source problems and empirically show that our approach is faster and +highly adaptive without sacrificing safety. + +
+
+ comment: Accepted for publication at the International Conference on + Autonomous Agents and MultiAgent Systems (AAMAS), 2024 +
+
+
+
+
+ + ♻ ☆ HashVFL: Defending Against Data Reconstruction Attacks in Vertical + Federated Learning + + +
+ Vertical Federated Learning (VFL) is a trending collaborative machine +learning model training solution. Existing industrial frameworks employ secure +multi-party computation techniques such as homomorphic encryption to ensure +data security and privacy. Despite these efforts, studies have revealed that +data leakage remains a risk in VFL due to the correlations between intermediate +representations and raw data. Neural networks can accurately capture these +correlations, allowing an adversary to reconstruct the data. This emphasizes +the need for continued research into securing VFL systems. + Our work shows that hashing is a promising solution to counter data +reconstruction attacks. The one-way nature of hashing makes it difficult for an +adversary to recover data from hash codes. However, implementing hashing in VFL +presents new challenges, including vanishing gradients and information loss. To +address these issues, we propose HashVFL, which integrates hashing and +simultaneously achieves learnability, bit balance, and consistency. + Experimental results indicate that HashVFL effectively maintains task +performance while defending against data reconstruction attacks. It also brings +additional benefits in reducing the degree of label leakage, mitigating +adversarial attacks, and detecting abnormal inputs. We hope our work will +inspire further research into the potential applications of HashVFL. + +
+
+
+
+
+ + ♻ ☆ Provably Convergent Federated Trilevel Learning AAAI 2024 + + +
+ Trilevel learning, also called trilevel optimization (TLO), has been +recognized as a powerful modelling tool for hierarchical decision process and +widely applied in many machine learning applications, such as robust neural +architecture search, hyperparameter optimization, and domain adaptation. +Tackling TLO problems has presented a great challenge due to their nested +decision-making structure. In addition, existing works on TLO face the +following key challenges: 1) they all focus on the non-distributed setting, +which may lead to privacy breach; 2) they do not offer any non-asymptotic +convergence analysis which characterizes how fast an algorithm converges. To +address the aforementioned challenges, this paper proposes an asynchronous +federated trilevel optimization method to solve TLO problems. The proposed +method utilizes $\mu$-cuts to construct a hyper-polyhedral approximation for +the TLO problem and solve it in an asynchronous manner. We demonstrate that the +proposed $\mu$-cuts are applicable to not only convex functions but also a wide +range of non-convex functions that meet the $\mu$-weakly convex assumption. +Furthermore, we theoretically analyze the non-asymptotic convergence rate for +the proposed method by showing its iteration complexity to obtain +$\epsilon$-stationary point is upper bounded by +$\mathcal{O}(\frac{1}{\epsilon^2})$. Extensive experiments on real-world +datasets have been conducted to elucidate the superiority of the proposed +method, e.g., it has a faster convergence rate with a maximum acceleration of +approximately 80$\%$. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Modulate Your Spectrum in Self-Supervised Learning ICLR 2024 + + +
+ Whitening loss offers a theoretical guarantee against feature collapse in +self-supervised learning (SSL) with joint embedding architectures. Typically, +it involves a hard whitening approach, transforming the embedding and applying +loss to the whitened output. In this work, we introduce Spectral Transformation +(ST), a framework to modulate the spectrum of embedding and to seek for +functions beyond whitening that can avoid dimensional collapse. We show that +whitening is a special instance of ST by definition, and our empirical +investigations unveil other ST instances capable of preventing collapse. +Additionally, we propose a novel ST instance named IterNorm with trace loss +(INTL). Theoretical analysis confirms INTL's efficacy in preventing collapse +and modulating the spectrum of embedding toward equal-eigenvalues during +optimization. Our experiments on ImageNet classification and COCO object +detection demonstrate INTL's potential in learning superior representations. +The code is available at https://github.com/winci-ai/INTL. + +
+
+ comment: Accepted at ICLR 2024. The code is available at + https://github.com/winci-ai/intl +
+
+
+
+
+ + ♻ ☆ Learning bounded-degree polytrees with known skeleton ALT 2024 + + +
+ We establish finite-sample guarantees for efficient proper learning of +bounded-degree polytrees, a rich class of high-dimensional probability +distributions and a subclass of Bayesian networks, a widely-studied type of +graphical model. Recently, Bhattacharyya et al. (2021) obtained finite-sample +guarantees for recovering tree-structured Bayesian networks, i.e., 1-polytrees. +We extend their results by providing an efficient algorithm which learns +$d$-polytrees in polynomial time and sample complexity for any bounded $d$ when +the underlying undirected graph (skeleton) is known. We complement our +algorithm with an information-theoretic sample complexity lower bound, showing +that the dependence on the dimension and target accuracy parameters are nearly +tight. + +
+
+ comment: Fixed some typos. Added some discussions. Accepted to ALT 2024 +
+
+
+
+
+ + ♻ ☆ ALEXR: An Optimal Single-Loop Algorithm for Convex Finite-Sum Coupled + Compositional Stochastic Optimization + + +
+ This paper revisits a class of convex Finite-Sum Coupled Compositional +Stochastic Optimization (cFCCO) problems with many applications, including +group distributionally robust optimization (GDRO), learning with imbalanced +data, reinforcement learning, and learning to rank. To better solve these +problems, we introduce an efficient single-loop primal-dual block-coordinate +proximal algorithm, dubbed ALEXR. This algorithm leverages block-coordinate +stochastic mirror ascent updates for the dual variable and stochastic proximal +gradient descent updates for the primal variable. We establish the convergence +rates of ALEXR in both convex and strongly convex cases under smoothness and +non-smoothness conditions of involved functions, which not only improve the +best rates in previous works on smooth cFCCO problems but also expand the realm +of cFCCO for solving more challenging non-smooth problems such as the dual form +of GDRO. Finally, we present lower complexity bounds to demonstrate that the +convergence rates of ALEXR are optimal among first-order block-coordinate +stochastic algorithms for the considered class of cFCCO problems. + +
+
+ comment: Fixed several typos; Added some numerical experiments +
+
+
+
+
+ + ♻ ☆ Task-Driven Causal Feature Distillation: Towards Trustworthy Risk + Prediction AAAI + + +
+ Since artificial intelligence has seen tremendous recent successes in many +areas, it has sparked great interest in its potential for trustworthy and +interpretable risk prediction. However, most models lack causal reasoning and +struggle with class imbalance, leading to poor precision and recall. To address +this, we propose a Task-Driven Causal Feature Distillation model (TDCFD) to +transform original feature values into causal feature attributions for the +specific risk prediction task. The causal feature attribution helps describe +how much contribution the value of this feature can make to the risk prediction +result. After the causal feature distillation, a deep neural network is applied +to produce trustworthy prediction results with causal interpretability and high +precision/recall. We evaluate the performance of our TDCFD method on several +synthetic and real datasets, and the results demonstrate its superiority over +the state-of-the-art methods regarding precision, recall, interpretability, and +causality. + +
+
+ comment: Proceedings of the 2024 AAAI Conference on Artificial Intelligence +
+
+
+
+
+ + ♻ ☆ Empirical Study of Named Entity Recognition Performance Using + Distribution-aware Word Embedding + + +
+ With the fast development of Deep Learning techniques, Named Entity +Recognition (NER) is becoming more and more important in the information +extraction task. The greatest difficulty that the NER task faces is to keep the +detectability even when types of NE and documents are unfamiliar. Realizing +that the specificity information may contain potential meanings of a word and +generate semantic-related features for word embedding, we develop a +distribution-aware word embedding and implement three different methods to make +use of the distribution information in a NER framework. And the result shows +that the performance of NER will be improved if the word specificity is +incorporated into existing NER methods. + +
+
+ comment: Want to correct +
+
+
+
+
+ + ♻ ☆ Global Convergence of Natural Policy Gradient with Hessian-aided + Momentum Variance Reduction + + +
+ Natural policy gradient (NPG) and its variants are widely-used policy search +methods in reinforcement learning. Inspired by prior work, a new NPG variant +coined NPG-HM is developed in this paper, which utilizes the Hessian-aided +momentum technique for variance reduction, while the sub-problem is solved via +the stochastic gradient descent method. It is shown that NPG-HM can achieve the +global last iterate $\epsilon$-optimality with a sample complexity of +$\mathcal{O}(\epsilon^{-2})$, which is the best known result for natural policy +gradient type methods under the generic Fisher non-degenerate policy +parameterizations. The convergence analysis is built upon a relaxed weak +gradient dominance property tailored for NPG under the compatible function +approximation framework, as well as a neat way to decompose the error when +handling the sub-problem. Moreover, numerical experiments on Mujoco-based +environments demonstrate the superior performance of NPG-HM over other +state-of-the-art policy gradient methods. + +
+
+
+
+
+ + ♻ ☆ Look, Remember and Reason: Grounded reasoning in videos with language + models ICLR 2024 + + +
+ Multi-modal language models (LM) have recently shown promising performance in +high-level reasoning tasks on videos. However, existing methods still fall +short in tasks like causal or compositional spatiotemporal reasoning over +actions, in which model predictions need to be grounded in fine-grained +low-level details, such as object motions and object interactions. In this +work, we propose training an LM end-to-end on low-level surrogate tasks, +including object detection, re-identification, and tracking, to endow the model +with the required low-level visual capabilities. We show that a two-stream +video encoder with spatiotemporal attention is effective at capturing the +required static and motion-based cues in the video. By leveraging the LM's +ability to perform the low-level surrogate tasks, we can cast reasoning in +videos as the three-step process of Look, Remember, Reason wherein visual +information is extracted using low-level visual skills step-by-step and then +integrated to arrive at a final answer. We demonstrate the effectiveness of our +framework on diverse visual reasoning tasks from the ACRE, CATER, +Something-Else and STAR datasets. Our approach is trainable end-to-end and +surpasses state-of-the-art task-specific methods across these tasks by a large +margin. + +
+
+ comment: To appear at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Finite-Time Logarithmic Bayes Regret Upper Bounds + + +
+ We derive the first finite-time logarithmic Bayes regret upper bounds for +Bayesian bandits. In a multi-armed bandit, we obtain $O(c_\Delta \log n)$ and +$O(c_h \log^2 n)$ upper bounds for an upper confidence bound algorithm, where +$c_h$ and $c_\Delta$ are constants depending on the prior distribution and the +gaps of bandit instances sampled from it, respectively. The latter bound +asymptotically matches the lower bound of Lai (1987). Our proofs are a major +technical departure from prior works, while being simple and general. To show +the generality of our techniques, we apply them to linear bandits. Our results +provide insights on the value of prior in the Bayesian setting, both in the +objective and as a side information given to the learner. They significantly +improve upon existing $\tilde{O}(\sqrt{n})$ bounds, which have become standard +in the literature despite the logarithmic lower bound of Lai (1987). + +
+
+
+
+
+ + ♻ ☆ LRS: Enhancing Adversarial Transferability through Lipschitz Regularized + Surrogate AAAI 2024 + + +
+ The transferability of adversarial examples is of central importance to +transfer-based black-box adversarial attacks. Previous works for generating +transferable adversarial examples focus on attacking \emph{given} pretrained +surrogate models while the connections between surrogate models and adversarial +trasferability have been overlooked. In this paper, we propose {\em Lipschitz +Regularized Surrogate} (LRS) for transfer-based black-box attacks, a novel +approach that transforms surrogate models towards favorable adversarial +transferability. Using such transformed surrogate models, any existing +transfer-based black-box attack can run without any change, yet achieving much +better performance. Specifically, we impose Lipschitz regularization on the +loss landscape of surrogate models to enable a smoother and more controlled +optimization process for generating more transferable adversarial examples. In +addition, this paper also sheds light on the connection between the inner +properties of surrogate models and adversarial transferability, where three +factors are identified: smaller local Lipschitz constant, smoother loss +landscape, and stronger adversarial robustness. We evaluate our proposed LRS +approach by attacking state-of-the-art standard deep neural networks and +defense models. The results demonstrate significant improvement on the attack +success rates and transferability. Our code is available at +https://github.com/TrustAIoT/LRS. + +
+
+ comment: AAAI 2024 main track. Code available on Github (see abstract). + Appendix is included in this updated version +
+
+
+
+
+ + ♻ ☆ Using Twitter Data to Understand Public Perceptions of Approved versus + Off-label Use for COVID-19-related Medications + + +
+ Understanding public discourse on emergency use of unproven therapeutics is +crucial for monitoring safe use and combating misinformation. We developed a +natural language processing-based pipeline to comprehend public perceptions of +and stances on coronavirus disease 2019 (COVID-19)-related drugs on Twitter +over time. This retrospective study included 609,189 US-based tweets from +January 29, 2020, to November 30, 2021, about four drugs that garnered +significant public attention during the COVID-19 pandemic: (1) +Hydroxychloroquine and Ivermectin, therapies with anecdotal evidence; and (2) +Molnupiravir and Remdesivir, FDA-approved treatments for eligible patients. +Time-trend analysis was employed to understand popularity trends and related +events. Content and demographic analyses were conducted to explore potential +rationales behind people's stances on each drug. Time-trend analysis indicated +that Hydroxychloroquine and Ivermectin were discussed more than Molnupiravir +and Remdesivir, particularly during COVID-19 surges. Hydroxychloroquine and +Ivermectin discussions were highly politicized, related to conspiracy theories, +hearsay, and celebrity influences. The distribution of stances between the two +major US political parties was significantly different (P < .001); Republicans +were more likely to support Hydroxychloroquine (55%) and Ivermectin (30%) than +Democrats. People with healthcare backgrounds tended to oppose +Hydroxychloroquine (7%) more than the general population, while the general +population was more likely to support Ivermectin (14%). Our study found that +social media users have varying perceptions and stances on off-label versus +FDA-authorized drug use at different stages of COVID-19. This indicates that +health systems, regulatory agencies, and policymakers should design tailored +strategies to monitor and reduce misinformation to promote safe drug use. + +
+
+ comment: Full paper published in JAMIA +
+
+
+
+
+ + ♻ ☆ Learning an Inventory Control Policy with General Inventory Arrival + Dynamics + + +
+ In this paper we address the problem of learning and backtesting inventory +control policies in the presence of general arrival dynamics -- which we term +as a quantity-over-time arrivals model (QOT). We also allow for order +quantities to be modified as a post-processing step to meet vendor constraints +such as order minimum and batch size constraints -- a common practice in real +supply chains. To the best of our knowledge this is the first work to handle +either arbitrary arrival dynamics or an arbitrary downstream post-processing of +order quantities. Building upon recent work (Madeka et al., 2022) we similarly +formulate the periodic review inventory control problem as an exogenous +decision process, where most of the state is outside the control of the agent. +Madeka et al., 2022 show how to construct a simulator that replays historic +data to solve this class of problem. In our case, we incorporate a deep +generative model for the arrivals process as part of the history replay. By +formulating the problem as an exogenous decision process, we can apply results +from Madeka et al., 2022 to obtain a reduction to supervised learning. Via +simulation studies we show that this approach yields statistically significant +improvements in profitability over production baselines. Using data from a +real-world A/B test, we show that Gen-QOT generalizes well to off-policy data +and that the resulting buying policy outperforms traditional inventory +management systems in real world settings. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Benchmarking Large Multimodal Models against Common Corruptions + + +
+ This technical report aims to fill a deficiency in the assessment of large +multimodal models (LMMs) by specifically examining the self-consistency of +their outputs when subjected to common corruptions. We investigate the +cross-modal interactions between text, image, and speech, encompassing four +essential generation tasks: text-to-image, image-to-text, text-to-speech, and +speech-to-text. We create a comprehensive benchmark, named MMCBench, that +covers more than 100 popular LMMs (totally over 150 model checkpoints). A +thorough evaluation under common corruptions is critical for practical +deployment and facilitates a better understanding of the reliability of +cutting-edge LMMs. The benchmarking code is available at +https://github.com/sail-sg/MMCBench + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ MInD: Improving Multimodal Sentiment Analysis via Multimodal Information + Disentanglement + + +
+ Learning effective joint representations has been a central task in +multimodal sentiment analysis. Previous methods focus on leveraging the +correlations between different modalities and enhancing performance through +sophisticated fusion techniques. However, challenges still exist due to the +inherent heterogeneity of distinct modalities, which may lead to distributional +gap, impeding the full exploitation of inter-modal information and resulting in +redundancy and impurity in the information extracted from features. To address +this problem, we introduce the Multimodal Information Disentanglement (MInD) +approach. MInD decomposes the multimodal inputs into a modality-invariant +component, a modality-specific component, and a remnant noise component for +each modality through a shared encoder and multiple private encoders. The +shared encoder aims to explore the shared information and commonality across +modalities, while the private encoders are deployed to capture the distinctive +information and characteristic features. These representations thus furnish a +comprehensive perspective of the multimodal data, facilitating the fusion +process instrumental for subsequent prediction tasks. Furthermore, MInD +improves the learned representations by explicitly modeling the task-irrelevant +noise in an adversarial manner. Experimental evaluations conducted on benchmark +datasets, including CMU-MOSI, CMU-MOSEI, and UR-Funny, demonstrate MInD's +superior performance over existing state-of-the-art methods in both multimodal +emotion recognition and multimodal humor detection tasks. + +
+
+
+
+
+ + ☆ Identity-Driven Multimedia Forgery Detection via Reference Assistance + + +
+ Recent advancements in technologies, such as the 'deepfake' technique, have +paved the way for the generation of various media forgeries. In response to the +potential hazards of these media forgeries, many researchers engage in +exploring detection methods, increasing the demand for high-quality media +forgery datasets. Despite this, existing datasets have certain limitations. +Firstly, most of datasets focus on the manipulation of visual modality and +usually lack diversity, as only a few forgery approaches are considered. +Secondly, the quality of media is often inadequate in clarity and naturalness. +Meanwhile, the size of the dataset is also limited. Thirdly, while many +real-world forgeries are driven by identity, the identity information of the +subject in media is frequently neglected. For detection, identity information +could be an essential clue to boost accuracy. Moreover, official media +concerning certain identities on the Internet can serve as prior knowledge, +aiding both the audience and forgery detectors in determining the true +identity. Therefore, we propose an identity-driven multimedia forgery dataset, +IDForge, which contains 249,138 video shots. All video shots are sourced from +324 wild videos collected of 54 celebrities from the Internet. The fake video +shots involve 9 types of manipulation across visual, audio and textual +modalities. Additionally, IDForge provides extra 214,438 real video shots as a +reference set for the 54 celebrities. Correspondingly, we design an effective +multimedia detection network, Reference-assisted Multimodal Forgery Detection +Network (R-MFDN). Through extensive experiments on the proposed dataset, we +demonstrate the effectiveness of R-MFDN on the multimedia detection task. + +
+
+
+
+
+ + ♻ ☆ Beyond Task Performance: Evaluating and Reducing the Flaws of Large + Multimodal Models with In-Context Learning ICLR 2024 + + +
+ Following the success of Large Language Models (LLMs), Large Multimodal +Models (LMMs), such as the Flamingo model and its subsequent competitors, have +started to emerge as natural steps towards generalist agents. However, +interacting with recent LMMs reveals major limitations that are hardly captured +by the current evaluation benchmarks. Indeed, task performances (e.g., VQA +accuracy) alone do not provide enough clues to understand their real +capabilities, limitations, and to which extent such models are aligned to human +expectations. To refine our understanding of those flaws, we deviate from the +current evaluation paradigm, and (1) evaluate 10 recent open-source LMMs from +3B up to 80B parameter scale, on 5 different axes; hallucinations, abstention, +compositionality, explainability and instruction following. Our evaluation on +these axes reveals major flaws in LMMs. While the current go-to solution to +align these models is based on training, such as instruction tuning or RLHF, we +rather (2) explore the training-free in-context learning (ICL) as a solution, +and study how it affects these limitations. Based on our ICL study, (3) we push +ICL further and propose new multimodal ICL variants such as; Multitask-ICL, +Chain-of-Hindsight-ICL, and Self-Correcting-ICL. Our findings are as follows. +(1) Despite their success, LMMs have flaws that remain unsolved with scaling +alone. (2) The effect of ICL on LMMs flaws is nuanced; despite its +effectiveness for improved explainability, answer abstention, ICL only slightly +improves instruction following, does not improve compositional abilities, and +actually even amplifies hallucinations. (3) The proposed ICL variants are +promising as post-hoc approaches to efficiently tackle some of those flaws. The +code is available here: https://github.com/mshukor/EvALign-ICL. + +
+
+ comment: ICLR 2024. Project Page: https://evalign-icl.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 34 + +
+
+
+ + ☆ Text-to-Image Cross-Modal Generation: A Systematic Review + + +
+ We review research on generating visual data from text from the angle of +"cross-modal generation." This point of view allows us to draw parallels +between various methods geared towards working on input text and producing +visual output, without limiting the analysis to narrow sub-areas. It also +results in the identification of common templates in the field, which are then +compared and contrasted both within pools of similar methods and across lines +of research. We provide a breakdown of text-to-image generation into various +flavors of image-from-text methods, video-from-text methods, image editing, +self-supervised and graph-based approaches. In this discussion, we focus on +research papers published at 8 leading machine learning conferences in the +years 2016-2022, also incorporating a number of relevant papers not matching +the outlined search criteria. The conducted review suggests a significant +increase in the number of papers published in the area and highlights research +gaps and potential lines of investigation. To our knowledge, this is the first +review to systematically look at text-to-image generation from the perspective +of "cross-modal generation." + +
+
+
+
+
+ + ☆ Freely Long-Thinking Transformer (FraiLT) + + +
+ Freely Long-Thinking Transformer (FraiLT) is an improved transformer model +designed to enhance processing capabilities without scaling up size. It +utilizes a recursive approach, iterating over a subset of layers multiple +times, and introduces iteration encodings to maintain awareness across these +cycles. Iteration encoding allows FraiLT to achieve the interpretive depth of +larger models in a compact form. When evaluated on a synthetic story dataset, +FraiLT outperformed larger models, showcasing its ability to deliver +high-quality performance while reducing memory demands. This model represents a +step forward towards more efficient and accessible language models. + +
+
+
+
+
+ + ☆ In-context Learning with Retrieved Demonstrations for Language Models: A + Survey + + +
+ Language models, especially pre-trained large language models, have showcased +remarkable abilities as few-shot in-context learners (ICL), adept at adapting +to new tasks with just a few demonstrations in the input context. However, the +model's ability to perform ICL is sensitive to the choice of the few-shot +demonstrations. Instead of using a fixed set of demonstrations, one recent +development is to retrieve demonstrations tailored to each input query. The +implementation of demonstration retrieval is relatively straightforward, +leveraging existing databases and retrieval systems. This not only improves the +efficiency and scalability of the learning process but also has been shown to +reduce biases inherent in manual example selection. In light of the encouraging +results and growing research in ICL with retrieved demonstrations, we conduct +an extensive review of studies in this area. In this survey, we discuss and +compare different design choices for retrieval models, retrieval training +procedures, and inference algorithms. + +
+
+
+
+
+ + ☆ Robust Evaluation Measures for Evaluating Social Biases in Masked + Language Models + + +
+ Many evaluation measures are used to evaluate social biases in masked +language models (MLMs). However, we find that these previously proposed +evaluation measures are lacking robustness in scenarios with limited datasets. +This is because these measures are obtained by comparing the +pseudo-log-likelihood (PLL) scores of the stereotypical and anti-stereotypical +samples using an indicator function. The disadvantage is the limited mining of +the PLL score sets without capturing its distributional information. In this +paper, we represent a PLL score set as a Gaussian distribution and use Kullback +Leibler (KL) divergence and Jensen Shannon (JS) divergence to construct +evaluation measures for the distributions of stereotypical and +anti-stereotypical PLL scores. Experimental results on the publicly available +datasets StereoSet (SS) and CrowS-Pairs (CP) show that our proposed measures +are significantly more robust and interpretable than those proposed previously. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray + Report Labeling + + +
+ Free-text radiology reports present a rich data source for various medical +tasks, but effectively labeling these texts remains challenging. Traditional +rule-based labeling methods fall short of capturing the nuances of diverse +free-text patterns. Moreover, models using expert-annotated data are limited by +data scarcity and pre-defined classes, impacting their performance, flexibility +and scalability. To address these issues, our study offers three main +contributions: 1) We demonstrate the potential of GPT as an adept labeler using +carefully designed prompts. 2) Utilizing only the data labeled by GPT, we +trained a BERT-based labeler, CheX-GPT, which operates faster and more +efficiently than its GPT counterpart. 3) To benchmark labeler performance, we +introduced a publicly available expert-annotated test set, MIMIC-500, +comprising 500 cases from the MIMIC validation set. Our findings demonstrate +that CheX-GPT not only excels in labeling accuracy over existing models, but +also showcases superior efficiency, flexibility, and scalability, supported by +our introduction of the MIMIC-500 dataset for robust benchmarking. Code and +models are available at https://github.com/kakaobrain/CheXGPT. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ With Greater Text Comes Greater Necessity: Inference-Time Training Helps + Long Text Generation + + +
+ Long text generation, such as novel writing or discourse-level translation +with extremely long contexts, presents significant challenges to current +language models. Existing methods mainly focus on extending the model's context +window through strategies like length extrapolation. However, these approaches +demand substantial hardware resources during the training and/or inference +phases. Our proposed method, Temp-Lora, introduces an alternative concept. +Instead of relying on the KV cache to store all context information, Temp-Lora +embeds this information directly into the model's parameters. In the process of +long text generation, we use a temporary Lora module, progressively trained +with text generated previously. This approach not only efficiently preserves +contextual knowledge but also prevents any permanent alteration to the model's +parameters given that the module is discarded post-generation. Extensive +experiments on the PG19 language modeling benchmark and the GuoFeng +discourse-level translation benchmark validate the effectiveness of Temp-Lora. +Our results show that: 1) Temp-Lora substantially enhances generation quality +for long texts, as indicated by a 13.2% decrease in perplexity on a subset of +PG19, and a 29.6% decrease in perplexity along with a 53.2% increase in BLEU +score on GuoFeng, 2) Temp-Lora is compatible with and enhances most existing +long text generation methods, and 3) Temp-Lora can greatly reduce computational +costs by shortening the context window. While ensuring a slight improvement in +generation quality (a decrease of 3.8% in PPL), it enables a reduction of 70.5% +in the FLOPs required for inference and a 51.5% decrease in latency. + +
+
+
+
+
+ + ☆ Towards Better Inclusivity: A Diverse Tweet Corpus of English Varieties + + +
+ The prevalence of social media presents a growing opportunity to collect and +analyse examples of English varieties. Whilst usage of these varieties was - +and, in many cases, still is - used only in spoken contexts or hard-to-access +private messages, social media sites like Twitter provide a platform for users +to communicate informally in a scrapeable format. Notably, Indian English +(Hinglish), Singaporean English (Singlish), and African-American English (AAE) +can be commonly found online. These varieties pose a challenge to existing +natural language processing (NLP) tools as they often differ orthographically +and syntactically from standard English for which the majority of these tools +are built. NLP models trained on standard English texts produced biased +outcomes for users of underrepresented varieties. Some research has aimed to +overcome the inherent biases caused by unrepresentative data through techniques +like data augmentation or adjusting training models. + We aim to address the issue of bias at its root - the data itself. We curate +a dataset of tweets from countries with high proportions of underserved English +variety speakers, and propose an annotation framework of six categorical +classifications along a pseudo-spectrum that measures the degree of standard +English and that thereby indirectly aims to surface the manifestations of +English varieties in these tweets. Following best annotation practices, our +growing corpus features 170,800 tweets taken from 7 countries, labeled by +annotators who are from those countries and can communicate in +regionally-dominant varieties of English. Our corpus highlights the accuracy +discrepancies in pre-trained language identifiers between western English and +non-western (i.e., less standard) English varieties. We hope to contribute to +the growing literature identifying and reducing the implicit demographic +discrepancies in NLP. + +
+
+ comment: 10 pages (including limitations, references and appendices), 2 + figures +
+
+
+
+
+ + ☆ Over-Reasoning and Redundant Calculation of Large Language Models EACL 2024 + + +
+ Large language models (LLMs) can solve problems step-by-step. While this +chain-of-thought (CoT) reasoning boosts LLMs' performance, it is unclear if +LLMs \textit{know} when to use CoT and whether those CoT are always necessary +to answer the question. This paper shows that LLMs tend to generate redundant +calculations and reasoning on a manually constructed math QA dataset, +GSM8K-Zero. GSM8K-Zero is constructed such that the questions can be answered +without any calculations, but LLMs, including Llama-2 models and Claude-2, tend +to generate lengthy and unnecessary calculations to answer the questions. We +also conduct experiments to explain why LLMs generate redundant calculations +and reasonings. GSM8K-Zero is publicly available at +https://github.com/d223302/Over-Reasoning-of-LLMs and +https://huggingface.co/datasets/dcml0714/GSM8K-Zero. + +
+
+ comment: EACL 2024 main conference paper. Camera-ready version +
+
+
+
+
+ + ☆ Estimating the Usefulness of Clarifying Questions and Answers for + Conversational Search ECIR '24 + + +
+ While the body of research directed towards constructing and generating +clarifying questions in mixed-initiative conversational search systems is vast, +research aimed at processing and comprehending users' answers to such questions +is scarce. To this end, we present a simple yet effective method for processing +answers to clarifying questions, moving away from previous work that simply +appends answers to the original query and thus potentially degrades retrieval +performance. Specifically, we propose a classifier for assessing usefulness of +the prompted clarifying question and an answer given by the user. Useful +questions or answers are further appended to the conversation history and +passed to a transformer-based query rewriting module. Results demonstrate +significant improvements over strong non-mixed-initiative baselines. +Furthermore, the proposed approach mitigates the performance drops when non +useful questions and answers are utilized. + +
+
+ comment: This is the author's version of the work. The definitive version is + published in: Proceedings of the 46th European Conference on Information + Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland +
+
+
+
+
+ + ☆ Linear Alignment: A Closed-form Solution for Aligning Human Preferences + without Tuning and Feedback + + +
+ The success of AI assistants based on Language Models (LLMs) hinges on +Reinforcement Learning from Human Feedback (RLHF) to comprehend and align with +user intentions. However, traditional alignment algorithms, such as PPO, are +hampered by complex annotation and training requirements. This reliance limits +the applicability of RLHF and hinders the development of professional +assistants tailored to diverse human preferences. In this work, we introduce +\textit{Linear Alignment}, a novel algorithm that aligns language models with +human preferences in one single inference step, eliminating the reliance on +data annotation and model training. Linear alignment incorporates a new +parameterization for policy optimization under divergence constraints, which +enables the extraction of optimal policy in a closed-form manner and +facilitates the direct estimation of the aligned response. Extensive +experiments on both general and personalized preference datasets demonstrate +that linear alignment significantly enhances the performance and efficiency of +LLM alignment across diverse scenarios. Our code and dataset will be published +on \url{https://github.com/Wizardcoast/Linear_Alignment.git}. + +
+
+
+
+
+ + ☆ Towards Reliable and Factual Response Generation: Detecting Unanswerable + Questions in Information-Seeking Conversations ECIR '24 + + +
+ Generative AI models face the challenge of hallucinations that can undermine +users' trust in such systems. We approach the problem of conversational +information seeking as a two-step process, where relevant passages in a corpus +are identified first and then summarized into a final system response. This way +we can automatically assess if the answer to the user's question is present in +the corpus. Specifically, our proposed method employs a sentence-level +classifier to detect if the answer is present, then aggregates these +predictions on the passage level, and eventually across the top-ranked passages +to arrive at a final answerability estimate. For training and evaluation, we +develop a dataset based on the TREC CAsT benchmark that includes answerability +labels on the sentence, passage, and ranking levels. We demonstrate that our +proposed method represents a strong baseline and outperforms a state-of-the-art +LLM on the answerability prediction task. + +
+
+ comment: This is the author's version of the work. The definitive version is + published in: Proceedings of the 46th European Conference on Information + Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland +
+
+
+
+
+ + ☆ Majority or Minority: Data Imbalance Learning Method for Named Entity + Recognition + + +
+ Data imbalance presents a significant challenge in various machine learning +(ML) tasks, particularly named entity recognition (NER) within natural language +processing (NLP). NER exhibits a data imbalance with a long-tail distribution, +featuring numerous minority classes (i.e., entity classes) and a single +majority class (i.e., O-class). The imbalance leads to the misclassifications +of the entity classes as the O-class. To tackle the imbalance, we propose a +simple and effective learning method, named majority or minority (MoM) +learning. MoM learning incorporates the loss computed only for samples whose +ground truth is the majority class (i.e., the O-class) into the loss of the +conventional ML model. Evaluation experiments on four NER datasets (Japanese +and English) showed that MoM learning improves prediction performance of the +minority classes, without sacrificing the performance of the majority class and +is more effective than widely known and state-of-the-art methods. We also +evaluated MoM learning using frameworks as sequential labeling and machine +reading comprehension, which are commonly used in NER. Furthermore, MoM +learning has achieved consistent performance improvements regardless of +language, model, or framework. + +
+
+ comment: 6 pages, 1 figures, 6 tables +
+
+
+
+
+ + ☆ SEBERTNets: Sequence Enhanced BERT Networks for Event Entity Extraction + Tasks Oriented to the Finance Field + + +
+ Event extraction lies at the cores of investment analysis and asset +management in the financial field, and thus has received much attention. The +2019 China conference on knowledge graph and semantic computing (CCKS) +challenge sets up a evaluation competition for event entity extraction task +oriented to the finance field. In this task, we mainly focus on how to extract +the event entity accurately, and recall all the corresponding event entity +effectively. In this paper, we propose a novel model, Sequence Enhanced BERT +Networks (SEBERTNets for short), which can inherit the advantages of the +BERT,and while capturing sequence semantic information. In addition, motivated +by recommendation system, we propose Hybrid Sequence Enhanced BERT Networks +(HSEBERTNets for short), which uses a multi-channel recall method to recall all +the corresponding event entity. The experimental results show that, the F1 +score of SEBERTNets is 0.905 in the first stage, and the F1 score of +HSEBERTNets is 0.934 in the first stage, which demonstarate the effectiveness +of our methods. + +
+
+ comment: CCKS 2019 +
+
+
+
+
+ + ☆ MolTailor: Tailoring Chemical Molecular Representation to Specific Tasks + via Text Prompts AAAI 2024 + + +
+ Deep learning is now widely used in drug discovery, providing significant +acceleration and cost reduction. As the most fundamental building block, +molecular representation is essential for predicting molecular properties to +enable various downstream applications. Most existing methods attempt to +incorporate more information to learn better representations. However, not all +features are equally important for a specific task. Ignoring this would +potentially compromise the training efficiency and predictive accuracy. To +address this issue, we propose a novel approach, which treats language models +as an agent and molecular pretraining models as a knowledge base. The agent +accentuates task-relevant features in the molecular representation by +understanding the natural language description of the task, just as a tailor +customizes clothes for clients. Thus, we call this approach MolTailor. +Evaluations demonstrate MolTailor's superior performance over baselines, +validating the efficacy of enhancing relevance for molecular representation +learning. This illustrates the potential of language model guided optimization +to better exploit and unleash the capabilities of existing powerful molecular +representation methods. Our codes and appendix are available at +https://github.com/SCIR-HI/MolTailor. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ MedLM: Exploring Language Models for Medical Question Answering Systems + + +
+ In the face of rapidly expanding online medical literature, automated systems +for aggregating and summarizing information are becoming increasingly crucial +for healthcare professionals and patients. Large Language Models (LLMs), with +their advanced generative capabilities, have shown promise in various NLP +tasks, and their potential in the healthcare domain, particularly for +Closed-Book Generative QnA, is significant. However, the performance of these +models in domain-specific tasks such as medical Q&A remains largely unexplored. +This study aims to fill this gap by comparing the performance of general and +medical-specific distilled LMs for medical Q&A. We aim to evaluate the +effectiveness of fine-tuning domain-specific LMs and compare the performance of +different families of Language Models. The study will address critical +questions about these models' reliability, comparative performance, and +effectiveness in the context of medical Q&A. The findings will provide valuable +insights into the suitability of different LMs for specific applications in the +medical domain. + +
+
+
+
+
+ + ☆ Using Large Language Model for End-to-End Chinese ASR and NER + + +
+ Mapping speech tokens to the same feature space as text tokens has become the +paradigm for the integration of speech modality into decoder-only large +language models (LLMs). An alternative approach is to use an encoder-decoder +architecture that incorporates speech features through cross-attention. This +approach, however, has received less attention in the literature. In this work, +we connect the Whisper encoder with ChatGLM3 and provide in-depth comparisons +of these two approaches using Chinese automatic speech recognition (ASR) and +name entity recognition (NER) tasks. We evaluate them not only by conventional +metrics like the F1 score but also by a novel fine-grained taxonomy of ASR-NER +errors. Our experiments reveal that encoder-decoder architecture outperforms +decoder-only architecture with a short context, while decoder-only architecture +benefits from a long context as it fully exploits all layers of the LLM. By +using LLM, we significantly reduced the entity omission errors and improved the +entity ASR accuracy compared to the Conformer baseline. Additionally, we +obtained a state-of-the-art (SOTA) F1 score of 0.805 on the AISHELL-NER test +set by using chain-of-thought (CoT) NER which first infers long-form ASR +transcriptions and then predicts NER labels. + +
+
+ comment: 5 pages, 2 figures +
+
+
+
+
+ + ☆ Language Models as Hierarchy Encoders + + +
+ Interpreting hierarchical structures latent in language is a key limitation +of current language models (LMs). While previous research has implicitly +leveraged these hierarchies to enhance LMs, approaches for their explicit +encoding are yet to be explored. To address this, we introduce a novel approach +to re-train transformer encoder-based LMs as Hierarchy Transformer encoders +(HiTs), harnessing the expansive nature of hyperbolic space. Our method +situates the output embedding space of pre-trained LMs within a Poincar\'e ball +with a curvature that adapts to the embedding dimension, followed by +re-training on hyperbolic cluster and centripetal losses. These losses are +designed to effectively cluster related entities (input as texts) and organise +them hierarchically. We evaluate HiTs against pre-trained and fine-tuned LMs, +focusing on their capabilities in simulating transitive inference, predicting +subsumptions, and transferring knowledge across hierarchies. The results +demonstrate that HiTs consistently outperform both pre-trained and fine-tuned +LMs in these tasks, underscoring the effectiveness and transferability of our +re-trained hierarchy encoders. + +
+
+
+
+
+ + ☆ Finding a Needle in the Adversarial Haystack: A Targeted Paraphrasing + Approach For Uncovering Edge Cases with Minimal Distribution Distortion EACL 2024 + + +
+ Adversarial attacks against NLP Deep Learning models are a significant +concern. In particular, adversarial samples exploit the model's sensitivity to +small input changes. While these changes appear insignificant on the semantics +of the input sample, they result in significant decay in model performance. In +this paper, we propose Targeted Paraphrasing via RL (TPRL), an approach to +automatically learn a policy to generate challenging samples that most likely +improve the model's performance. TPRL leverages FLAN T5, a language model, as a +generator and employs a self learned policy using a proximal policy gradient to +generate the adversarial examples automatically. TPRL's reward is based on the +confusion induced in the classifier, preserving the original text meaning +through a Mutual Implication score. We demonstrate and evaluate TPRL's +effectiveness in discovering natural adversarial attacks and improving model +performance through extensive experiments on four diverse NLP classification +tasks via Automatic and Human evaluation. TPRL outperforms strong baselines, +exhibits generalizability across classifiers and datasets, and combines the +strengths of language modeling and reinforcement learning to generate diverse +and influential adversarial examples. + +
+
+ comment: EACL 2024 - Main conference +
+
+
+
+
+ + ☆ Confidence Preservation Property in Knowledge Distillation Abstractions + + +
+ Social media platforms prevent malicious activities by detecting harmful +content of posts and comments. To that end, they employ large-scale deep neural +network language models for sentiment analysis and content understanding. Some +models, like BERT, are complex, and have numerous parameters, which makes them +expensive to operate and maintain. To overcome these deficiencies, industry +experts employ a knowledge distillation compression technique, where a +distilled model is trained to reproduce the classification behavior of the +original model. The distillation processes terminates when the distillation +loss function reaches the stopping criteria. This function is mainly designed +to ensure that the original and the distilled models exhibit alike +classification behaviors. However, besides classification accuracy, there are +additional properties of the original model that the distilled model should +preserve to be considered as an appropriate abstraction. In this work, we +explore whether distilled TinyBERT models preserve confidence values of the +original BERT models, and investigate how this confidence preservation property +could guide tuning hyperparameters of the distillation process. + +
+
+
+
+
+ + ☆ Revolutionizing API Documentation through Summarization + + +
+ This study tackles the challenges associated with interpreting Application +Programming Interface (API) documentation, an integral aspect of software +development. Official API documentation, while essential, can be lengthy and +challenging to navigate, prompting developers to seek unofficial sources such +as Stack Overflow. Leveraging the vast user-generated content on Stack +Overflow, including code snippets and discussions, we employ BERTopic and +extractive summarization to automatically generate concise and informative API +summaries. These summaries encompass key insights like general usage, common +developer issues, and potential solutions, sourced from the wealth of knowledge +on Stack Overflow. Software developers evaluate these summaries for +performance, coherence, and interoperability, providing valuable feedback on +the practicality of our approach. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2308.09070 +
+
+
+
+
+ + ☆ ProLex: A Benchmark for Language Proficiency-oriented Lexical + Substitution + + +
+ Lexical Substitution discovers appropriate substitutes for a given target +word in a context sentence. However, the task fails to consider substitutes +that are of equal or higher proficiency than the target, an aspect that could +be beneficial for language learners looking to improve their writing. To bridge +this gap, we propose a new task, language proficiency-oriented lexical +substitution. We also introduce ProLex, a novel benchmark designed to assess +systems' ability to generate not only appropriate substitutes but also +substitutes that demonstrate better language proficiency. Besides the +benchmark, we propose models that can automatically perform the new task. We +show that our best model, a Llama2-13B model fine-tuned with task-specific +synthetic data, outperforms ChatGPT by an average of 3.2% in F-score and +achieves comparable results with GPT-4 on ProLex. + +
+
+
+
+
+ + ♻ ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models + + +
+ Collecting large amounts of real-world interaction data to train general +robotic policies is often prohibitively expensive, thus motivating the use of +simulation data. However, existing methods for data generation have generally +focused on scene-level diversity (e.g., object instances and poses) rather than +task-level diversity, due to the human effort required to come up with and +verify novel tasks. This has made it challenging for policies trained on +simulation data to demonstrate significant task-level generalization. In this +paper, we propose to automatically generate rich simulation environments and +expert demonstrations by exploiting a large language models' (LLM) grounding +and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed +generation, wherein a target task is given to the LLM and the LLM proposes a +task curriculum to solve the target task, and exploratory generation, wherein +the LLM bootstraps from previous tasks and iteratively proposes novel tasks +that would be helpful in solving more complex tasks. We use GPT4 to expand the +existing benchmark by ten times to over 100 tasks, on which we conduct +supervised finetuning and evaluate several LLMs including finetuned GPTs and +Code Llama on code generation for robotic simulation tasks. Furthermore, we +observe that LLMs-generated simulation programs can enhance task-level +generalization significantly when used for multitask policy training. We +further find that with minimal sim-to-real adaptation, the multitask policies +pretrained on GPT4-generated simulation tasks exhibit stronger transfer to +unseen long-horizon tasks in the real world and outperform baselines by 25%. +See the project website (https://liruiw.github.io/gensim) for code, demos, and +videos. + +
+
+ comment: See our project website (https://liruiw.github.io/gensim), demo and + datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code + (https://github.com/liruiw/GenSim) for more details +
+
+
+
+
+ + ♻ ☆ ChaCha: Leveraging Large Language Models to Prompt Children to Share + Their Emotions about Personal Events + + +
+ Children typically learn to identify and express emotions through sharing +their stories and feelings with others, particularly their family. However, it +is challenging for parents or siblings to have emotional communication with +children since children are still developing their communication skills. We +present ChaCha, a chatbot that encourages and guides children to share personal +events and associated emotions. ChaCha combines a state machine and large +language models (LLMs) to keep the dialogue on track while carrying on +free-form conversations. Through an exploratory study with 20 children (aged +8-12), we examine how ChaCha prompts children to share personal events and +guides them to describe associated emotions. Participants perceived ChaCha as a +close friend and shared their stories on various topics, such as family trips +and personal achievements. Based on the findings, we discuss opportunities for +leveraging LLMs to design child-friendly chatbots to support children in +sharing emotions. + +
+
+ comment: 16 pages, 5 figures, 2 tables; Accepted at ACM CHI 2024 +
+
+
+
+
+ + ♻ ☆ Code Simulation Challenges for Large Language Models + + +
+ We investigate the extent to which Large Language Models (LLMs) can simulate +the execution of computer code and algorithms. We begin by looking at straight +line programs, and show that current LLMs demonstrate poor performance even +with such simple programs -- performance rapidly degrades with the length of +code. We then investigate the ability of LLMs to simulate programs that contain +critical paths and redundant instructions. We also go beyond straight line +program simulation with sorting algorithms and nested loops, and we show the +computational complexity of a routine directly affects the ability of an LLM to +simulate its execution. We observe that LLMs execute instructions sequentially +and with a low error margin only for short programs or standard procedures. +LLMs' code simulation is in tension with their pattern recognition and +memorisation capabilities: on tasks where memorisation is detrimental, we +propose a novel prompting method to simulate code execution line by line. +Empirically, our new Chain of Simulation (CoSm) method improves on the standard +Chain of Thought prompting approach by avoiding the pitfalls of memorisation. + +
+
+ comment: main paper (10 pages) + Appendix (11 pages) +
+
+
+
+
+ + ♻ ☆ VivesDebate-Speech: A Corpus of Spoken Argumentation to Leverage Audio + Features for Argument Mining EMNLP 2023 + + +
+ In this paper, we describe VivesDebate-Speech, a corpus of spoken +argumentation created to leverage audio features for argument mining tasks. The +creation of this corpus represents an important contribution to the +intersection of speech processing and argument mining communities, and one of +the most complete publicly available resources in this topic. Moreover, we have +performed a set of first-of-their-kind experiments which show an improvement +when integrating audio features into the argument mining pipeline. The provided +results can be used as a baseline for future research. + +
+
+ comment: 5 pages; EMNLP 2023 Accepted Version +
+
+
+
+
+ + ♻ ☆ Automatic Debate Evaluation with Argumentation Semantics and Natural + Language Argument Graph Networks EMNLP 2023 + + +
+ The lack of annotated data on professional argumentation and complete +argumentative debates has led to the oversimplification and the inability of +approaching more complex natural language processing tasks. Such is the case of +the automatic debate evaluation. In this paper, we propose an original hybrid +method to automatically evaluate argumentative debates. For that purpose, we +combine concepts from argumentation theory such as argumentation frameworks and +semantics, with Transformer-based architectures and neural graph networks. +Furthermore, we obtain promising results that lay the basis on an unexplored +new instance of the automatic analysis of natural language arguments. + +
+
+ comment: EMNLP 2023 Accepted Version +
+
+
+
+
+ + ♻ ☆ DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning ICLR 2024 + + +
+ Prompt tuning (PT), where a small amount of trainable soft (continuous) +prompt vectors is affixed to the input of language models (LM), has shown +promising results across various tasks and models for parameter-efficient +fine-tuning (PEFT). PT stands out from other PEFT approaches because it +maintains competitive performance with fewer trainable parameters and does not +drastically scale up its parameters as the model size expands. However, PT +introduces additional soft prompt tokens, leading to longer input sequences, +which significantly impacts training and inference time and memory usage due to +the Transformer's quadratic complexity. Particularly concerning for Large +Language Models (LLMs) that face heavy daily querying. To address this issue, +we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt +into a shorter soft prompt and a pair of low-rank matrices that are then +optimised with two different learning rates. This allows DePT to achieve better +performance while saving substantial memory and time costs compared to vanilla +PT and its variants, without changing trainable parameter sizes. Through +extensive experiments on 23 natural language processing (NLP) and +vision-language (VL) tasks, we demonstrate that DePT outperforms +state-of-the-art PEFT approaches, including the full fine-tuning baseline, in +some scenarios. Additionally, we empirically show that DEPT grows more +efficient as the model size increases. Our further study reveals that DePT +integrates seamlessly with parameter-efficient transfer learning in the +few-shot learning setting and highlights its adaptability to various model +architectures and sizes. + +
+
+ comment: ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT +
+
+
+
+
+ + ♻ ☆ Rosetta Stone at KSAA-RD Shared Task: A Hop From Language Modeling To + Word--Definition Alignment + + +
+ A Reverse Dictionary is a tool enabling users to discover a word based on its +provided definition, meaning, or description. Such a technique proves valuable +in various scenarios, aiding language learners who possess a description of a +word without its identity, and benefiting writers seeking precise terminology. +These scenarios often encapsulate what is referred to as the +"Tip-of-the-Tongue" (TOT) phenomena. In this work, we present our winning +solution for the Arabic Reverse Dictionary shared task. This task focuses on +deriving a vector representation of an Arabic word from its accompanying +description. The shared task encompasses two distinct subtasks: the first +involves an Arabic definition as input, while the second employs an English +definition. For the first subtask, our approach relies on an ensemble of +finetuned Arabic BERT-based models, predicting the word embedding for a given +definition. The final representation is obtained through averaging the output +embeddings from each model within the ensemble. In contrast, the most effective +solution for the second subtask involves translating the English test +definitions into Arabic and applying them to the finetuned models originally +trained for the first subtask. This straightforward method achieves the highest +score across both subtasks. + +
+
+ comment: Proceedings of ArabicNLP 2023 +
+
+
+
+
+ + ♻ ☆ Topic-VQ-VAE: Leveraging Latent Codebooks for Flexible Topic-Guided + Document Generation AAAI + + +
+ This paper introduces a novel approach for topic modeling utilizing latent +codebooks from Vector-Quantized Variational Auto-Encoder~(VQ-VAE), discretely +encapsulating the rich information of the pre-trained embeddings such as the +pre-trained language model. From the novel interpretation of the latent +codebooks and embeddings as conceptual bag-of-words, we propose a new +generative topic model called Topic-VQ-VAE~(TVQ-VAE) which inversely generates +the original documents related to the respective latent codebook. The TVQ-VAE +can visualize the topics with various generative distributions including the +traditional BoW distribution and the autoregressive image generation. Our +experimental results on document analysis and image generation demonstrate that +TVQ-VAE effectively captures the topic context which reveals the underlying +structures of the dataset and supports flexible forms of document generation. +Official implementation of the proposed TVQ-VAE is available at +https://github.com/clovaai/TVQ-VAE. + +
+
+ comment: Published in the 38th annual AAAI conference on Artificial + Intelligence +
+
+
+
+
+ + ♻ ☆ AV-data2vec: Self-supervised Learning of Audio-Visual Speech + Representations with Contextualized Target Representations + + +
+ Self-supervision has shown great potential for audio-visual speech +recognition by vastly reducing the amount of labeled data required to build +good systems. However, existing methods are either not entirely end-to-end or +do not train joint representations of both modalities. In this paper, we +introduce AV-data2vec which addresses these challenges and builds audio-visual +representations based on predicting contextualized representations which has +been successful in the uni-modal case. The model uses a shared transformer +encoder for both audio and video and can combine both modalities to improve +speech recognition. Results on LRS3 show that AV-data2vec consistently +outperforms existing methods under all settings with the same amount of data +and model size. + +
+
+ comment: 2023 ASRU +
+
+
+
+
+ + ♻ ☆ Towards Hierarchical Spoken Language Dysfluency Modeling EACL + + +
+ Speech disfluency modeling is the bottleneck for both speech therapy and +language learning. However, there is no effective AI solution to systematically +tackle this problem. We solidify the concept of disfluent speech and disfluent +speech modeling. We then present Hierarchical Unconstrained Disfluency Modeling +(H-UDM) approach, the hierarchical extension of UDM that addresses both +disfluency transcription and detection to eliminate the need for extensive +manual annotation. Our experimental findings serve as clear evidence of the +effectiveness and reliability of the methods we have introduced, encompassing +both transcription and detection tasks. + +
+
+ comment: 2024 EACL. Hierarchical extension of our previous workshop paper + arXiv:2312.12810 +
+
+
+
+
+ + ♻ ☆ Towards Optimal Statistical Watermarking + + +
+ We study statistical watermarking by formulating it as a hypothesis testing +problem, a general framework which subsumes all previous statistical +watermarking methods. Key to our formulation is a coupling of the output tokens +and the rejection region, realized by pseudo-random generators in practice, +that allows non-trivial trade-off between the Type I error and Type II error. +We characterize the Uniformly Most Powerful (UMP) watermark in the general +hypothesis testing setting and the minimax Type II error in the model-agnostic +setting. In the common scenario where the output is a sequence of $n$ tokens, +we establish nearly matching upper and lower bounds on the number of i.i.d. +tokens required to guarantee small Type I and Type II errors. Our rate of +$\Theta(h^{-1} \log (1/h))$ with respect to the average entropy per token $h$ +highlights potentials for improvement from the rate of $h^{-2}$ in the previous +works. Moreover, we formulate the robust watermarking problem where users are +allowed to perform a class of perturbations on the generated texts, and +characterize the optimal type II error of robust UMP tests via a linear +programming problem. To the best of our knowledge, this is the first systematic +statistical treatment on the watermarking problem with near-optimal rates in +the i.i.d. setting, which might be of interest for future works. + +
+
+
+
+
+ + ♻ ☆ MathVista: Evaluating Mathematical Reasoning of Foundation Models in + Visual Contexts ICLR 2024 + + +
+ Large Language Models (LLMs) and Large Multimodal Models (LMMs) exhibit +impressive problem-solving skills in many tasks and domains, but their ability +in mathematical reasoning in visual contexts has not been systematically +studied. To bridge this gap, we present MathVista, a benchmark designed to +combine challenges from diverse mathematical and visual tasks. It consists of +6,141 examples, derived from 28 existing multimodal datasets involving +mathematics and 3 newly created datasets (i.e., IQTest, FunctionQA, and +PaperQA). Completing these tasks requires fine-grained, deep visual +understanding and compositional reasoning, which all state-of-the-art +foundation models find challenging. With MathVista, we have conducted a +comprehensive, quantitative evaluation of 12 prominent foundation models. The +best-performing GPT-4V model achieves an overall accuracy of 49.9%, +substantially outperforming Bard, the second-best performer, by 15.1%. Our +in-depth analysis reveals that the superiority of GPT-4V is mainly attributed +to its enhanced visual perception and mathematical reasoning. However, GPT-4V +still falls short of human performance by 10.4%, as it often struggles to +understand complex figures and perform rigorous reasoning. This significant gap +underscores the critical role that MathVista will play in the development of +general-purpose AI agents capable of tackling mathematically intensive and +visually rich real-world tasks. We further explore the new ability of +self-verification, the application of self-consistency, and the interactive +chatbot capabilities of GPT-4V, highlighting its promising potential for future +research. The project is available at https://mathvista.github.io/. + +
+
+ comment: 116 pages, 120 figures. Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through + Text Reconstruction EACL 2024 + + +
+ Fine-grained few-shot entity extraction in the chemical domain faces two +unique challenges. First, compared with entity extraction tasks in the general +domain, sentences from chemical papers usually contain more entities. Moreover, +entity extraction models usually have difficulty extracting entities of +long-tailed types. In this paper, we propose Chem-FINESE, a novel +sequence-to-sequence (seq2seq) based few-shot entity extraction approach, to +address these two challenges. Our Chem-FINESE has two components: a seq2seq +entity extractor to extract named entities from the input sentence and a +seq2seq self-validation module to reconstruct the original input sentence from +extracted entities. Inspired by the fact that a good entity extraction system +needs to extract entities faithfully, our new self-validation module leverages +entity extraction results to reconstruct the original input sentence. Besides, +we design a new contrastive loss to reduce excessive copying during the +extraction process. Finally, we release ChemNER+, a new fine-grained chemical +entity extraction dataset that is annotated by domain experts with the ChemNER +schema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets +show that our newly proposed framework has contributed up to 8.26% and 6.84% +absolute F1-score gains respectively. + +
+
+ comment: 16 pages. Accepted by Findings of the Association for Computational + Linguistics: EACL 2024. Code and resources are available at + https://github.com/EagleW/Chem-FINESE +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 46 + +
+
+
+ + ☆ Text-to-Image Cross-Modal Generation: A Systematic Review + + +
+ We review research on generating visual data from text from the angle of +"cross-modal generation." This point of view allows us to draw parallels +between various methods geared towards working on input text and producing +visual output, without limiting the analysis to narrow sub-areas. It also +results in the identification of common templates in the field, which are then +compared and contrasted both within pools of similar methods and across lines +of research. We provide a breakdown of text-to-image generation into various +flavors of image-from-text methods, video-from-text methods, image editing, +self-supervised and graph-based approaches. In this discussion, we focus on +research papers published at 8 leading machine learning conferences in the +years 2016-2022, also incorporating a number of relevant papers not matching +the outlined search criteria. The conducted review suggests a significant +increase in the number of papers published in the area and highlights research +gaps and potential lines of investigation. To our knowledge, this is the first +review to systematically look at text-to-image generation from the perspective +of "cross-modal generation." + +
+
+
+
+
+ + ☆ A Survey on African Computer Vision Datasets, Topics and Researchers + + +
+ Computer vision encompasses a range of tasks such as object detection, +semantic segmentation, and 3D reconstruction. Despite its relevance to African +communities, research in this field within Africa represents only 0.06% of +top-tier publications over the past decade. This study undertakes a thorough +analysis of 63,000 Scopus-indexed computer vision publications from Africa, +spanning from 2012 to 2022. The aim is to provide a survey of African computer +vision topics, datasets and researchers. A key aspect of our study is the +identification and categorization of African Computer Vision datasets using +large language models that automatically parse abstracts of these publications. +We also provide a compilation of unofficial African Computer Vision datasets +distributed through challenges or data hosting platforms, and provide a full +taxonomy of dataset categories. Our survey also pinpoints computer vision +topics trends specific to different African regions, indicating their unique +focus areas. Additionally, we carried out an extensive survey to capture the +views of African researchers on the current state of computer vision research +in the continent and the structural barriers they believe need urgent +attention. In conclusion, this study catalogs and categorizes Computer Vision +datasets and topics contributed or initiated by African institutions and +identifies barriers to publishing in top-tier Computer Vision venues. This +survey underscores the importance of encouraging African researchers and +institutions in advancing computer vision research in the continent. It also +stresses on the need for research topics to be more aligned with the needs of +African communities. + +
+
+ comment: Under Review, Community Work of Ro'ya Grassroots, + https://ro-ya-cv4africa.github.io/homepage/. arXiv admin note: text overlap + with arXiv:2305.06773 +
+
+
+
+
+ + ☆ Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass + Diffusion Transformers + + +
+ We present the Hourglass Diffusion Transformer (HDiT), an image generative +model that exhibits linear scaling with pixel count, supporting training at +high-resolution (e.g. $1024 \times 1024$) directly in pixel-space. Building on +the Transformer architecture, which is known to scale to billions of +parameters, it bridges the gap between the efficiency of convolutional U-Nets +and the scalability of Transformers. HDiT trains successfully without typical +high-resolution training techniques such as multiscale architectures, latent +autoencoders or self-conditioning. We demonstrate that HDiT performs +competitively with existing models on ImageNet $256^2$, and sets a new +state-of-the-art for diffusion models on FFHQ-$1024^2$. + +
+
+ comment: 20 pages, 13 figures, project page and code available at + https://crowsonkb.github.io/hourglass-diffusion-transformers/ +
+
+
+
+
+ + ☆ TetraLoss: Improving the Robustness of Face Recognition against Morphing + Attacks + + +
+ Face recognition systems are widely deployed in high-security applications +such as for biometric verification at border controls. Despite their high +accuracy on pristine data, it is well-known that digital manipulations, such as +face morphing, pose a security threat to face recognition systems. Malicious +actors can exploit the facilities offered by the identity document issuance +process to obtain identity documents containing morphed images. Thus, subjects +who contributed to the creation of the morphed image can with high probability +use the identity document to bypass automated face recognition systems. In +recent years, no-reference (i.e., single image) and differential morphing +attack detectors have been proposed to tackle this risk. These systems are +typically evaluated in isolation from the face recognition system that they +have to operate jointly with and do not consider the face recognition process. +Contrary to most existing works, we present a novel method for adapting deep +learning-based face recognition systems to be more robust against face morphing +attacks. To this end, we introduce TetraLoss, a novel loss function that learns +to separate morphed face images from its contributing subjects in the embedding +space while still preserving high biometric verification performance. In a +comprehensive evaluation, we show that the proposed method can significantly +enhance the original system while also significantly outperforming other tested +baseline methods. + +
+
+ comment: Accepted to the IEEE International Conference on Automatic Face & + Gesture Recognition 2024 (FG'24) +
+
+
+
+
+ + ☆ Thermal Image Calibration and Correction using Unpaired Cycle-Consistent + Adversarial Networks + + +
+ Unmanned aerial vehicles (UAVs) offer a flexible and cost-effective solution +for wildfire monitoring. However, their widespread deployment during wildfires +has been hindered by a lack of operational guidelines and concerns about +potential interference with aircraft systems. Consequently, the progress in +developing deep-learning models for wildfire detection and characterization +using aerial images is constrained by the limited availability, size, and +quality of existing datasets. This paper introduces a solution aimed at +enhancing the quality of current aerial wildfire datasets to align with +advancements in camera technology. The proposed approach offers a solution to +create a comprehensive, standardized large-scale image dataset. This paper +presents a pipeline based on CycleGAN to enhance wildfire datasets and a novel +fusion method that integrates paired RGB images as attribute conditioning in +the generators of both directions, improving the accuracy of the generated +images. + +
+
+ comment: This paper has been accepted at the Asilomar 2023 Conference and will + be published +
+
+
+
+
+ + ☆ Hierarchical Prompts for Rehearsal-free Continual Learning + + +
+ Continual learning endeavors to equip the model with the capability to +integrate current task knowledge while mitigating the forgetting of past task +knowledge. Inspired by prompt tuning, prompt-based methods maintain a frozen +backbone and train with slight learnable prompts to minimize the catastrophic +forgetting that arises due to updating a large number of backbone parameters. +Nonetheless, these learnable prompts tend to concentrate on the discriminatory +knowledge of the current task while ignoring past task knowledge, leading to +that learnable prompts still suffering from catastrophic forgetting. This paper +introduces a novel rehearsal-free paradigm for continual learning termed +Hierarchical Prompts (H-Prompts), comprising three categories of prompts -- +class prompt, task prompt, and general prompt. To effectively depict the +knowledge of past classes, class prompt leverages Bayesian Distribution +Alignment to model the distribution of classes in each task. To reduce the +forgetting of past task knowledge, task prompt employs Cross-task Knowledge +Excavation to amalgamate the knowledge encapsulated in the learned class +prompts of past tasks and current task knowledge. Furthermore, general prompt +utilizes Generalized Knowledge Exploration to deduce highly generalized +knowledge in a self-supervised manner. Evaluations on two benchmarks +substantiate the efficacy of the proposed H-Prompts, exemplified by an average +accuracy of 87.8% in Split CIFAR-100 and 70.6% in Split ImageNet-R. + +
+
+ comment: Submitted to TPAMI +
+
+
+
+
+ + ☆ How Robust Are Energy-Based Models Trained With Equilibrium Propagation? + + +
+ Deep neural networks (DNNs) are easily fooled by adversarial perturbations +that are imperceptible to humans. Adversarial training, a process where +adversarial examples are added to the training set, is the current +state-of-the-art defense against adversarial attacks, but it lowers the model's +accuracy on clean inputs, is computationally expensive, and offers less +robustness to natural noise. In contrast, energy-based models (EBMs), which +were designed for efficient implementation in neuromorphic hardware and +physical systems, incorporate feedback connections from each layer to the +previous layer, yielding a recurrent, deep-attractor architecture which we +hypothesize should make them naturally robust. Our work is the first to explore +the robustness of EBMs to both natural corruptions and adversarial attacks, +which we do using the CIFAR-10 and CIFAR-100 datasets. We demonstrate that EBMs +are more robust than transformers and display comparable robustness to +adversarially-trained DNNs on gradient-based (white-box) attacks, query-based +(black-box) attacks, and natural perturbations without sacrificing clean +accuracy, and without the need for adversarial training or additional training +techniques. + +
+
+
+
+
+ + ☆ Multi-View Neural 3D Reconstruction of Micro-/Nanostructures with Atomic + Force Microscopy + + +
+ Atomic Force Microscopy (AFM) is a widely employed tool for micro-/nanoscale +topographic imaging. However, conventional AFM scanning struggles to +reconstruct complex 3D micro-/nanostructures precisely due to limitations such +as incomplete sample topography capturing and tip-sample convolution artifacts. +Here, we propose a multi-view neural-network-based framework with AFM +(MVN-AFM), which accurately reconstructs surface models of intricate +micro-/nanostructures. Unlike previous works, MVN-AFM does not depend on any +specially shaped probes or costly modifications to the AFM system. To achieve +this, MVN-AFM uniquely employs an iterative method to align multi-view data and +eliminate AFM artifacts simultaneously. Furthermore, we pioneer the application +of neural implicit surface reconstruction in nanotechnology and achieve +markedly improved results. Extensive experiments show that MVN-AFM effectively +eliminates artifacts present in raw AFM images and reconstructs various +micro-/nanostructures including complex geometrical microstructures printed via +Two-photon Lithography and nanoparticles such as PMMA nanospheres and ZIF-67 +nanocrystals. This work presents a cost-effective tool for micro-/nanoscale 3D +analysis. + +
+
+
+
+
+ + ☆ Deformable Endoscopic Tissues Reconstruction with Gaussian Splatting + + +
+ Surgical 3D reconstruction is a critical area of research in robotic surgery, +with recent works adopting variants of dynamic radiance fields to achieve +success in 3D reconstruction of deformable tissues from single-viewpoint +videos. However, these methods often suffer from time-consuming optimization or +inferior quality, limiting their adoption in downstream tasks. Inspired by 3D +Gaussian Splatting, a recent trending 3D representation, we present EndoGS, +applying Gaussian Splatting for deformable endoscopic tissue reconstruction. +Specifically, our approach incorporates deformation fields to handle dynamic +scenes, depth-guided supervision to optimize 3D targets with a single +viewpoint, and a spatial-temporal weight mask to mitigate tool occlusion. As a +result, EndoGS reconstructs and renders high-quality deformable endoscopic +tissues from a single-viewpoint video, estimated depth maps, and labeled tool +masks. Experiments on DaVinci robotic surgery videos demonstrate that EndoGS +achieves superior rendering quality. Code is available at +https://github.com/HKU-MedAI/EndoGS. + +
+
+ comment: Work in progress. 10 pages, 4 figures +
+
+
+
+
+ + ☆ CaBuAr: California Burned Areas dataset for delineation + + +
+ Forest wildfires represent one of the catastrophic events that, over the last +decades, caused huge environmental and humanitarian damages. In addition to a +significant amount of carbon dioxide emission, they are a source of risk to +society in both short-term (e.g., temporary city evacuation due to fire) and +long-term (e.g., higher risks of landslides) cases. Consequently, the +availability of tools to support local authorities in automatically identifying +burned areas plays an important role in the continuous monitoring requirement +to alleviate the aftereffects of such catastrophic events. The great +availability of satellite acquisitions coupled with computer vision techniques +represents an important step in developing such tools. This paper introduces a +novel open dataset that tackles the burned area delineation problem, a binary +segmentation problem applied to satellite imagery. The presented resource +consists of pre- and post-fire Sentinel-2 L2A acquisitions of California forest +fires that took place starting in 2015. Raster annotations were generated from +the data released by California's Department of Forestry and Fire Protection. +Moreover, in conjunction with the dataset, we release three different baselines +based on spectral indexes analyses, SegFormer, and U-Net models. + +
+
+ comment: Accepted at the IEEE Geoscience and Remote Sensing Magazine +
+
+
+
+
+ + ☆ MobileARLoc: On-device Robust Absolute Localisation for Pervasive + Markerless Mobile AR + + +
+ Recent years have seen significant improvement in absolute camera pose +estimation, paving the way for pervasive markerless Augmented Reality (AR). +However, accurate absolute pose estimation techniques are computation- and +storage-heavy, requiring computation offloading. As such, AR systems rely on +visual-inertial odometry (VIO) to track the device's relative pose between +requests to the server. However, VIO suffers from drift, requiring frequent +absolute repositioning. This paper introduces MobileARLoc, a new framework for +on-device large-scale markerless mobile AR that combines an absolute pose +regressor (APR) with a local VIO tracking system. Absolute pose regressors +(APRs) provide fast on-device pose estimation at the cost of reduced accuracy. +To address APR accuracy and reduce VIO drift, MobileARLoc creates a feedback +loop where VIO pose estimations refine the APR predictions. The VIO system +identifies reliable predictions of APR, which are then used to compensate for +the VIO drift. We comprehensively evaluate MobileARLoc through dataset +simulations. MobileARLoc halves the error compared to the underlying APR and +achieve fast (80\,ms) on-device inference speed. + +
+
+ comment: Accepted for publication at the 3rd edition of the Pervasive and + Resource-Constrained AI (PerConAI) workshop (co-located with PerCom 2024). + arXiv admin note: substantial text overlap with arXiv:2308.05394 +
+
+
+
+
+ + ☆ Self-Supervised Bird's Eye View Motion Prediction with Cross-Modality + Signals + + +
+ Learning the dense bird's eye view (BEV) motion flow in a self-supervised +manner is an emerging research for robotics and autonomous driving. Current +self-supervised methods mainly rely on point correspondences between point +clouds, which may introduce the problems of fake flow and inconsistency, +hindering the model's ability to learn accurate and realistic motion. In this +paper, we introduce a novel cross-modality self-supervised training framework +that effectively addresses these issues by leveraging multi-modality data to +obtain supervision signals. We design three innovative supervision signals to +preserve the inherent properties of scene motion, including the masked Chamfer +distance loss, the piecewise rigidity loss, and the temporal consistency loss. +Through extensive experiments, we demonstrate that our proposed self-supervised +framework outperforms all previous self-supervision methods for the motion +prediction task. + +
+
+
+
+
+ + ☆ Edge-Enabled Real-time Railway Track Segmentation + + +
+ Accurate and rapid railway track segmentation can assist automatic train +driving and is a key step in early warning to fixed or moving obstacles on the +railway track. However, certain existing algorithms tailored for track +segmentation often struggle to meet the requirements of real-time and +efficiency on resource-constrained edge devices. Considering this challenge, we +propose an edge-enabled real-time railway track segmentation algorithm, which +is optimized to be suitable for edge applications by optimizing the network +structure and quantizing the model after training. Initially, Ghost convolution +is introduced to reduce the complexity of the backbone, thereby achieving the +extraction of key information of the interested region at a lower cost. To +further reduce the model complexity and calculation, a new lightweight +detection head is proposed to achieve the best balance between accuracy and +efficiency. Subsequently, we introduce quantization techniques to map the +model's floating-point weights and activation values into lower bit-width +fixed-point representations, reducing computational demands and memory +footprint, ultimately accelerating the model's inference. Finally, we draw +inspiration from GPU parallel programming principles to expedite the +pre-processing and post-processing stages of the algorithm by doing parallel +processing. The approach is evaluated with public and challenging dataset +RailSem19 and tested on Jetson Nano. Experimental results demonstrate that our +enhanced algorithm achieves an accuracy level of 83.3% while achieving a +real-time inference rate of 25 frames per second when the input size is +480x480, thereby effectively meeting the requirements for real-time and +high-efficiency operation. + +
+
+
+
+
+ + ☆ MapChange: Enhancing Semantic Change Detection with Temporal-Invariant + Historical Maps Based on Deep Triplet Network + + +
+ Semantic Change Detection (SCD) is recognized as both a crucial and +challenging task in the field of image analysis. Traditional methods for SCD +have predominantly relied on the comparison of image pairs. However, this +approach is significantly hindered by substantial imaging differences, which +arise due to variations in shooting times, atmospheric conditions, and angles. +Such discrepancies lead to two primary issues: the under-detection of minor yet +significant changes, and the generation of false alarms due to temporal +variances. These factors often result in unchanged objects appearing markedly +different in multi-temporal images. In response to these challenges, the +MapChange framework has been developed. This framework introduces a novel +paradigm that synergizes temporal-invariant historical map data with +contemporary high-resolution images. By employing this combination, the +temporal variance inherent in conventional image pair comparisons is +effectively mitigated. The efficacy of the MapChange framework has been +empirically validated through comprehensive testing on two public datasets. +These tests have demonstrated the framework's marked superiority over existing +state-of-the-art SCD methods. + +
+
+
+
+
+ + ☆ ColorVideoVDP: A visual difference predictor for image, video and + display distortions + + +
+ ColorVideoVDP is a video and image quality metric that models spatial and +temporal aspects of vision, for both luminance and color. The metric is built +on novel psychophysical models of chromatic spatiotemporal contrast sensitivity +and cross-channel contrast masking. It accounts for the viewing conditions, +geometric, and photometric characteristics of the display. It was trained to +predict common video streaming distortions (e.g. video compression, rescaling, +and transmission errors), and also 8 new distortion types related to AR/VR +displays (e.g. light source and waveguide non-uniformities). To address the +latter application, we collected our novel XR-Display-Artifact-Video quality +dataset (XR-DAVID), comprised of 336 distorted videos. Extensive testing on +XR-DAVID, as well as several datasets from the literature, indicate a +significant gain in prediction performance compared to existing metrics. +ColorVideoVDP opens the doors to many novel applications which require the +joint automated spatiotemporal assessment of luminance and color distortions, +including video streaming, display specification and design, visual comparison +of results, and perceptually-guided quality optimization. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ☆ Exploring Missing Modality in Multimodal Egocentric Datasets + + +
+ Multimodal video understanding is crucial for analyzing egocentric videos, +where integrating multiple sensory signals significantly enhances action +recognition and moment localization. However, practical applications often +grapple with incomplete modalities due to factors like privacy concerns, +efficiency demands, or hardware malfunctions. Addressing this, our study delves +into the impact of missing modalities on egocentric action recognition, +particularly within transformer-based models. We introduce a novel concept +-Missing Modality Token (MMT)-to maintain performance even when modalities are +absent, a strategy that proves effective in the Ego4D, Epic-Kitchens, and +Epic-Sounds datasets. Our method mitigates the performance loss, reducing it +from its original $\sim 30\%$ drop to only $\sim 10\%$ when half of the test +set is modal-incomplete. Through extensive experimentation, we demonstrate the +adaptability of MMT to different training scenarios and its superiority in +handling missing modalities compared to current methods. Our research +contributes a comprehensive analysis and an innovative approach, opening +avenues for more resilient multimodal systems in real-world settings. + +
+
+
+
+
+ + ☆ Task-specific regularization loss towards model calibration for reliable + lung cancer detection + + +
+ Lung cancer is one of the significant causes of cancer-related deaths +globally. Early detection and treatment improve the chances of survival. +Traditionally CT scans have been used to extract the most significant lung +infection information and diagnose cancer. This process is carried out manually +by an expert radiologist. The imbalance in the radiologists-to-population ratio +in a country like India implies significant work pressure on them and thus +raises the need to automate a few of their responsibilities. The tendency of +modern-day Deep Neural networks to make overconfident mistakes limit their +usage to detect cancer. In this paper, we propose a new task-specific loss +function to calibrate the neural network to reduce the risk of overconfident +mistakes. We use the state-of-the-art Multi-class Difference in Confidence and +Accuracy (MDCA) loss in conjunction with the proposed task-specific loss +function to achieve the same. We also integrate post-hoc calibration by +performing temperature scaling on top of the train-time calibrated model. We +demonstrate 5.98% improvement in the Expected Calibration Error (ECE) and a +17.9% improvement in Maximum Calibration Error (MCE) as compared to the +best-performing SOTA algorithm. + +
+
+
+
+
+ + ☆ Inter-Domain Mixup for Semi-Supervised Domain Adaptation + + +
+ Semi-supervised domain adaptation (SSDA) aims to bridge source and target +domain distributions, with a small number of target labels available, achieving +better classification performance than unsupervised domain adaptation (UDA). +However, existing SSDA work fails to make full use of label information from +both source and target domains for feature alignment across domains, resulting +in label mismatch in the label space during model testing. This paper presents +a novel SSDA approach, Inter-domain Mixup with Neighborhood Expansion (IDMNE), +to tackle this issue. Firstly, we introduce a cross-domain feature alignment +strategy, Inter-domain Mixup, that incorporates label information into model +adaptation. Specifically, we employ sample-level and manifold-level data mixing +to generate compatible training samples. These newly established samples, +combined with reliable and actual label information, display diversity and +compatibility across domains, while such extra supervision thus facilitates +cross-domain feature alignment and mitigates label mismatch. Additionally, we +utilize Neighborhood Expansion to leverage high-confidence pseudo-labeled +samples in the target domain, diversifying the label information of the target +domain and thereby further increasing the performance of the adaptation model. +Accordingly, the proposed approach outperforms existing state-of-the-art +methods, achieving significant accuracy improvements on popular SSDA +benchmarks, including DomainNet, Office-Home, and Office-31. + +
+
+ comment: Publisted to Elsevier PR2024, available at + https://www.sciencedirect.com/science/article/pii/S0031320323007203?via%3Dihub +
+
+
+
+
+ + ☆ Adaptive Betweenness Clustering for Semi-Supervised Domain Adaptation + + +
+ Compared to unsupervised domain adaptation, semi-supervised domain adaptation +(SSDA) aims to significantly improve the classification performance and +generalization capability of the model by leveraging the presence of a small +amount of labeled data from the target domain. Several SSDA approaches have +been developed to enable semantic-aligned feature confusion between labeled (or +pseudo labeled) samples across domains; nevertheless, owing to the scarcity of +semantic label information of the target domain, they were arduous to fully +realize their potential. In this study, we propose a novel SSDA approach named +Graph-based Adaptive Betweenness Clustering (G-ABC) for achieving categorical +domain alignment, which enables cross-domain semantic alignment by mandating +semantic transfer from labeled data of both the source and target domains to +unlabeled target samples. In particular, a heterogeneous graph is initially +constructed to reflect the pairwise relationships between labeled samples from +both domains and unlabeled ones of the target domain. Then, to degrade the +noisy connectivity in the graph, connectivity refinement is conducted by +introducing two strategies, namely Confidence Uncertainty based Node Removal +and Prediction Dissimilarity based Edge Pruning. Once the graph has been +refined, Adaptive Betweenness Clustering is introduced to facilitate semantic +transfer by using across-domain betweenness clustering and within-domain +betweenness clustering, thereby propagating semantic label information from +labeled samples across domains to unlabeled target data. Extensive experiments +on three standard benchmark datasets, namely DomainNet, Office-Home, and +Office-31, indicated that our method outperforms previous state-of-the-art SSDA +approaches, demonstrating the superiority of the proposed G-ABC algorithm. + +
+
+ comment: 16 pages, 9 figures, published to IEEE TIP +
+
+
+
+
+ + ☆ General Flow as Foundation Affordance for Scalable Robot Learning + + +
+ We address the challenge of acquiring real-world manipulation skills with a +scalable framework.Inspired by the success of large-scale auto-regressive +prediction in Large Language Models (LLMs), we hold the belief that identifying +an appropriate prediction target capable of leveraging large-scale datasets is +crucial for achieving efficient and universal learning. Therefore, we propose +to utilize flow, which represents the future trajectories of 3D points on +objects of interest, as an ideal prediction target in robot learning. To +exploit scalable data resources, we turn our attention to cross-embodiment +datasets. We develop, for the first time, a language-conditioned prediction +model directly from large-scale RGBD human video datasets. Our predicted flow +offers actionable geometric and physics guidance, thus facilitating stable +zero-shot skill transfer in real-world scenarios.We deploy our method with a +policy based on closed-loop flow prediction. Remarkably, without any additional +training, our method achieves an impressive 81% success rate in human-to-robot +skill transfer, covering 18 tasks in 6 scenes. Our framework features the +following benefits: (1) scalability: leveraging cross-embodiment data +resources; (2) universality: multiple object categories, including rigid, +articulated, and soft bodies; (3) stable skill transfer: providing actionable +guidance with a small inference domain-gap. These lead to a new pathway towards +scalable general robot learning. Data, code, and model weights will be made +publicly available. + +
+
+
+
+
+ + ☆ Geometric Prior Guided Feature Representation Learning for Long-Tailed + Classification + + +
+ Real-world data are long-tailed, the lack of tail samples leads to a +significant limitation in the generalization ability of the model. Although +numerous approaches of class re-balancing perform well for moderate class +imbalance problems, additional knowledge needs to be introduced to help the +tail class recover the underlying true distribution when the observed +distribution from a few tail samples does not represent its true distribution +properly, thus allowing the model to learn valuable information outside the +observed domain. In this work, we propose to leverage the geometric information +of the feature distribution of the well-represented head class to guide the +model to learn the underlying distribution of the tail class. Specifically, we +first systematically define the geometry of the feature distribution and the +similarity measures between the geometries, and discover four phenomena +regarding the relationship between the geometries of different feature +distributions. Then, based on four phenomena, feature uncertainty +representation is proposed to perturb the tail features by utilizing the +geometry of the head class feature distribution. It aims to make the perturbed +features cover the underlying distribution of the tail class as much as +possible, thus improving the model's generalization performance in the test +domain. Finally, we design a three-stage training scheme enabling feature +uncertainty modeling to be successfully applied. Experiments on +CIFAR-10/100-LT, ImageNet-LT, and iNaturalist2018 show that our proposed +approach outperforms other similar methods on most metrics. In addition, the +experimental phenomena we discovered are able to provide new perspectives and +theoretical foundations for subsequent studies. + +
+
+ comment: This work was accepted by the IJCV +
+
+
+
+
+ + ☆ Exploring Diffusion Time-steps for Unsupervised Representation Learning ICLR 2024 + + +
+ Representation learning is all about discovering the hidden modular +attributes that generate the data faithfully. We explore the potential of +Denoising Diffusion Probabilistic Model (DM) in unsupervised learning of the +modular attributes. We build a theoretical framework that connects the +diffusion time-steps and the hidden attributes, which serves as an effective +inductive bias for unsupervised learning. Specifically, the forward diffusion +process incrementally adds Gaussian noise to samples at each time-step, which +essentially collapses different samples into similar ones by losing attributes, +e.g., fine-grained attributes such as texture are lost with less noise added +(i.e., early time-steps), while coarse-grained ones such as shape are lost by +adding more noise (i.e., late time-steps). To disentangle the modular +attributes, at each time-step t, we learn a t-specific feature to compensate +for the newly lost attribute, and the set of all 1,...,t-specific features, +corresponding to the cumulative set of lost attributes, are trained to make up +for the reconstruction error of a pre-trained DM at time-step t. On CelebA, +FFHQ, and Bedroom datasets, the learned feature significantly improves +attribute classification and enables faithful counterfactual generation, e.g., +interpolating only one specified attribute between two images, validating the +disentanglement quality. Codes are in https://github.com/yue-zhongqi/diti. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ Grayscale Image Colorization with GAN and CycleGAN in Different Image + Domain + + +
+ Automatic colorization of grayscale image has been a challenging task. +Previous research have applied supervised methods in conquering this problem [ +1]. In this paper, we reproduces a GAN-based coloring model, and experiments +one of its variant. We also proposed a CycleGAN based model and experiments +those methods on various datasets. The result shows that the proposed CycleGAN +model does well in human-face coloring and comic coloring, but lack the ability +to diverse colorization. + +
+
+
+
+
+ + ☆ Enhancing the vision-language foundation model with key semantic + knowledge-emphasized report refinement + + +
+ Recently, vision-language representation learning has made remarkable +advancements in building up medical foundation models, holding immense +potential for transforming the landscape of clinical research and medical care. +The underlying hypothesis is that the rich knowledge embedded in radiology +reports can effectively assist and guide the learning process, reducing the +need for additional labels. However, these reports tend to be complex and +sometimes even consist of redundant descriptions that make the representation +learning too challenging to capture the key semantic information. This paper +develops a novel iterative vision-language representation learning framework by +proposing a key semantic knowledge-emphasized report refinement method. +Particularly, raw radiology reports are refined to highlight the key +information according to a constructed clinical dictionary and two +model-optimized knowledge-enhancement metrics. The iterative framework is +designed to progressively learn, starting from gaining a general understanding +of the patient's condition based on raw reports and gradually refines and +extracts critical information essential to the fine-grained analysis tasks. The +effectiveness of the proposed framework is validated on various downstream +medical image analysis tasks, including disease classification, +region-of-interest segmentation, and phrase grounding. Our framework surpasses +seven state-of-the-art methods in both fine-tuning and zero-shot settings, +demonstrating its encouraging potential for different clinical applications. + +
+
+
+
+
+ + ☆ Embedded Hyperspectral Band Selection with Adaptive Optimization for + Image Semantic Segmentation + + +
+ Hyperspectral band selection plays a pivotal role in remote sensing and image +analysis, aiming to identify the most informative spectral bands while +minimizing computational overhead. In this paper, we introduce a pioneering +approach for hyperspectral band selection that offers an embedded solution, +making it well-suited for resource-constrained or real-time applications. Our +proposed method, embedded Hyperspectral Band Selection (EHBS), excels in +selecting the best bands without the need for prior processing, seamlessly +integrating with the downstream task model. This is achieved through the +adaptation of the Stochastic Gates (STG) algorithm, originally designed for +feature selection, for hyperspectral band selection in the context of image +semantic segmentation and the integration of a dynamic optimizer, DoG, which +removes the need for the required tuning the learning rate. To assess the +performance of our method, we introduce a novel metric for evaluating band +selection methods across different target numbers of selected bands quantified +by the Area Under the Curve (AUC). We conduct experiments on two distinct +semantic-segmentation hyperspectral benchmark datasets, demonstrating its +superiority in terms of its resulting accuracy and its ease of use compared to +many common and state-of-the-art methods. Furthermore, our contributions extend +beyond the realm of hyperspectral band selection. The adaptability of our +approach to other tasks, especially those involving grouped features, opens up +promising avenues for broader applications within the realm of deep learning, +such as feature selection for feature groups. The demonstrated success on the +tested datasets and the potential for application to a variety of tasks +underscore the value of our method as a substantial addition to the field of +computer vision. + +
+
+
+
+
+ + ☆ S$^3$M-Net: Joint Learning of Semantic Segmentation and Stereo Matching + for Autonomous Driving + + +
+ Semantic segmentation and stereo matching are two essential components of 3D +environmental perception systems for autonomous driving. Nevertheless, +conventional approaches often address these two problems independently, +employing separate models for each task. This approach poses practical +limitations in real-world scenarios, particularly when computational resources +are scarce or real-time performance is imperative. Hence, in this article, we +introduce S$^3$M-Net, a novel joint learning framework developed to perform +semantic segmentation and stereo matching simultaneously. Specifically, +S$^3$M-Net shares the features extracted from RGB images between both tasks, +resulting in an improved overall scene understanding capability. This feature +sharing process is realized using a feature fusion adaption (FFA) module, which +effectively transforms the shared features into semantic space and subsequently +fuses them with the encoded disparity features. The entire joint learning +framework is trained by minimizing a novel semantic consistency-guided (SCG) +loss, which places emphasis on the structural consistency in both tasks. +Extensive experimental results conducted on the vKITTI2 and KITTI datasets +demonstrate the effectiveness of our proposed joint learning framework and its +superior performance compared to other state-of-the-art single-task networks. +Our project webpage is accessible at mias.group/S3M-Net. + +
+
+ comment: accepted to IEEE Trans. on Intelligent Vehicles (T-IV) +
+
+
+
+
+ + ☆ Adversarial Augmentation Training Makes Action Recognition Models More + Robust to Realistic Video Distribution Shifts + + +
+ Despite recent advances in video action recognition achieving strong +performance on existing benchmarks, these models often lack robustness when +faced with natural distribution shifts between training and test data. We +propose two novel evaluation methods to assess model resilience to such +distribution disparity. One method uses two different datasets collected from +different sources and uses one for training and validation, and the other for +testing. More precisely, we created dataset splits of HMDB-51 or UCF-101 for +training, and Kinetics-400 for testing, using the subset of the classes that +are overlapping in both train and test datasets. The other proposed method +extracts the feature mean of each class from the target evaluation dataset's +training data (i.e. class prototype) and estimates test video prediction as a +cosine similarity score between each sample to the class prototypes of each +target class. This procedure does not alter model weights using the target +dataset and it does not require aligning overlapping classes of two different +datasets, thus is a very efficient method to test the model robustness to +distribution shifts without prior knowledge of the target distribution. We +address the robustness problem by adversarial augmentation training - +generating augmented views of videos that are "hard" for the classification +model by applying gradient ascent on the augmentation parameters - as well as +"curriculum" scheduling the strength of the video augmentations. We +experimentally demonstrate the superior performance of the proposed adversarial +augmentation approach over baselines across three state-of-the-art action +recognition models - TSM, Video Swin Transformer, and Uniformer. The presented +work provides critical insight into model robustness to distribution shifts and +presents effective techniques to enhance video action recognition performance +in a real-world deployment. + +
+
+
+
+
+ + ♻ ☆ MaskDiff: Modeling Mask Distribution with Diffusion Probabilistic Model + for Few-Shot Instance Segmentation AAAI 2024 + + +
+ Few-shot instance segmentation extends the few-shot learning paradigm to the +instance segmentation task, which tries to segment instance objects from a +query image with a few annotated examples of novel categories. Conventional +approaches have attempted to address the task via prototype learning, known as +point estimation. However, this mechanism depends on prototypes (\eg mean of +$K-$shot) for prediction, leading to performance instability. To overcome the +disadvantage of the point estimation mechanism, we propose a novel approach, +dubbed MaskDiff, which models the underlying conditional distribution of a +binary mask, which is conditioned on an object region and $K-$shot information. +Inspired by augmentation approaches that perturb data with Gaussian noise for +populating low data density regions, we model the mask distribution with a +diffusion probabilistic model. We also propose to utilize classifier-free +guided mask sampling to integrate category information into the binary mask +generation process. Without bells and whistles, our proposed method +consistently outperforms state-of-the-art methods on both base and novel +classes of the COCO dataset while simultaneously being more stable than +existing methods. The source code is available at: +https://github.com/minhquanlecs/MaskDiff. + +
+
+ comment: Accepted at AAAI 2024 (oral presentation) +
+
+
+
+
+ + ♻ ☆ Predicting Age from White Matter Diffusivity with Residual Learning SP + + +
+ Imaging findings inconsistent with those expected at specific chronological +age ranges may serve as early indicators of neurological disorders and +increased mortality risk. Estimation of chronological age, and deviations from +expected results, from structural MRI data has become an important task for +developing biomarkers that are sensitive to such deviations. Complementary to +structural analysis, diffusion tensor imaging (DTI) has proven effective in +identifying age-related microstructural changes within the brain white matter, +thereby presenting itself as a promising additional modality for brain age +prediction. Although early studies have sought to harness DTI's advantages for +age estimation, there is no evidence that the success of this prediction is +owed to the unique microstructural and diffusivity features that DTI provides, +rather than the macrostructural features that are also available in DTI data. +Therefore, we seek to develop white-matter-specific age estimation to capture +deviations from normal white matter aging. Specifically, we deliberately +disregard the macrostructural information when predicting age from DTI scalar +images, using two distinct methods. The first method relies on extracting only +microstructural features from regions of interest. The second applies 3D +residual neural networks (ResNets) to learn features directly from the images, +which are non-linearly registered and warped to a template to minimize +macrostructural variations. When tested on unseen data, the first method yields +mean absolute error (MAE) of 6.11 years for cognitively normal participants and +MAE of 6.62 years for cognitively impaired participants, while the second +method achieves MAE of 4.69 years for cognitively normal participants and MAE +of 4.96 years for cognitively impaired participants. We find that the ResNet +model captures subtler, non-macrostructural features for brain age prediction. + +
+
+ comment: SPIE Medical Imaging: Image Processing. San Diego, CA. February 2024 + (accepted as poster presentation) +
+
+
+
+
+ + ♻ ☆ GenSim: Generating Robotic Simulation Tasks via Large Language Models + + +
+ Collecting large amounts of real-world interaction data to train general +robotic policies is often prohibitively expensive, thus motivating the use of +simulation data. However, existing methods for data generation have generally +focused on scene-level diversity (e.g., object instances and poses) rather than +task-level diversity, due to the human effort required to come up with and +verify novel tasks. This has made it challenging for policies trained on +simulation data to demonstrate significant task-level generalization. In this +paper, we propose to automatically generate rich simulation environments and +expert demonstrations by exploiting a large language models' (LLM) grounding +and coding ability. Our approach, dubbed GenSim, has two modes: goal-directed +generation, wherein a target task is given to the LLM and the LLM proposes a +task curriculum to solve the target task, and exploratory generation, wherein +the LLM bootstraps from previous tasks and iteratively proposes novel tasks +that would be helpful in solving more complex tasks. We use GPT4 to expand the +existing benchmark by ten times to over 100 tasks, on which we conduct +supervised finetuning and evaluate several LLMs including finetuned GPTs and +Code Llama on code generation for robotic simulation tasks. Furthermore, we +observe that LLMs-generated simulation programs can enhance task-level +generalization significantly when used for multitask policy training. We +further find that with minimal sim-to-real adaptation, the multitask policies +pretrained on GPT4-generated simulation tasks exhibit stronger transfer to +unseen long-horizon tasks in the real world and outperform baselines by 25%. +See the project website (https://liruiw.github.io/gensim) for code, demos, and +videos. + +
+
+ comment: See our project website (https://liruiw.github.io/gensim), demo and + datasets (https://huggingface.co/spaces/Gen-Sim/Gen-Sim), and code + (https://github.com/liruiw/GenSim) for more details +
+
+
+
+
+ + ♻ ☆ Dominating Set Database Selection for Visual Place Recognition + + +
+ This paper presents an approach for creating a visual place recognition (VPR) +database for localization in indoor environments from RGBD scanning sequences. +The proposed approach is formulated as a minimization problem in terms of +dominating set algorithm for graph, constructed from spatial information, and +referred as DominatingSet. Our algorithm shows better scene coverage in +comparison to other methodologies that are used for database creation. Also, we +demonstrate that using DominatingSet, a database size could be up to 250-1400 +times smaller than the original scanning sequence while maintaining a recall +rate of more than 80% on testing sequences. We evaluated our algorithm on +7-scenes and BundleFusion datasets and an additionally recorded sequence in a +highly repetitive office setting. In addition, the database selection can +produce weakly-supervised labels for fine-tuning neural place recognition +algorithms to particular settings, improving even more their accuracy. The +paper also presents a fully automated pipeline for VPR database creation from +RGBD scanning sequences, as well as a set of metrics for VPR database +evaluation. The code and released data are available on our web-page~ -- +https://prime-slam.github.io/place-recognition-db/ + +
+
+
+
+
+ + ♻ ☆ Gated Cross-Attention Network for Depth Completion + + +
+ Depth completion is a popular research direction in the field of depth +estimation. The fusion of color and depth features is the current critical +challenge in this task, mainly due to the asymmetry between the rich scene +details in color images and the sparse pixels in depth maps. To tackle this +issue, we design an efficient Gated Cross-Attention Network that propagates +confidence via a gating mechanism, simultaneously extracting and refining key +information in both color and depth branches to achieve local spatial feature +fusion. Additionally, we employ an attention network based on the Transformer +in low-dimensional space to effectively fuse global features and increase the +network's receptive field. With a simple yet efficient gating mechanism, our +proposed method achieves fast and accurate depth completion without the need +for additional branches or post-processing steps. At the same time, we use the +Ray Tune mechanism with the AsyncHyperBandScheduler scheduler and the +HyperOptSearch algorithm to automatically search for the optimal number of +module iterations, which also allows us to achieve performance comparable to +state-of-the-art methods. We conduct experiments on both indoor and outdoor +scene datasets. Our fast network achieves Pareto-optimal solutions in terms of +time and accuracy, and at the time of submission, our accurate network ranks +first among all published papers on the KITTI official website in terms of +accuracy. + +
+
+
+
+
+ + ♻ ☆ Promptable Game Models: Text-Guided Game Simulation via Masked Diffusion + Models + + +
+ Neural video game simulators emerged as powerful tools to generate and edit +videos. Their idea is to represent games as the evolution of an environment's +state driven by the actions of its agents. While such a paradigm enables users +to play a game action-by-action, its rigidity precludes more semantic forms of +control. To overcome this limitation, we augment game models with prompts +specified as a set of natural language actions and desired states. The result-a +Promptable Game Model (PGM)-makes it possible for a user to play the game by +prompting it with high- and low-level action sequences. Most captivatingly, our +PGM unlocks the director's mode, where the game is played by specifying goals +for the agents in the form of a prompt. This requires learning "game AI", +encapsulated by our animation model, to navigate the scene using high-level +constraints, play against an adversary, and devise a strategy to win a point. +To render the resulting state, we use a compositional NeRF representation +encapsulated in our synthesis model. To foster future research, we present +newly collected, annotated and calibrated Tennis and Minecraft datasets. Our +method significantly outperforms existing neural video game simulators in terms +of rendering quality and unlocks applications beyond the capabilities of the +current state of the art. Our framework, data, and models are available at +https://snap-research.github.io/promptable-game-models/. + +
+
+ comment: ACM Transactions on Graphics \c{opyright} Copyright is held by the + owner/author(s) 2023. This is the author's version of the work. It is posted + here for your personal use. Not for redistribution. The definitive Version of + Record was published in ACM Transactions on Graphics, + http://dx.doi.org/10.1145/3635705 +
+
+
+
+
+ + ♻ ☆ DePT: Decomposed Prompt Tuning for Parameter-Efficient Fine-tuning ICLR 2024 + + +
+ Prompt tuning (PT), where a small amount of trainable soft (continuous) +prompt vectors is affixed to the input of language models (LM), has shown +promising results across various tasks and models for parameter-efficient +fine-tuning (PEFT). PT stands out from other PEFT approaches because it +maintains competitive performance with fewer trainable parameters and does not +drastically scale up its parameters as the model size expands. However, PT +introduces additional soft prompt tokens, leading to longer input sequences, +which significantly impacts training and inference time and memory usage due to +the Transformer's quadratic complexity. Particularly concerning for Large +Language Models (LLMs) that face heavy daily querying. To address this issue, +we propose Decomposed Prompt Tuning (DePT), which decomposes the soft prompt +into a shorter soft prompt and a pair of low-rank matrices that are then +optimised with two different learning rates. This allows DePT to achieve better +performance while saving substantial memory and time costs compared to vanilla +PT and its variants, without changing trainable parameter sizes. Through +extensive experiments on 23 natural language processing (NLP) and +vision-language (VL) tasks, we demonstrate that DePT outperforms +state-of-the-art PEFT approaches, including the full fine-tuning baseline, in +some scenarios. Additionally, we empirically show that DEPT grows more +efficient as the model size increases. Our further study reveals that DePT +integrates seamlessly with parameter-efficient transfer learning in the +few-shot learning setting and highlights its adaptability to various model +architectures and sizes. + +
+
+ comment: ICLR 2024. Code is available at https://github.com/ZhengxiangShi/DePT +
+
+
+
+
+ + ♻ ☆ DiffuMask: Synthesizing Images with Pixel-level Annotations for Semantic + Segmentation Using Diffusion Models + + +
+ Collecting and annotating images with pixel-wise labels is time-consuming and +laborious. In contrast, synthetic data can be freely available using a +generative model (e.g., DALL-E, Stable Diffusion). In this paper, we show that +it is possible to automatically obtain accurate semantic masks of synthetic +images generated by the Off-the-shelf Stable Diffusion model, which uses only +text-image pairs during training. Our approach, called DiffuMask, exploits the +potential of the cross-attention map between text and image, which is natural +and seamless to extend the text-driven image synthesis to semantic mask +generation. DiffuMask uses text-guided cross-attention information to localize +class/word-specific regions, which are combined with practical techniques to +create a novel high-resolution and class-discriminative pixel-wise mask. The +methods help to reduce data collection and annotation costs obviously. +Experiments demonstrate that the existing segmentation methods trained on +synthetic data of DiffuMask can achieve a competitive performance over the +counterpart of real data (VOC 2012, Cityscapes). For some classes (e.g., bird), +DiffuMask presents promising performance, close to the stateof-the-art result +of real data (within 3% mIoU gap). Moreover, in the open-vocabulary +segmentation (zero-shot) setting, DiffuMask achieves a new SOTA result on +Unseen class of VOC 2012. The project website can be found at +https://weijiawu.github.io/DiffusionMask/. + +
+
+
+
+
+ + ♻ ☆ Enhancing Visibility in Nighttime Haze Images Using Guided APSF and + Gradient Adaptive Convolution + + +
+ Visibility in hazy nighttime scenes is frequently reduced by multiple +factors, including low light, intense glow, light scattering, and the presence +of multicolored light sources. Existing nighttime dehazing methods often +struggle with handling glow or low-light conditions, resulting in either +excessively dark visuals or unsuppressed glow outputs. In this paper, we +enhance the visibility from a single nighttime haze image by suppressing glow +and enhancing low-light regions. To handle glow effects, our framework learns +from the rendered glow pairs. Specifically, a light source aware network is +proposed to detect light sources of night images, followed by the APSF +(Atmospheric Point Spread Function)-guided glow rendering. Our framework is +then trained on the rendered images, resulting in glow suppression. Moreover, +we utilize gradient-adaptive convolution, to capture edges and textures in hazy +scenes. By leveraging extracted edges and textures, we enhance the contrast of +the scene without losing important structural details. To boost low-light +intensity, our network learns an attention map, then adjusted by gamma +correction. This attention has high values on low-light regions and low values +on haze and glow regions. Extensive evaluation on real nighttime haze images, +demonstrates the effectiveness of our method. Our experiments demonstrate that +our method achieves a PSNR of 30.38dB, outperforming state-of-the-art methods +by 13% on GTA5 nighttime haze dataset. Our data and code is available at +https://github.com/jinyeying/nighttime_dehaze. + +
+
+ comment: Accepted to ACM'MM2023, https://github.com/jinyeying/nighttime_dehaze +
+
+
+
+
+ + ♻ ☆ Ultrafast and Ultralight Network-Based Intelligent System for Real-time + Diagnosis of Ear diseases in Any Devices + + +
+ Traditional ear disease diagnosis heavily depends on experienced specialists +and specialized equipment, frequently resulting in misdiagnoses, treatment +delays, and financial burdens for some patients. Utilizing deep learning models +for efficient ear disease diagnosis has proven effective and affordable. +However, existing research overlooked model inference speed and parameter size +required for deployment. To tackle these challenges, we constructed a +large-scale dataset comprising eight ear disease categories and normal ear +canal samples from two hospitals. Inspired by ShuffleNetV2, we developed +Best-EarNet, an ultrafast and ultralight network enabling real-time ear disease +diagnosis. Best-EarNet incorporates the novel Local-Global Spatial Feature +Fusion Module which can capture global and local spatial information +simultaneously and guide the network to focus on crucial regions within feature +maps at various levels, mitigating low accuracy issues. Moreover, our network +uses multiple auxiliary classification heads for efficient parameter +optimization. With 0.77M parameters, Best-EarNet achieves an average frames per +second of 80 on CPU. Employing transfer learning and five-fold cross-validation +with 22,581 images from Hospital-1, the model achieves an impressive 95.23% +accuracy. External testing on 1,652 images from Hospital-2 validates its +performance, yielding 92.14% accuracy. Compared to state-of-the-art networks, +Best-EarNet establishes a new state-of-the-art (SOTA) in practical +applications. Most importantly, we developed an intelligent diagnosis system +called Ear Keeper, which can be deployed on common electronic devices. By +manipulating a compact electronic otoscope, users can perform comprehensive +scanning and diagnosis of the ear canal using real-time video. This study +provides a novel paradigm for ear endoscopy and other medical endoscopic image +recognition applications. + +
+
+
+
+
+ + ♻ ☆ Generic Knowledge Boosted Pre-training For Remote Sensing Images + + +
+ Deep learning models are essential for scene classification, change +detection, land cover segmentation, and other remote sensing image +understanding tasks. Most backbones of existing remote sensing deep learning +models are typically initialized by pre-trained weights obtained from ImageNet +pre-training (IMP). However, domain gaps exist between remote sensing images +and natural images (e.g., ImageNet), making deep learning models initialized by +pre-trained weights of IMP perform poorly for remote sensing image +understanding. Although some pre-training methods are studied in the remote +sensing community, current remote sensing pre-training methods face the problem +of vague generalization by only using remote sensing images. In this paper, we +propose a novel remote sensing pre-training framework, Generic Knowledge +Boosted Remote Sensing Pre-training (GeRSP), to learn robust representations +from remote sensing and natural images for remote sensing understanding tasks. +GeRSP contains two pre-training branches: (1) A self-supervised pre-training +branch is adopted to learn domain-related representations from unlabeled remote +sensing images. (2) A supervised pre-training branch is integrated into GeRSP +for general knowledge learning from labeled natural images. Moreover, GeRSP +combines two pre-training branches using a teacher-student architecture to +simultaneously learn representations with general and special knowledge, which +generates a powerful pre-trained model for deep learning model initialization. +Finally, we evaluate GeRSP and other remote sensing pre-training methods on +three downstream tasks, i.e., object detection, semantic segmentation, and +scene classification. The extensive experimental results consistently +demonstrate that GeRSP can effectively learn robust representations in a +unified manner, improving the performance of remote sensing downstream tasks. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ GaussianHead: High-fidelity Head Avatars with Learnable Gaussian + Derivation + + +
+ Constructing vivid 3D head avatars for given subjects and realizing a series +of animations on them is valuable yet challenging. This paper presents +GaussianHead, which models the actional human head with anisotropic 3D +Gaussians. In our framework, a motion deformation field and multi-resolution +tri-plane are constructed respectively to deal with the head's dynamic geometry +and complex texture. Notably, we impose an exclusive derivation scheme on each +Gaussian, which generates its multiple doppelgangers through a set of learnable +parameters for position transformation. With this design, we can compactly and +accurately encode the appearance information of Gaussians, even those fitting +the head's particular components with sophisticated structures. In addition, an +inherited derivation strategy for newly added Gaussians is adopted to +facilitate training acceleration. Extensive experiments show that our method +can produce high-fidelity renderings, outperforming state-of-the-art approaches +in reconstruction, cross-identity reenactment, and novel view synthesis tasks. +Our code is available at: https://github.com/chiehwangs/gaussian-head. + +
+
+
+
+
+ + ♻ ☆ Toward Sufficient Spatial-Frequency Interaction for Gradient-aware + Underwater Image Enhancement ICASSP 2024 + + +
+ Underwater images suffer from complex and diverse degradation, which +inevitably affects the performance of underwater visual tasks. However, most +existing learning-based Underwater image enhancement (UIE) methods mainly +restore such degradations in the spatial domain, and rarely pay attention to +the fourier frequency information. In this paper, we develop a novel UIE +framework based on spatial-frequency interaction and gradient maps, namely +SFGNet, which consists of two stages. Specifically, in the first stage, we +propose a dense spatial-frequency fusion network (DSFFNet), mainly including +our designed dense fourier fusion block and dense spatial fusion block, +achieving sufficient spatial-frequency interaction by cross connections between +these two blocks. In the second stage, we propose a gradient-aware corrector +(GAC) to further enhance perceptual details and geometric structures of images +by gradient map. Experimental results on two real-world underwater image +datasets show that our approach can successfully enhance underwater images, and +achieves competitive performance in visual quality improvement. The code is +available at https://github.com/zhihefang/SFGNet. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ CLID: Controlled-Length Image Descriptions with Limited Data + + +
+ Controllable image captioning models generate human-like image descriptions, +enabling some kind of control over the generated captions. This paper focuses +on controlling the caption length, i.e. a short and concise description or a +long and detailed one. Since existing image captioning datasets contain mostly +short captions, generating long captions is challenging. To address the +shortage of long training examples, we propose to enrich the dataset with +varying-length self-generated captions. These, however, might be of varying +quality and are thus unsuitable for conventional training. We introduce a novel +training strategy that selects the data points to be used at different times +during the training. Our method dramatically improves the length-control +abilities, while exhibiting SoTA performance in terms of caption quality. Our +approach is general and is shown to be applicable also to paragraph generation. + +
+
+
+
+
+ + ♻ ☆ Deep Tiny Network for Recognition-Oriented Face Image Quality Assessment + + +
+ Face recognition has made significant progress in recent years due to deep +convolutional neural networks (CNN). In many face recognition (FR) scenarios, +face images are acquired from a sequence with huge intra-variations. These +intra-variations, which are mainly affected by the low-quality face images, +cause instability of recognition performance. Previous works have focused on +ad-hoc methods to select frames from a video or use face image quality +assessment (FIQA) methods, which consider only a particular or combination of +several distortions. + In this work, we present an efficient non-reference image quality assessment +for FR that directly links image quality assessment (IQA) and FR. More +specifically, we propose a new measurement to evaluate image quality without +any reference. Based on the proposed quality measurement, we propose a deep +Tiny Face Quality network (tinyFQnet) to learn a quality prediction function +from data. + We evaluate the proposed method for different powerful FR models on two +classical video-based (or template-based) benchmark: IJB-B and YTF. Extensive +experiments show that, although the tinyFQnet is much smaller than the others, +the proposed method outperforms state-of-the-art quality assessment methods in +terms of effectiveness and efficiency. + +
+
+
+
+
+ + ♻ ☆ Learning to Generalize over Subpartitions for Heterogeneity-aware Domain + Adaptive Nuclei Segmentation + + +
+ Annotation scarcity and cross-modality/stain data distribution shifts are two +major obstacles hindering the application of deep learning models for nuclei +analysis, which holds a broad spectrum of potential applications in digital +pathology. Recently, unsupervised domain adaptation (UDA) methods have been +proposed to mitigate the distributional gap between different imaging +modalities for unsupervised nuclei segmentation in histopathology images. +However, existing UDA methods are built upon the assumption that data +distributions within each domain should be uniform. Based on the +over-simplified supposition, they propose to align the histopathology target +domain with the source domain integrally, neglecting severe intra-domain +discrepancy over subpartitions incurred by mixed cancer types and sampling +organs. In this paper, for the first time, we propose to explicitly consider +the heterogeneity within the histopathology domain and introduce open compound +domain adaptation (OCDA) to resolve the crux. In specific, a two-stage +disentanglement framework is proposed to acquire domain-invariant feature +representations at both image and instance levels. The holistic design +addresses the limitations of existing OCDA approaches which struggle to capture +instance-wise variations. Two regularization strategies are specifically +devised herein to leverage the rich subpartition-specific characteristics in +histopathology images and facilitate subdomain decomposition. Moreover, we +propose a dual-branch nucleus shape and structure preserving module to prevent +nucleus over-generation and deformation in the synthesized images. Experimental +results on both cross-modality and cross-stain scenarios over a broad range of +diverse datasets demonstrate the superiority of our method compared with +state-of-the-art UDA and OCDA methods. + +
+
+
+
+
+ + ♻ ☆ MVBench: A Comprehensive Multi-modal Video Understanding Benchmark + + +
+ With the rapid development of Multi-modal Large Language Models (MLLMs), a +number of diagnostic benchmarks have recently emerged to evaluate the +comprehension capabilities of these models. However, most benchmarks +predominantly assess spatial understanding in the static image tasks, while +overlooking temporal understanding in the dynamic video tasks. To alleviate +this issue, we introduce a comprehensive Multi-modal Video understanding +Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot +be effectively solved with a single frame. Specifically, we first introduce a +novel static-to-dynamic method to define these temporal-related tasks. By +transforming various static tasks into dynamic ones, we enable the systematic +generation of video tasks that require a broad spectrum of temporal skills, +ranging from perception to cognition. Then, guided by the task definition, we +automatically convert public video annotations into multiple-choice QA to +evaluate each task. On one hand, such a distinct paradigm allows us to build +MVBench efficiently, without much manual intervention. On the other hand, it +guarantees evaluation fairness with ground-truth video annotations, avoiding +the biased scoring of LLMs. Moreover, we further develop a robust video MLLM +baseline, i.e., VideoChat2, by progressive multi-modal training with diverse +instruction-tuning data. The extensive results on our MVBench reveal that, the +existing MLLMs are far from satisfactory in temporal understanding, while our +VideoChat2 largely surpasses these leading models by over 15% on MVBench. All +models and data are available at https://github.com/OpenGVLab/Ask-Anything. + +
+
+ comment: 18 pages, 7 figures, 19 tables +
+
+
+
+
+ + ♻ ☆ Towards Identifiable Unsupervised Domain Translation: A Diversified + Distribution Matching Approach + + +
+ Unsupervised domain translation (UDT) aims to find functions that convert +samples from one domain (e.g., sketches) to another domain (e.g., photos) +without changing the high-level semantic meaning (also referred to as +``content''). The translation functions are often sought by probability +distribution matching of the transformed source domain and target domain. +CycleGAN stands as arguably the most representative approach among this line of +work. However, it was noticed in the literature that CycleGAN and variants +could fail to identify the desired translation functions and produce +content-misaligned translations. This limitation arises due to the presence of +multiple translation functions -- referred to as ``measure-preserving +automorphism" (MPA) -- in the solution space of the learning criteria. Despite +awareness of such identifiability issues, solutions have remained elusive. This +study delves into the core identifiability inquiry and introduces an MPA +elimination theory. Our analysis shows that MPA is unlikely to exist, if +multiple pairs of diverse cross-domain conditional distributions are matched by +the learning function. Our theory leads to a UDT learner using distribution +matching over auxiliary variable-induced subsets of the domains -- other than +over the entire data domains as in the classical approaches. The proposed +framework is the first to rigorously establish translation identifiability +under reasonable UDT settings, to our best knowledge. Experiments corroborate +with our theoretical claims. + +
+
+
+
+
+ + ♻ ☆ Animal3D: A Comprehensive Dataset of 3D Animal Pose and Shape + + +
+ Accurately estimating the 3D pose and shape is an essential step towards +understanding animal behavior, and can potentially benefit many downstream +applications, such as wildlife conservation. However, research in this area is +held back by the lack of a comprehensive and diverse dataset with high-quality +3D pose and shape annotations. In this paper, we propose Animal3D, the first +comprehensive dataset for mammal animal 3D pose and shape estimation. Animal3D +consists of 3379 images collected from 40 mammal species, high-quality +annotations of 26 keypoints, and importantly the pose and shape parameters of +the SMAL model. All annotations were labeled and checked manually in a +multi-stage process to ensure highest quality results. Based on the Animal3D +dataset, we benchmark representative shape and pose estimation models at: (1) +supervised learning from only the Animal3D data, (2) synthetic to real transfer +from synthetically generated images, and (3) fine-tuning human pose and shape +estimation models. Our experimental results demonstrate that predicting the 3D +shape and pose of animals across species remains a very challenging task, +despite significant advances in human pose estimation. Our results further +demonstrate that synthetic pre-training is a viable strategy to boost the model +performance. Overall, Animal3D opens new directions for facilitating future +research in animal 3D pose and shape estimation, and is publicly available. + +
+
+ comment: 11 pages, 5 figures, link to the dataset: + https://xujiacong.github.io/Animal3D/ +
+
+
+
+
+
+
+
+ + Information Retrieval 9 + +
+
+
+ + ☆ What Are We Optimizing For? A Human-centric Evaluation Of Deep + Learning-based Recommender Systems + + +
+ Deep learning-based (DL) models in recommender systems (RecSys) have gained +significant recognition for their remarkable accuracy in predicting user +preferences. However, their performance often lacks a comprehensive evaluation +from a human-centric perspective, which encompasses various dimensions beyond +simple interest matching. In this work, we have developed a robust +human-centric evaluation framework that incorporates seven diverse metrics to +assess the quality of recommendations generated by five recent open-sourced DL +models. Our evaluation datasets consist of both offline benchmark data and +personalized online recommendation feedback collected from 445 real users. We +find that (1) different DL models have different pros and cons in the +multi-dimensional metrics that we test with; (2) users generally want a +combination of accuracy with at least one another human values in the +recommendation; (3) the degree of combination of different values needs to be +carefully experimented to user preferred level. + +
+
+
+
+
+ + ☆ In-context Learning with Retrieved Demonstrations for Language Models: A + Survey + + +
+ Language models, especially pre-trained large language models, have showcased +remarkable abilities as few-shot in-context learners (ICL), adept at adapting +to new tasks with just a few demonstrations in the input context. However, the +model's ability to perform ICL is sensitive to the choice of the few-shot +demonstrations. Instead of using a fixed set of demonstrations, one recent +development is to retrieve demonstrations tailored to each input query. The +implementation of demonstration retrieval is relatively straightforward, +leveraging existing databases and retrieval systems. This not only improves the +efficiency and scalability of the learning process but also has been shown to +reduce biases inherent in manual example selection. In light of the encouraging +results and growing research in ICL with retrieved demonstrations, we conduct +an extensive review of studies in this area. In this survey, we discuss and +compare different design choices for retrieval models, retrieval training +procedures, and inference algorithms. + +
+
+
+
+
+ + ☆ Simple Domain Adaptation for Sparse Retrievers ECIR 2024 + + +
+ In Information Retrieval, and more generally in Natural Language Processing, +adapting models to specific domains is conducted through fine-tuning. Despite +the successes achieved by this method and its versatility, the need for +human-curated and labeled data makes it impractical to transfer to new tasks, +domains, and/or languages when training data doesn't exist. Using the model +without training (zero-shot) is another option that however suffers an +effectiveness cost, especially in the case of first-stage retrievers. Numerous +research directions have emerged to tackle these issues, most of them in the +context of adapting to a task or a language. However, the literature is scarcer +for domain (or topic) adaptation. In this paper, we address this issue of +cross-topic discrepancy for a sparse first-stage retriever by transposing a +method initially designed for language adaptation. By leveraging pre-training +on the target data to learn domain-specific knowledge, this technique +alleviates the need for annotated data and expands the scope of domain +adaptation. Despite their relatively good generalization ability, we show that +even sparse retrievers can benefit from our simple domain adaptation method. + +
+
+ comment: Accepted at ECIR 2024 +
+
+
+
+
+ + ☆ Enhancing Recommendation Diversity by Re-ranking with Large Language + Models + + +
+ It has long been recognized that it is not enough for a Recommender System +(RS) to provide recommendations based only on their relevance to users. Among +many other criteria, the set of recommendations may need to be diverse in order +to handle uncertainty and offer a meaningful choice. The literature reports +many ways of measuring diversity and ways of improving the diversity of a set +of recommendations, most notably by re-ranking and selecting from a larger set +of candidate recommendations. Driven by promising insights from the literature +on how to incorporate versatile Large Language Models (LLMs) into the RS +pipeline, in this paper, we show how LLMs can be used for diversity re-ranking. + We begin with an informal study that verifies that LLMs can be used for +re-ranking tasks and do have some understanding of the concept of diversity. +Then, we design a more rigorous methodology where LLMs are prompted to generate +a diverse ranking from a candidate ranking using various prompt templates with +different re-ranking instructions in a zero-shot fashion. We conduct +comprehensive experiments testing state-of-the-art conversational LLMs from the +GPT and Llama families. We compare their re-ranking capabilities with random +re-ranking and various traditional re-ranking methods from the literature (MMR, +xQuAD and RxQuAD). We find that LLM-based re-ranking outperforms random +re-ranking across all the metrics that we use but does not perform as well as +the traditional re-ranking methods. We gain insight into prompt design for this +task (e.g.\ on the whole, it is better to prompt for diversity rather than a +balance of diversity and relevance). Given that no special knowledge +engineering is needed, we conclude that LLM-based re-ranking is a promising +approach, and we highlight directions for future research. We open-source the +code of our experiments for reproducibility. + +
+
+ comment: 32 pages, 2 figures +
+
+
+
+
+ + ☆ CheX-GPT: Harnessing Large Language Models for Enhanced Chest X-ray + Report Labeling + + +
+ Free-text radiology reports present a rich data source for various medical +tasks, but effectively labeling these texts remains challenging. Traditional +rule-based labeling methods fall short of capturing the nuances of diverse +free-text patterns. Moreover, models using expert-annotated data are limited by +data scarcity and pre-defined classes, impacting their performance, flexibility +and scalability. To address these issues, our study offers three main +contributions: 1) We demonstrate the potential of GPT as an adept labeler using +carefully designed prompts. 2) Utilizing only the data labeled by GPT, we +trained a BERT-based labeler, CheX-GPT, which operates faster and more +efficiently than its GPT counterpart. 3) To benchmark labeler performance, we +introduced a publicly available expert-annotated test set, MIMIC-500, +comprising 500 cases from the MIMIC validation set. Our findings demonstrate +that CheX-GPT not only excels in labeling accuracy over existing models, but +also showcases superior efficiency, flexibility, and scalability, supported by +our introduction of the MIMIC-500 dataset for robust benchmarking. Code and +models are available at https://github.com/kakaobrain/CheXGPT. + +
+
+ comment: 16 pages, 3 figures +
+
+
+
+
+ + ☆ D2K: Turning Historical Data into Retrievable Knowledge for Recommender + Systems + + +
+ A vast amount of user behavior data is constantly accumulating on today's +large recommendation platforms, recording users' various interests and tastes. +Preserving knowledge from the old data while new data continually arrives is a +vital problem for recommender systems. Existing approaches generally seek to +save the knowledge implicitly in the model parameters. However, such a +parameter-centric approach lacks scalability and flexibility -- the capacity is +hard to scale, and the knowledge is inflexible to utilize. Hence, in this work, +we propose a framework that turns massive user behavior data to retrievable +knowledge (D2K). It is a data-centric approach that is model-agnostic and easy +to scale up. Different from only storing unary knowledge such as the user-side +or item-side information, D2K propose to store ternary knowledge for +recommendation, which is determined by the complete recommendation factors -- +user, item, and context. The knowledge retrieved by target samples can be +directly used to enhance the performance of any recommendation algorithms. +Specifically, we introduce a Transformer-based knowledge encoder to transform +the old data into knowledge with the user-item-context cross features. A +personalized knowledge adaptation unit is devised to effectively exploit the +information from the knowledge base by adapting the retrieved knowledge to the +target samples. Extensive experiments on two public datasets show that D2K +significantly outperforms existing baselines and is compatible with a major +collection of recommendation algorithms. + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ Estimating the Usefulness of Clarifying Questions and Answers for + Conversational Search ECIR '24 + + +
+ While the body of research directed towards constructing and generating +clarifying questions in mixed-initiative conversational search systems is vast, +research aimed at processing and comprehending users' answers to such questions +is scarce. To this end, we present a simple yet effective method for processing +answers to clarifying questions, moving away from previous work that simply +appends answers to the original query and thus potentially degrades retrieval +performance. Specifically, we propose a classifier for assessing usefulness of +the prompted clarifying question and an answer given by the user. Useful +questions or answers are further appended to the conversation history and +passed to a transformer-based query rewriting module. Results demonstrate +significant improvements over strong non-mixed-initiative baselines. +Furthermore, the proposed approach mitigates the performance drops when non +useful questions and answers are utilized. + +
+
+ comment: This is the author's version of the work. The definitive version is + published in: Proceedings of the 46th European Conference on Information + Retrieval (ECIR '24), March 24-28, 2024, Glasgow, Scotland +
+
+
+
+
+ + ☆ Towards Reliable and Factual Response Generation: Detecting Unanswerable + Questions in Information-Seeking Conversations ECIR '24 + + +
+ Generative AI models face the challenge of hallucinations that can undermine +users' trust in such systems. We approach the problem of conversational +information seeking as a two-step process, where relevant passages in a corpus +are identified first and then summarized into a final system response. This way +we can automatically assess if the answer to the user's question is present in +the corpus. Specifically, our proposed method employs a sentence-level +classifier to detect if the answer is present, then aggregates these +predictions on the passage level, and eventually across the top-ranked passages +to arrive at a final answerability estimate. For training and evaluation, we +develop a dataset based on the TREC CAsT benchmark that includes answerability +labels on the sentence, passage, and ranking levels. We demonstrate that our +proposed method represents a strong baseline and outperforms a state-of-the-art +LLM on the answerability prediction task. + +
+
+ comment: This is the author's version of the work. The definitive version is + published in: Proceedings of the 46th European Conference on Information + Retrieval} (ECIR '24), March 24--28, 2024, Glasgow, Scotland +
+
+
+
+
+ + ☆ On-Device Recommender Systems: A Comprehensive Survey + + +
+ Recommender systems have been widely deployed in various real-world +applications to help users identify content of interest from massive amounts of +information. Traditional recommender systems work by collecting user-item +interaction data in a cloud-based data center and training a centralized model +to perform the recommendation service. However, such cloud-based recommender +systems (CloudRSs) inevitably suffer from excessive resource consumption, +response latency, as well as privacy and security risks concerning both data +and models. Recently, driven by the advances in storage, communication, and +computation capabilities of edge devices, there has been a shift of focus from +CloudRSs to on-device recommender systems (DeviceRSs), which leverage the +capabilities of edge devices to minimize centralized data storage requirements, +reduce the response latency caused by communication overheads, and enhance user +privacy and security by localizing data processing and model training. Despite +the rapid rise of DeviceRSs, there is a clear absence of timely literature +reviews that systematically introduce, categorize and contrast these methods. +To bridge this gap, we aim to provide a comprehensive survey of DeviceRSs, +covering three main aspects: (1) the deployment and inference of DeviceRSs (2) +the training and update of DeviceRSs (3) the security and privacy of DeviceRSs. +Furthermore, we provide a fine-grained and systematic taxonomy of the methods +involved in each aspect, followed by a discussion regarding challenges and +future research directions. This is the first comprehensive survey on DeviceRSs +that covers a spectrum of tasks to fit various needs. We believe this survey +will help readers effectively grasp the current research status in this field, +equip them with relevant technical foundations, and stimulate new research +ideas for developing DeviceRSs. + +
+
+
+
+
+
+
+
+ + Machine Learning 13 + +
+
+
+ + ☆ What Are We Optimizing For? A Human-centric Evaluation Of Deep + Learning-based Recommender Systems + + +
+ Deep learning-based (DL) models in recommender systems (RecSys) have gained +significant recognition for their remarkable accuracy in predicting user +preferences. However, their performance often lacks a comprehensive evaluation +from a human-centric perspective, which encompasses various dimensions beyond +simple interest matching. In this work, we have developed a robust +human-centric evaluation framework that incorporates seven diverse metrics to +assess the quality of recommendations generated by five recent open-sourced DL +models. Our evaluation datasets consist of both offline benchmark data and +personalized online recommendation feedback collected from 445 real users. We +find that (1) different DL models have different pros and cons in the +multi-dimensional metrics that we test with; (2) users generally want a +combination of accuracy with at least one another human values in the +recommendation; (3) the degree of combination of different values needs to be +carefully experimented to user preferred level. + +
+
+
+
+
+ + ☆ Text-to-Image Cross-Modal Generation: A Systematic Review + + +
+ We review research on generating visual data from text from the angle of +"cross-modal generation." This point of view allows us to draw parallels +between various methods geared towards working on input text and producing +visual output, without limiting the analysis to narrow sub-areas. It also +results in the identification of common templates in the field, which are then +compared and contrasted both within pools of similar methods and across lines +of research. We provide a breakdown of text-to-image generation into various +flavors of image-from-text methods, video-from-text methods, image editing, +self-supervised and graph-based approaches. In this discussion, we focus on +research papers published at 8 leading machine learning conferences in the +years 2016-2022, also incorporating a number of relevant papers not matching +the outlined search criteria. The conducted review suggests a significant +increase in the number of papers published in the area and highlights research +gaps and potential lines of investigation. To our knowledge, this is the first +review to systematically look at text-to-image generation from the perspective +of "cross-modal generation." + +
+
+
+
+
+ + ☆ Reframing Offline Reinforcement Learning as a Regression Problem + + +
+ The study proposes the reformulation of offline reinforcement learning as a +regression problem that can be solved with decision trees. Aiming to predict +actions based on input states, return-to-go (RTG), and timestep information, we +observe that with gradient-boosted trees, the agent training and inference are +very fast, the former taking less than a minute. Despite the simplification +inherent in this reformulated problem, our agent demonstrates performance that +is at least on par with established methods. This assertion is validated by +testing it across standard datasets associated with D4RL Gym-MuJoCo tasks. We +further discuss the agent's ability to generalize by testing it on two extreme +cases, how it learns to model the return distributions effectively even with +highly skewed expert datasets, and how it exhibits robust performance in +scenarios with sparse/delayed rewards. + +
+
+
+
+
+ + ☆ Tight Verification of Probabilistic Robustness in Bayesian Neural + Networks AISTATS 2024 + + +
+ We introduce two algorithms for computing tight guarantees on the +probabilistic robustness of Bayesian Neural Networks (BNNs). Computing +robustness guarantees for BNNs is a significantly more challenging task than +verifying the robustness of standard Neural Networks (NNs) because it requires +searching the parameters' space for safe weights. Moreover, tight and complete +approaches for the verification of standard NNs, such as those based on +Mixed-Integer Linear Programming (MILP), cannot be directly used for the +verification of BNNs because of the polynomial terms resulting from the +consecutive multiplication of variables encoding the weights. Our algorithms +efficiently and effectively search the parameters' space for safe weights by +using iterative expansion and the network's gradient and can be used with any +verification algorithm of choice for BNNs. In addition to proving that our +algorithms compute tighter bounds than the SoA, we also evaluate our algorithms +against the SoA on standard benchmarks, such as MNIST and CIFAR10, showing that +our algorithms compute bounds up to 40% tighter than the SoA. + +
+
+ comment: Accepted at AISTATS 2024 +
+
+
+
+
+ + ☆ Freely Long-Thinking Transformer (FraiLT) + + +
+ Freely Long-Thinking Transformer (FraiLT) is an improved transformer model +designed to enhance processing capabilities without scaling up size. It +utilizes a recursive approach, iterating over a subset of layers multiple +times, and introduces iteration encodings to maintain awareness across these +cycles. Iteration encoding allows FraiLT to achieve the interpretive depth of +larger models in a compact form. When evaluated on a synthetic story dataset, +FraiLT outperformed larger models, showcasing its ability to deliver +high-quality performance while reducing memory demands. This model represents a +step forward towards more efficient and accessible language models. + +
+
+
+
+
+ + ☆ Efficient local linearity regularization to overcome catastrophic + overfitting ICLR 2024 + + +
+ Catastrophic overfitting (CO) in single-step adversarial training (AT) +results in abrupt drops in the adversarial test accuracy (even down to 0%). For +models trained with multi-step AT, it has been observed that the loss function +behaves locally linearly with respect to the input, this is however lost in +single-step AT. To address CO in single-step AT, several methods have been +proposed to enforce local linearity of the loss via regularization. However, +these regularization terms considerably slow down training due to Double +Backpropagation. Instead, in this work, we introduce a regularization term, +called ELLE, to mitigate CO effectively and efficiently in classical AT +evaluations, as well as some more difficult regimes, e.g., large adversarial +perturbations and long training schedules. Our regularization term can be +theoretically linked to curvature of the loss function and is computationally +cheaper than previous methods by avoiding Double Backpropagation. Our thorough +experimental validation demonstrates that our work does not suffer from CO, +even in challenging settings where previous works suffer from it. We also +notice that adapting our regularization parameter during training (ELLE-A) +greatly improves the performance, specially in large $\epsilon$ setups. Our +implementation is available in https://github.com/LIONS-EPFL/ELLE . + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ☆ Continuous Field Reconstruction from Sparse Observations with Implicit + Neural Networks + + +
+ Reliably reconstructing physical fields from sparse sensor data is a +challenge that frequently arises in many scientific domains. In practice, the +process generating the data often is not understood to sufficient accuracy. +Therefore, there is a growing interest in using the deep neural network route +to address the problem. This work presents a novel approach that learns a +continuous representation of the physical field using implicit neural +representations (INRs). Specifically, after factorizing spatiotemporal +variability into spatial and temporal components using the separation of +variables technique, the method learns relevant basis functions from sparsely +sampled irregular data points to develop a continuous representation of the +data. In experimental evaluations, the proposed model outperforms recent INR +methods, offering superior reconstruction quality on simulation data from a +state-of-the-art climate model and a second dataset that comprises ultra-high +resolution satellite-based sea surface temperature fields. + +
+
+ comment: 25 pages,21 figures +
+
+
+
+
+ + ☆ Graph Edits for Counterfactual Explanations: A Unified GNN Approach + + +
+ Counterfactuals have been established as a popular explainability technique +which leverages a set of minimal edits to alter the prediction of a classifier. +When considering conceptual counterfactuals, the edits requested should +correspond to salient concepts present in the input data. At the same time, +conceptual distances are defined by knowledge graphs, ensuring the optimality +of conceptual edits. In this work, we extend previous endeavors on conceptual +counterfactuals by introducing \textit{graph edits as counterfactual +explanations}: should we represent input data as graphs, which is the shortest +graph edit path that results in an alternative classification label as provided +by a black-box classifier? + +
+
+
+
+
+ + ☆ $\texttt{immrax}$: A Parallelizable and Differentiable Toolbox for + Interval Analysis and Mixed Monotone Reachability in JAX + + +
+ We present an implementation of interval analysis and mixed monotone interval +reachability analysis as function transforms in Python, fully composable with +the computational framework JAX. The resulting toolbox inherits several key +features from JAX, including computational efficiency through Just-In-Time +Compilation, GPU acceleration for quick parallelized computations, and +Automatic Differentiability. We demonstrate the toolbox's performance on +several case studies, including a reachability problem on a vehicle model +controlled by a neural network, and a robust closed-loop optimal control +problem for a swinging pendulum. + +
+
+
+
+
+ + ☆ Scalable High-Resolution Pixel-Space Image Synthesis with Hourglass + Diffusion Transformers + + +
+ We present the Hourglass Diffusion Transformer (HDiT), an image generative +model that exhibits linear scaling with pixel count, supporting training at +high-resolution (e.g. $1024 \times 1024$) directly in pixel-space. Building on +the Transformer architecture, which is known to scale to billions of +parameters, it bridges the gap between the efficiency of convolutional U-Nets +and the scalability of Transformers. HDiT trains successfully without typical +high-resolution training techniques such as multiscale architectures, latent +autoencoders or self-conditioning. We demonstrate that HDiT performs +competitively with existing models on ImageNet $256^2$, and sets a new +state-of-the-art for diffusion models on FFHQ-$1024^2$. + +
+
+ comment: 20 pages, 13 figures, project page and code available at + https://crowsonkb.github.io/hourglass-diffusion-transformers/ +
+
+
+
+
+ + ♻ ☆ Generator Identification for Linear SDEs with Additive and + Multiplicative Noise + + +
+ In this paper, we present conditions for identifying the generator of a +linear stochastic differential equation (SDE) from the distribution of its +solution process with a given fixed initial state. These identifiability +conditions are crucial in causal inference using linear SDEs as they enable the +identification of the post-intervention distributions from its observational +distribution. Specifically, we derive a sufficient and necessary condition for +identifying the generator of linear SDEs with additive noise, as well as a +sufficient condition for identifying the generator of linear SDEs with +multiplicative noise. We show that the conditions derived for both types of +SDEs are generic. Moreover, we offer geometric interpretations of the derived +identifiability conditions to enhance their understanding. To validate our +theoretical results, we perform a series of simulations, which support and +substantiate the established findings. + +
+
+
+
+
+ + ♻ ☆ PDE Generalization of In-Context Operator Networks: A Study on 1D Scalar + Nonlinear Conservation Laws + + +
+ Can we build a single large model for a wide range of PDE-related scientific +learning tasks? Can this model generalize to new PDEs, even of new forms, +without any fine-tuning? In-context operator learning and the corresponding +model In-Context Operator Networks (ICON) represent an initial exploration of +these questions. The capability of ICON regarding the first question has been +demonstrated previously. In this paper, we present a detailed methodology for +solving PDE problems with ICON, and show how a single ICON model can make +forward and reverse predictions for different equations with different strides, +provided with appropriately designed data prompts. We show the positive +evidence to the second question, i.e., ICON can generalize well to some PDEs +with new forms without any fine-tuning. This is exemplified through a study on +1D scalar nonlinear conservation laws, a family of PDEs with temporal +evolution. We also show how to broaden the range of problems that an ICON model +can address, by transforming functions and equations to ICON's capability +scope. We believe that the progress in this paper is a significant step towards +the goal of training a foundation model for PDE-related tasks under the +in-context operator learning framework. + +
+
+
+
+
+ + ♻ ☆ The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet + Transits + + +
+ This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase +Folding and Convolutional Neural Network (CNN) system to detect exoplanets +using the transit method. We devise a fast folding algorithm parallelized on a +GPU to amplify low signal-to-noise ratio transit signals, allowing a search at +high precision and speed. A CNN trained on two million synthetic light curves +reports a score indicating the likelihood of a planetary signal at each period. +While the GPFC method has broad applicability across period ranges, this +research specifically focuses on detecting ultra-short-period planets with +orbital periods less than one day. GPFC improves on speed by three orders of +magnitude over the predominant Box-fitting Least Squares (BLS) method. Our +simulation results show GPFC achieves $97%$ training accuracy, higher true +positive rate at the same false positive rate of detection, and higher +precision at the same recall rate when compared to BLS. GPFC recovers $100\%$ +of known ultra-short-period planets in $\textit{Kepler}$ light curves from a +blind search. These results highlight the promise of GPFC as an alternative +approach to the traditional BLS algorithm for finding new transiting exoplanets +in data taken with $\textit{Kepler}$ and other space transit missions such as +K2, TESS and future PLATO and Earth 2.0. + +
+
+ comment: 16 pages, 19 figures; Accepted for publication in the peer-reviewed + journal, Monthly Notices of the Royal Astronomical Society (MNRAS), on + January 20, 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 28 + +
+
+
+ + ☆ Analyzing Task-Encoding Tokens in Large Language Models + + +
+ In-context learning (ICL) has become an effective solution for few-shot +learning in natural language processing. Past work has found that, during this +process, representations of the last prompt token are utilized to store task +reasoning procedures, thereby explaining the working mechanism of in-context +learning. In this paper, we seek to locate and analyze other task-encoding +tokens whose representations store task reasoning procedures. Supported by +experiments that ablate the representations of different token types, we find +that template and stopword tokens are the most prone to be task-encoding +tokens. In addition, we demonstrate experimentally that lexical cues, +repetition, and text formats are the main distinguishing characteristics of +these tokens. Our work provides additional insights into how large language +models (LLMs) leverage task reasoning procedures in ICL and suggests that +future work may involve using task-encoding tokens to improve the computational +efficiency of LLMs at inference time and their ability to handle long +sequences. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ PRILoRA: Pruned and Rank-Increasing Low-Rank Adaptation EACL 2024 + + +
+ With the proliferation of large pre-trained language models (PLMs), +fine-tuning all model parameters becomes increasingly inefficient, particularly +when dealing with numerous downstream tasks that entail substantial training +and storage costs. Several approaches aimed at achieving parameter-efficient +fine-tuning (PEFT) have been proposed. Among them, Low-Rank Adaptation (LoRA) +stands out as an archetypal method, incorporating trainable rank decomposition +matrices into each target module. Nevertheless, LoRA does not consider the +varying importance of each layer. To address these challenges, we introduce +PRILoRA, which linearly allocates a different rank for each layer, in an +increasing manner, and performs pruning throughout the training process, +considering both the temporary magnitude of weights and the accumulated +statistics of the input to any given layer. We validate the effectiveness of +PRILoRA through extensive experiments on eight GLUE benchmarks, setting a new +state of the art. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ☆ Progress in Privacy Protection: A Review of Privacy Preserving + Techniques in Recommender Systems, Edge Computing, and Cloud Computing + + +
+ As digital technology evolves, the increasing use of connected devices brings +both challenges and opportunities in the areas of mobile crowdsourcing, edge +computing, and recommender systems. This survey focuses on these dynamic +fields, emphasizing the critical need for privacy protection in our +increasingly data-oriented world. It explores the latest trends in these +interconnected areas, with a special emphasis on privacy and data security. Our +method involves an in-depth analysis of various academic works, which helps us +to gain a comprehensive understanding of these sectors and their shifting focus +towards privacy concerns. We present new insights and marks a significant +advancement in addressing privacy issues within these technologies. The survey +is a valuable resource for researchers, industry practitioners, and policy +makers, offering an extensive overview of these fields and their related +privacy challenges, catering to a wide audience in the modern digital era. + +
+
+
+
+
+ + ☆ Word-Level ASR Quality Estimation for Efficient Corpus Sampling and + Post-Editing through Analyzing Attentions of a Reference-Free Metric + + +
+ In the realm of automatic speech recognition (ASR), the quest for models that +not only perform with high accuracy but also offer transparency in their +decision-making processes is crucial. The potential of quality estimation (QE) +metrics is introduced and evaluated as a novel tool to enhance explainable +artificial intelligence (XAI) in ASR systems. Through experiments and analyses, +the capabilities of the NoRefER (No Reference Error Rate) metric are explored +in identifying word-level errors to aid post-editors in refining ASR +hypotheses. The investigation also extends to the utility of NoRefER in the +corpus-building process, demonstrating its effectiveness in augmenting datasets +with insightful annotations. The diagnostic aspects of NoRefER are examined, +revealing its ability to provide valuable insights into model behaviors and +decision patterns. This has proven beneficial for prioritizing hypotheses in +post-editing workflows and fine-tuning ASR models. The findings suggest that +NoRefER is not merely a tool for error detection but also a comprehensive +framework for enhancing ASR systems' transparency, efficiency, and +effectiveness. To ensure the reproducibility of the results, all source codes +of this study are made publicly available. + +
+
+
+
+
+ + ☆ Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense + Passage Retrieval + + +
+ Masked auto-encoder pre-training has emerged as a prevalent technique for +initializing and enhancing dense retrieval systems. It generally utilizes +additional Transformer decoder blocks to provide sustainable supervision +signals and compress contextual information into dense representations. +However, the underlying reasons for the effectiveness of such a pre-training +technique remain unclear. The usage of additional Transformer-based decoders +also incurs significant computational costs. In this study, we aim to shed +light on this issue by revealing that masked auto-encoder (MAE) pre-training +with enhanced decoding significantly improves the term coverage of input tokens +in dense representations, compared to vanilla BERT checkpoints. Building upon +this observation, we propose a modification to the traditional MAE by replacing +the decoder of a masked auto-encoder with a completely simplified Bag-of-Word +prediction task. This modification enables the efficient compression of lexical +signals into dense representations through unsupervised pre-training. +Remarkably, our proposed method achieves state-of-the-art retrieval performance +on several large-scale retrieval benchmarks without requiring any additional +parameters, which provides a 67% training speed-up compared to standard masked +auto-encoder pre-training with enhanced decoding. + +
+
+ comment: Working in progress. Our code will be available at + https://github.com/ma787639046/bowdpr +
+
+
+
+
+ + ☆ Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented + Generation in Niche Domains, Exemplified by Korean Medicine + + +
+ We propose a natural language prompt-based retrieval augmented generation +(Prompt-RAG), a novel approach to enhance the performance of generative large +language models (LLMs) in niche domains. Conventional RAG methods mostly +require vector embeddings, yet the suitability of generic LLM-based embedding +representations for specialized domains remains uncertain. To explore and +exemplify this point, we compared vector embeddings from Korean Medicine (KM) +and Conventional Medicine (CM) documents, finding that KM document embeddings +correlated more with token overlaps and less with human-assessed document +relatedness, in contrast to CM embeddings. Prompt-RAG, distinct from +conventional RAG models, operates without the need for embedding vectors. Its +performance was assessed through a Question-Answering (QA) chatbot application, +where responses were evaluated for relevance, readability, and informativeness. +The results showed that Prompt-RAG outperformed existing models, including +ChatGPT and conventional vector embedding-based RAGs, in terms of relevance and +informativeness. Despite challenges like content structuring and response +latency, the advancements in LLMs are expected to encourage the use of +Prompt-RAG, making it a promising tool for other domains in need of RAG +methods. + +
+
+ comment: 26 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ End-to-End Argument Mining over Varying Rhetorical Structures + + +
+ Rhetorical Structure Theory implies no single discourse interpretation of a +text, and the limitations of RST parsers further exacerbate inconsistent +parsing of similar structures. Therefore, it is important to take into account +that the same argumentative structure can be found in semantically similar +texts with varying rhetorical structures. In this work, the differences between +paraphrases within the same argument scheme are evaluated from a rhetorical +perspective. The study proposes a deep dependency parsing model to assess the +connection between rhetorical and argument structures. The model utilizes +rhetorical relations; RST structures of paraphrases serve as training data +augmentations. The method allows for end-to-end argumentation analysis using a +rhetorical tree instead of a word sequence. It is evaluated on the bilingual +Microtexts corpus, and the first results on fully-fledged argument parsing for +the Russian version of the corpus are reported. The results suggest that +argument mining can benefit from multiple variants of discourse structure. + +
+
+
+
+
+ + ☆ Unfair TOS: An Automated Approach using Customized BERT + + +
+ Terms of Service (ToS) form an integral part of any agreement as it defines +the legal relationship between a service provider and an end-user. Not only do +they establish and delineate reciprocal rights and responsibilities, but they +also provide users with information on essential aspects of contracts that +pertain to the use of digital spaces. These aspects include a wide range of +topics, including limitation of liability, data protection, etc. Users tend to +accept the ToS without going through it before using any application or +service. Such ignorance puts them in a potentially weaker situation in case any +action is required. Existing methodologies for the detection or classification +of unfair clauses are however obsolete and show modest performance. In this +research paper, we present SOTA(State of The Art) results on unfair clause +detection from ToS documents based on unprecedented Fine-tuning BERT in +integration with SVC(Support Vector Classifier). The study shows proficient +performance with a macro F1-score of 0.922 at unfair clause detection, and +superior performance is also shown in the classification of unfair clauses by +each tag. Further, a comparative analysis is performed by answering research +questions on the Transformer models utilized. In order to further research and +experimentation the code and results are made available on +https://github.com/batking24/Unfair-TOS-An-Automated-Approach-based-on-Fine-tuning-BERT-in-conjunction-with-ML. + +
+
+
+
+
+ + ☆ InferAligner: Inference-Time Alignment for Harmlessness through + Cross-Model Guidance + + +
+ With the rapid development of large language models (LLMs), they are not only +used as general-purpose AI assistants but are also customized through further +fine-tuning to meet the requirements of different applications. A pivotal +factor in the success of current LLMs is the alignment process. Current +alignment methods, such as supervised fine-tuning (SFT) and reinforcement +learning from human feedback (RLHF), focus on training-time alignment and are +often complex and cumbersome to implement. Therefore, we develop +\textbf{InferAligner}, a novel inference-time alignment method that utilizes +cross-model guidance for harmlessness alignment. InferAligner utilizes safety +steering vectors extracted from safety-aligned model to modify the activations +of the target model when responding to harmful inputs, thereby guiding the +target model to provide harmless responses. Experimental results show that our +method can be very effectively applied to domain-specific models in finance, +medicine, and mathematics, as well as to multimodal large language models +(MLLMs) such as LLaVA. It significantly diminishes the Attack Success Rate +(ASR) of both harmful instructions and jailbreak attacks, while maintaining +almost unchanged performance in downstream tasks. + +
+
+
+
+
+ + ☆ How the Advent of Ubiquitous Large Language Models both Stymie and + Turbocharge Dynamic Adversarial Question Generation + + +
+ Dynamic adversarial question generation, where humans write examples to stump +a model, aims to create examples that are realistic and informative. However, +the advent of large language models (LLMs) has been a double-edged sword for +human authors: more people are interested in seeing and pushing the limits of +these models, but because the models are so much stronger an opponent, they are +harder to defeat. To understand how these models impact adversarial question +writing process, we enrich the writing guidance with LLMs and retrieval models +for the authors to reason why their questions are not adversarial. While +authors could create interesting, challenging adversarial questions, they +sometimes resort to tricks that result in poor questions that are ambiguous, +subjective, or confusing not just to a computer but also to humans. To address +these issues, we propose new metrics and incentives for eliciting good, +challenging questions and present a new dataset of adversarially authored +questions. + +
+
+
+
+
+ + ☆ Gaussian Adaptive Attention is All You Need: Robust Contextual + Representations Across Multiple Modalities + + +
+ We propose the Multi-Head Gaussian Adaptive Attention Mechanism (GAAM), a +novel probabilistic attention framework, and the Gaussian Adaptive Transformer +(GAT), designed to enhance information aggregation across multiple modalities, +including Speech, Text and Vision. GAAM integrates learnable mean and variance +into its attention mechanism, implemented in a Multi-Headed framework enabling +it to collectively model any Probability Distribution for dynamic recalibration +of feature significance. This method demonstrates significant improvements, +especially with highly non-stationary data, surpassing the state-of-the-art +attention techniques in model performance (up to approximately +20% in +accuracy) by identifying key elements within the feature space. GAAM's +compatibility with dot-product-based attention models and relatively low number +of parameters showcases its adaptability and potential to boost existing +attention frameworks. Empirically, GAAM exhibits superior adaptability and +efficacy across a diverse range of tasks, including emotion recognition in +speech, image classification, and text classification, thereby establishing its +robustness and versatility in handling multi-modal data. Furthermore, we +introduce the Importance Factor (IF), a new learning-based metric that enhances +the explainability of models trained with GAAM-based methods. Overall, GAAM +represents an advancement towards development of better performing and more +explainable attention models across multiple modalities. + +
+
+
+
+
+ + ☆ Enhancing Large Language Models for Clinical Decision Support by + Incorporating Clinical Practice Guidelines + + +
+ Background Large Language Models (LLMs), enhanced with Clinical Practice +Guidelines (CPGs), can significantly improve Clinical Decision Support (CDS). +However, methods for incorporating CPGs into LLMs are not well studied. Methods +We develop three distinct methods for incorporating CPGs into LLMs: Binary +Decision Tree (BDT), Program-Aided Graph Construction (PAGC), and +Chain-of-Thought-Few-Shot Prompting (CoT-FSP). To evaluate the effectiveness of +the proposed methods, we create a set of synthetic patient descriptions and +conduct both automatic and human evaluation of the responses generated by four +LLMs: GPT-4, GPT-3.5 Turbo, LLaMA, and PaLM 2. Zero-Shot Prompting (ZSP) was +used as the baseline method. We focus on CDS for COVID-19 outpatient treatment +as the case study. Results All four LLMs exhibit improved performance when +enhanced with CPGs compared to the baseline ZSP. BDT outperformed both CoT-FSP +and PAGC in automatic evaluation. All of the proposed methods demonstrated high +performance in human evaluation. Conclusion LLMs enhanced with CPGs demonstrate +superior performance, as compared to plain LLMs with ZSP, in providing accurate +recommendations for COVID-19 outpatient treatment, which also highlights the +potential for broader applications beyond the case study. + +
+
+
+
+
+ + ☆ Exploiting Duality in Open Information Extraction with Predicate Prompt + + +
+ Open information extraction (OpenIE) aims to extract the schema-free triplets +in the form of (\emph{subject}, \emph{predicate}, \emph{object}) from a given +sentence. Compared with general information extraction (IE), OpenIE poses more +challenges for the IE models, {especially when multiple complicated triplets +exist in a sentence. To extract these complicated triplets more effectively, in +this paper we propose a novel generative OpenIE model, namely \emph{DualOIE}, +which achieves a dual task at the same time as extracting some triplets from +the sentence, i.e., converting the triplets into the sentence.} Such dual task +encourages the model to correctly recognize the structure of the given sentence +and thus is helpful to extract all potential triplets from the sentence. +Specifically, DualOIE extracts the triplets in two steps: 1) first extracting a +sequence of all potential predicates, 2) then using the predicate sequence as a +prompt to induce the generation of triplets. Our experiments on two benchmarks +and our dataset constructed from Meituan demonstrate that DualOIE achieves the +best performance among the state-of-the-art baselines. Furthermore, the online +A/B test on Meituan platform shows that 0.93\% improvement of QV-CTR and 0.56\% +improvement of UV-CTR have been obtained when the triplets extracted by DualOIE +were leveraged in Meituan's search system. + +
+
+
+
+
+ + ♻ ☆ Beyond Shared Vocabulary: Increasing Representational Word Similarities + across Languages for Multilingual Machine Translation + + +
+ Using a vocabulary that is shared across languages is common practice in +Multilingual Neural Machine Translation (MNMT). In addition to its simple +design, shared tokens play an important role in positive knowledge transfer, +assuming that shared tokens refer to similar meanings across languages. +However, when word overlap is small, especially due to different writing +systems, transfer is inhibited. In this paper, we define word-level information +transfer pathways via word equivalence classes and rely on graph networks to +fuse word embeddings across languages. Our experiments demonstrate the +advantages of our approach: 1) embeddings of words with similar meanings are +better aligned across languages, 2) our method achieves consistent BLEU +improvements of up to 2.3 points for high- and low-resource MNMT, and 3) less +than 1.0\% additional trainable parameters are required with a limited increase +in computational costs, while inference time remains identical to the baseline. +We release the codebase to the community. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Developing ChatGPT for Biology and Medicine: A Complete Review of + Biomedical Question Answering + + +
+ ChatGPT explores a strategic blueprint of question answering (QA) in +delivering medical diagnosis, treatment recommendations, and other healthcare +support. This is achieved through the increasing incorporation of medical +domain data via natural language processing (NLP) and multimodal paradigms. By +transitioning the distribution of text, images, videos, and other modalities +from the general domain to the medical domain, these techniques have expedited +the progress of medical domain question answering (MDQA). They bridge the gap +between human natural language and sophisticated medical domain knowledge or +expert manual annotations, handling large-scale, diverse, unbalanced, or even +unlabeled data analysis scenarios in medical contexts. Central to our focus is +the utilizing of language models and multimodal paradigms for medical question +answering, aiming to guide the research community in selecting appropriate +mechanisms for their specific medical research requirements. Specialized tasks +such as unimodal-related question answering, reading comprehension, reasoning, +diagnosis, relation extraction, probability modeling, and others, as well as +multimodal-related tasks like vision question answering, image caption, +cross-modal retrieval, report summarization, and generation, are discussed in +detail. Each section delves into the intricate specifics of the respective +method under consideration. This paper highlights the structures and +advancements of medical domain explorations against general domain methods, +emphasizing their applications across different tasks and datasets. It also +outlines current challenges and opportunities for future medical domain +research, paving the way for continued innovation and application in this +rapidly evolving field. + +
+
+ comment: 50 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ GNN2R: Weakly-Supervised Rationale-Providing Question Answering over + Knowledge Graphs + + +
+ Most current methods for multi-hop question answering (QA) over knowledge +graphs (KGs) only provide final conclusive answers without explanations, such +as a set of KG entities that is difficult for normal users to review and +comprehend. This issue severely limits the application of KG-based QA in +real-world scenarios. However, it is non-trivial to solve due to two +challenges: First, annotations of reasoning chains of multi-hop questions, +which could serve as supervision for explanation generation, are usually +lacking. Second, it is difficult to maintain high efficiency when explicit KG +triples need to be retrieved to generate explanations. In this paper, we +propose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to +solve this issue. GNN2R can provide both final answers and reasoning subgraphs +as a rationale behind final answers efficiently with only weak supervision that +is available through question-final answer pairs. We extensively evaluated +GNN2R with detailed analyses in experiments. The results demonstrate that, in +terms of effectiveness, efficiency, and quality of generated explanations, +GNN2R outperforms existing state-of-the-art methods that are applicable to this +task. Our code and pre-trained models are available at +https://github.com/ruijie-wang-uzh/GNN2R. + +
+
+
+
+
+ + ♻ ☆ The Impact of Reasoning Step Length on Large Language Models + + +
+ Chain of Thought (CoT) is significant in improving the reasoning abilities of +large language models (LLMs). However, the correlation between the +effectiveness of CoT and the length of reasoning steps in prompts remains +largely unknown. To shed light on this, we have conducted several empirical +experiments to explore the relations. Specifically, we design experiments that +expand and compress the rationale reasoning steps within CoT demonstrations, +while keeping all other factors constant. We have the following key findings. +First, the results indicate that lengthening the reasoning steps in prompts, +even without adding new information into the prompt, considerably enhances +LLMs' reasoning abilities across multiple datasets. Alternatively, shortening +the reasoning steps, even while preserving the key information, significantly +diminishes the reasoning abilities of models. This finding highlights the +importance of the number of steps in CoT prompts and provides practical +guidance to make better use of LLMs' potential in complex problem-solving +scenarios. Second, we also investigated the relationship between the +performance of CoT and the rationales used in demonstrations. Surprisingly, the +result shows that even incorrect rationales can yield favorable outcomes if +they maintain the requisite length of inference. Third, we observed that the +advantages of increasing reasoning steps are task-dependent: simpler tasks +require fewer steps, whereas complex tasks gain significantly from longer +inference sequences. + +
+
+
+
+
+ + ♻ ☆ Assertion Enhanced Few-Shot Learning: Instructive Technique for Large + Language Models to Generate Educational Explanations + + +
+ Human educators possess an intrinsic ability to anticipate and seek +educational explanations from students, which drives them to pose +thought-provoking questions when students cannot articulate these explanations +independently. We aim to imbue Intelligent Tutoring Systems with this ability +using few-shot learning capability of Large Language Models. Our work proposes +a novel prompting technique, Assertion Enhanced Few-Shot Learning, to +facilitate the generation of accurate, detailed oriented educational +explanations. Our central hypothesis is that, in educational domain, few-shot +demonstrations are necessary but not a sufficient condition for quality +explanation generation. We conducted a study involving 12 in-service teachers, +comparing our approach to Traditional Few-Shot Learning. The results show that +Assertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and +yields higher-quality explanations, as evaluated by teachers. We also conduct a +qualitative ablation study to factor the impact of assertions to provide +educator-friendly prompting guidelines for generating explanations in their +domain of interest. + +
+
+
+
+
+ + ♻ ☆ Machines Do See Color: A Guideline to Classify Different Forms of Racist + Discourse in Large Corpora + + +
+ Current methods to identify and classify racist language in text rely on +small-n qualitative approaches or large-n approaches focusing exclusively on +overt forms of racist discourse. This article provides a step-by-step +generalizable guideline to identify and classify different forms of racist +discourse in large corpora. In our approach, we start by conceptualizing racism +and its different manifestations. We then contextualize these racist +manifestations to the time and place of interest, which allows researchers to +identify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a +cross-lingual model for supervised text classification with a cutting-edge +contextual understanding of text. We show that XLM-R and XLM-R-Racismo, our +pretrained model, outperform other state-of-the-art approaches in classifying +racism in large corpora. We illustrate our approach using a corpus of tweets +relating to the Ecuadorian ind\'igena community between 2018 and 2021. + +
+
+ comment: 37 pages, 5 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Large language models in biomedical natural language processing: + benchmarks, baselines, and recommendations + + +
+ Biomedical literature is growing rapidly, making it challenging to curate and +extract knowledge manually. Biomedical natural language processing (BioNLP) +techniques that can automatically extract information from biomedical +literature help alleviate this burden. Recently, large Language Models (LLMs), +such as GPT-3 and GPT-4, have gained significant attention for their impressive +performance. However, their effectiveness in BioNLP tasks and impact on method +development and downstream users remain understudied. This pilot study (1) +establishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and +one-shot settings in eight BioNLP datasets across four applications: named +entity recognition, relation extraction, multi-label document classification, +and semantic similarity and reasoning, (2) examines the errors produced by the +LLMs and categorized the errors into three types: missingness, inconsistencies, +and unwanted artificial content, and (3) provides suggestions for using LLMs in +BioNLP applications. We make the datasets, baselines, and results publicly +available to the community via +https://github.com/qingyu-qc/gpt_bionlp_benchmark. + +
+
+
+
+
+ + ♻ ☆ MR-GSM8K: A Meta-Reasoning Revolution in Large Language Model Evaluation + + +
+ In this work, we introduce a novel evaluation paradigm for Large Language +Models, one that challenges them to engage in meta-reasoning. This approach +addresses critical shortcomings in existing math problem-solving benchmarks, +traditionally used to evaluate the cognitive capabilities of agents. Our +paradigm shifts the focus from result-oriented assessments, which often +overlook the reasoning process, to a more holistic evaluation that effectively +differentiates the cognitive capabilities among models. For example, in our +benchmark, GPT-4 demonstrates a performance five times better than GPT3-5. The +significance of this new paradigm lies in its ability to reveal potential +cognitive deficiencies in LLMs that current benchmarks, such as GSM8K, fail to +uncover due to their saturation and lack of effective differentiation among +varying reasoning abilities. Our comprehensive analysis includes several +state-of-the-art math models from both open-source and closed-source +communities, uncovering fundamental deficiencies in their training and +evaluation approaches. This paper not only advocates for a paradigm shift in +the assessment of LLMs but also contributes to the ongoing discourse on the +trajectory towards Artificial General Intelligence (AGI). By promoting the +adoption of meta-reasoning evaluation methods similar to ours, we aim to +facilitate a more accurate assessment of the true cognitive abilities of LLMs. + +
+
+ comment: Code: https://github.com/dvlab-research/MR-GSM8K +
+
+
+
+
+ + ♻ ☆ Universal Vulnerabilities in Large Language Models: In-context Learning + Backdoor Attacks + + +
+ In-context learning, a paradigm bridging the gap between pre-training and +fine-tuning, has demonstrated high efficacy in several NLP tasks, especially in +few-shot settings. Unlike traditional fine-tuning methods, in-context learning +adapts pre-trained models to unseen tasks without updating any parameters. +Despite being widely applied, in-context learning is vulnerable to malicious +attacks. In this work, we raise security concerns regarding this paradigm. Our +studies demonstrate that an attacker can manipulate the behavior of large +language models by poisoning the demonstration context, without the need for +fine-tuning the model. Specifically, we have designed a new backdoor attack +method, named ICLAttack, to target large language models based on in-context +learning. Our method encompasses two types of attacks: poisoning demonstration +examples and poisoning prompts, which can make models behave in accordance with +predefined intentions. ICLAttack does not require additional fine-tuning to +implant a backdoor, thus preserving the model's generality. Furthermore, the +poisoned examples are correctly labeled, enhancing the natural stealth of our +attack method. Extensive experimental results across several language models, +ranging in size from 1.3B to 40B parameters, demonstrate the effectiveness of +our attack method, exemplified by a high average attack success rate of 95.0% +across the three datasets on OPT models. Our findings highlight the +vulnerabilities of language models, and we hope this work will raise awareness +of the possible security threats associated with in-context learning. + +
+
+
+
+
+ + ♻ ☆ Agent Alignment in Evolving Social Norms + + +
+ Agents based on Large Language Models (LLMs) are increasingly permeating +various domains of human production and life, highlighting the importance of +aligning them with human values. The current alignment of AI systems primarily +focuses on passively aligning LLMs through human intervention. However, agents +possess characteristics like receiving environmental feedback and +self-evolution, rendering the LLM alignment methods inadequate. In response, we +propose an evolutionary framework for agent evolution and alignment, named +EvolutionaryAgent, which transforms agent alignment into a process of evolution +and selection under the principle of survival of the fittest. In an environment +where social norms continuously evolve, agents better adapted to the current +social norms will have a higher probability of survival and proliferation, +while those inadequately aligned dwindle over time. Experimental results +assessing the agents from multiple perspectives in aligning with social norms +demonstrate that EvolutionaryAgent can align progressively better with the +evolving social norms while maintaining its proficiency in general tasks. +Effectiveness tests conducted on various open and closed-source LLMs as the +foundation for agents also prove the applicability of our approach. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Augmenting Math Word Problems via Iterative Question Composing + + +
+ Despite recent progress in improving the mathematical reasoning ability of +large language models(LLMs), solving competition-level math problems without +the use of external tools remains challenging for open-source LLMs. In this +work, we introduce the MMIQC dataset, a mixture of processed web data and +synthetic question-response pairs, to equip base models with better +mathematical reasoning skills. In different model sizes, the models fine-tuned +on MMIQC consistently outperform their counterparts by a clear margin on MATH +test set. Notably, DeepSeek-67B-MMIQC achieves a 41.0% accuracy, 4.2% higher +than the previous open-source SOTA. Our experiments also show that a large part +of the improvement can be attributed to our novel augmentation method +IQC(Iterative Question Composing), where we iteratively ask an LLM to compose +new questions from the given seed problems and do rejection sampling from +another LLM. MMIQC has now been released on +https://huggingface.co/datasets/Vivacem/MMIQC. + +
+
+
+
+
+ + ♻ ☆ EMO: Earth Mover Distance Optimization for Auto-Regressive Language + Modeling ICLR 2024 + + +
+ Neural language models are probabilistic models of human text. They are +predominantly trained using maximum likelihood estimation (MLE), which is +equivalent to minimizing the forward cross-entropy between the empirical data +distribution and the model distribution. However, various degeneration +phenomena are still widely observed when decoding from the distributions +learned by such models. We establish that the forward cross-entropy is +suboptimal as a distance metric for aligning human and model distribution due +to its (1) recall-prioritization (2) negative diversity ignorance and (3) +train-test mismatch. In this paper, we propose Earth Mover Distance +Optimization (EMO) for auto-regressive language modeling. EMO capitalizes on +the inherent properties of earth mover distance to address the aforementioned +challenges. Due to the high complexity of direct computation, we further +introduce a feasible upper bound for EMO to ease end-to-end training. Upon +extensive evaluation of language models trained using EMO and MLE. We find that +EMO demonstrates a consistently better language modeling performance than MLE +across domains. Moreover, EMO demonstrates noteworthy enhancements in +downstream performance with minimal fine-tuning on merely 25,000 sentences. +This highlights the tremendous potential of EMO as a lightweight calibration +method for enhancing large-scale pre-trained language models. + +
+
+ comment: To appear at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ A Comprehensive Analysis of the Effectiveness of Large Language Models + as Automatic Dialogue Evaluators AAAI-2024 + + +
+ Automatic evaluation is an integral aspect of dialogue system research. The +traditional reference-based NLG metrics are generally found to be unsuitable +for dialogue assessment. Consequently, recent studies have suggested various +unique, reference-free neural metrics that better align with human evaluations. +Notably among them, large language models (LLMs), particularly the +instruction-tuned variants like ChatGPT, are shown to be promising substitutes +for human judges. Yet, existing works on utilizing LLMs for automatic dialogue +evaluation are limited in their scope in terms of the number of meta-evaluation +datasets, mode of evaluation, coverage of LLMs, etc. Hence, it remains +inconclusive how effective these LLMs are. To this end, we conduct a +comprehensive study on the application of LLMs for automatic dialogue +evaluation. Specifically, we analyze the multi-dimensional evaluation +capability of 30 recently emerged LLMs at both turn and dialogue levels, using +a comprehensive set of 12 meta-evaluation datasets. Additionally, we probe the +robustness of the LLMs in handling various adversarial perturbations at both +turn and dialogue levels. Finally, we explore how model-level and +dimension-level ensembles impact the evaluation performance. All resources are +available at https://github.com/e0397123/comp-analysis. + +
+
+ comment: An extended version of AAAI-2024 camera-ready paper (appendix + included, 16 pages) +
+
+
+
+
+ + ♻ ☆ IDEAL: Influence-Driven Selective Annotations Empower In-Context + Learners in Large Language Models ICLR 2024 + + +
+ In-context learning is a promising paradigm that utilizes in-context examples +as prompts for the predictions of large language models. These prompts are +crucial for achieving strong performance. However, since the prompts need to be +sampled from a large volume of annotated examples, finding the right prompt may +result in high annotation costs. To address this challenge, this paper +introduces an influence-driven selective annotation method that aims to +minimize annotation costs while improving the quality of in-context examples. +The essence of our method is to select a pivotal subset from a large-scale +unlabeled data pool to annotate for the subsequent sampling of prompts. +Specifically, a directed graph is first constructed to represent unlabeled +data. Afterward, the influence of candidate unlabeled subsets is quantified +with a diffusion process. A simple yet effective greedy algorithm for unlabeled +data selection is lastly introduced. It iteratively selects the data if it +provides a maximum marginal gain with respect to quantified influence. Compared +with previous efforts on selective annotations, our influence-driven method +works in an end-to-end manner, avoids an intractable explicit balance between +data diversity and representativeness, and enjoys theoretical support. +Experiments confirm the superiority of the proposed method on various +benchmarks, achieving better performance under lower time consumption during +subset selection. The project page is available at +https://skzhang1.github.io/IDEAL/. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ For Generated Text, Is NLI-Neutral Text the Best Text? + + +
+ We explore incorporating natural language inference (NLI) into the text +generative pipeline by using a pre-trained NLI model to assess whether a +generated sentence entails, contradicts, or is neutral to the prompt and +preceding text. First, we show that the NLI task is predictive of generation +errors made by GPT-3. We use these results to develop an NLI-informed +generation procedure for GPT-J. Then, we evaluate these generations by +obtaining human annotations on error types and overall quality. We find that an +NLI strategy of maximizing entailment improves text generation when the nucleus +sampling randomness parameter value is high, while one which maximizes +contradiction is in fact productive when the parameter value is low. Overall, +though, we demonstrate that an NLI strategy of maximizing the neutral class +provides the highest quality of generated text (significantly better than the +vanilla generations), regardless of parameter value. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Drop your Decoder: Pre-training with Bag-of-Word Prediction for Dense + Passage Retrieval + + +
+ Masked auto-encoder pre-training has emerged as a prevalent technique for +initializing and enhancing dense retrieval systems. It generally utilizes +additional Transformer decoder blocks to provide sustainable supervision +signals and compress contextual information into dense representations. +However, the underlying reasons for the effectiveness of such a pre-training +technique remain unclear. The usage of additional Transformer-based decoders +also incurs significant computational costs. In this study, we aim to shed +light on this issue by revealing that masked auto-encoder (MAE) pre-training +with enhanced decoding significantly improves the term coverage of input tokens +in dense representations, compared to vanilla BERT checkpoints. Building upon +this observation, we propose a modification to the traditional MAE by replacing +the decoder of a masked auto-encoder with a completely simplified Bag-of-Word +prediction task. This modification enables the efficient compression of lexical +signals into dense representations through unsupervised pre-training. +Remarkably, our proposed method achieves state-of-the-art retrieval performance +on several large-scale retrieval benchmarks without requiring any additional +parameters, which provides a 67% training speed-up compared to standard masked +auto-encoder pre-training with enhanced decoding. + +
+
+ comment: Working in progress. Our code will be available at + https://github.com/ma787639046/bowdpr +
+
+
+
+
+ + ☆ Prompt-RAG: Pioneering Vector Embedding-Free Retrieval-Augmented + Generation in Niche Domains, Exemplified by Korean Medicine + + +
+ We propose a natural language prompt-based retrieval augmented generation +(Prompt-RAG), a novel approach to enhance the performance of generative large +language models (LLMs) in niche domains. Conventional RAG methods mostly +require vector embeddings, yet the suitability of generic LLM-based embedding +representations for specialized domains remains uncertain. To explore and +exemplify this point, we compared vector embeddings from Korean Medicine (KM) +and Conventional Medicine (CM) documents, finding that KM document embeddings +correlated more with token overlaps and less with human-assessed document +relatedness, in contrast to CM embeddings. Prompt-RAG, distinct from +conventional RAG models, operates without the need for embedding vectors. Its +performance was assessed through a Question-Answering (QA) chatbot application, +where responses were evaluated for relevance, readability, and informativeness. +The results showed that Prompt-RAG outperformed existing models, including +ChatGPT and conventional vector embedding-based RAGs, in terms of relevance and +informativeness. Despite challenges like content structuring and response +latency, the advancements in LLMs are expected to encourage the use of +Prompt-RAG, making it a promising tool for other domains in need of RAG +methods. + +
+
+ comment: 26 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Navigating the Thin Line: Examining User Behavior in Search to Detect + Engagement and Backfire Effects ECIR2024 + + +
+ Opinionated users often seek information that aligns with their preexisting +beliefs while dismissing contradictory evidence due to confirmation bias. This +conduct hinders their ability to consider alternative stances when searching +the web. Despite this, few studies have analyzed how the diversification of +search results on disputed topics influences the search behavior of highly +opinionated users. To this end, we present a preregistered user study (n = 257) +investigating whether different levels (low and high) of bias metrics and +search results presentation (with or without AI-predicted stances labels) can +affect the stance diversity consumption and search behavior of opinionated +users on three debated topics (i.e., atheism, intellectual property rights, and +school uniforms). Our results show that exposing participants to +(counter-attitudinally) biased search results increases their consumption of +attitude-opposing content, but we also found that bias was associated with a +trend toward overall fewer interactions within the search page. We also found +that 19% of users interacted with queries and search pages but did not select +any search results. When we removed these participants in a post-hoc analysis, +we found that stance labels increased the diversity of stances consumed by +users, particularly when the search results were biased. Our findings highlight +the need for future research to explore distinct search scenario settings to +gain insight into opinionated users' behavior. + +
+
+ comment: 17 pages, 3 figures, ECIR2024 (46th European Conference on + Information Retrieval - IR4Good track) +
+
+
+
+
+ + ☆ A Deep Learning Approach for Selective Relevance Feedback + + +
+ Pseudo-relevance feedback (PRF) can enhance average retrieval effectiveness +over a sufficiently large number of queries. However, PRF often introduces a +drift into the original information need, thus hurting the retrieval +effectiveness of several queries. While a selective application of PRF can +potentially alleviate this issue, previous approaches have largely relied on +unsupervised or feature-based learning to determine whether a query should be +expanded. In contrast, we revisit the problem of selective PRF from a deep +learning perspective, presenting a model that is entirely data-driven and +trained in an end-to-end manner. The proposed model leverages a +transformer-based bi-encoder architecture. Additionally, to further improve +retrieval effectiveness with this selective PRF approach, we make use of the +model's confidence estimates to combine the information from the original and +expanded queries. In our experiments, we apply this selective feedback on a +number of different combinations of ranking and feedback models, and show that +our proposed approach consistently improves retrieval effectiveness for both +sparse and dense ranking models, with the feedback models being either sparse, +dense or generative. + +
+
+
+
+
+ + ☆ Document Set Expansion with Positive-Unlabeled Learning: A Density + Estimation-based Approach + + +
+ Document set expansion aims to identify relevant documents from a large +collection based on a small set of documents that are on a fine-grained topic. +Previous work shows that PU learning is a promising method for this task. +However, some serious issues remain unresolved, i.e. typical challenges that PU +methods suffer such as unknown class prior and imbalanced data, and the need +for transductive experimental settings. In this paper, we propose a novel PU +learning framework based on density estimation, called puDE, that can handle +the above issues. The advantage of puDE is that it neither constrained to the +SCAR assumption and nor require any class prior knowledge. We demonstrate the +effectiveness of the proposed method using a series of real-world datasets and +conclude that our method is a better alternative for the DSE task. + +
+
+
+
+
+ + ☆ Exploiting Duality in Open Information Extraction with Predicate Prompt + + +
+ Open information extraction (OpenIE) aims to extract the schema-free triplets +in the form of (\emph{subject}, \emph{predicate}, \emph{object}) from a given +sentence. Compared with general information extraction (IE), OpenIE poses more +challenges for the IE models, {especially when multiple complicated triplets +exist in a sentence. To extract these complicated triplets more effectively, in +this paper we propose a novel generative OpenIE model, namely \emph{DualOIE}, +which achieves a dual task at the same time as extracting some triplets from +the sentence, i.e., converting the triplets into the sentence.} Such dual task +encourages the model to correctly recognize the structure of the given sentence +and thus is helpful to extract all potential triplets from the sentence. +Specifically, DualOIE extracts the triplets in two steps: 1) first extracting a +sequence of all potential predicates, 2) then using the predicate sequence as a +prompt to induce the generation of triplets. Our experiments on two benchmarks +and our dataset constructed from Meituan demonstrate that DualOIE achieves the +best performance among the state-of-the-art baselines. Furthermore, the online +A/B test on Meituan platform shows that 0.93\% improvement of QV-CTR and 0.56\% +improvement of UV-CTR have been obtained when the triplets extracted by DualOIE +were leveraged in Meituan's search system. + +
+
+
+
+
+ + ☆ FedRKG: A Privacy-preserving Federated Recommendation Framework via + Knowledge Graph Enhancement + + +
+ Federated Learning (FL) has emerged as a promising approach for preserving +data privacy in recommendation systems by training models locally. Recently, +Graph Neural Networks (GNN) have gained popularity in recommendation tasks due +to their ability to capture high-order interactions between users and items. +However, privacy concerns prevent the global sharing of the entire user-item +graph. To address this limitation, some methods create pseudo-interacted items +or users in the graph to compensate for missing information for each client. +Unfortunately, these methods introduce random noise and raise privacy concerns. +In this paper, we propose FedRKG, a novel federated recommendation system, +where a global knowledge graph (KG) is constructed and maintained on the server +using publicly available item information, enabling higher-order user-item +interactions. On the client side, a relation-aware GNN model leverages diverse +KG relationships. To protect local interaction items and obscure gradients, we +employ pseudo-labeling and Local Differential Privacy (LDP). Extensive +experiments conducted on three real-world datasets demonstrate the competitive +performance of our approach compared to centralized algorithms while ensuring +privacy preservation. Moreover, FedRKG achieves an average accuracy improvement +of 4% compared to existing federated learning baselines. + +
+
+
+
+
+ + ♻ ☆ Large language models in biomedical natural language processing: + benchmarks, baselines, and recommendations + + +
+ Biomedical literature is growing rapidly, making it challenging to curate and +extract knowledge manually. Biomedical natural language processing (BioNLP) +techniques that can automatically extract information from biomedical +literature help alleviate this burden. Recently, large Language Models (LLMs), +such as GPT-3 and GPT-4, have gained significant attention for their impressive +performance. However, their effectiveness in BioNLP tasks and impact on method +development and downstream users remain understudied. This pilot study (1) +establishes the baseline performance of GPT-3 and GPT-4 at both zero-shot and +one-shot settings in eight BioNLP datasets across four applications: named +entity recognition, relation extraction, multi-label document classification, +and semantic similarity and reasoning, (2) examines the errors produced by the +LLMs and categorized the errors into three types: missingness, inconsistencies, +and unwanted artificial content, and (3) provides suggestions for using LLMs in +BioNLP applications. We make the datasets, baselines, and results publicly +available to the community via +https://github.com/qingyu-qc/gpt_bionlp_benchmark. + +
+
+
+
+
+ + ♻ ☆ Learning Graph ODE for Continuous-Time Sequential Recommendation + + +
+ Sequential recommendation aims at understanding user preference by capturing +successive behavior correlations, which are usually represented as the item +purchasing sequences based on their past interactions. Existing efforts +generally predict the next item via modeling the sequential patterns. Despite +effectiveness, there exist two natural deficiencies: (i) user preference is +dynamic in nature, and the evolution of collaborative signals is often ignored; +and (ii) the observed interactions are often irregularly-sampled, while +existing methods model item transitions assuming uniform intervals. Thus, how +to effectively model and predict the underlying dynamics for user preference +becomes a critical research problem. To tackle the above challenges, in this +paper, we focus on continuous-time sequential recommendation and propose a +principled graph ordinary differential equation framework named GDERec. +Technically, GDERec is characterized by an autoregressive graph ordinary +differential equation consisting of two components, which are parameterized by +two tailored graph neural networks (GNNs) respectively to capture user +preference from the perspective of hybrid dynamical systems. The two customized +GNNs are trained alternately in an autoregressive manner to track the evolution +of the underlying system from irregular observations, and thus learn effective +representations of users and items beneficial to the sequential recommendation. +Extensive experiments on five benchmark datasets demonstrate the superiority of +our model over various state-of-the-art recommendation methods. + +
+
+ comment: Accepted by EEE Transactions on Knowledge and Data Engineering (TKDE + 2024) +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Reasoning Based on Attention GCN + + +
+ We propose a novel technique to enhance Knowledge Graph Reasoning by +combining Graph Convolution Neural Network (GCN) with the Attention Mechanism. +This approach utilizes the Attention Mechanism to examine the relationships +between entities and their neighboring nodes, which helps to develop detailed +feature vectors for each entity. The GCN uses shared parameters to effectively +represent the characteristics of adjacent entities. We first learn the +similarity of entities for node representation learning. By integrating the +attributes of the entities and their interactions, this method generates +extensive implicit feature vectors for each entity, improving performance in +tasks including entity classification and link prediction, outperforming +traditional neural network models. To conclude, this work provides crucial +methodological support for a range of applications, such as search engines, +question-answering systems, recommendation systems, and data integration tasks. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 61 + +
+
+
+ + ☆ Reinforcement learning for question answering in programming domain + using public community scoring as a human feedback + + +
+ In this study, we investigate the enhancement of the GPT Neo 125M performance +in Community Question Answering (CQA) with a focus on programming, through the +integration of Reinforcement Learning from Human Feedback (RLHF) and the +utilization of scores from Stack Overflow. Two distinct reward model training +strategies are employed for fine-tuning with Proximal Policy Optimization +(PPO). Notably, the improvements in performance achieved through this method +are comparable to those of GPT Neo 2.7B parameter variant. Additionally, an +auxiliary scoring mechanism is introduced, which demonstrates the limitations +of conventional linguistic metrics in evaluating responses in the programming +domain. Through accurate analysis, this paper looks at the divergence between +traditional linguistic metrics and our human-preferences-based reward model, +underscoring the imperative for domain-specific evaluation methods. By +elucidating the complexities involved in applying RLHF to programming CQA and +accentuating the significance of context-aware evaluation, this study +contributes to the ongoing efforts in refining Large Language Models through +focused human feedback. + +
+
+
+
+
+ + ☆ Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs + Without Fine-Tuning + + +
+ Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type +of attack that can coax these models into generating harmful and illegal +content. In this paper, we show that pruning up to 20% of LLM parameters +markedly increases their resistance to such attacks without additional training +and without sacrificing their performance in standard benchmarks. Intriguingly, +we discovered that the enhanced safety observed post-pruning correlates to the +initial safety training level of the model, hinting that the effect of pruning +could be more general and may hold for other LLM behaviors beyond safety. +Additionally, we introduce a curated dataset of 225 harmful tasks across five +categories, inserted into ten different Jailbreaking prompts, showing that +pruning aids LLMs in concentrating attention on task-relevant tokens in +jailbreaking prompts. Lastly, our experiments reveal that the prominent chat +models, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high +susceptibility to jailbreaking attacks, with some categories achieving nearly +70-100% success rate. These insights underline the potential of pruning as a +generalizable approach for improving LLM safety, reliability, and potentially +other desired behaviors. + +
+
+
+
+
+ + ☆ Advancements in eHealth Data Analytics through Natural Language + Processing and Deep Learning + + +
+ The healthcare environment is commonly referred to as "information-rich" but +also "knowledge poor". Healthcare systems collect huge amounts of data from +various sources: lab reports, medical letters, logs of medical tools or +programs, medical prescriptions, etc. These massive sets of data can provide +great knowledge and information that can improve the medical services, and +overall the healthcare domain, such as disease prediction by analyzing the +patient's symptoms or disease prevention, by facilitating the discovery of +behavioral factors for diseases. Unfortunately, only a relatively small volume +of the textual eHealth data is processed and interpreted, an important factor +being the difficulty in efficiently performing Big Data operations. In the +medical field, detecting domain-specific multi-word terms is a crucial task as +they can define an entire concept with a few words. A term can be defined as a +linguistic structure or a concept, and it is composed of one or more words with +a specific meaning to a domain. All the terms of a domain create its +terminology. This chapter offers a critical study of the current, most +performant solutions for analyzing unstructured (image and textual) eHealth +data. This study also provides a comparison of the current Natural Language +Processing and Deep Learning techniques in the eHealth context. Finally, we +examine and discuss some of the current issues, and we define a set of research +directions in this area. + +
+
+
+
+
+ + ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence + in extremist social media + + +
+ Online hate speech proliferation has created a difficult problem for social +media platforms. A particular challenge relates to the use of coded language by +groups interested in both creating a sense of belonging for its users and +evading detection. Coded language evolves quickly and its use varies over time. +This paper proposes a methodology for detecting emerging coded hate-laden +terminology. The methodology is tested in the context of online antisemitic +discourse. The approach considers posts scraped from social media platforms, +often used by extremist users. The posts are scraped using seed expressions +related to previously known discourse of hatred towards Jews. The method begins +by identifying the expressions most representative of each post and calculating +their frequency in the whole corpus. It filters out grammatically incoherent +expressions as well as previously encountered ones so as to focus on emergent +well-formed terminology. This is followed by an assessment of semantic +similarity to known antisemitic terminology using a fine-tuned large language +model, and subsequent filtering out of the expressions that are too distant +from known expressions of hatred. Emergent antisemitic expressions containing +terms clearly relating to Jewish topics are then removed to return only coded +expressions of hatred. + +
+
+ comment: 9 pages, 4 figures, 2 algorithms, 3 tables +
+
+
+
+
+ + ☆ A survey on recent advances in named entity recognition + + +
+ Named Entity Recognition seeks to extract substrings within a text that name +real-world objects and to determine their type (for example, whether they refer +to persons or organizations). In this survey, we first present an overview of +recent popular approaches, but we also look at graph- and transformer- based +methods including Large Language Models (LLMs) that have not had much coverage +in other surveys. Second, we focus on methods designed for datasets with scarce +annotations. Third, we evaluate the performance of the main NER implementations +on a variety of datasets with differing characteristics (as regards their +domain, their size, and their number of classes). We thus provide a deep +comparison of algorithms that are never considered together. Our experiments +shed some light on how the characteristics of datasets affect the behavior of +the methods that we compare. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Medusa: Simple LLM Inference Acceleration Framework with Multiple + Decoding Heads + + +
+ The inference process in Large Language Models (LLMs) is often limited due to +the absence of parallelism in the auto-regressive decoding process, resulting +in most operations being restricted by the memory bandwidth of accelerators. +While methods such as speculative decoding have been suggested to address this +issue, their implementation is impeded by the challenges associated with +acquiring and maintaining a separate draft model. In this paper, we present +Medusa, an efficient method that augments LLM inference by adding extra +decoding heads to predict multiple subsequent tokens in parallel. Using a +tree-based attention mechanism, Medusa constructs multiple candidate +continuations and verifies them simultaneously in each decoding step. By +leveraging parallel processing, Medusa introduces only minimal overhead in +terms of single-step latency while substantially reducing the number of +decoding steps required. + We present two levels of fine-tuning procedures for Medusa to meet the needs +of different use cases: Medusa-1: Medusa is directly fine-tuned on top of a +frozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa +is fine-tuned together with the backbone LLM, enabling better prediction +accuracy of Medusa heads and higher speedup but needing a special training +recipe that preserves the backbone model's capabilities. + Moreover, we propose several extensions that improve or expand the utility of +Medusa, including a self-distillation to handle situations where no training +data is available and a typical acceptance scheme to boost the acceptance rate +while maintaining generation quality. We evaluate Medusa on models of various +sizes and training procedures. Our experiments demonstrate that Medusa-1 can +achieve over 2.2x speedup without compromising generation quality, while +Medusa-2 further improves the speedup to 2.3-3.6x. + +
+
+ comment: The code for this implementation is available at + https://github.com/FasterDecoding/Medusa +
+
+
+
+
+ + ☆ Mitigating Hallucinations of Large Language Models via Knowledge + Consistent Alignment + + +
+ While Large Language Models (LLMs) have proven to be exceptional on a variety +of tasks after alignment, they may still produce responses that contradict the +context or world knowledge confidently, a phenomenon known as +``hallucination''. In this paper, we demonstrate that reducing the +inconsistency between the external knowledge encapsulated in the training data +and the intrinsic knowledge inherited in the pretraining corpus could mitigate +hallucination in alignment. Specifically, we introduce a novel knowledge +consistent alignment (KCA) approach, which involves automatically formulating +examinations based on external knowledge for accessing the comprehension of +LLMs. For data encompassing knowledge inconsistency, KCA implements several +simple yet efficient strategies for processing. We illustrate the superior +performance of the proposed KCA approach in mitigating hallucinations across +six benchmarks using LLMs of different backbones and scales. Furthermore, we +confirm the correlation between knowledge inconsistency and hallucination, +signifying the effectiveness of reducing knowledge inconsistency in alleviating +hallucinations. Our code, model weights, and data are public at +\url{https://github.com/fanqiwan/KCA}. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Structured Code Representations Enable Data-Efficient Adaptation of Code + Language Models + + +
+ Current language models tailored for code tasks often adopt the +pre-training-then-fine-tuning paradigm from natural language processing, +modeling source code as plain text. This approach, however, overlooks the +unambiguous structures inherent in programming languages. In this work, we +explore data-efficient adaptation of pre-trained code models by further +pre-training and fine-tuning them with program structures. Specifically, we +represent programs as parse trees -- also known as concrete syntax trees (CSTs) +-- and adapt pre-trained models on serialized CSTs. Although the models that we +adapt have been pre-trained only on the surface form of programs, we find that +a small amount of continual pre-training and fine-tuning on CSTs without +changing the model architecture yields improvements over the baseline approach +across various code tasks. The improvements are found to be particularly +significant when there are limited training examples, demonstrating the +effectiveness of integrating program structures with plain-text representation +even when working with backbone models that have not been pre-trained with +structures. + +
+
+
+
+
+ + ☆ Q&A Prompts: Discovering Rich Visual Clues through Mining + Question-Answer Prompts for VQA requiring Diverse World Knowledge + + +
+ With the breakthrough of multi-modal large language models, answering complex +visual questions that demand advanced reasoning abilities and world knowledge +has become a much more important testbed for developing AI models than ever. +However, equipping AI models with robust cross-modality reasoning ability +remains challenging since the cognition scheme of humans has not been +understood systematically. In this paper, we believe that if we can collect +visual clues in the given image as much as possible, we will recognize the +image more accurately, understand the question better, recall relevant +knowledge more easily, and finally reason out the answer. We discover these +rich visual clues by mining question-answer pairs in images and sending them +into multi-modal large language models as prompts. We call the proposed method +Q&A Prompts. Specifically, we first use the image-answer pairs and the +corresponding questions in the training set as inputs and outputs to train a +visual question generation model. Then, we use an image tagging model to +identify various instances and send packaged image-tag pairs into the visual +question generation model to generate relevant questions with the extracted +image tags as answers. Finally, we encode these generated question-answer pairs +as prompts with a visual-aware prompting module and send them into pre-trained +multi-modal large language models to reason out the final answers. Experimental +results show that, compared with state-of-the-art methods, our Q&A Prompts +achieves substantial improvements on the challenging visual question answering +datasets requiring reasoning over diverse world knowledge, such as OK-VQA and +A-OKVQA. + +
+
+
+
+
+ + ☆ Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal + Models for Video Question Answering + + +
+ Video Question Answering (VideoQA) aims to answer natural language questions +based on the information observed in videos. Despite the recent success of +Large Multimodal Models (LMMs) in image-language understanding and reasoning, +they deal with VideoQA insufficiently by simply taking uniformly sampled frames +as visual inputs, which ignores question-relevant visual clues. Moreover, there +are no human annotations for question-critical timestamps in existing VideoQA +datasets. In light of this, we propose a novel weakly supervised framework to +enforce the LMMs to reason out the answers with question-critical moments as +visual inputs. Specifically, we fuse the question and answer pairs as event +descriptions to find multiple keyframes as target moments, which will be +pseudo-labels. With these pseudo-labels as additionally weak supervision, we +devise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG +learns multiple Gaussian functions to characterize the temporal structure of +the video, and sample question-critical frames as positive moments to be the +visual inputs of LMMs. Extensive experiments on several VideoQA benchmarks +verify the effectiveness of our framework, and we achieve substantial +improvements compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ☆ LangBridge: Multilingual Reasoning Without Multilingual Supervision + + +
+ We introduce LangBridge, a zero-shot approach to adapt language models for +multilingual reasoning tasks without multilingual supervision. LangBridge +operates by bridging two models, each specialized in different aspects: (1) one +specialized in understanding multiple languages (e.g., mT5 encoder) and (2) one +specialized in reasoning (e.g., Orca 2). LangBridge connects the two models by +introducing minimal trainable parameters between them. Despite utilizing only +English data for training, LangBridge considerably enhances the performance of +language models on low-resource languages across mathematical reasoning, +coding, and logical reasoning. Our analysis suggests that the efficacy of +LangBridge stems from the language-agnostic characteristics of multilingual +representations. We publicly release our code and models. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ A Simple Framework to Accelerate Multilingual Language Model for + Monolingual Text Generation + + +
+ Recent advancements in large language models have facilitated the execution +of complex language tasks, not only in English but also in non-English +languages. However, the tokenizers of most language models, such as Llama, +trained on English-centric corpora, tend to excessively fragment tokens in +non-English languages. This issue is especially pronounced in non-roman +alphabetic languages, which are often divided at a character or even Unicode +level, leading to slower text generation. To address this, our study introduces +a novel framework designed to expedite text generation in these languages. This +framework predicts larger linguistic units than those of conventional +multilingual tokenizers and is specifically tailored to the target language, +thereby reducing the number of decoding steps required. Our empirical results +demonstrate that the proposed framework increases the generation speed by a +factor of 1.9 compared to standard decoding while maintaining the performance +of a pre-trained multilingual model on monolingual tasks. + +
+
+
+
+
+ + ☆ Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech + Detection + + +
+ With the recent surge and exponential growth of social media usage, +scrutinizing social media content for the presence of any hateful content is of +utmost importance. Researchers have been diligently working since the past +decade on distinguishing between content that promotes hatred and content that +does not. Traditionally, the main focus has been on analyzing textual content. +However, recent research attempts have also commenced into the identification +of audio-based content. Nevertheless, studies have shown that relying solely on +audio or text-based content may be ineffective, as recent upsurge indicates +that individuals often employ sarcasm in their speech and writing. To overcome +these challenges, we present an approach to identify whether a speech promotes +hate or not utilizing both audio and textual representations. Our methodology +is based on the Transformer framework that incorporates both audio and text +sampling, accompanied by our very own layer called "Attentive Fusion". The +results of our study surpassed previous state-of-the-art techniques, achieving +an impressive macro F1 score of 0.927 on the Test Set. + +
+
+ comment: Accepted in 20th International Conference on Natural Language + Processing (ICON) +
+
+
+
+
+ + ☆ Sowing the Wind, Reaping the Whirlwind: The Impact of Editing Language + Models + + +
+ In the rapidly advancing field of artificial intelligence, the concept of +Red-Teaming or Jailbreaking large language models (LLMs) has emerged as a +crucial area of study. This approach is especially significant in terms of +assessing and enhancing the safety and robustness of these models. This paper +investigates the intricate consequences of such modifications through model +editing, uncovering a complex relationship between enhancing model accuracy and +preserving its ethical integrity. Our in-depth analysis reveals a striking +paradox: while injecting accurate information is crucial for model reliability, +it can paradoxically destabilize the model's foundational framework, resulting +in unpredictable and potentially unsafe behaviors. Additionally, we propose a +benchmark dataset NicheHazardQA to investigate this unsafe behavior both within +the same and cross topical domain. This aspect of our research sheds light on +how the edits, impact the model's safety metrics and guardrails. Our findings +show that model editing serves as a cost-effective tool for topical red-teaming +by methodically applying targeted edits and evaluating the resultant model +behavior + +
+
+
+
+
+ + ☆ PHOENIX: Open-Source Language Adaption for Direct Preference + Optimization + + +
+ Large language models have gained immense importance in recent years and have +demonstrated outstanding results in solving various tasks. However, despite +these achievements, many questions remain unanswered in the context of large +language models. Besides the optimal use of the models for inference and the +alignment of the results to the desired specifications, the transfer of models +to other languages is still an underdeveloped area of research. The recent +publication of models such as Llama-2 and Zephyr has provided new insights into +architectural improvements and the use of human feedback. However, insights +into adapting these techniques to other languages remain scarce. In this paper, +we build on latest improvements and apply the Direct Preference +Optimization(DPO) approach to the German language. The model is available at +https://huggingface.co/DRXD1000/Phoenix. + +
+
+
+
+
+ + ☆ Self-training from Self-memory in Data-to-text Generation + + +
+ This paper introduces a novel training model, self-training from self-memory +(STSM) in data-to-text generation (DTG), allowing the model to self-train on +subsets, including self-memory as outputs inferred directly from the trained +models and/or the new data. The quality of self-memory is validated by two +models, data-to-text (D2T) and text-to-data (T2D), by two pre-defined +conditions: (1) the appearance of all source values in the outputs of the D2T +model and (2) the ability to convert back to source data in the outputs in the +T2D model. We utilize a greedy algorithm to generate shorter D2T outputs if +they contain all source values. Subsequently, we use the T2D model to confirm +that these outputs can capture input relationships by demonstrating their +capacity to convert text back into data. With 30% of the dataset, we can train +the D2T model with a competitive performance compared to full training in the +same setup. We experiment with our model on two datasets, E2E NLG and DART. +STSM offers the D2T model a generalization capability from its subset memory +while reducing training data volume. Ultimately, we anticipate that this paper +will contribute to continual learning solutions that adapt to new training +data, incorporating it as a form of self-memory in DTG tasks. The curated +dataset is publicly available at: https://github.com/hoangthangta/STSM. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy + + +
+ We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel +multi-adapter method, OrchMoE, which capitalizes on modular skill architecture +for enhanced forward transfer in neural networks. Unlike prior models that +depend on explicit task identification inputs, OrchMoE automatically discerns +task categories, streamlining the learning process. This is achieved through an +integrated mechanism comprising an Automatic Task Classification module and a +Task-Skill Allocation module, which collectively deduce task-specific +classifications and tailor skill allocation matrices. Our extensive evaluations +on the 'Super Natural Instructions' dataset, featuring 1,600 diverse +instructional tasks, indicate that OrchMoE substantially outperforms comparable +multi-adapter baselines in terms of both performance and sample utilization +efficiency, all while operating within the same parameter constraints. These +findings suggest that OrchMoE offers a significant leap forward in multi-task +learning efficiency. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Multilingual acoustic word embeddings for zero-resource languages + + +
+ This research addresses the challenge of developing speech applications for +zero-resource languages that lack labelled data. It specifically uses acoustic +word embedding (AWE) -- fixed-dimensional representations of variable-duration +speech segments -- employing multilingual transfer, where labelled data from +several well-resourced languages are used for pertaining. The study introduces +a new neural network that outperforms existing AWE models on zero-resource +languages. It explores the impact of the choice of well-resourced languages. +AWEs are applied to a keyword-spotting system for hate speech detection in +Swahili radio broadcasts, demonstrating robustness in real-world scenarios. +Additionally, novel semantic AWE models improve semantic query-by-example +search. + +
+
+
+
+
+ + ☆ Speech Swin-Transformer: Exploring a Hierarchical Transformer with + Shifted Windows for Speech Emotion Recognition ICASSP 2024 + + +
+ Swin-Transformer has demonstrated remarkable success in computer vision by +leveraging its hierarchical feature representation based on Transformer. In +speech signals, emotional information is distributed across different scales of +speech features, e.\,g., word, phrase, and utterance. Drawing above +inspiration, this paper presents a hierarchical speech Transformer with shifted +windows to aggregate multi-scale emotion features for speech emotion +recognition (SER), called Speech Swin-Transformer. Specifically, we first +divide the speech spectrogram into segment-level patches in the time domain, +composed of multiple frame patches. These segment-level patches are then +encoded using a stack of Swin blocks, in which a local window Transformer is +utilized to explore local inter-frame emotional information across frame +patches of each segment patch. After that, we also design a shifted window +Transformer to compensate for patch correlations near the boundaries of segment +patches. Finally, we employ a patch merging operation to aggregate +segment-level emotional features for hierarchical speech representation by +expanding the receptive field of Transformer from frame-level to segment-level. +Experimental results demonstrate that our proposed Speech Swin-Transformer +outperforms the state-of-the-art methods. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ The "Colonial Impulse" of Natural Language Processing: An Audit of + Bengali Sentiment Analysis Tools and Their Identity-based Biases + + +
+ While colonization has sociohistorically impacted people's identities across +various dimensions, those colonial values and biases continue to be perpetuated +by sociotechnical systems. One category of sociotechnical systems--sentiment +analysis tools--can also perpetuate colonial values and bias, yet less +attention has been paid to how such tools may be complicit in perpetuating +coloniality, although they are often used to guide various practices (e.g., +content moderation). In this paper, we explore potential bias in sentiment +analysis tools in the context of Bengali communities that have experienced and +continue to experience the impacts of colonialism. Drawing on identity +categories most impacted by colonialism amongst local Bengali communities, we +focused our analytic attention on gender, religion, and nationality. We +conducted an algorithmic audit of all sentiment analysis tools for Bengali, +available on the Python package index (PyPI) and GitHub. Despite similar +semantic content and structure, our analyses showed that in addition to +inconsistencies in output from different tools, Bengali sentiment analysis +tools exhibit bias between different identity categories and respond +differently to different ways of identity expression. Connecting our findings +with colonially shaped sociocultural structures of Bengali communities, we +discuss the implications of downstream bias of sentiment analysis tools. + +
+
+
+
+
+ + ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model + Reasoning over Image Sequences + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated proficiency in +handling a variety of visual-language tasks. However, current MLLM benchmarks +are predominantly designed to evaluate reasoning based on static information +about a single image, and the ability of modern MLLMs to extrapolate from image +sequences, which is essential for understanding our ever-changing world, has +been less investigated. To address this challenge, this paper introduces +Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning +abilities. Mementos features 4,761 diverse image sequences with varying +lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning +performance. Through a careful evaluation of nine recent MLLMs on Mementos, +including GPT-4V and Gemini, we find that they struggle to accurately describe +dynamic information about given image sequences, often leading to +hallucinations/misrepresentations of objects and their corresponding behaviors. +Our quantitative analysis and case studies identify three key factors impacting +MLLMs' sequential image reasoning: the correlation between object and +behavioral hallucinations, the influence of cooccurring behaviors, and the +compounding impact of behavioral hallucinations. Our dataset is available at +https://github.com/umd-huang-lab/Mementos. + +
+
+ comment: 27 pages, 23 figures +
+
+
+
+
+ + ☆ Cross-lingual Editing in Multilingual Language Models EACL 2024 + + +
+ The training of large language models (LLMs) necessitates substantial data +and computational resources, and updating outdated LLMs entails significant +efforts and resources. While numerous model editing techniques (METs) have +emerged to efficiently update model outputs without retraining, their +effectiveness in multilingual LLMs, where knowledge is stored in diverse +languages, remains an underexplored research area. This research paper +introduces the cross-lingual model editing (\textbf{XME}) paradigm, wherein a +fact is edited in one language, and the subsequent update propagation is +observed across other languages. To investigate the XME paradigm, we conducted +experiments using BLOOM, mBERT, and XLM-RoBERTa using the two writing scripts: +\textit{Latin} (English, French, and Spanish) and \textit{Indic} (Hindi, +Gujarati, and Bengali). The results reveal notable performance limitations of +state-of-the-art METs under the XME setting, mainly when the languages involved +belong to two distinct script families. These findings highlight the need for +further research and development of XME techniques to address these challenges. +For more comprehensive information, the dataset used in this research and the +associated code are publicly available at the following +URL\url{https://github.com/lingo-iitgn/XME}. + +
+
+ comment: Accepted at EACL 2024 +
+
+
+
+
+ + ☆ A match made in consistency heaven: when large language models meet + evolutionary algorithms + + +
+ Pre-trained large language models (LLMs) have powerful capabilities for +generating creative natural text. Evolutionary algorithms (EAs) can discover +diverse solutions to complex real-world problems. Motivated by the common +collective and directionality of text sequence generation and evolution, this +paper illustrates the strong consistency of LLMs and EAs, which includes +multiple one-to-one key characteristics: token embedding and genotype-phenotype +mapping, position encoding and fitness shaping, position embedding and +selection, attention and crossover, feed-forward neural network and mutation, +model training and parameter update, and multi-task learning and +multi-objective optimization. Based on this consistency perspective, existing +coupling studies are analyzed, including evolutionary fine-tuning and +LLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap +for future research in coupling LLMs and EAs, while highlighting key challenges +along the way. The consistency not only reveals the evolution mechanism behind +LLMs but also facilitates the development of evolved artificial agents that +approach or surpass biological organisms. + +
+
+ comment: A perspective article under review +
+
+
+
+
+ + ☆ FinSQL: Model-Agnostic LLMs-based Text-to-SQL Framework for Financial + Analysis + + +
+ Text-to-SQL, which provides zero-code interface for operating relational +databases, has gained much attention in financial analysis; because, financial +professionals may not well-skilled in SQL programming. However, until now, +there is no practical Text-to-SQL benchmark dataset for financial analysis, and +existing Text-to-SQL methods have not considered the unique characteristics of +databases in financial applications, such as commonly existing wide tables. To +address these issues, we collect a practical Text-to-SQL benchmark dataset and +propose a model-agnostic Large Language Model (LLMs)-based Text-to-SQL +framework for financial analysis. The benchmark dataset, BULL, is collected +from the practical financial analysis business of Hundsun Technologies Inc., +including databases for fund, stock, and macro economy. Besides, the proposed +LLMs-based Text-to-SQL framework, FinSQL, provides a systematic treatment for +financial Text-to-SQL from the perspectives of prompt construction, +parameter-efficient fine-tuning and output calibration. Extensive experimental +results on BULL demonstrate that FinSQL achieves the state-of-the-art +Text-to-SQL performance at a small cost; furthermore, FinSQL can bring up to +36.64% performance improvement in scenarios requiring few-shot cross-database +model transfer. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ Knowledge Fusion of Large Language Models ICLR 2024 + + +
+ While training large language models (LLMs) from scratch can generate models +with distinct functionalities and strengths, it comes at significant costs and +may result in redundant capabilities. Alternatively, a cost-effective and +compelling approach is to merge existing pre-trained LLMs into a more potent +model. However, due to the varying architectures of these LLMs, directly +blending their weights is impractical. In this paper, we introduce the notion +of knowledge fusion for LLMs, aimed at combining the capabilities of existing +LLMs and transferring them into a single LLM. By leveraging the generative +distributions of source LLMs, we externalize their collective knowledge and +unique strengths, thereby potentially elevating the capabilities of the target +model beyond those of any individual source LLM. We validate our approach using +three popular LLMs with different architectures--Llama-2, MPT, and +OpenLLaMA--across various benchmarks and tasks. Our findings confirm that the +fusion of LLMs can improve the performance of the target model across a range +of capabilities such as reasoning, commonsense, and code generation. Our code, +model weights, and data are public at +\url{https://github.com/fanqiwan/FuseLLM}. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ☆ Generative Dense Retrieval: Memory Can Be a Burden EACL 2024 + + +
+ Generative Retrieval (GR), autoregressively decoding relevant document +identifiers given a query, has been shown to perform well under the setting of +small-scale corpora. By memorizing the document corpus with model parameters, +GR implicitly achieves deep interaction between query and document. However, +such a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for +fine-grained features of documents; (2) Memory confusion gets worse as the +corpus size increases; (3) Huge memory update costs for new documents. To +alleviate these problems, we propose the Generative Dense Retrieval (GDR) +paradigm. Specifically, GDR first uses the limited memory volume to achieve +inter-cluster matching from query to relevant document clusters. +Memorizing-free matching mechanism from Dense Retrieval (DR) is then introduced +to conduct fine-grained intra-cluster matching from clusters to relevant +documents. The coarse-to-fine process maximizes the advantages of GR's deep +interaction and DR's scalability. Besides, we design a cluster identifier +constructing strategy to facilitate corpus memory and a cluster-adaptive +negative sampling strategy to enhance the intra-cluster mapping ability. +Empirical results show that GDR obtains an average of 3.0 R@100 improvement on +NQ dataset under multiple settings and has better scalability. + +
+
+ comment: EACL 2024 main +
+
+
+
+
+ + ☆ Escape Sky-high Cost: Early-stopping Self-Consistency for Multi-step + Reasoning ICLR 2024 + + +
+ Self-consistency (SC) has been a widely used decoding strategy for +chain-of-thought reasoning. Despite bringing significant performance +improvements across a variety of multi-step reasoning tasks, it is a high-cost +method that requires multiple sampling with the preset size. In this paper, we +propose a simple and scalable sampling process, \textbf{E}arly-Stopping +\textbf{S}elf-\textbf{C}onsistency (ESC), to greatly reduce the cost of SC +without sacrificing performance. On this basis, one control scheme for ESC is +further derivated to dynamically choose the performance-cost balance for +different tasks and models. To demonstrate ESC's effectiveness, we conducted +extensive experiments on three popular categories of reasoning tasks: +arithmetic, commonsense and symbolic reasoning over language models with +varying scales. The empirical results show that ESC reduces the average number +of sampling of chain-of-thought reasoning by a significant margin on six +benchmarks, including MATH (-33.8%), GSM8K (-80.1%), StrategyQA (-76.8%), +CommonsenseQA (-78.5%), Coin Flip (-84.2%) and Last Letters (-67.4%), while +attaining comparable performances. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Name Tagging Under Domain Shift via Metric Learning for Life Sciences + + +
+ Name tagging is a key component of Information Extraction (IE), particularly +in scientific domains such as biomedicine and chemistry, where large language +models (LLMs), e.g., ChatGPT, fall short. We investigate the applicability of +transfer learning for enhancing a name tagging model trained in the biomedical +domain (the source domain) to be used in the chemical domain (the target +domain). A common practice for training such a model in a few-shot learning +setting is to pretrain the model on the labeled source data, and then, to +finetune it on a hand-full of labeled target examples. In our experiments we +observed that such a model is prone to mis-labeling the source entities, which +can often appear in the text, as the target entities. To alleviate this +problem, we propose a model to transfer the knowledge from the source domain to +the target domain, however, at the same time, to project the source entities +and target entities into separate regions of the feature space. This diminishes +the risk of mis-labeling the source entities as the target entities. Our model +consists of two stages: 1) entity grouping in the source domain, which +incorporates knowledge from annotated events to establish relations between +entities, and 2) entity discrimination in the target domain, which relies on +pseudo labeling and contrastive learning to enhance discrimination between the +entities in the two domains. We carry out our extensive experiments across +three source and three target datasets, and demonstrate that our method +outperforms the baselines, in some scenarios by 5\% absolute value. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ DeepEdit: Knowledge Editing as Decoding with Constraints + + +
+ We develop a new perspective of knowledge editing for large language models +(LLMs) as decoding with constraints. We propose DeepEdit (Depth-first Search +based Progressive Decoding for Knowledge Editing), a neuro-symbolic method that +improves knowledge editing with better coherence of reasoning, relevance to the +question, and awareness of updated knowledge. DeepEdit can be flexibly applied +to all black-box LLMs: it does not require any access to the model parameters, +representations, or output vocabulary distributions. DeepEdit progressively +produces the high-quality reasoning steps towards effective knowledge editing. +It utilizes a depth-first search to revise the LLMs' output, which improves the +output's informativeness to the input question and awareness of the updated +knowledge. Qualitatively, DeepEdit effectively controls LLMs to produce more +succinct reasoning in accord with knowledge editing. Quantitatively, DeepEdit +yields significant gains on MQuaKE, a challenging multi-hop question-answering +dataset with knowledge editing. We release the source code at +https://github.com/wangywUST/DeepEdit. + +
+
+
+
+
+ + ☆ Data-driven grapheme-to-phoneme representations for a lexicon-free + text-to-speech ICASSP 2024 + + +
+ Grapheme-to-Phoneme (G2P) is an essential first step in any modern, +high-quality Text-to-Speech (TTS) system. Most of the current G2P systems rely +on carefully hand-crafted lexicons developed by experts. This poses a two-fold +problem. Firstly, the lexicons are generated using a fixed phoneme set, +usually, ARPABET or IPA, which might not be the most optimal way to represent +phonemes for all languages. Secondly, the man-hours required to produce such an +expert lexicon are very high. In this paper, we eliminate both of these issues +by using recent advances in self-supervised learning to obtain data-driven +phoneme representations instead of fixed representations. We compare our +lexicon-free approach against strong baselines that utilize a well-crafted +lexicon. Furthermore, we show that our data-driven lexicon-free method performs +as good or even marginally better than the conventional rule-based or +lexicon-based neural G2Ps in terms of Mean Opinion Score (MOS) while using no +prior language lexicon or phoneme set, i.e. no linguistic expertise. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Critical Data Size of Language Models from a Grokking Perspective + + +
+ We explore the critical data size in language models, a threshold that marks +a fundamental shift from quick memorization to slow generalization. We +formalize the phase transition under the grokking configuration into the Data +Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus +regimes in language models training dynamics. We develop a grokking +configuration to reproduce grokking on simplistic language models stably by +rescaling initialization and weight decay. We show that generalization occurs +only when language models reach a critical size. We analyze grokking across +sample-wise and model-wise, verifying the proposed data efficiency hypothesis. +Our experiments reveal smoother phase transitions occurring at the critical +dataset size for language datasets. As the model size increases, this critical +point also becomes larger, indicating that larger models require more data. Our +results deepen the understanding of language model training, offering a novel +perspective on the role of data in the learning mechanism of language models. + +
+
+
+
+
+ + ☆ Contextualized Automatic Speech Recognition with Attention-Based Bias + Phrase Boosted Beam Search ICASSP20224 + + +
+ End-to-end (E2E) automatic speech recognition (ASR) methods exhibit +remarkable performance. However, since the performance of such methods is +intrinsically linked to the context present in the training data, E2E-ASR +methods do not perform as desired for unseen user contexts (e.g., technical +terms, personal names, and playlists). Thus, E2E-ASR methods must be easily +contextualized by the user or developer. This paper proposes an attention-based +contextual biasing method that can be customized using an editable phrase list +(referred to as a bias list). The proposed method can be trained effectively by +combining a bias phrase index loss and special tokens to detect the bias +phrases in the input speech data. In addition, to improve the contextualization +performance during inference further, we propose a bias phrase boosted (BPB) +beam search algorithm based on the bias phrase index probability. Experimental +results demonstrate that the proposed method consistently improves the word +error rate and the character error rate of the target phrases in the bias list +on both the Librispeech-960 (English) and our in-house (Japanese) dataset, +respectively. + +
+
+ comment: accepted by ICASSP20224 +
+
+
+
+
+ + ☆ Investigating Training Strategies and Model Robustness of Low-Rank + Adaptation for Language Modeling in Speech Recognition + + +
+ The use of low-rank adaptation (LoRA) with frozen pretrained language models +(PLMs) has become increasing popular as a mainstream, resource-efficient +modeling approach for memory-constrained hardware. In this study, we first +explore how to enhance model performance by introducing various LoRA training +strategies, achieving relative word error rate reductions of 3.50\% on the +public Librispeech dataset and of 3.67\% on an internal dataset in the +messaging domain. To further characterize the stability of LoRA-based +second-pass speech recognition models, we examine robustness against input +perturbations. These perturbations are rooted in homophone replacements and a +novel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both +designed to measure the relative degradation in the performance of rescoring +models. Our experimental results indicate that while advanced variants of LoRA, +such as dynamic rank-allocated LoRA, lead to performance degradation in +$1$-best perturbation, they alleviate the degradation in $N$-best perturbation. +This finding is in comparison to fully-tuned models and vanilla LoRA tuning +baselines, suggesting that a comprehensive selection is needed when using +LoRA-based adaptation for compute-cost savings and robust language modeling. + +
+
+
+
+
+ + ☆ Large Language Models are Efficient Learners of Noise-Robust Speech + Recognition ICLR 2024 + + +
+ Recent advances in large language models (LLMs) have promoted generative +error correction (GER) for automatic speech recognition (ASR), which leverages +the rich linguistic knowledge and powerful reasoning ability of LLMs to improve +recognition results. The latest work proposes a GER benchmark with HyPoradise +dataset to learn the mapping from ASR N-best hypotheses to ground-truth +transcription by efficient LLM finetuning, which shows great effectiveness but +lacks specificity on noise-robust ASR. In this work, we extend the benchmark to +noisy conditions and investigate if we can teach LLMs to perform denoising for +GER just like what robust ASR do}, where one solution is introducing noise +information as a conditioner into LLM. However, directly incorporating noise +embeddings from audio encoder could harm the LLM tuning due to cross-modality +gap. To this end, we propose to extract a language-space noise embedding from +the N-best list to represent the noise conditions of source speech, which can +promote the denoising process in GER. Furthermore, in order to enhance its +representation ability of audio noise, we design a knowledge distillation (KD) +approach via mutual information estimation to distill the real noise +information in audio embeddings to our language embedding. Experiments on +various latest LLMs demonstrate our approach achieves a new breakthrough with +up to 53.9% correction improvement in terms of word error rate while with +limited training data. Analysis shows that our language-space noise embedding +can well represent the noise conditions of source speech, under which +off-the-shelf LLMs show strong ability of language-space denoising. + +
+
+ comment: Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be + open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license +
+
+
+
+
+ + ☆ Breaking the Curse of Multilinguality with Cross-lingual Expert Language + Models + + +
+ Despite their popularity in non-English NLP, multilingual language models +often underperform monolingual ones due to inter-language competition for model +parameters. We propose Cross-lingual Expert Language Models (X-ELM), which +mitigate this competition by independently training language models on subsets +of the multilingual corpus. This process specializes X-ELMs to different +languages while remaining effective as a multilingual ensemble. Our experiments +show that when given the same compute budget, X-ELM outperforms jointly trained +multilingual models across all considered languages and that these gains +transfer to downstream tasks. X-ELM provides additional benefits over +performance improvements: new experts can be iteratively added, adapting X-ELM +to new languages without catastrophic forgetting. Furthermore, training is +asynchronous, reducing the hardware requirements for multilingual training and +democratizing multilingual modeling. + +
+
+
+
+
+ + ☆ Mining experimental data from Materials Science literature with Large + Language Models + + +
+ This study is dedicated to evaluating the capabilities of advanced large +language models (LLMs) such as GPT-3.5-Turbo, GPT-4, and GPT-4-Turbo in the +extraction of structured information from scientific documents within the field +of materials science. We introduce a novel methodology for the comparative +analysis of intricate material expressions, emphasising the standardisation of +chemical formulas to tackle the complexities inherent in materials science +information assessment. To this end, we primarily focus on two critical tasks +of information extraction: (i) a named entity recognition (NER) of studied +materials and physical properties and (ii) a relation extraction (RE) between +these entities. The performance of LLMs in executing these tasks is benchmarked +against traditional models based on the BERT architecture and rule-based +approaches. For NER, LLMs fail to outperform the baseline with zero-shot +prompting and exhibit only limited improvement with few-shot prompting. +However, for RE, a GPT-3.5-Turbo fine-tuned with the appropriate strategy +outperforms all models, including the baseline. Without any fine-tuning, GPT-4 +and GPT-4-Turbo display remarkable reasoning and relationship extraction +capabilities after being provided with merely a couple of examples, surpassing +the baseline. Overall, the results suggest that although LLMs demonstrate +relevant reasoning skills in connecting concepts, for tasks requiring +extracting complex domain-specific entities like materials, specialised models +are currently a better choice. + +
+
+
+
+
+ + ☆ PubTator 3.0: an AI-powered Literature Resource for Unlocking Biomedical + Knowledge + + +
+ PubTator 3.0 (https://www.ncbi.nlm.nih.gov/research/pubtator3/) is a +biomedical literature resource using state-of-the-art AI techniques to offer +semantic and relation searches for key concepts like proteins, genetic +variants, diseases, and chemicals. It currently provides over one billion +entity and relation annotations across approximately 36 million PubMed +abstracts and 6 million full-text articles from the PMC open access subset, +updated weekly. PubTator 3.0's online interface and API utilize these +precomputed entity relations and synonyms to provide advanced search +capabilities and enable large-scale analyses, streamlining many complex +information needs. We showcase the retrieval quality of PubTator 3.0 using a +series of entity pair queries, demonstrating that PubTator 3.0 retrieves a +greater number of articles than either PubMed or Google Scholar, with higher +precision in the top 20 results. We further show that integrating ChatGPT +(GPT-4) with PubTator APIs dramatically improves the factuality and +verifiability of its responses. In summary, PubTator 3.0 offers a comprehensive +set of features and tools that allow researchers to navigate the ever-expanding +wealth of biomedical literature, expediting research and unlocking valuable +insights for scientific discovery. + +
+
+
+
+
+ + ☆ FAIR Enough: How Can We Develop and Assess a FAIR-Compliant Dataset for + Large Language Models' Training? + + +
+ Advancements in Large Language Models (LLMs) highlight the need for ethical +practices and data integrity. We introduce a framework that embeds FAIR +(Findable, Accessible, Interoperable, Reusable) data principles into LLM +training. This approach marks a shift towards practices compliant with FAIR +standards. Our framework presents guidelines for integrating FAIR data +principles into LLM training. This initiative includes a checklist for +researchers and developers. We also demonstrate its practical application +through a case study focused on bias identification and mitigation in our +FAIR-compliant dataset. This work is a significant contribution to AI ethics +and data science, advocating for balanced and ethical training methods in LLMs. + +
+
+
+
+
+ + ☆ Analysis and Detection of Multilingual Hate Speech Using Transformer + Based Deep Learning + + +
+ Hate speech is harmful content that directly attacks or promotes hatred +against members of groups or individuals based on actual or perceived aspects +of identity, such as racism, religion, or sexual orientation. This can affect +social life on social media platforms as hateful content shared through social +media can harm both individuals and communities. As the prevalence of hate +speech increases online, the demand for automated detection as an NLP task is +increasing. In this work, the proposed method is using transformer-based model +to detect hate speech in social media, like twitter, Facebook, WhatsApp, +Instagram, etc. The proposed model is independent of languages and has been +tested on Italian, English, German, Bengali. The Gold standard datasets were +collected from renowned researcher Zeerak Talat, Sara Tonelli, Melanie Siegel, +and Rezaul Karim. The success rate of the proposed model for hate speech +detection is higher than the existing baseline and state-of-the-art models with +accuracy in Bengali dataset is 89%, in English: 91%, in German dataset 91% and +in Italian dataset it is 77%. The proposed algorithm shows substantial +improvement to the benchmark method. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ The Radiation Oncology NLP Database + + +
+ We present the Radiation Oncology NLP Database (ROND), the first dedicated +Natural Language Processing (NLP) dataset for radiation oncology, an important +medical specialty that has received limited attention from the NLP community in +the past. With the advent of Artificial General Intelligence (AGI), there is an +increasing need for specialized datasets and benchmarks to facilitate research +and development. ROND is specifically designed to address this gap in the +domain of radiation oncology, a field that offers many opportunities for NLP +exploration. It encompasses various NLP tasks including Logic Reasoning, Text +Classification, Named Entity Recognition (NER), Question Answering (QA), Text +Summarization, and Patient-Clinician Conversations, each with a distinct focus +on radiation oncology concepts and application cases. In addition, we have +developed an instruction-tuning dataset consisting of over 20k instruction +pairs (based on ROND) and trained a large language model, CancerChat. This +serves to demonstrate the potential of instruction-tuning large language models +within a highly-specialized medical domain. The evaluation results in this +study could serve as baseline results for future research. ROND aims to +stimulate advancements in radiation oncology and clinical NLP by offering a +platform for testing and improving algorithms and models in a domain-specific +context. The ROND dataset is a joint effort of multiple U.S. health +institutions. The data is available at +https://github.com/zl-liu/Radiation-Oncology-NLP-Database. + +
+
+ comment: 10 pages, 7 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language + Models + + +
+ The carbon footprint associated with large language models (LLMs) is a +significant concern, encompassing emissions from their training, inference, +experimentation, and storage processes, including operational and embodied +carbon emissions. An essential aspect is accurately estimating the carbon +impact of emerging LLMs even before their training, which heavily relies on GPU +usage. Existing studies have reported the carbon footprint of LLM training, but +only one tool, mlco2, can predict the carbon footprint of new neural networks +prior to physical training. However, mlco2 has several serious limitations. It +cannot extend its estimation to dense or mixture-of-experts (MoE) LLMs, +disregards critical architectural parameters, focuses solely on GPUs, and +cannot model embodied carbon footprints. Addressing these gaps, we introduce +\textit{\carb}, an end-to-end carbon footprint projection model designed for +both dense and MoE LLMs. Compared to mlco2, \carb~significantly enhances the +accuracy of carbon footprint estimations for various LLMs. The source code is +released at \url{https://github.com/SotaroKaneda/MLCarbon}. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ INACIA: Integrating Large Language Models in Brazilian Audit Courts: + Opportunities and Challenges + + +
+ This paper introduces INACIA (Instru\c{c}\~ao Assistida com Intelig\^encia +Artificial), a groundbreaking system designed to integrate Large Language +Models (LLMs) into the operational framework of Brazilian Federal Court of +Accounts (TCU). The system automates various stages of case analysis, including +basic information extraction, admissibility examination, Periculum in mora and +Fumus boni iuris analyses, and recommendations generation. Through a series of +experiments, we demonstrate INACIA's potential in extracting relevant +information from case documents, evaluating its legal plausibility, and +formulating propositions for judicial decision-making. Utilizing a validation +dataset alongside LLMs, our evaluation methodology presents an innovative +approach to assessing system performance, correlating highly with human +judgment. The results highlight INACIA's proficiency in handling complex legal +tasks, indicating its suitability for augmenting efficiency and judicial +fairness within legal systems. The paper also discusses potential enhancements +and future applications, positioning INACIA as a model for worldwide AI +integration in legal domains. + +
+
+
+
+
+ + ♻ ☆ How Transferable are Attribute Controllers on Pretrained Multilingual + Translation Models? EACL 2024 + + +
+ Customizing machine translation models to comply with fine-grained attributes +such as formality has seen tremendous progress recently. However, current +approaches mostly rely on at least some supervised data with attribute +annotation. Data scarcity therefore remains a bottleneck to democratizing such +customization possibilities to a wider range of languages, lower-resource ones +in particular. Given recent progress in pretrained massively multilingual +translation models, we use them as a foundation to transfer the attribute +controlling capabilities to languages without supervised data. In this work, we +present a comprehensive analysis of transferring attribute controllers based on +a pretrained NLLB-200 model. We investigate both training- and inference-time +control techniques under various data scenarios, and uncover their relative +strengths and weaknesses in zero-shot performance and domain robustness. We +show that both paradigms are complementary, as shown by consistent improvements +on 5 zero-shot directions. Moreover, a human evaluation on a real low-resource +language, Bengali, confirms our findings on zero-shot transfer to new target +languages. The code is +$\href{https://github.com/dannigt/attribute-controller-transfer}{\text{here}}$. + +
+
+ comment: EACL 2024 +
+
+
+
+
+ + ♻ ☆ MCWDST: a Minimum-Cost Weighted Directed Spanning Tree Algorithm for + Real-Time Fake News Mitigation in Social Media + + +
+ The widespread availability of internet access and handheld devices confers +to social media a power similar to the one newspapers used to have. People seek +affordable information on social media and can reach it within seconds. Yet +this convenience comes with dangers; any user may freely post whatever they +please and the content can stay online for a long period, regardless of its +truthfulness. A need to detect untruthful information, also known as fake news, +arises. In this paper, we present an end-to-end solution that accurately +detects fake news and immunizes network nodes that spread them in real-time. To +detect fake news, we propose two new stack deep learning architectures that +utilize convolutional and bidirectional LSTM layers. To mitigate the spread of +fake news, we propose a real-time network-aware strategy that (1) constructs a +minimum-cost weighted directed spanning tree for a detected node, and (2) +immunizes nodes in that tree by scoring their harmfulness using a novel ranking +function. We demonstrate the effectiveness of our solution on five real-world +datasets. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 2 +
+
+
+
+
+ + ♻ ☆ Generative User-Experience Research for Developing Domain-specific + Natural Language Processing Applications + + +
+ User experience (UX) is a part of human-computer interaction (HCI) research +and focuses on increasing intuitiveness, transparency, simplicity, and trust +for the system users. Most UX research for machine learning (ML) or natural +language processing (NLP) focuses on a data-driven methodology. It engages +domain users mainly for usability evaluation. Moreover, more typical UX methods +tailor the systems towards user usability, unlike learning about the user needs +first. This paper proposes a new methodology for integrating generative UX +research into developing domain NLP applications. Generative UX research +employs domain users at the initial stages of prototype development, i.e., +ideation and concept evaluation, and the last stage for evaluating system +usefulness and user utility. The methodology emerged from and is evaluated on a +case study about the full-cycle prototype development of a domain-specific +semantic search for daily operations in the process industry. A key finding of +our case study is that involving domain experts increases their interest and +trust in the final NLP application. The combined UX+NLP research of the +proposed method efficiently considers data- and user-driven opportunities and +constraints, which can be crucial for developing NLP applications. + +
+
+
+
+
+ + ♻ ☆ Efficient slot labelling + + +
+ Slot labelling is an essential component of any dialogue system, aiming to +find important arguments in every user turn. Common approaches involve large +pre-trained language models (PLMs) like BERT or RoBERTa, but they face +challenges such as high computational requirements and dependence on +pre-training data. In this work, we propose a lightweight method which performs +on par or better than the state-of-the-art PLM-based methods, while having +almost 10x less trainable parameters. This makes it especially applicable for +real-life industry scenarios. + +
+
+
+
+
+ + ♻ ☆ Exploring Iterative Enhancement for Improving Learnersourced + Multiple-Choice Question Explanations with Large Language Models + + +
+ Large language models exhibit superior capabilities in processing and +understanding language, yet their applications in educational contexts remain +underexplored. Learnersourcing enhances learning by engaging students in +creating their own educational content. When learnersourcing multiple-choice +questions, creating explanations for the solution of a question is a crucial +step; it helps other students understand the solution and promotes a deeper +understanding of related concepts. However, it is often difficult for students +to craft effective solution explanations, due to limited subject understanding. +To help scaffold the task of automated explanation generation, we present and +evaluate a framework called "ILearner-LLM", that iteratively enhances the +generated explanations for the given questions with large language models. +Comprising an explanation generation model and an explanation evaluation model, +the framework generates high-quality student-aligned explanations by +iteratively feeding the quality rating score from the evaluation model back +into the instruction prompt of the explanation generation model. Experimental +results demonstrate the effectiveness of our ILearner-LLM on LLaMA2-13B and +GPT-4 to generate higher quality explanations that are closer to those written +by students on five PeerWise datasets. Our findings represent a promising path +to enrich the learnersourcing experience for students and to enhance the +capabilities of large language models for educational applications. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ♻ ☆ Measuring the Robustness of NLP Models to Domain Shifts + + +
+ Existing research on Domain Robustness (DR) suffers from disparate setups, +lack of task variety, and scarce research on recent models and capabilities +such as few-shot learning. Furthermore, we claim that the common practice of +measuring DR might further obscure the picture. Current research focuses on +challenge sets and relies solely on the Source Drop (SD): Using the source +in-domain performance as a reference point for degradation. However, the Target +Drop (TD) should be used as a complementary point of view. To understand the DR +challenge in modern NLP models, we developed a benchmark comprised of seven NLP +tasks, including classification, QA, and generation. Our benchmark focuses on +natural topical domain shifts and enables measuring both the SD and the TD. Our +comprehensive study, involving over 14,000 domain shifts across 18 fine-tuned +and few-shot models, shows that both models suffer from drops upon domain +shifts. While fine-tuned models excel in-domain, few-shot LLMs often surpass +them cross-domain, showing better robustness. In addition, we found that a +large SD can be explained by shifting to a harder domain rather than a genuine +DR challenge. Thus, the TD is a more reliable metric. + +
+
+
+
+
+ + ♻ ☆ A ripple in time: a discontinuity in American history + + +
+ In this note we use the State of the Union Address (SOTU) dataset from Kaggle +to make some surprising (and some not so surprising) observations pertaining to +the general timeline of American history, and the character and nature of the +addresses themselves. Our main approach is using vector embeddings, such as +BERT (DistilBERT) and GPT-2. + While it is widely believed that BERT (and its variations) is most suitable +for NLP classification tasks, we find out that GPT-2 in conjunction with +nonlinear dimension reduction methods such as UMAP provide better separation +and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In +our case, no model fine-tuning is required, and the pre-trained out-of-the-box +GPT-2 model is enough. + We also used a fine-tuned DistilBERT model for classification detecting which +President delivered which address, with very good results (accuracy 93\% - 95\% +depending on the run). An analogous task was performed to determine the year of +writing, and we were able to pin it down to about 4 years (which is a single +presidential term). + It is worth noting that SOTU addresses provide relatively small writing +samples (with about 8000 words on average, and varying widely from under 2000 +words to more than 20000), and that the amount of authors is relatively large +(we used SOTU addresses of 42 US presidents). This shows that the techniques +employed turn out to be rather efficient, while all the computations described +in this note can be performed using a single GPU instance of Google Colab. + The accompanying code is available on GitHub. + +
+
+ comment: 7 pages, 8 figures; GitHub repository + https://github.com/sashakolpakov/ripple_in_time +
+
+
+
+
+ + ♻ ☆ Enhancing Summarization Performance through Transformer-Based Prompt + Engineering in Automated Medical Reporting ALT + + +
+ Customized medical prompts enable Large Language Models (LLM) to effectively +address medical dialogue summarization. The process of medical reporting is +often time-consuming for healthcare professionals. Implementing medical +dialogue summarization techniques presents a viable solution to alleviate this +time constraint by generating automated medical reports. The effectiveness of +LLMs in this process is significantly influenced by the formulation of the +prompt, which plays a crucial role in determining the quality and relevance of +the generated reports. In this research, we used a combination of two distinct +prompting strategies, known as shot prompting and pattern prompting to enhance +the performance of automated medical reporting. The evaluation of the automated +medical reports is carried out using the ROUGE score and a human evaluation +with the help of an expert panel. The two-shot prompting approach in +combination with scope and domain context outperforms other methods and +achieves the highest score when compared to the human reference set by a +general practitioner. However, the automated reports are approximately twice as +long as the human references, due to the addition of both redundant and +relevant statements that are added to the report. + +
+
+ comment: 12 pages, 4 figures, to be presented at HEALTHINF 2024, author + contributions: research conducted and written by Daphne van Zandvoort and + Laura Wiersema, research suggested and used software created by Tom Huibers, + data provided and feedback provided by Sandra van Dulmen, supervision and + feedback provided by Sjaak Brinkkemper +
+
+
+
+
+ + ♻ ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Aligning Large Language Models with Counterfactual DPO + + +
+ Advancements in large language models (LLMs) have demonstrated remarkable +capabilities across a diverse range of applications. These models excel in +generating text completions that are contextually coherent and cover an +extensive array of subjects. However, the vast datasets required for their +training make aligning response styles during the pretraining and instruction +tuning phases challenging. Consequently, an additional alignment phase is +typically employed, wherein the model is further trained with human preference +data to better align its outputs with human expectations. While this process +doesn't introduce new capabilities per se, it does accentuate generation styles +innate to the model. This paper explores the utilization of counterfactual +prompting within the framework of Direct Preference Optimization (DPO) to align +the model's style without relying on human intervention. We demonstrate that +this method effectively instils desirable behaviour, mitigates undesirable +ones, and encourages the model to disregard inappropriate instructions. Our +findings suggest that counterfactual prompting with DPO presents a low-resource +way to fine-tune LLMs to meet the demands for responsible and ethically aligned +AI systems. + +
+
+
+
+
+ + ♻ ☆ RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large + Language Models in Tool Learning + + +
+ Tool learning has generated widespread interest as a vital means of +interaction between Large Language Models (LLMs) and the physical world. +Current research predominantly emphasizes LLMs' capacity to utilize tools in +well-structured environments while overlooking their stability when confronted +with the inevitable noise of the real world. To bridge this gap, we introduce +RoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool +learning. Specifically, we establish five external environments, each featuring +varying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union), +providing an in-depth analysis of the model's resilience across three critical +phases: tool selection, parameter identification, and content filling. +Experiments involving six widely-used models underscore the urgent necessity +for enhancing the robustness of LLMs in tool learning. For instance, the +performance of GPT-4 even drops significantly from 80.00 to 58.10 when there is +no substantial change in manual accuracy. More surprisingly, the noise +correction capability inherent in the GPT family paradoxically impedes its +adaptability in the face of mild noise. In light of these findings, we propose +RoTTuning, a strategy that enriches the diversity of training environments to +bolster the robustness of LLMs in tool learning. The code and data are +available at https://github.com/Junjie-Ye/RoTBench. + +
+
+
+
+
+ + ♻ ☆ TransNormerLLM: A Faster and Better Large Language Model with Improved + TransNormer + + +
+ We present TransNormerLLM, the first linear attention-based Large Language +Model (LLM) that outperforms conventional softmax attention-based models in +terms of both accuracy and efficiency. TransNormerLLM evolves from the previous +linear attention architecture TransNormer by making advanced modifications that +include positional embedding, linear attention acceleration, gating mechanisms, +tensor normalization, and inference acceleration and stabilization. +Specifically, we use LRPE together with an exponential decay to avoid attention +dilution issues while allowing the model to retain global interactions between +tokens. Additionally, we propose Lightning Attention, a cutting-edge technique +that accelerates linear attention by more than twice in runtime and reduces +memory usage by a remarkable four times. To further enhance the performance of +TransNormer, we leverage a gating mechanism for smooth training and a new +tensor normalization scheme to accelerate the model, resulting in an impressive +acceleration of over $20\%$. Furthermore, we develop a robust inference +algorithm that ensures numerical stability and consistent inference speed, +regardless of the sequence length, showcasing superior efficiency during both +training and inference stages. We also implement an efficient model parallel +schema for TransNormerLLM, enabling seamless deployment on large-scale clusters +and facilitating expansion to even more extensive models, i.e., LLMs with 175B +parameters. We validate our model design through a series of ablations and +train models with sizes of 385M, 1B, and 7B on our self-collected corpus. +Benchmark results demonstrate that our models not only match the performance of +state-of-the-art LLMs with Transformer but are also significantly faster. Code +is released at: https://github.com/OpenNLPLab/TransnormerLLM. + +
+
+ comment: Technical Report. Yiran Zhong is the corresponding author. Zhen Qin, + Dong Li, Weigao Sun, Weixuan Sun, Xuyang Shen contribute equally to this + paper. Code is released at: https://github.com/OpenNLPLab/TransnormerLLM +
+
+
+
+
+ + ♻ ☆ KnowledgeNavigator: Leveraging Large Language Models for Enhanced + Reasoning over Knowledge Graph + + +
+ Large language model (LLM) has achieved outstanding performance on various +downstream tasks with its powerful natural language understanding and zero-shot +capability, but LLM still suffers from knowledge limitation. Especially in +scenarios that require long logical chains or complex reasoning, the +hallucination and knowledge limitation of LLM limit its performance in question +answering (QA). In this paper, we propose a novel framework KnowledgeNavigator +to address these challenges by efficiently and accurately retrieving external +knowledge from knowledge graph and using it as a key factor to enhance LLM +reasoning. Specifically, KnowledgeNavigator first mines and enhances the +potential constraints of the given question to guide the reasoning. Then it +retrieves and filters external knowledge that supports answering through +iterative reasoning on knowledge graph with the guidance of LLM and the +question. Finally, KnowledgeNavigator constructs the structured knowledge into +effective prompts that are friendly to LLM to help its reasoning. We evaluate +KnowledgeNavigator on multiple public KGQA benchmarks, the experiments show the +framework has great effectiveness and generalization, outperforming previous +knowledge graph enhanced LLM methods and is comparable to the fully supervised +models. + +
+
+
+
+
+ + ♻ ☆ How Abilities in Large Language Models are Affected by Supervised + Fine-tuning Data Composition + + +
+ Large language models (LLMs) with enormous pre-training tokens and parameters +emerge diverse abilities, including math reasoning, code generation, and +instruction following. These abilities are further enhanced by supervised +fine-tuning (SFT). While the open-source community has explored ad-hoc SFT for +enhancing individual capabilities, proprietary LLMs exhibit versatility across +various skills. Therefore, understanding the facilitation of multiple abilities +via SFT is paramount. In this study, we specifically focuses on the interplay +of data composition between mathematical reasoning, code generation, and +general human-aligning abilities during SFT. We propose four intriguing +research questions to explore the association between model performance and +various factors including data amount, composition ratio, model size and SFT +strategies. Our experiments reveal that distinct capabilities scale differently +and larger models generally show superior performance with same amount of data. +Mathematical reasoning and code generation consistently improve with increasing +data amount, whereas general abilities plateau after roughly a thousand +samples. Moreover, we observe data composition appears to enhance various +abilities under limited data conditions, yet can lead to performance conflicts +when data is plentiful. Our findings also suggest the amount of composition +data influences performance more than the composition ratio. In analysis of SFT +strategies, we find that sequentially learning multiple skills risks +catastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT) +strategy offers a promising solution to learn multiple abilities with different +scaling patterns. + +
+
+
+
+
+ + ♻ ☆ Improving Text Embeddings with Large Language Models + + +
+ In this paper, we introduce a novel and simple method for obtaining +high-quality text embeddings using only synthetic data and less than 1k +training steps. Unlike existing methods that often depend on multi-stage +intermediate pre-training with billions of weakly-supervised text pairs, +followed by fine-tuning with a few labeled datasets, our method does not +require building complex training pipelines or relying on manually collected +datasets that are often constrained by task diversity and language coverage. We +leverage proprietary LLMs to generate diverse synthetic data for hundreds of +thousands of text embedding tasks across nearly 100 languages. We then +fine-tune open-source decoder-only LLMs on the synthetic data using standard +contrastive loss. Experiments demonstrate that our method achieves strong +performance on highly competitive text embedding benchmarks without using any +labeled data. Furthermore, when fine-tuned with a mixture of synthetic and +labeled data, our model sets new state-of-the-art results on the BEIR and MTEB +benchmarks. + +
+
+ comment: 20 pages, 15 tables +
+
+
+
+
+ + ♻ ☆ Better Explain Transformers by Illuminating Important Information + + +
+ Transformer-based models excel in various natural language processing (NLP) +tasks, attracting countless efforts to explain their inner workings. Prior +methods explain Transformers by focusing on the raw gradient and attention as +token attribution scores, where non-relevant information is often considered +during explanation computation, resulting in confusing results. In this work, +we propose highlighting the important information and eliminating irrelevant +information by a refined information flow on top of the layer-wise relevance +propagation (LRP) method. Specifically, we consider identifying syntactic and +positional heads as important attention heads and focus on the relevance +obtained from these important heads. Experimental results demonstrate that +irrelevant information does distort output attribution scores and then should +be masked during explanation computation. Compared to eight baselines on both +classification and question-answering datasets, our method consistently +outperforms with over 3\% to 33\% improvement on explanation metrics, providing +superior explanation performance. Our anonymous code repository is available +at: https://github.com/LinxinS97/Mask-LRP + +
+
+
+
+
+ + ♻ ☆ UniversalNER: Targeted Distillation from Large Language Models for Open + Named Entity Recognition ICLR 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable generalizability, +such as understanding arbitrary entities and relations. Instruction tuning has +proven effective for distilling LLMs into more cost-efficient models such as +Alpaca and Vicuna. Yet such student models still trail the original LLMs by +large margins in downstream applications. In this paper, we explore targeted +distillation with mission-focused instruction tuning to train student models +that can excel in a broad application class such as open information +extraction. Using named entity recognition (NER) for case study, we show how +ChatGPT can be distilled into much smaller UniversalNER models for open NER. +For evaluation, we assemble the largest NER benchmark to date, comprising 43 +datasets across 9 diverse domains such as biomedicine, programming, social +media, law, finance. Without using any direct supervision, UniversalNER attains +remarkable NER accuracy across tens of thousands of entity types, outperforming +general instruction-tuned models such as Alpaca and Vicuna by over 30 absolute +F1 points in average. With a tiny fraction of parameters, UniversalNER not only +acquires ChatGPT's capability in recognizing arbitrary entity types, but also +outperforms its NER accuracy by 7-9 absolute F1 points in average. Remarkably, +UniversalNER even outperforms by a large margin state-of-the-art multi-task +instruction-tuned systems such as InstructUIE, which uses supervised NER +examples. We also conduct thorough ablation studies to assess the impact of +various components in our distillation approach. We release the distillation +recipe, data, and UniversalNER models to facilitate future research on targeted +distillation. + +
+
+ comment: Accepted at ICLR 2024. Project page: https://universal-ner.github.io/ +
+
+
+
+
+ + ♻ ☆ Chain-of-Table: Evolving Tables in the Reasoning Chain for Table + Understanding ICLR 2024 + + +
+ Table-based reasoning with large language models (LLMs) is a promising +direction to tackle many table understanding tasks, such as table-based +question answering and fact verification. Compared with generic reasoning, +table-based reasoning requires the extraction of underlying semantics from both +free-form questions and semi-structured tabular data. Chain-of-Thought and its +similar approaches incorporate the reasoning chain in the form of textual +context, but it is still an open question how to effectively leverage tabular +data in the reasoning chain. We propose the Chain-of-Table framework, where +tabular data is explicitly used in the reasoning chain as a proxy for +intermediate thoughts. Specifically, we guide LLMs using in-context learning to +iteratively generate operations and update the table to represent a tabular +reasoning chain. LLMs can therefore dynamically plan the next operation based +on the results of the previous ones. This continuous evolution of the table +forms a chain, showing the reasoning process for a given tabular problem. The +chain carries structured information of the intermediate results, enabling more +accurate and reliable predictions. Chain-of-Table achieves new state-of-the-art +performance on WikiTQ, FeTaQA, and TabFact benchmarks across multiple LLM +choices. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 91 + +
+
+
+ + ☆ Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data + + +
+ This work presents Depth Anything, a highly practical solution for robust +monocular depth estimation. Without pursuing novel technical modules, we aim to +build a simple yet powerful foundation model dealing with any images under any +circumstances. To this end, we scale up the dataset by designing a data engine +to collect and automatically annotate large-scale unlabeled data (~62M), which +significantly enlarges the data coverage and thus is able to reduce the +generalization error. We investigate two simple yet effective strategies that +make data scaling-up promising. First, a more challenging optimization target +is created by leveraging data augmentation tools. It compels the model to +actively seek extra visual knowledge and acquire robust representations. +Second, an auxiliary supervision is developed to enforce the model to inherit +rich semantic priors from pre-trained encoders. We evaluate its zero-shot +capabilities extensively, including six public datasets and randomly captured +photos. It demonstrates impressive generalization ability. Further, through +fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs +are set. Our better depth model also results in a better depth-conditioned +ControlNet. Our models are released at +https://github.com/LiheYoung/Depth-Anything. + +
+
+ comment: Project page: https://depth-anything.github.io +
+
+
+
+
+ + ☆ Event detection from novel data sources: Leveraging satellite imagery + alongside GPS traces + + +
+ Rapid identification and response to breaking events, particularly those that +pose a threat to human life such as natural disasters or conflicts, is of +paramount importance. The prevalence of mobile devices and the ubiquity of +network connectivity has generated a massive amount of temporally- and +spatially-stamped data. Numerous studies have used mobile data to derive +individual human mobility patterns for various applications. Similarly, the +increasing number of orbital satellites has made it easier to gather +high-resolution images capturing a snapshot of a geographical area in sub-daily +temporal frequency. We propose a novel data fusion methodology integrating +satellite imagery with privacy-enhanced mobile data to augment the event +inference task, whether in real-time or historical. In the absence of boots on +the ground, mobile data is able to give an approximation of human mobility, +proximity to one another, and the built environment. On the other hand, +satellite imagery can provide visual information on physical changes to the +built and natural environment. The expected use cases for our methodology +include small-scale disaster detection (i.e., tornadoes, wildfires, and floods) +in rural regions, search and rescue operation augmentation for lost hikers in +remote wilderness areas, and identification of active conflict areas and +population displacement in war-torn states. Our implementation is open-source +on GitHub: https://github.com/ekinugurel/SatMobFusion. + +
+
+
+
+
+ + ☆ Synthesizing Moving People with 3D Control + + +
+ In this paper, we present a diffusion model-based framework for animating +people from a single image for a given target 3D motion sequence. Our approach +has two core components: a) learning priors about invisible parts of the human +body and clothing, and b) rendering novel body poses with proper clothing and +texture. For the first part, we learn an in-filling diffusion model to +hallucinate unseen parts of a person given a single image. We train this model +on texture map space, which makes it more sample-efficient since it is +invariant to pose and viewpoint. Second, we develop a diffusion-based rendering +pipeline, which is controlled by 3D human poses. This produces realistic +renderings of novel poses of the person, including clothing, hair, and +plausible in-filling of unseen regions. This disentangled approach allows our +method to generate a sequence of images that are faithful to the target motion +in the 3D pose and, to the input image in terms of visual similarity. In +addition to that, the 3D control allows various synthetic camera trajectories +to render a person. Our experiments show that our method is resilient in +generating prolonged motions and varied challenging and complex poses compared +to prior methods. Please check our website for more details: +https://boyiliee.github.io/3DHM.github.io/. + +
+
+
+
+
+ + ☆ SCENES: Subpixel Correspondence Estimation With Epipolar Supervision + + +
+ Extracting point correspondences from two or more views of a scene is a +fundamental computer vision problem with particular importance for relative +camera pose estimation and structure-from-motion. Existing local feature +matching approaches, trained with correspondence supervision on large-scale +datasets, obtain highly-accurate matches on the test sets. However, they do not +generalise well to new datasets with different characteristics to those they +were trained on, unlike classic feature extractors. Instead, they require +finetuning, which assumes that ground-truth correspondences or ground-truth +camera poses and 3D structure are available. We relax this assumption by +removing the requirement of 3D structure, e.g., depth maps or point clouds, and +only require camera pose information, which can be obtained from odometry. We +do so by replacing correspondence losses with epipolar losses, which encourage +putative matches to lie on the associated epipolar line. While weaker than +correspondence supervision, we observe that this cue is sufficient for +finetuning existing models on new data. We then further relax the assumption of +known camera poses by using pose estimates in a novel bootstrapping approach. +We evaluate on highly challenging datasets, including an indoor drone dataset +and an outdoor smartphone camera dataset, and obtain state-of-the-art results +without strong supervision. + +
+
+
+
+
+ + ☆ The Cadaver in the Machine: The Social Practices of Measurement and + Validation in Motion Capture Technology + + +
+ Motion capture systems, used across various domains, make body +representations concrete through technical processes. We argue that the +measurement of bodies and the validation of measurements for motion capture +systems can be understood as social practices. By analyzing the findings of a +systematic literature review (N=278) through the lens of social practice +theory, we show how these practices, and their varying attention to errors, +become ingrained in motion capture design and innovation over time. Moreover, +we show how contemporary motion capture systems perpetuate assumptions about +human bodies and their movements. We suggest that social practices of +measurement and validation are ubiquitous in the development of data- and +sensor-driven systems more broadly, and provide this work as a basis for +investigating hidden design assumptions and their potential negative +consequences in human-computer interaction. + +
+
+ comment: 34 pages, 9 figures. To appear in the 2024 ACM CHI Conference on + Human Factors in Computing Systems (CHI '24) +
+
+
+
+
+ + ☆ Motion Consistency Loss for Monocular Visual Odometry with + Attention-Based Deep Learning + + +
+ Deep learning algorithms have driven expressive progress in many complex +tasks. The loss function is a core component of deep learning techniques, +guiding the learning process of neural networks. This paper contributes by +introducing a consistency loss for visual odometry with deep learning-based +approaches. The motion consistency loss explores repeated motions that appear +in consecutive overlapped video clips. Experimental results show that our +approach increased the performance of a model on the KITTI odometry benchmark. + +
+
+
+
+
+ + ☆ Source-Free and Image-Only Unsupervised Domain Adaptation for Category + Level Object Pose Estimation ICLR 2024 + + +
+ We consider the problem of source-free unsupervised category-level pose +estimation from only RGB images to a target domain without any access to source +domain data or 3D annotations during adaptation. Collecting and annotating +real-world 3D data and corresponding images is laborious, expensive, yet +unavoidable process, since even 3D pose domain adaptation methods require 3D +data in the target domain. We introduce 3DUDA, a method capable of adapting to +a nuisance-ridden target domain without 3D or depth data. Our key insight stems +from the observation that specific object subparts remain stable across +out-of-domain (OOD) scenarios, enabling strategic utilization of these +invariant subcomponents for effective model updates. We represent object +categories as simple cuboid meshes, and harness a generative model of neural +feature activations modeled at each mesh vertex learnt using differential +rendering. We focus on individual locally robust mesh vertex features and +iteratively update them based on their proximity to corresponding features in +the target domain even when the global pose is not correct. Our model is then +trained in an EM fashion, alternating between updating the vertex features and +the feature extractor. We show that our method simulates fine-tuning on a +global pseudo-labeled dataset under mild assumptions, which converges to the +target domain asymptotically. Through extensive empirical validation, including +a complex extreme UDA setup which combines real nuisances, synthetic noise, and +occlusion, we demonstrate the potency of our simple approach in addressing the +domain shift challenge and significantly improving pose estimation accuracy. + +
+
+ comment: 36 pages, 9 figures, 50 tables; ICLR 2024 (Poster) +
+
+
+
+
+ + ☆ Understanding Video Transformers via Universal Concept Discovery + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we demonstrate +that VTCDcan be used to improve model performance for fine-grained tasks. + +
+
+
+
+
+ + ☆ ActAnywhere: Subject-Aware Video Background Generation + + +
+ Generating video background that tailors to foreground subject motion is an +important problem for the movie industry and visual effects community. This +task involves synthesizing background that aligns with the motion and +appearance of the foreground subject, while also complies with the artist's +creative intention. We introduce ActAnywhere, a generative model that automates +this process which traditionally requires tedious manual efforts. Our model +leverages the power of large-scale video diffusion models, and is specifically +tailored for this task. ActAnywhere takes a sequence of foreground subject +segmentation as input and an image that describes the desired scene as +condition, to produce a coherent video with realistic foreground-background +interactions while adhering to the condition frame. We train our model on a +large-scale dataset of human-scene interaction videos. Extensive evaluations +demonstrate the superior performance of our model, significantly outperforming +baselines. Moreover, we show that ActAnywhere generalizes to diverse +out-of-distribution samples, including non-human subjects. Please visit our +project webpage at https://actanywhere.github.io. + +
+
+
+
+
+ + ☆ RAD-DINO: Exploring Scalable Medical Image Encoders Beyond Text + Supervision + + +
+ Language-supervised pre-training has proven to be a valuable method for +extracting semantically meaningful features from images, serving as a +foundational element in multimodal systems within the computer vision and +medical imaging domains. However, resulting features are limited by the +information contained within the text. This is particularly problematic in +medical imaging, where radiologists' written findings focus on specific +observations; a challenge compounded by the scarcity of paired imaging-text +data due to concerns over leakage of personal health information. In this work, +we fundamentally challenge the prevailing reliance on language supervision for +learning general purpose biomedical imaging encoders. We introduce RAD-DINO, a +biomedical image encoder pre-trained solely on unimodal biomedical imaging data +that obtains similar or greater performance than state-of-the-art biomedical +language supervised models on a diverse range of benchmarks. Specifically, the +quality of learned representations is evaluated on standard imaging tasks +(classification and semantic segmentation), and a vision-language alignment +task (text report generation from images). To further demonstrate the drawback +of language supervision, we show that features from RAD-DINO correlate with +other medical records (e.g., sex or age) better than language-supervised +models, which are generally not mentioned in radiology reports. Finally, we +conduct a series of ablations determining the factors in RAD-DINO's +performance; notably, we observe that RAD-DINO's downstream performance scales +well with the quantity and diversity of training data, demonstrating that +image-only supervision is a scalable approach for training a foundational +biomedical image encoder. + +
+
+
+
+
+ + ☆ Learning to Visually Connect Actions and their Effects + + +
+ In this work, we introduce the novel concept of visually Connecting Actions +and Their Effects (CATE) in video understanding. CATE can have applications in +areas like task planning and learning from demonstration. We propose different +CATE-based task formulations, such as action selection and action +specification, where video understanding models connect actions and effects at +semantic and fine-grained levels. We observe that different formulations +produce representations capturing intuitive action properties. We also design +various baseline models for action selection and action specification. Despite +the intuitive nature of the task, we observe that models struggle, and humans +outperform them by a large margin. The study aims to establish a foundation for +future efforts, showcasing the flexibility and versatility of connecting +actions and effects in video understanding, with the hope of inspiring advanced +formulations and models. + +
+
+
+
+
+ + ☆ Measuring the Impact of Scene Level Objects on Object Detection: Towards + Quantitative Explanations of Detection Decisions + + +
+ Although accuracy and other common metrics can provide a useful window into +the performance of an object detection model, they lack a deeper view of the +model's decision process. Regardless of the quality of the training data and +process, the features that an object detection model learns cannot be +guaranteed. A model may learn a relationship between certain background +context, i.e., scene level objects, and the presence of the labeled classes. +Furthermore, standard performance verification and metrics would not identify +this phenomenon. This paper presents a new black box explainability method for +additional verification of object detection models by finding the impact of +scene level objects on the identification of the objects within the image. By +comparing the accuracies of a model on test data with and without certain scene +level objects, the contributions of these objects to the model's performance +becomes clearer. The experiment presented here will assess the impact of +buildings and people in image context on the detection of emergency road +vehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the +presence of a scene level object will indicate the model's reliance on that +object to make its detections. The results of this research lead to providing a +quantitative explanation of the object detection model's decision process, +enabling a deeper understanding of the model's performance. + +
+
+ comment: 9 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ Sat2Scene: 3D Urban Scene Generation from Satellite Images with + Diffusion + + +
+ Directly generating scenes from satellite imagery offers exciting +possibilities for integration into applications like games and map services. +However, challenges arise from significant view changes and scene scale. +Previous efforts mainly focused on image or video generation, lacking +exploration into the adaptability of scene generation for arbitrary views. +Existing 3D generation works either operate at the object level or are +difficult to utilize the geometry obtained from satellite imagery. To overcome +these limitations, we propose a novel architecture for direct 3D scene +generation by introducing diffusion models into 3D sparse representations and +combining them with neural rendering techniques. Specifically, our approach +generates texture colors at the point level for a given geometry using a 3D +diffusion model first, which is then transformed into a scene representation in +a feed-forward manner. The representation can be utilized to render arbitrary +views which would excel in both single-frame quality and inter-frame +consistency. Experiments in two city-scale datasets show that our model +demonstrates proficiency in generating photo-realistic street-view image +sequences and cross-view urban scenes from satellite imagery. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Determination of efficiency indicators of the stand for intelligent + control of manual operations in industrial production + + +
+ Systems of intelligent control of manual operations in industrial production +are being implemented in many industries nowadays. Such systems use +high-resolution cameras and computer vision algorithms to automatically track +the operator's manipulations and prevent technological errors in the assembly +process. At the same time compliance with safety regulations in the workspace +is monitored. As a result, the defect rate of manufactured products and the +number of accidents during the manual assembly of any device are decreased. +Before implementing an intelligent control system into a real production it is +necessary to calculate its efficiency. In order to do it experiments on the +stand for manual operations control systems were carried out. This paper +proposes the methodology for calculating the efficiency indicators. This +mathematical approach is based on the IoU calculation of real- and +predicted-time intervals between assembly stages. The results show high +precision in tracking the validity of manual assembly and do not depend on the +duration of the assembly process. + +
+
+
+
+
+ + ☆ NN-VVC: Versatile Video Coding boosted by self-supervisedly learned + image coding for machines + + +
+ The recent progress in artificial intelligence has led to an ever-increasing +usage of images and videos by machine analysis algorithms, mainly neural +networks. Nonetheless, compression, storage and transmission of media have +traditionally been designed considering human beings as the viewers of the +content. Recent research on image and video coding for machine analysis has +progressed mainly in two almost orthogonal directions. The first is represented +by end-to-end (E2E) learned codecs which, while offering high performance on +image coding, are not yet on par with state-of-the-art conventional video +codecs and lack interoperability. The second direction considers using the +Versatile Video Coding (VVC) standard or any other conventional video codec +(CVC) together with pre- and post-processing operations targeting machine +analysis. While the CVC-based methods benefit from interoperability and broad +hardware and software support, the machine task performance is often lower than +the desired level, particularly in low bitrates. This paper proposes a hybrid +codec for machines called NN-VVC, which combines the advantages of an +E2E-learned image codec and a CVC to achieve high performance in both image and +video coding for machines. Our experiments show that the proposed system +achieved up to -43.20% and -26.8% Bj{\o}ntegaard Delta rate reduction over VVC +for image and video data, respectively, when evaluated on multiple different +datasets and machine vision tasks. To the best of our knowledge, this is the +first research paper showing a hybrid video codec that outperforms VVC on +multiple datasets and multiple machine vision tasks. + +
+
+ comment: ISM 2023 Best paper award winner version +
+
+
+
+
+ + ☆ HiCD: Change Detection in Quality-Varied Images via Hierarchical + Correlation Distillation + + +
+ Advanced change detection techniques primarily target image pairs of equal +and high quality. However, variations in imaging conditions and platforms +frequently lead to image pairs with distinct qualities: one image being +high-quality, while the other being low-quality. These disparities in image +quality present significant challenges for understanding image pairs +semantically and extracting change features, ultimately resulting in a notable +decline in performance. To tackle this challenge, we introduce an innovative +training strategy grounded in knowledge distillation. The core idea revolves +around leveraging task knowledge acquired from high-quality image pairs to +guide the model's learning process when dealing with image pairs that exhibit +differences in quality. Additionally, we develop a hierarchical correlation +distillation approach (involving self-correlation, cross-correlation, and +global correlation). This approach compels the student model to replicate the +correlations inherent in the teacher model, rather than focusing solely on +individual features. This ensures effective knowledge transfer while +maintaining the student model's training flexibility. + +
+
+ comment: accepted by TGRS +
+
+
+
+
+ + ☆ Character Recognition in Byzantine Seals with Deep Neural Networks + + +
+ Seals are small coin-shaped artifacts, mostly made of lead, held with strings +to seal letters. This work presents the first attempt towards automatic reading +of text on Byzantine seal images.Byzantine seals are generally decorated with +iconography on the obverse side and Greek text on the reverse side. Text may +include the sender's name, position in the Byzantine aristocracy, and elements +of prayers. Both text and iconography are precious literary sources that wait +to be exploited electronically, so the development of computerized systems for +interpreting seals images is of paramount importance. This work's contribution +is hence a deep, two-stages, character reading pipeline for transcribing +Byzantine seal images. A first deep convolutional neural network (CNN) detects +characters in the seal (character localization). A second convolutional network +reads the localized characters (character classification). Finally, a +diplomatic transcription of the seal is provided by post-processing the two +network outputs. We provide an experimental evaluation of each CNN in isolation +and both CNNs in combination. All performances are evaluated by +cross-validation. Character localization achieves a mean average precision +(mAP@0.5) greater than 0.9. Classification of characters cropped from ground +truth bounding boxes achieves Top-1 accuracy greater than 0.92. End-to-end +evaluation shows the efficiency of the proposed approach when compared to the +SoTA for similar tasks. + +
+
+
+
+
+ + ☆ Bridging the gap between image coding for machines and humans + + +
+ Image coding for machines (ICM) aims at reducing the bitrate required to +represent an image while minimizing the drop in machine vision analysis +accuracy. In many use cases, such as surveillance, it is also important that +the visual quality is not drastically deteriorated by the compression process. +Recent works on using neural network (NN) based ICM codecs have shown +significant coding gains against traditional methods; however, the decompressed +images, especially at low bitrates, often contain checkerboard artifacts. We +propose an effective decoder finetuning scheme based on adversarial training to +significantly enhance the visual quality of ICM codecs, while preserving the +machine analysis accuracy, without adding extra bitcost or parameters at the +inference phase. The results show complete removal of the checkerboard +artifacts at the negligible cost of -1.6% relative change in task performance +score. In the cases where some amount of artifacts is tolerable, such as when +machine consumption is the primary target, this technique can enhance both +pixel-fidelity and feature-fidelity scores without losing task performance. + +
+
+
+
+
+ + ☆ Removal and Selection: Improving RGB-Infrared Object Detection via + Coarse-to-Fine Fusion + + +
+ Object detection in visible (RGB) and infrared (IR) images has been widely +applied in recent years. Leveraging the complementary characteristics of RGB +and IR images, the object detector provides reliable and robust object +localization from day to night. Existing fusion strategies directly inject RGB +and IR images into convolution neural networks, leading to inferior detection +performance. Since the RGB and IR features have modality-specific noise, these +strategies will worsen the fused features along with the propagation. Inspired +by the mechanism of human brain processing multimodal information, this work +introduces a new coarse-to-fine perspective to purify and fuse two modality +features. Specifically, following this perspective, we design a Redundant +Spectrum Removal module to coarsely remove interfering information within each +modality and a Dynamic Feature Selection module to finely select the desired +features for feature fusion. To verify the effectiveness of the coarse-to-fine +fusion strategy, we construct a new object detector called Removal and +Selection Detector (RSDet). Extensive experiments on three RGB-IR object +detection datasets verify the superior performance of our method. + +
+
+ comment: 9pages, 7figures +
+
+
+
+
+ + ☆ Tool-LMM: A Large Multi-Modal Model for Tool Agent Learning + + +
+ Recently, the astonishing performance of large language models (LLMs) in +natural language comprehension and generation tasks triggered lots of +exploration of using them as central controllers to build agent systems. +Multiple studies focus on bridging the LLMs to external tools to extend the +application scenarios. However, the current LLMs' perceiving tool-use ability +is limited to a single text query, which may result in ambiguity in +understanding the users' real intentions. LLMs are expected to eliminate that +by perceiving the visual- or auditory-grounded instructions' information. +Therefore, in this paper, we propose Tool-LMM, a system incorporating +open-source LLMs and multi-modal encoders so that the learnt LLMs can be +conscious of multi-modal input instruction and then select the function-matched +tool correctly. To facilitate the evaluation of the model's capability, we +collect a dataset featured by consisting of multi-modal input tools from +HuggingFace. Another important feature of our dataset is that our dataset also +contains multiple potential choices for the same instruction due to the +existence of identical functions and synonymous functions, which provides more +potential solutions for the same query. The experiments reveal that our LMM is +capable of recommending appropriate tools for multi-modal instructions. Codes +and data are available at https://github.com/Tool-LMM/Tool-LMM. + +
+
+ comment: 21 pages, 9 figures, 10 tables +
+
+
+
+
+ + ☆ Q&A Prompts: Discovering Rich Visual Clues through Mining + Question-Answer Prompts for VQA requiring Diverse World Knowledge + + +
+ With the breakthrough of multi-modal large language models, answering complex +visual questions that demand advanced reasoning abilities and world knowledge +has become a much more important testbed for developing AI models than ever. +However, equipping AI models with robust cross-modality reasoning ability +remains challenging since the cognition scheme of humans has not been +understood systematically. In this paper, we believe that if we can collect +visual clues in the given image as much as possible, we will recognize the +image more accurately, understand the question better, recall relevant +knowledge more easily, and finally reason out the answer. We discover these +rich visual clues by mining question-answer pairs in images and sending them +into multi-modal large language models as prompts. We call the proposed method +Q&A Prompts. Specifically, we first use the image-answer pairs and the +corresponding questions in the training set as inputs and outputs to train a +visual question generation model. Then, we use an image tagging model to +identify various instances and send packaged image-tag pairs into the visual +question generation model to generate relevant questions with the extracted +image tags as answers. Finally, we encode these generated question-answer pairs +as prompts with a visual-aware prompting module and send them into pre-trained +multi-modal large language models to reason out the final answers. Experimental +results show that, compared with state-of-the-art methods, our Q&A Prompts +achieves substantial improvements on the challenging visual question answering +datasets requiring reasoning over diverse world knowledge, such as OK-VQA and +A-OKVQA. + +
+
+
+
+
+ + ☆ Weakly Supervised Gaussian Contrastive Grounding with Large Multimodal + Models for Video Question Answering + + +
+ Video Question Answering (VideoQA) aims to answer natural language questions +based on the information observed in videos. Despite the recent success of +Large Multimodal Models (LMMs) in image-language understanding and reasoning, +they deal with VideoQA insufficiently by simply taking uniformly sampled frames +as visual inputs, which ignores question-relevant visual clues. Moreover, there +are no human annotations for question-critical timestamps in existing VideoQA +datasets. In light of this, we propose a novel weakly supervised framework to +enforce the LMMs to reason out the answers with question-critical moments as +visual inputs. Specifically, we fuse the question and answer pairs as event +descriptions to find multiple keyframes as target moments, which will be +pseudo-labels. With these pseudo-labels as additionally weak supervision, we +devise a lightweight Gaussian-based Contrastive Grounding (GCG) module. GCG +learns multiple Gaussian functions to characterize the temporal structure of +the video, and sample question-critical frames as positive moments to be the +visual inputs of LMMs. Extensive experiments on several VideoQA benchmarks +verify the effectiveness of our framework, and we achieve substantial +improvements compared to previous state-of-the-art methods. + +
+
+
+
+
+ + ☆ Dense 3D Reconstruction Through Lidar: A Comparative Study on Ex-vivo + Porcine Tissue + + +
+ New sensing technologies and more advanced processing algorithms are +transforming computer-integrated surgery. While researchers are actively +investigating depth sensing and 3D reconstruction for vision-based surgical +assistance, it remains difficult to achieve real-time, accurate, and robust 3D +representations of the abdominal cavity for minimally invasive surgery. Thus, +this work uses quantitative testing on fresh ex-vivo porcine tissue to +thoroughly characterize the quality with which a 3D laser-based time-of-flight +sensor (lidar) can perform anatomical surface reconstruction. Ground-truth +surface shapes are captured with a commercial laser scanner, and the resulting +signed error fields are analyzed using rigorous statistical tools. When +compared to modern learning-based stereo matching from endoscopic images, +time-of-flight sensing demonstrates higher precision, lower processing delay, +higher frame rate, and superior robustness against sensor distance and poor +illumination. Furthermore, we report on the potential negative effect of +near-infrared light penetration on the accuracy of lidar measurements across +different tissue samples, identifying a significant measured depth offset for +muscle in contrast to fat and liver. Our findings highlight the potential of +lidar for intraoperative 3D perception and point toward new methods that +combine complementary time-of-flight and spectral imaging. + +
+
+
+
+
+ + ☆ MixNet: Towards Effective and Efficient UHD Low-Light Image Enhancement + + +
+ With the continuous advancement of imaging devices, the prevalence of +Ultra-High-Definition (UHD) images is rising. Although many image restoration +methods have achieved promising results, they are not directly applicable to +UHD images on devices with limited computational resources due to the +inherently high computational complexity of UHD images. In this paper, we focus +on the task of low-light image enhancement (LLIE) and propose a novel LLIE +method called MixNet, which is designed explicitly for UHD images. To capture +the long-range dependency of features without introducing excessive +computational complexity, we present the Global Feature Modulation Layer +(GFML). GFML associates features from different views by permuting the feature +maps, enabling efficient modeling of long-range dependency. In addition, we +also design the Local Feature Modulation Layer (LFML) and Feed-forward Layer +(FFL) to capture local features and transform features into a compact +representation. This way, our MixNet achieves effective LLIE with few model +parameters and low computational complexity. We conducted extensive experiments +on both synthetic and real-world datasets, and the comprehensive results +demonstrate that our proposed method surpasses the performance of current +state-of-the-art methods. The code will be available at +\url{https://github.com/zzr-idam/MixNet}. + +
+
+
+
+
+ + ☆ BadODD: Bangladeshi Autonomous Driving Object Detection Dataset + + +
+ We propose a comprehensive dataset for object detection in diverse driving +environments across 9 districts in Bangladesh. The dataset, collected +exclusively from smartphone cameras, provided a realistic representation of +real-world scenarios, including day and night conditions. Most existing +datasets lack suitable classes for autonomous navigation on Bangladeshi roads, +making it challenging for researchers to develop models that can handle the +intricacies of road scenarios. To address this issue, the authors proposed a +new set of classes based on characteristics rather than local vehicle names. +The dataset aims to encourage the development of models that can handle the +unique challenges of Bangladeshi road scenarios for the effective deployment of +autonomous vehicles. The dataset did not consist of any online images to +simulate real-world conditions faced by autonomous vehicles. The classification +of vehicles is challenging because of the diverse range of vehicles on +Bangladeshi roads, including those not found elsewhere in the world. The +proposed classification system is scalable and can accommodate future vehicles, +making it a valuable resource for researchers in the autonomous vehicle sector. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ☆ A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification: + Models, Data Sets and Challenges + + +
+ Vehicle re-identification (ReID) endeavors to associate vehicle images +collected from a distributed network of cameras spanning diverse traffic +environments. This task assumes paramount importance within the spectrum of +vehicle-centric technologies, playing a pivotal role in deploying Intelligent +Transportation Systems (ITS) and advancing smart city initiatives. Rapid +advancements in deep learning have significantly propelled the evolution of +vehicle ReID technologies in recent years. Consequently, undertaking a +comprehensive survey of methodologies centered on deep learning for vehicle +re-identification has become imperative and inescapable. This paper extensively +explores deep learning techniques applied to vehicle ReID. It outlines the +categorization of these methods, encompassing supervised and unsupervised +approaches, delves into existing research within these categories, introduces +datasets and evaluation criteria, and delineates forthcoming challenges and +potential research directions. This comprehensive assessment examines the +landscape of deep learning in vehicle ReID and establishes a foundation and +starting point for future works. It aims to serve as a complete reference by +highlighting challenges and emerging trends, fostering advancements and +applications in vehicle ReID utilizing deep learning models. + +
+
+
+
+
+ + ☆ A comprehensive study on fidelity metrics for XAI + + +
+ The use of eXplainable Artificial Intelligence (XAI) systems has introduced a +set of challenges that need resolution. Herein, we focus on how to correctly +select an XAI method, an open questions within the field. The inherent +difficulty of this task is due to the lack of a ground truth. Several authors +have proposed metrics to approximate the fidelity of different XAI methods. +These metrics lack verification and have concerning disagreements. In this +study, we proposed a novel methodology to verify fidelity metrics, using a +well-known transparent model, namely a decision tree. This model allowed us to +obtain explanations with perfect fidelity. Our proposal constitutes the first +objective benchmark for these metrics, facilitating a comparison of existing +proposals, and surpassing existing methods. We applied our benchmark to assess +the existing fidelity metrics in two different experiments, each using public +datasets comprising 52,000 images. The images from these datasets had a size a +128 by 128 pixels and were synthetic data that simplified the training process. +All metric values, indicated a lack of fidelity, with the best one showing a 30 +\% deviation from the expected values for perfect explanation. Our +experimentation led us to conclude that the current fidelity metrics are not +reliable enough to be used in real scenarios. From this finding, we deemed it +necessary to development new metrics, to avoid the detected problems, and we +recommend the usage of our proposal as a benchmark within the scientific +community to address these limitations. + +
+
+
+
+
+ + ☆ Towards Universal Unsupervised Anomaly Detection in Medical Imaging + + +
+ The increasing complexity of medical imaging data underscores the need for +advanced anomaly detection methods to automatically identify diverse +pathologies. Current methods face challenges in capturing the broad spectrum of +anomalies, often limiting their use to specific lesion types in brain scans. To +address this challenge, we introduce a novel unsupervised approach, termed +\textit{Reversed Auto-Encoders (RA)}, designed to create realistic +pseudo-healthy reconstructions that enable the detection of a wider range of +pathologies. We evaluate the proposed method across various imaging modalities, +including magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray, +and chest X-ray, and demonstrate superior performance in detecting anomalies +compared to existing state-of-the-art methods. Our unsupervised anomaly +detection approach may enhance diagnostic accuracy in medical imaging by +identifying a broader range of unknown pathologies. Our code is publicly +available at: \url{https://github.com/ci-ber/RA}. + +
+
+
+
+
+ + ☆ Polytopic Autoencoders with Smooth Clustering for Reduced-order + Modelling of Flows + + +
+ With the advancement of neural networks, there has been a notable increase, +both in terms of quantity and variety, in research publications concerning the +application of autoencoders to reduced-order models. We propose a polytopic +autoencoder architecture that includes a lightweight nonlinear encoder, a +convex combination decoder, and a smooth clustering network. Supported by +several proofs, the model architecture ensures that all reconstructed states +lie within a polytope, accompanied by a metric indicating the quality of the +constructed polytopes, referred to as polytope error. Additionally, it offers a +minimal number of convex coordinates for polytopic linear-parameter varying +systems while achieving acceptable reconstruction errors compared to proper +orthogonal decomposition (POD). To validate our proposed model, we conduct +simulations involving two flow scenarios with the incompressible Navier-Stokes +equation. Numerical results demonstrate the guaranteed properties of the model, +low reconstruction errors compared to POD, and the improvement in error using a +clustering network. + +
+
+ comment: 28 pages, 18 figures +
+
+
+
+
+ + ☆ M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics + Prediction from Histopathology Images + + +
+ The advancement of Spatial Transcriptomics (ST) has facilitated the +spatially-aware profiling of gene expressions based on histopathology images. +Although ST data offers valuable insights into the micro-environment of tumors, +its acquisition cost remains expensive. Therefore, directly predicting the ST +expressions from digital pathology images is desired. Current methods usually +adopt existing regression backbones for this task, which ignore the inherent +multi-scale hierarchical data structure of digital pathology images. To address +this limit, we propose M2ORT, a many-to-one regression Transformer that can +accommodate the hierarchical structure of the pathology images through a +decoupled multi-scale feature extractor. Different from traditional models that +are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology +images of different magnifications at a time to jointly predict the gene +expressions at their corresponding common ST spot, aiming at learning a +many-to-one relationship through training. We have tested M2ORT on three public +ST datasets and the experimental results show that M2ORT can achieve +state-of-the-art performance with fewer parameters and floating-point +operations (FLOPs). The code is available at: +https://github.com/Dootmaan/M2ORT/. + +
+
+
+
+
+ + ☆ DGL: Dynamic Global-Local Prompt Tuning for Text-Video Retrieval AAAI2024 + + +
+ Text-video retrieval is a critical multi-modal task to find the most relevant +video for a text query. Although pretrained models like CLIP have demonstrated +impressive potential in this area, the rising cost of fully finetuning these +models due to increasing model size continues to pose a problem. To address +this challenge, prompt tuning has emerged as an alternative. However, existing +works still face two problems when adapting pretrained image-text models to +downstream video-text tasks: (1) The visual encoder could only encode +frame-level features and failed to extract global-level general video +information. (2) Equipping the visual and text encoder with separated prompts +failed to mitigate the visual-text modality gap. To this end, we propose DGL, a +cross-modal Dynamic prompt tuning method with Global-Local video attention. In +contrast to previous prompt tuning methods, we employ the shared latent space +to generate local-level text and frame prompts that encourage inter-modal +interaction. Furthermore, we propose modeling video in a global-local attention +mechanism to capture global video information from the perspective of prompt +tuning. Extensive experiments reveal that when only 0.67% parameters are tuned, +our cross-modal prompt tuning strategy DGL outperforms or is comparable to +fully finetuning methods on MSR-VTT, VATEX, LSMDC, and ActivityNet datasets. +Code will be available at https://github.com/knightyxp/DGL + +
+
+ comment: AAAI2024, Code will be available at https://github.com/knightyxp/DGL +
+
+
+
+
+ + ☆ 3D Shape Completion on Unseen Categories:A Weakly-supervised Approach + + +
+ 3D shapes captured by scanning devices are often incomplete due to occlusion. +3D shape completion methods have been explored to tackle this limitation. +However, most of these methods are only trained and tested on a subset of +categories, resulting in poor generalization to unseen categories. In this +paper, we introduce a novel weakly-supervised framework to reconstruct the +complete shapes from unseen categories. We first propose an end-to-end +prior-assisted shape learning network that leverages data from the seen +categories to infer a coarse shape. Specifically, we construct a prior bank +consisting of representative shapes from the seen categories. Then, we design a +multi-scale pattern correlation module for learning the complete shape of the +input by analyzing the correlation between local patterns within the input and +the priors at various scales. In addition, we propose a self-supervised shape +refinement model to further refine the coarse shape. Considering the shape +variability of 3D objects across categories, we construct a category-specific +prior bank to facilitate shape refinement. Then, we devise a voxel-based +partial matching loss and leverage the partial scans to drive the refinement +process. Extensive experimental results show that our approach is superior to +state-of-the-art methods by a large margin. + +
+
+ comment: 13 pages,8 figures +
+
+
+
+
+ + ☆ Dream360: Diverse and Immersive Outdoor Virtual Scene Creation via + Transformer-Based 360 Image Outpainting + + +
+ 360 images, with a field-of-view (FoV) of 180x360, provide immersive and +realistic environments for emerging virtual reality (VR) applications, such as +virtual tourism, where users desire to create diverse panoramic scenes from a +narrow FoV photo they take from a viewpoint via portable devices. It thus +brings us to a technical challenge: `How to allow the users to freely create +diverse and immersive virtual scenes from a narrow FoV image with a specified +viewport?' To this end, we propose a transformer-based 360 image outpainting +framework called Dream360, which can generate diverse, high-fidelity, and +high-resolution panoramas from user-selected viewports, considering the +spherical properties of 360 images. Compared with existing methods, e.g., [3], +which primarily focus on inputs with rectangular masks and central locations +while overlooking the spherical property of 360 images, our Dream360 offers +higher outpainting flexibility and fidelity based on the spherical +representation. Dream360 comprises two key learning stages: (I) codebook-based +panorama outpainting via Spherical-VQGAN (S-VQGAN), and (II) frequency-aware +refinement with a novel frequency-aware consistency loss. Specifically, S-VQGAN +learns a sphere-specific codebook from spherical harmonic (SH) values, +providing a better representation of spherical data distribution for scene +modeling. The frequency-aware refinement matches the resolution and further +improves the semantic consistency and visual fidelity of the generated results. +Our Dream360 achieves significantly lower Frechet Inception Distance (FID) +scores and better visual fidelity than existing methods. We also conducted a +user study involving 15 participants to interactively evaluate the quality of +the generated results in VR, demonstrating the flexibility and superiority of +our Dream360 framework. + +
+
+ comment: 11 pages, accepted to IEEE VR 2024 +
+
+
+
+
+ + ☆ MAEDiff: Masked Autoencoder-enhanced Diffusion Models for Unsupervised + Anomaly Detection in Brain Images + + +
+ Unsupervised anomaly detection has gained significant attention in the field +of medical imaging due to its capability of relieving the costly pixel-level +annotation. To achieve this, modern approaches usually utilize generative +models to produce healthy references of the diseased images and then identify +the abnormalities by comparing the healthy references and the original diseased +images. Recently, diffusion models have exhibited promising potential for +unsupervised anomaly detection in medical images for their good mode coverage +and high sample quality. However, the intrinsic characteristics of the medical +images, e.g. the low contrast, and the intricate anatomical structure of the +human body make the reconstruction challenging. Besides, the global information +of medical images often remain underutilized. To address these two issues, we +propose a novel Masked Autoencoder-enhanced Diffusion Model (MAEDiff) for +unsupervised anomaly detection in brain images. The MAEDiff involves a +hierarchical patch partition. It generates healthy images by overlapping +upper-level patches and implements a mechanism based on the masked autoencoders +operating on the sub-level patches to enhance the condition on the unnoised +regions. Extensive experiments on data of tumors and multiple sclerosis lesions +demonstrate the effectiveness of our method. + +
+
+
+
+
+ + ☆ 360ORB-SLAM: A Visual SLAM System for Panoramic Images with Depth + Completion Network + + +
+ To enhance the performance and effect of AR/VR applications and visual +assistance and inspection systems, visual simultaneous localization and mapping +(vSLAM) is a fundamental task in computer vision and robotics. However, +traditional vSLAM systems are limited by the camera's narrow field-of-view, +resulting in challenges such as sparse feature distribution and lack of dense +depth information. To overcome these limitations, this paper proposes a +360ORB-SLAM system for panoramic images that combines with a depth completion +network. The system extracts feature points from the panoramic image, utilizes +a panoramic triangulation module to generate sparse depth information, and +employs a depth completion network to obtain a dense panoramic depth map. +Experimental results on our novel panoramic dataset constructed based on Carla +demonstrate that the proposed method achieves superior scale accuracy compared +to existing monocular SLAM methods and effectively addresses the challenges of +feature association and scale ambiguity. The integration of the depth +completion network enhances system stability and mitigates the impact of +dynamic elements on SLAM performance. + +
+
+ comment: 6 pages, 9 figures +
+
+
+
+
+ + ☆ Symbol as Points: Panoptic Symbol Spotting via Point-based + Representation ICLR 2024 + + +
+ This work studies the problem of panoptic symbol spotting, which is to spot +and parse both countable object instances (windows, doors, tables, etc.) and +uncountable stuff (wall, railing, etc.) from computer-aided design (CAD) +drawings. Existing methods typically involve either rasterizing the vector +graphics into images and using image-based methods for symbol spotting, or +directly building graphs and using graph neural networks for symbol +recognition. In this paper, we take a different approach, which treats graphic +primitives as a set of 2D points that are locally connected and use point cloud +segmentation methods to tackle it. Specifically, we utilize a point transformer +to extract the primitive features and append a mask2former-like spotting head +to predict the final output. To better use the local connection information of +primitives and enhance their discriminability, we further propose the attention +with connection module (ACM) and contrastive connection learning scheme (CCL). +Finally, we propose a KNN interpolation mechanism for the mask attention module +of the spotting head to better handle primitive mask downsampling, which is +primitive-level in contrast to pixel-level for the image. Our approach, named +SymPoint, is simple yet effective, outperforming recent state-of-the-art method +GAT-CADNet by an absolute increase of 9.6% PQ and 10.4% RQ on the FloorPlanCAD +dataset. The source code and models will be available at +https://github.com/nicehuster/SymPoint. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ I-SplitEE: Image classification in Split Computing DNNs with Early Exits + + +
+ The recent advances in Deep Neural Networks (DNNs) stem from their +exceptional performance across various domains. However, their inherent large +size hinders deploying these networks on resource-constrained devices like +edge, mobile, and IoT platforms. Strategies have emerged, from partial cloud +computation offloading (split computing) to integrating early exits within DNN +layers. Our work presents an innovative unified approach merging early exits +and split computing. We determine the 'splitting layer', the optimal depth in +the DNN for edge device computations, and whether to infer on edge device or be +offloaded to the cloud for inference considering accuracy, computational +efficiency, and communication costs. Also, Image classification faces diverse +environmental distortions, influenced by factors like time of day, lighting, +and weather. To adapt to these distortions, we introduce I-SplitEE, an online +unsupervised algorithm ideal for scenarios lacking ground truths and with +sequential data. Experimental validation using Caltech-256 and Cifar-10 +datasets subjected to varied distortions showcases I-SplitEE's ability to +reduce costs by a minimum of 55% with marginal performance degradation of at +most 5%. + +
+
+ comment: To appear in proceedings of IEEE International Conference on + Communications 2024 +
+
+
+
+
+ + ☆ Learning Position-Aware Implicit Neural Network for Real-World Face + Inpainting + + +
+ Face inpainting requires the model to have a precise global understanding of +the facial position structure. Benefiting from the powerful capabilities of +deep learning backbones, recent works in face inpainting have achieved decent +performance in ideal setting (square shape with $512px$). However, existing +methods often produce a visually unpleasant result, especially in the +position-sensitive details (e.g., eyes and nose), when directly applied to +arbitrary-shaped images in real-world scenarios. The visually unpleasant +position-sensitive details indicate the shortcomings of existing methods in +terms of position information processing capability. In this paper, we propose +an \textbf{I}mplicit \textbf{N}eural \textbf{I}npainting \textbf{N}etwork +(IN$^2$) to handle arbitrary-shape face images in real-world scenarios by +explicit modeling for position information. Specifically, a downsample +processing encoder is proposed to reduce information loss while obtaining the +global semantic feature. A neighbor hybrid attention block is proposed with a +hybrid attention mechanism to improve the facial understanding ability of the +model without restricting the shape of the input. Finally, an implicit neural +pyramid decoder is introduced to explicitly model position information and +bridge the gap between low-resolution features and high-resolution output. +Extensive experiments demonstrate the superiority of the proposed method in +real-world face inpainting task. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ NWPU-MOC: A Benchmark for Fine-grained Multi-category Object Counting in + Aerial Images + + +
+ Object counting is a hot topic in computer vision, which aims to estimate the +number of objects in a given image. However, most methods only count objects of +a single category for an image, which cannot be applied to scenes that need to +count objects with multiple categories simultaneously, especially in aerial +scenes. To this end, this paper introduces a Multi-category Object Counting +(MOC) task to estimate the numbers of different objects (cars, buildings, +ships, etc.) in an aerial image. Considering the absence of a dataset for this +task, a large-scale Dataset (NWPU-MOC) is collected, consisting of 3,416 scenes +with a resolution of 1024 $\times$ 1024 pixels, and well-annotated using 14 +fine-grained object categories. Besides, each scene contains RGB and Near +Infrared (NIR) images, of which the NIR spectrum can provide richer +characterization information compared with only the RGB spectrum. Based on +NWPU-MOC, the paper presents a multi-spectrum, multi-category object counting +framework, which employs a dual-attention module to fuse the features of RGB +and NIR and subsequently regress multi-channel density maps corresponding to +each object category. In addition, to modeling the dependency between different +channels in the density map with each object category, a spatial contrast loss +is designed as a penalty for overlapping predictions at the same spatial +position. Experimental results demonstrate that the proposed method achieves +state-of-the-art performance compared with some mainstream counting algorithms. +The dataset, code and models are publicly available at +https://github.com/lyongo/NWPU-MOC. + +
+
+
+
+
+ + ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model + Reasoning over Image Sequences + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated proficiency in +handling a variety of visual-language tasks. However, current MLLM benchmarks +are predominantly designed to evaluate reasoning based on static information +about a single image, and the ability of modern MLLMs to extrapolate from image +sequences, which is essential for understanding our ever-changing world, has +been less investigated. To address this challenge, this paper introduces +Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning +abilities. Mementos features 4,761 diverse image sequences with varying +lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning +performance. Through a careful evaluation of nine recent MLLMs on Mementos, +including GPT-4V and Gemini, we find that they struggle to accurately describe +dynamic information about given image sequences, often leading to +hallucinations/misrepresentations of objects and their corresponding behaviors. +Our quantitative analysis and case studies identify three key factors impacting +MLLMs' sequential image reasoning: the correlation between object and +behavioral hallucinations, the influence of cooccurring behaviors, and the +compounding impact of behavioral hallucinations. Our dataset is available at +https://github.com/umd-huang-lab/Mementos. + +
+
+ comment: 27 pages, 23 figures +
+
+
+
+
+ + ☆ On mitigating stability-plasticity dilemma in CLIP-guided image morphing + via geodesic distillation loss + + +
+ Large-scale language-vision pre-training models, such as CLIP, have achieved +remarkable text-guided image morphing results by leveraging several +unconditional generative models. However, existing CLIP-guided image morphing +methods encounter difficulties when morphing photorealistic images. +Specifically, existing guidance fails to provide detailed explanations of the +morphing regions within the image, leading to misguidance. In this paper, we +observed that such misguidance could be effectively mitigated by simply using a +proper regularization loss. Our approach comprises two key components: 1) a +geodesic cosine similarity loss that minimizes inter-modality features (i.e., +image and text) on a projected subspace of CLIP space, and 2) a latent +regularization loss that minimizes intra-modality features (i.e., image and +image) on the image manifold. By replacing the na\"ive directional CLIP loss in +a drop-in replacement manner, our method achieves superior morphing results on +both images and videos for various benchmarks, including CLIP-inversion. + +
+
+
+
+
+ + ☆ Focaler-IoU: More Focused Intersection over Union Loss + + +
+ Bounding box regression plays a crucial role in the field of object +detection, and the positioning accuracy of object detection largely depends on +the loss function of bounding box regression. Existing researchs improve +regression performance by utilizing the geometric relationship between bounding +boxes, while ignoring the impact of difficult and easy sample distribution on +bounding box regression. In this article, we analyzed the impact of difficult +and easy sample distribution on regression results, and then proposed +Focaler-IoU, which can improve detector performance in different detection +tasks by focusing on different regression samples. Finally, comparative +experiments were conducted using existing advanced detectors and regression +methods for different detection tasks, and the detection performance was +further improved by using the method proposed in this paper.Code is available +at \url{https://github.com/malagoutou/Focaler-IoU}. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2312.17663 +
+
+
+
+
+ + ☆ Exploring Color Invariance through Image-Level Ensemble Learning + + +
+ In the field of computer vision, the persistent presence of color bias, +resulting from fluctuations in real-world lighting and camera conditions, +presents a substantial challenge to the robustness of models. This issue is +particularly pronounced in complex wide-area surveillance scenarios, such as +person re-identification and industrial dust segmentation, where models often +experience a decline in performance due to overfitting on color information +during training, given the presence of environmental variations. Consequently, +there is a need to effectively adapt models to cope with the complexities of +camera conditions. To address this challenge, this study introduces a learning +strategy named Random Color Erasing, which draws inspiration from ensemble +learning. This strategy selectively erases partial or complete color +information in the training data without disrupting the original image +structure, thereby achieving a balanced weighting of color features and other +features within the neural network. This approach mitigates the risk of +overfitting and enhances the model's ability to handle color variation, thereby +improving its overall robustness. The approach we propose serves as an ensemble +learning strategy, characterized by robust interpretability. A comprehensive +analysis of this methodology is presented in this paper. Across various tasks +such as person re-identification and semantic segmentation, our approach +consistently improves strong baseline methods. Notably, in comparison to +existing methods that prioritize color robustness, our strategy significantly +enhances performance in cross-domain scenarios. The code available at +\url{https://github.com/layumi/Person\_reID\_baseline\_pytorch/blob/master/random\_erasing.py} +or \url{https://github.com/finger-monkey/Data-Augmentation}. + +
+
+
+
+
+ + ☆ GMC-IQA: Exploiting Global-correlation and Mean-opinion Consistency for + No-reference Image Quality Assessment + + +
+ Due to the subjective nature of image quality assessment (IQA), assessing +which image has better quality among a sequence of images is more reliable than +assigning an absolute mean opinion score for an image. Thus, IQA models are +evaluated by global correlation consistency (GCC) metrics like PLCC and SROCC, +rather than mean opinion consistency (MOC) metrics like MAE and MSE. However, +most existing methods adopt MOC metrics to define their loss functions, due to +the infeasible computation of GCC metrics during training. In this work, we +construct a novel loss function and network to exploit Global-correlation and +Mean-opinion Consistency, forming a GMC-IQA framework. Specifically, we propose +a novel GCC loss by defining a pairwise preference-based rank estimation to +solve the non-differentiable problem of SROCC and introducing a queue mechanism +to reserve previous data to approximate the global results of the whole data. +Moreover, we propose a mean-opinion network, which integrates diverse opinion +features to alleviate the randomness of weight learning and enhance the model +robustness. Experiments indicate that our method outperforms SOTA methods on +multiple authentic datasets with higher accuracy and generalization. We also +adapt the proposed loss to various networks, which brings better performance +and more stable training. + +
+
+
+
+
+ + ☆ Enhancing medical vision-language contrastive learning via + inter-matching relation modelling + + +
+ Medical image representations can be learned through medical vision-language +contrastive learning (mVLCL) where medical imaging reports are used as weak +supervision through image-text alignment. These learned image representations +can be transferred to and benefit various downstream medical vision tasks such +as disease classification and segmentation. Recent mVLCL methods attempt to +align image sub-regions and the report keywords as local-matchings. However, +these methods aggregate all local-matchings via simple pooling operations while +ignoring the inherent relations between them. These methods therefore fail to +reason between local-matchings that are semantically related, e.g., +local-matchings that correspond to the disease word and the location word +(semantic-relations), and also fail to differentiate such clinically important +local-matchings from others that correspond to less meaningful words, e.g., +conjunction words (importance-relations). Hence, we propose a mVLCL method that +models the inter-matching relations between local-matchings via a +relation-enhanced contrastive learning framework (RECLF). In RECLF, we +introduce a semantic-relation reasoning module (SRM) and an importance-relation +reasoning module (IRM) to enable more fine-grained report supervision for image +representation learning. We evaluated our method using four public benchmark +datasets on four downstream tasks, including segmentation, zero-shot +classification, supervised classification, and cross-modal retrieval. Our +results demonstrated the superiority of our RECLF over the state-of-the-art +mVLCL methods with consistent improvements across single-modal and cross-modal +tasks. These results suggest that our RECLF, by modelling the inter-matching +relations, can learn improved medical image representations with better +generalization capabilities. + +
+
+ comment: 11 pages, 5 figures. Under review +
+
+
+
+
+ + ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short + Video Search Scenarios + + +
+ Vision-Language Models pre-trained on large-scale image-text datasets have +shown superior performance in downstream tasks such as image retrieval. Most of +the images for pre-training are presented in the form of open domain +common-sense visual elements. Differently, video covers in short video search +scenarios are presented as user-originated contents that provide important +visual summaries of videos. In addition, a portion of the video covers come +with manually designed cover texts that provide semantic complements. In order +to fill in the gaps in short video cover data, we establish the first +large-scale cover-text benchmark for Chinese short video search scenarios. +Specifically, we release two large-scale datasets CBVS-5M/10M to provide short +video covers, and the manual fine-labeling dataset CBVS-20K to provide real +user queries, which serves as an image-text benchmark test in the Chinese short +video search field. To integrate the semantics of cover text in the case of +modality missing, we propose UniCLIP where cover texts play a guiding role +during training, however are not relied upon by inference. Extensive evaluation +on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has +been deployed to Tencent's online video search systems with hundreds of +millions of visits and achieved significant gains. The complete dataset, code +and checkpoints will be available upon release. + +
+
+
+
+
+ + ☆ LDReg: Local Dimensionality Regularized Self-Supervised Learning ICLR 2024 + + +
+ Representations learned via self-supervised learning (SSL) can be susceptible +to dimensional collapse, where the learned representation subspace is of +extremely low dimensionality and thus fails to represent the full data +distribution and modalities. Dimensional collapse also known as the +"underfilling" phenomenon is one of the major causes of degraded performance on +downstream tasks. Previous work has investigated the dimensional collapse +problem of SSL at a global level. In this paper, we demonstrate that +representations can span over high dimensional space globally, but collapse +locally. To address this, we propose a method called $\textit{local +dimensionality regularization (LDReg)}$. Our formulation is based on the +derivation of the Fisher-Rao metric to compare and optimize local distance +distributions at an asymptotically small radius for each data point. By +increasing the local intrinsic dimensionality, we demonstrate through a range +of experiments that LDReg improves the representation quality of SSL. The +results also show that LDReg can regularize dimensionality at both local and +global levels. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Learning to Robustly Reconstruct Low-light Dynamic Scenes from Spike + Streams + + +
+ As a neuromorphic sensor with high temporal resolution, spike camera can +generate continuous binary spike streams to capture per-pixel light intensity. +We can use reconstruction methods to restore scene details in high-speed +scenarios. However, due to limited information in spike streams, low-light +scenes are difficult to effectively reconstruct. In this paper, we propose a +bidirectional recurrent-based reconstruction framework, including a +Light-Robust Representation (LR-Rep) and a fusion module, to better handle such +extreme conditions. LR-Rep is designed to aggregate temporal information in +spike streams, and a fusion module is utilized to extract temporal features. +Additionally, we have developed a reconstruction benchmark for high-speed +low-light scenes. Light sources in the scenes are carefully aligned to +real-world conditions. Experimental results demonstrate the superiority of our +method, which also generalizes well to real spike streams. Related codes and +proposed datasets will be released after publication. + +
+
+
+
+
+ + ☆ Path Choice Matters for Clear Attribution in Path Methods ICLR 2024 + + +
+ Rigorousness and clarity are both essential for interpretations of DNNs to +engender human trust. Path methods are commonly employed to generate rigorous +attributions that satisfy three axioms. However, the meaning of attributions +remains ambiguous due to distinct path choices. To address the ambiguity, we +introduce \textbf{Concentration Principle}, which centrally allocates high +attributions to indispensable features, thereby endowing aesthetic and +sparsity. We then present \textbf{SAMP}, a model-agnostic interpreter, which +efficiently searches the near-optimal path from a pre-defined set of +manipulation paths. Moreover, we propose the infinitesimal constraint (IC) and +momentum strategy (MS) to improve the rigorousness and optimality. +Visualizations show that SAMP can precisely reveal DNNs by pinpointing salient +image pixels. We also perform quantitative experiments and observe that our +method significantly outperforms the counterparts. Code: +https://github.com/zbr17/SAMP. + +
+
+ comment: ICLR 2024 accepted +
+
+
+
+
+ + ♻ ☆ NeRF Revisited: Fixing Quadrature Instability in Volume Rendering + + +
+ Neural radiance fields (NeRF) rely on volume rendering to synthesize novel +views. Volume rendering requires evaluating an integral along each ray, which +is numerically approximated with a finite sum that corresponds to the exact +integral along the ray under piecewise constant volume density. As a +consequence, the rendered result is unstable w.r.t. the choice of samples along +the ray, a phenomenon that we dub quadrature instability. We propose a +mathematically principled solution by reformulating the sample-based rendering +equation so that it corresponds to the exact integral under piecewise linear +volume density. This simultaneously resolves multiple issues: conflicts between +samples along different rays, imprecise hierarchical sampling, and +non-differentiability of quantiles of ray termination distances w.r.t. model +parameters. We demonstrate several benefits over the classical sample-based +rendering equation, such as sharper textures, better geometric reconstruction, +and stronger depth supervision. Our proposed formulation can be also be used as +a drop-in replacement to the volume rendering equation of existing NeRF-based +methods. Our project page can be found at pl-nerf.github.io. + +
+
+ comment: Neurips 2023 +
+
+
+
+
+ + ♻ ☆ GBSD: Generative Bokeh with Stage Diffusion ICASSP + + +
+ The bokeh effect is an artistic technique that blurs out-of-focus areas in a +photograph and has gained interest due to recent developments in text-to-image +synthesis and the ubiquity of smart-phone cameras and photo-sharing apps. Prior +work on rendering bokeh effects have focused on post hoc image manipulation to +produce similar blurring effects in existing photographs using classical +computer graphics or neural rendering techniques, but have either depth +discontinuity artifacts or are restricted to reproducing bokeh effects that are +present in the training data. More recent diffusion based models can synthesize +images with an artistic style, but either require the generation of +high-dimensional masks, expensive fine-tuning, or affect global image +characteristics. In this paper, we present GBSD, the first generative +text-to-image model that synthesizes photorealistic images with a bokeh style. +Motivated by how image synthesis occurs progressively in diffusion models, our +approach combines latent diffusion models with a 2-stage conditioning algorithm +to render bokeh effects on semantically defined objects. Since we can focus the +effect on objects, this semantic bokeh effect is more versatile than classical +rendering techniques. We evaluate GBSD both quantitatively and qualitatively +and demonstrate its ability to be applied in both text-to-image and +image-to-image settings. + +
+
+ comment: Short Version is accepted by International Conference on Acoustics, + Speech, and Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ♻ ☆ Smooth and Stepwise Self-Distillation for Object Detection ICIP + + +
+ Distilling the structured information captured in feature maps has +contributed to improved results for object detection tasks, but requires +careful selection of baseline architectures and substantial pre-training. +Self-distillation addresses these limitations and has recently achieved +state-of-the-art performance for object detection despite making several +simplifying architectural assumptions. Building on this work, we propose Smooth +and Stepwise Self-Distillation (SSSD) for object detection. Our SSSD +architecture forms an implicit teacher from object labels and a feature pyramid +network backbone to distill label-annotated feature maps using Jensen-Shannon +distance, which is smoother than distillation losses used in prior work. We +additionally add a distillation coefficient that is adaptively configured based +on the learning rate. We extensively benchmark SSSD against a baseline and two +state-of-the-art object detector architectures on the COCO dataset by varying +the coefficients and backbone and detector networks. We demonstrate that SSSD +achieves higher average precision in most experimental settings, is robust to a +wide range of coefficients, and benefits from our stepwise distillation +procedure. + +
+
+ comment: Accepted by International Conference on Image Processing (ICIP) 2023 +
+
+
+
+
+ + ♻ ☆ IPR-NeRF: Ownership Verification meets Neural Radiance Field + + +
+ Neural Radiance Field (NeRF) models have gained significant attention in the +computer vision community in the recent past with state-of-the-art visual +quality and produced impressive demonstrations. Since then, technopreneurs have +sought to leverage NeRF models into a profitable business. Therefore, NeRF +models make it worth the risk of plagiarizers illegally copying, +re-distributing, or misusing those models. This paper proposes a comprehensive +intellectual property (IP) protection framework for the NeRF model in both +black-box and white-box settings, namely IPR-NeRF. In the black-box setting, a +diffusion-based solution is introduced to embed and extract the watermark via a +two-stage optimization process. In the white-box setting, a designated digital +signature is embedded into the weights of the NeRF model by adopting the sign +loss objective. Our extensive experiments demonstrate that not only does our +approach maintain the fidelity (\ie, the rendering quality) of IPR-NeRF models, +but it is also robust against both ambiguity and removal attacks compared to +prior arts. + +
+
+ comment: Error on the paper +
+
+
+
+
+ + ♻ ☆ AUPIMO: Redefining Visual Anomaly Detection Benchmarks with High Speed + and Low Tolerance SP + + +
+ Recent advances in visual anomaly detection research have seen AUROC and +AUPRO scores on public benchmark datasets such as MVTec and VisA converge +towards perfect recall, giving the impression that these benchmarks are +near-solved. However, high AUROC and AUPRO scores do not always reflect +qualitative performance, which limits the validity of these metrics in +real-world applications. We argue that the artificial ceiling imposed by the +lack of an adequate evaluation metric restrains progression of the field, and +it is crucial that we revisit the evaluation metrics used to rate our +algorithms. In response, we introduce Per-IMage Overlap (PIMO), a novel metric +that addresses the shortcomings of AUROC and AUPRO. PIMO retains the +recall-based nature of the existing metrics but introduces two distinctions: +the assignment of curves (and respective area under the curve) is per-image, +and its X-axis relies solely on normal images. Measuring recall per image +simplifies instance score indexing and is more robust to noisy annotations. As +we show, it also accelerates computation and enables the usage of statistical +tests to compare models. By imposing low tolerance for false positives on +normal images, PIMO provides an enhanced model validation procedure and +highlights performance variations across datasets. Our experiments demonstrate +that PIMO offers practical advantages and nuanced performance insights that +redefine anomaly detection benchmarks -- notably challenging the perception +that MVTec AD and VisA datasets have been solved by contemporary models. +Available on GitHub: https://github.com/jpcbertoldo/aupimo. + +
+
+ comment: This research has been conducted during Google Summer of Code 2023 + (GSoC 2023) at OpenVINO (Intel). GSoC 2023 page: + https://summerofcode.withgoogle.com/archive/2023/projects/SPMopugd +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of Multimodal Image-Text Models under + Distribution Shift + + +
+ Multimodal image-text models have shown remarkable performance in the past +few years. However, evaluating robustness against distribution shifts is +crucial before adopting them in real-world applications. In this work, we +investigate the robustness of 12 popular open-sourced image-text models under +common perturbations on five tasks (image-text retrieval, visual reasoning, +visual entailment, image captioning, and text-to-image generation). In +particular, we propose several new multimodal robustness benchmarks by applying +17 image perturbation and 16 text perturbation techniques on top of existing +datasets. We observe that multimodal models are not robust to image and text +perturbations, especially to image perturbations. Among the tested perturbation +methods, character-level perturbations constitute the most severe distribution +shift for text, and zoom blur is the most severe shift for image data. We also +introduce two new robustness metrics (\textbf{MMI} for MultiModal Impact score +and \textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal +models. We hope our extensive study sheds light on new directions for the +development of robust multimodal models. More details can be found on the +project webpage: \url{https://MMRobustness.github.io}. + +
+
+ comment: Accepted by Journal of Data-centric Machine Learning Research (DMLR) + 2024 +
+
+
+
+
+ + ♻ ☆ ClawCraneNet: Leveraging Object-level Relation for Text-based Video + Segmentation + + +
+ Text-based video segmentation is a challenging task that segments out the +natural language referred objects in videos. It essentially requires semantic +comprehension and fine-grained video understanding. Existing methods introduce +language representation into segmentation models in a bottom-up manner, which +merely conducts vision-language interaction within local receptive fields of +ConvNets. We argue that such interaction is not fulfilled since the model can +barely construct region-level relationships given partial observations, which +is contrary to the description logic of natural language/referring expressions. +In fact, people usually describe a target object using relations with other +objects, which may not be easily understood without seeing the whole video. To +address the issue, we introduce a novel top-down approach by imitating how we +human segment an object with the language guidance. We first figure out all +candidate objects in videos and then choose the refereed one by parsing +relations among those high-level objects. Three kinds of object-level relations +are investigated for precise relationship understanding, i.e., positional +relation, text-guided semantic relation, and temporal relation. Extensive +experiments on A2D Sentences and J-HMDB Sentences show our method outperforms +state-of-the-art methods by a large margin. Qualitative results also show our +results are more explainable. + +
+
+ comment: Extended version published in + https://ieeexplore.ieee.org/abstract/document/10083244 +
+
+
+
+
+ + ♻ ☆ PoseScript: Linking 3D Human Poses and Natural Language ECCV 2022 + + +
+ Natural language plays a critical role in many computer vision applications, +such as image captioning, visual question answering, and cross-modal retrieval, +to provide fine-grained semantic information. Unfortunately, while human pose +is key to human understanding, current 3D human pose datasets lack detailed +language descriptions. To address this issue, we have introduced the PoseScript +dataset. This dataset pairs more than six thousand 3D human poses from AMASS +with rich human-annotated descriptions of the body parts and their spatial +relationships. Additionally, to increase the size of the dataset to a scale +that is compatible with data-hungry learning algorithms, we have proposed an +elaborate captioning process that generates automatic synthetic descriptions in +natural language from given 3D keypoints. This process extracts low-level pose +information, known as "posecodes", using a set of simple but generic rules on +the 3D keypoints. These posecodes are then combined into higher level textual +descriptions using syntactic rules. With automatic annotations, the amount of +available data significantly scales up (100k), making it possible to +effectively pretrain deep models for finetuning on human captions. To showcase +the potential of annotated poses, we present three multi-modal learning tasks +that utilize the PoseScript dataset. Firstly, we develop a pipeline that maps +3D poses and textual descriptions into a joint embedding space, allowing for +cross-modal retrieval of relevant poses from large-scale datasets. Secondly, we +establish a baseline for a text-conditioned model generating 3D poses. Thirdly, +we present a learned process for generating pose descriptions. These +applications demonstrate the versatility and usefulness of annotated poses in +various tasks and pave the way for future research in the field. + +
+
+ comment: Extended version of the ECCV 2022 paper +
+
+
+
+
+ + ♻ ☆ Rethinking Cross-modal Interaction from a Top-down Perspective for + Referring Video Object Segmentation + + +
+ Referring video object segmentation (RVOS) aims to segment video objects with +the guidance of natural language reference. Previous methods typically tackle +RVOS through directly grounding linguistic reference over the image lattice. +Such bottom-up strategy fails to explore object-level cues, easily leading to +inferior results. In this work, we instead put forward a two-stage, top-down +RVOS solution. First, an exhaustive set of object tracklets is constructed by +propagating object masks detected from several sampled frames to the entire +video. Second, a Transformer-based tracklet-language grounding module is +proposed, which models instance-level visual relations and cross-modal +interactions simultaneously and efficiently. Our model ranks first place on +CVPR2021 Referring Youtube-VOS challenge. + +
+
+ comment: Champion solution in YouTube-VOS 2021 Track 3. Extended version + published in https://ieeexplore.ieee.org/abstract/document/10083244 +
+
+
+
+
+ + ♻ ☆ IM-IAD: Industrial Image Anomaly Detection Benchmark in Manufacturing + + +
+ Image anomaly detection (IAD) is an emerging and vital computer vision task +in industrial manufacturing (IM). Recently, many advanced algorithms have been +reported, but their performance deviates considerably with various IM settings. +We realize that the lack of a uniform IM benchmark is hindering the development +and usage of IAD methods in real-world applications. In addition, it is +difficult for researchers to analyze IAD algorithms without a uniform +benchmark. To solve this problem, we propose a uniform IM benchmark, for the +first time, to assess how well these algorithms perform, which includes various +levels of supervision (unsupervised versus fully supervised), learning +paradigms (few-shot, continual and noisy label), and efficiency (memory usage +and inference speed). Then, we construct a comprehensive image anomaly +detection benchmark (IM-IAD), which includes 19 algorithms on seven major +datasets with a uniform setting. Extensive experiments (17,017 total) on IM-IAD +provide in-depth insights into IAD algorithm redesign or selection. Moreover, +the proposed IM-IAD benchmark challenges existing algorithms and suggests +future research directions. To foster reproducibility and accessibility, the +source code of IM-IAD is uploaded on the website, +https://github.com/M-3LAB/IM-IAD. + +
+
+
+
+
+ + ♻ ☆ Matcher: Segment Anything with One Shot Using All-Purpose Feature + Matching ICLR2024 + + +
+ Powered by large-scale pre-training, vision foundation models exhibit +significant potential in open-world image understanding. However, unlike large +language models that excel at directly tackling various language tasks, vision +foundation models require a task-specific model structure followed by +fine-tuning on specific tasks. In this work, we present Matcher, a novel +perception paradigm that utilizes off-the-shelf vision foundation models to +address various perception tasks. Matcher can segment anything by using an +in-context example without training. Additionally, we design three effective +components within the Matcher framework to collaborate with these foundation +models and unleash their full potential in diverse perception tasks. Matcher +demonstrates impressive generalization performance across various segmentation +tasks, all without training. For example, it achieves 52.7% mIoU on COCO-20$^i$ +with one example, surpassing the state-of-the-art specialist model by 1.6%. In +addition, Matcher achieves 33.0% mIoU on the proposed LVIS-92$^i$ for one-shot +semantic segmentation, outperforming the state-of-the-art generalist model by +14.4%. Our visualization results further showcase the open-world generality and +flexibility of Matcher when applied to images in the wild. Our code can be +found at https://github.com/aim-uofa/Matcher. + +
+
+ comment: Accepted to ICLR2024 +
+
+
+
+
+ + ♻ ☆ Local-Global Context Aware Transformer for Language-Guided Video + Segmentation + + +
+ We explore the task of language-guided video segmentation (LVS). Previous +algorithms mostly adopt 3D CNNs to learn video representation, struggling to +capture long-term context and easily suffering from visual-linguistic +misalignment. In light of this, we present Locater (local-global context aware +Transformer), which augments the Transformer architecture with a finite memory +so as to query the entire video with the language expression in an efficient +manner. The memory is designed to involve two components -- one for +persistently preserving global video content, and one for dynamically gathering +local temporal context and segmentation history. Based on the memorized +local-global context and the particular content of each frame, Locater +holistically and flexibly comprehends the expression as an adaptive query +vector for each frame. The vector is used to query the corresponding frame for +mask generation. The memory also allows Locater to process videos with linear +time complexity and constant size memory, while Transformer-style +self-attention computation scales quadratically with sequence length. To +thoroughly examine the visual grounding capability of LVS models, we contribute +a new LVS dataset, A2D-S+, which is built upon A2D-S dataset but poses +increased challenges in disambiguating among similar objects. Experiments on +three LVS datasets and our A2D-S+ show that Locater outperforms previous +state-of-the-arts. Further, we won the 1st place in the Referring Video Object +Segmentation Track of the 3rd Large-scale Video Object Segmentation Challenge, +where Locater served as the foundation for the winning solution. Our code and +dataset are available at: https://github.com/leonnnop/Locater + +
+
+ comment: Accepted by TPAMI. Code, data: https://github.com/leonnnop/Locater +
+
+
+
+
+ + ♻ ☆ Data-Driven Modelling for Harmonic Current Emission in Low-Voltage Grid + Using MCReSANet with Interpretability Analysis + + +
+ Even though the use of power electronics PE loads offers enhanced electrical +energy conversion efficiency and control, they remain the primary sources of +harmonics in grids. When diverse loads are connected in the distribution +system, their interactions complicate establishing analytical models for the +relationship between harmonic voltages and currents. To solve this, our paper +presents a data-driven model using MCReSANet to construct the highly nonlinear +between harmonic voltage and current. Two datasets from PCCs in Finland and +Germany are utilized, which demonstrates that MCReSANet is capable of +establishing accurate nonlinear mappings, even in the presence of various +network characteristics for selected Finland and Germany datasets. The model +built by MCReSANet can improve the MAE by 10% and 14% compared to the CNN, and +by 8% and 17% compared to the MLP for both Finnish and German datasets, also +showing much lower model uncertainty than others. This is a crucial +prerequisite for more precise SHAP value-based feature importance analysis, +which is a method for the model interpretability analysis in this paper. The +results by feature importance analysis show the detailed relationships between +each order of harmonic voltage and current in the distribution system. There is +an interactive impact on each order of harmonic current, but some orders of +harmonic voltages have a dominant influence on harmonic current emissions: +positive sequence and zero sequence harmonics have the dominant importance in +the Finnish and German networks, respectively, which conforms to the pattern of +connected load types in two selected Finnish and German datasets. This paper +enhances the potential for understanding and predicting harmonic current +emissions by diverse PE loads in distribution systems, which is beneficial to +more effective management for optimizing power quality in diverse grid +environments. + +
+
+
+
+
+ + ♻ ☆ ConstScene: Dataset and Model for Advancing Robust Semantic Segmentation + in Construction Environments + + +
+ The increasing demand for autonomous machines in construction environments +necessitates the development of robust object detection algorithms that can +perform effectively across various weather and environmental conditions. This +paper introduces a new semantic segmentation dataset specifically tailored for +construction sites, taking into account the diverse challenges posed by adverse +weather and environmental conditions. The dataset is designed to enhance the +training and evaluation of object detection models, fostering their +adaptability and reliability in real-world construction applications. Our +dataset comprises annotated images captured under a wide range of different +weather conditions, including but not limited to sunny days, rainy periods, +foggy atmospheres, and low-light situations. Additionally, environmental +factors such as the existence of dirt/mud on the camera lens are integrated +into the dataset through actual captures and synthetic generation to simulate +the complex conditions prevalent in construction sites. We also generate +synthetic images of the annotations including precise semantic segmentation +masks for various objects commonly found in construction environments, such as +wheel loader machines, personnel, cars, and structural elements. To demonstrate +the dataset's utility, we evaluate state-of-the-art object detection algorithms +on our proposed benchmark. The results highlight the dataset's success in +adversarial training models across diverse conditions, showcasing its efficacy +compared to existing datasets that lack such environmental variability. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ EZ-CLIP: Efficient Zeroshot Video Action Recognition + + +
+ Recent advancements in large-scale pre-training of visual-language models on +paired image-text data have demonstrated impressive generalization capabilities +for zero-shot tasks. Building on this success, efforts have been made to adapt +these image-based visual-language models, such as CLIP, for videos extending +their zero-shot capabilities to the video domain. While these adaptations have +shown promising results, they come at a significant computational cost and +struggle with effectively modeling the crucial temporal aspects inherent to the +video domain. In this study, we present EZ-CLIP, a simple and efficient +adaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal +visual prompting for seamless temporal adaptation, requiring no fundamental +alterations to the core CLIP architecture while preserving its remarkable +generalization abilities. Moreover, we introduce a novel learning objective +that guides the temporal visual prompts to focus on capturing motion, thereby +enhancing its learning capabilities from video data. We conducted extensive +experiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP +for zero-shot learning and base-to-novel video action recognition, and also +demonstrating its potential for few-shot generalization.Impressively, with a +mere 5.2 million learnable parameters (as opposed to the 71.1 million in the +prior best model), EZ-CLIP can be efficiently trained on a single GPU, +outperforming existing approaches in several evaluations. + +
+
+
+
+
+ + ♻ ☆ Semantic Lens: Instance-Centric Semantic Alignment for Video + Super-Resolution AAAI 2024 + + +
+ As a critical clue of video super-resolution (VSR), inter-frame alignment +significantly impacts overall performance. However, accurate pixel-level +alignment is a challenging task due to the intricate motion interweaving in the +video. In response to this issue, we introduce a novel paradigm for VSR named +Semantic Lens, predicated on semantic priors drawn from degraded videos. +Specifically, video is modeled as instances, events, and scenes via a Semantic +Extractor. Those semantics assist the Pixel Enhancer in understanding the +recovered contents and generating more realistic visual results. The distilled +global semantics embody the scene information of each frame, while the +instance-specific semantics assemble the spatial-temporal contexts related to +each instance. Furthermore, we devise a Semantics-Powered Attention +Cross-Embedding (SPACE) block to bridge the pixel-level features with semantic +knowledge, composed of a Global Perspective Shifter (GPS) and an +Instance-Specific Semantic Embedding Encoder (ISEE). Concretely, the GPS module +generates pairs of affine transformation parameters for pixel-level feature +modulation conditioned on global semantics. After that, the ISEE module +harnesses the attention mechanism to align the adjacent frames in the +instance-centric semantic space. In addition, we incorporate a simple yet +effective pre-alignment module to alleviate the difficulty of model training. +Extensive experiments demonstrate the superiority of our model over existing +state-of-the-art VSR methods. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Divide and not forget: Ensemble of selectively trained experts in + Continual Learning ICLR 2024 + + +
+ Class-incremental learning is becoming more popular as it helps models widen +their applicability while not forgetting what they already know. A trend in +this area is to use a mixture-of-expert technique, where different models work +together to solve the task. However, the experts are usually trained all at +once using whole task data, which makes them all prone to forgetting and +increasing computational burden. To address this limitation, we introduce a +novel approach named SEED. SEED selects only one, the most optimal expert for a +considered task, and uses data from this task to fine-tune only this expert. +For this purpose, each expert represents each class with a Gaussian +distribution, and the optimal expert is selected based on the similarity of +those distributions. Consequently, SEED increases diversity and heterogeneity +within the experts while maintaining the high stability of this ensemble +method. The extensive experiments demonstrate that SEED achieves +state-of-the-art performance in exemplar-free settings across various +scenarios, showing the potential of expert diversification through data in +continual learning. + +
+
+ comment: Accepted for ICLR 2024 (main track), code is available at: + https://github.com/grypesc/SEED +
+
+
+
+
+ + ♻ ☆ Towards domain-invariant Self-Supervised Learning with Batch Styles + Standardization ICLR 2024 + + +
+ In Self-Supervised Learning (SSL), models are typically pretrained, +fine-tuned, and evaluated on the same domains. However, they tend to perform +poorly when evaluated on unseen domains, a challenge that Unsupervised Domain +Generalization (UDG) seeks to address. Current UDG methods rely on domain +labels, which are often challenging to collect, and domain-specific +architectures that lack scalability when confronted with numerous domains, +making the current methodology impractical and rigid. Inspired by +contrastive-based UDG methods that mitigate spurious correlations by +restricting comparisons to examples from the same domain, we hypothesize that +eliminating style variability within a batch could provide a more convenient +and flexible way to reduce spurious correlations without requiring domain +labels. To verify this hypothesis, we introduce Batch Styles Standardization +(BSS), a relatively simple yet powerful Fourier-based method to standardize the +style of images in a batch specifically designed for integration with SSL +methods to tackle UDG. Combining BSS with existing SSL methods offers serious +advantages over prior UDG methods: (1) It eliminates the need for domain labels +or domain-specific network components to enhance domain-invariance in SSL +representations, and (2) offers flexibility as BSS can be seamlessly integrated +with diverse contrastive-based but also non-contrastive-based SSL methods. +Experiments on several UDG datasets demonstrate that it significantly improves +downstream task performances on unseen domains, often outperforming or rivaling +with UDG methods. Finally, this work clarifies the underlying mechanisms +contributing to BSS's effectiveness in improving domain-invariance in SSL +representations and performances on unseen domain. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Hierarchical Masked 3D Diffusion Model for Video Outpainting ACM MM 2023 + + +
+ Video outpainting aims to adequately complete missing areas at the edges of +video frames. Compared to image outpainting, it presents an additional +challenge as the model should maintain the temporal consistency of the filled +area. In this paper, we introduce a masked 3D diffusion model for video +outpainting. We use the technique of mask modeling to train the 3D diffusion +model. This allows us to use multiple guide frames to connect the results of +multiple video clip inferences, thus ensuring temporal consistency and reducing +jitter between adjacent frames. Meanwhile, we extract the global frames of the +video as prompts and guide the model to obtain information other than the +current video clip using cross-attention. We also introduce a hybrid +coarse-to-fine inference pipeline to alleviate the artifact accumulation +problem. The existing coarse-to-fine pipeline only uses the infilling strategy, +which brings degradation because the time interval of the sparse frames is too +large. Our pipeline benefits from bidirectional learning of the mask modeling +and thus can employ a hybrid strategy of infilling and interpolation when +generating sparse frames. Experiments show that our method achieves +state-of-the-art results in video outpainting tasks. More results and codes are +provided at our https://fanfanda.github.io/M3DDM/. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Diffusion Model is Secretly a Training-free Open Vocabulary Semantic + Segmenter + + +
+ The pre-trained text-image discriminative models, such as CLIP, has been +explored for open-vocabulary semantic segmentation with unsatisfactory results +due to the loss of crucial localization information and awareness of object +shapes. Recently, there has been a growing interest in expanding the +application of generative models from generation tasks to semantic +segmentation. These approaches utilize generative models either for generating +annotated data or extracting features to facilitate semantic segmentation. This +typically involves generating a considerable amount of synthetic data or +requiring additional mask annotations. To this end, we uncover the potential of +generative text-to-image diffusion models (e.g., Stable Diffusion) as highly +efficient open-vocabulary semantic segmenters, and introduce a novel +training-free approach named DiffSegmenter. The insight is that to generate +realistic objects that are semantically faithful to the input text, both the +complete object shapes and the corresponding semantics are implicitly learned +by diffusion models. We discover that the object shapes are characterized by +the self-attention maps while the semantics are indicated through the +cross-attention maps produced by the denoising U-Net, forming the basis of our +segmentation results.Additionally, we carefully design effective textual +prompts and a category filtering mechanism to further enhance the segmentation +results. Extensive experiments on three benchmark datasets show that the +proposed DiffSegmenter achieves impressive results for open-vocabulary semantic +segmentation. + +
+
+
+
+
+ + ♻ ☆ Domain Generalization with Vital Phase Augmentation AAAI-24 + + +
+ Deep neural networks have shown remarkable performance in image +classification. However, their performance significantly deteriorates with +corrupted input data. Domain generalization methods have been proposed to train +robust models against out-of-distribution data. Data augmentation in the +frequency domain is one of such approaches that enable a model to learn phase +features to establish domain-invariant representations. This approach changes +the amplitudes of the input data while preserving the phases. However, using +fixed phases leads to susceptibility to phase fluctuations because amplitudes +and phase fluctuations commonly occur in out-of-distribution. In this study, to +address this problem, we introduce an approach using finite variation of the +phases of input data rather than maintaining fixed phases. Based on the +assumption that the degree of domain-invariant features varies for each phase, +we propose a method to distinguish phases based on this degree. In addition, we +propose a method called vital phase augmentation (VIPAug) that applies the +variation to the phases differently according to the degree of domain-invariant +features of given phases. The model depends more on the vital phases that +contain more domain-invariant features for attaining robustness to amplitude +and phase fluctuations. We present experimental evaluations of our proposed +approach, which exhibited improved performance for both clean and corrupted +data. VIPAug achieved SOTA performance on the benchmark CIFAR-10 and CIFAR-100 +datasets, as well as near-SOTA performance on the ImageNet-100 and ImageNet +datasets. Our code is available at https://github.com/excitedkid/vipaug. + +
+
+ comment: Accepted by AAAI-24 +
+
+
+
+
+ + ♻ ☆ Learning from History: Task-agnostic Model Contrastive Learning for + Image Restoration AAAI + + +
+ Contrastive learning has emerged as a prevailing paradigm for high-level +vision tasks, which, by introducing properly negative samples, has also been +exploited for low-level vision tasks to achieve a compact optimization space to +account for their ill-posed nature. However, existing methods rely on manually +predefined and task-oriented negatives, which often exhibit pronounced +task-specific biases. To address this challenge, our paper introduces an +innovative method termed 'learning from history', which dynamically generates +negative samples from the target model itself. Our approach, named Model +Contrastive paradigm for Image Restoration (MCIR), rejuvenates latency models +as negative models, making it compatible with diverse image restoration tasks. +We propose the Self-Prior guided Negative loss (SPN) to enable it. This +approach significantly enhances existing models when retrained with the +proposed model contrastive paradigm. The results show significant improvements +in image restoration across various tasks and architectures. For example, +models retrained with SPN outperform the original FFANet and DehazeFormer by +3.41 dB and 0.57 dB on the RESIDE indoor dataset for image dehazing. Similarly, +they achieve notable improvements of 0.47 dB on SPA-Data over IDT for image +deraining and 0.12 dB on Manga109 for a 4x scale super-resolution over +lightweight SwinIR, respectively. Code and retrained models are available at +https://github.com/Aitical/MCIR. + +
+
+ comment: Camera Ready Version. Accepted to The 38th Annual AAAI Conference on + Artificial Intelligence (AAAI 2024) +
+
+
+
+
+ + ♻ ☆ Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy + for Temporal Sentence Grounding in Video AAAI 2024 + + +
+ Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias +issue, which is caused by the uneven temporal distribution of the target +moments for samples with similar semantic components in input videos or query +texts. Existing methods resort to utilizing prior knowledge about bias to +artificially break this uneven distribution, which only removes a limited +amount of significant language biases. In this work, we propose the +bias-conflict sample synthesis and adversarial removal debias strategy +(BSSARD), which dynamically generates bias-conflict samples by explicitly +leveraging potentially spurious correlations between single-modality features +and the temporal position of the target moments. Through adversarial training, +its bias generators continuously introduce biases and generate bias-conflict +samples to deceive its grounding model. Meanwhile, the grounding model +continuously eliminates the introduced biases, which requires it to model +multi-modality alignment information. BSSARD will cover most kinds of coupling +relationships and disrupt language and visual biases simultaneously. Extensive +experiments on Charades-CD and ActivityNet-CD demonstrate the promising +debiasing capability of BSSARD. Source codes are available at +https://github.com/qzhb/BSSARD. + +
+
+ comment: accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Wasserstein Distance-based Expansion of Low-Density Latent Regions for + Unknown Class Detection + + +
+ This paper addresses the significant challenge in open-set object detection +(OSOD): the tendency of state-of-the-art detectors to erroneously classify +unknown objects as known categories with high confidence. We present a novel +approach that effectively identifies unknown objects by distinguishing between +high and low-density regions in latent space. Our method builds upon the +Open-Det (OD) framework, introducing two new elements to the loss function. +These elements enhance the known embedding space's clustering and expand the +unknown space's low-density regions. The first addition is the Class +Wasserstein Anchor (CWA), a new function that refines the classification +boundaries. The second is a spectral normalisation step, improving the +robustness of the model. Together, these augmentations to the existing +Contrastive Feature Learner (CFL) and Unknown Probability Learner (UPL) loss +functions significantly improve OSOD performance. Our proposed OpenDet-CWA +(OD-CWA) method demonstrates: a) a reduction in open-set errors by +approximately 17%-22%, b) an enhancement in novelty detection capability by +1.5%-16%, and c) a decrease in the wilderness index by 2%-20% across various +open-set scenarios. These results represent a substantial advancement in the +field, showcasing the potential of our approach in managing the complexities of +open-set object detection. + +
+
+ comment: 8 Full length pages, followed by 2 supplementary pages, total of 9 + Figures +
+
+
+
+
+ + ♻ ☆ Hierarchical Compositional Representations for Few-shot Action + Recognition + + +
+ Recently action recognition has received more and more attention for its +comprehensive and practical applications in intelligent surveillance and +human-computer interaction. However, few-shot action recognition has not been +well explored and remains challenging because of data scarcity. In this paper, +we propose a novel hierarchical compositional representations (HCR) learning +approach for few-shot action recognition. Specifically, we divide a complicated +action into several sub-actions by carefully designed hierarchical clustering +and further decompose the sub-actions into more fine-grained spatially +attentional sub-actions (SAS-actions). Although there exist large differences +between base classes and novel classes, they can share similar patterns in +sub-actions or SAS-actions. Furthermore, we adopt the Earth Mover's Distance in +the transportation problem to measure the similarity between video samples in +terms of sub-action representations. It computes the optimal matching flows +between sub-actions as distance metric, which is favorable for comparing +fine-grained patterns. Extensive experiments show our method achieves the +state-of-the-art results on HMDB51, UCF101 and Kinetics datasets. + +
+
+ comment: Accepted by Computer Vision and Image Understanding +
+
+
+
+
+ + ♻ ☆ Skeleton-Guided Instance Separation for Fine-Grained Segmentation in + Microscopy + + +
+ One of the fundamental challenges in microscopy (MS) image analysis is +instance segmentation (IS), particularly when segmenting cluster regions where +multiple objects of varying sizes and shapes may be connected or even +overlapped in arbitrary orientations. Existing IS methods usually fail in +handling such scenarios, as they rely on coarse instance representations such +as keypoints and horizontal bounding boxes (h-bboxes). In this paper, we +propose a novel one-stage framework named A2B-IS to address this challenge and +enhance the accuracy of IS in MS images. Our approach represents each instance +with a pixel-level mask map and a rotated bounding box (r-bbox). Unlike +two-stage methods that use box proposals for segmentations, our method +decouples mask and box predictions, enabling simultaneous processing to +streamline the model pipeline. Additionally, we introduce a Gaussian skeleton +map to aid the IS task in two key ways: (1) It guides anchor placement, +reducing computational costs while improving the model's capacity to learn +RoI-aware features by filtering out noise from background regions. (2) It +ensures accurate isolation of densely packed instances by rectifying erroneous +box predictions near instance boundaries. To further enhance the performance, +we integrate two modules into the framework: (1) An Atrous Attention Block +(A2B) designed to extract high-resolution feature maps with fine-grained +multiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that +leverages both labeled and unlabeled images for model training. Our method has +been thoroughly validated on two large-scale MS datasets, demonstrating its +superiority over most state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ Diagnosis Of Takotsubo Syndrome By Robust Feature Selection From The + Complex Latent Space Of DL-based Segmentation Network + + +
+ Researchers have shown significant correlations among segmented objects in +various medical imaging modalities and disease related pathologies. Several +studies showed that using hand crafted features for disease prediction neglects +the immense possibility to use latent features from deep learning (DL) models +which may reduce the overall accuracy of differential diagnosis. However, +directly using classification or segmentation models on medical to learn latent +features opt out robust feature selection and may lead to overfitting. To fill +this gap, we propose a novel feature selection technique using the latent space +of a segmentation model that can aid diagnosis. We evaluated our method in +differentiating a rare cardiac disease: Takotsubo Syndrome (TTS) from the ST +elevation myocardial infarction (STEMI) using echocardiogram videos (echo). TTS +can mimic clinical features of STEMI in echo and extremely hard to distinguish. +Our approach shows promising results in differential diagnosis of TTS with 82% +diagnosis accuracy beating the previous state-of-the-art (SOTA) approach. +Moreover, the robust feature selection technique using LASSO algorithm shows +great potential in reducing the redundant features and creates a robust +pipeline for short- and long-term disease prognoses in the downstream analysis. + +
+
+ comment: 5 pages, 3 figures, conference +
+
+
+
+
+ + ♻ ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for + Diffusion-Based Video Generation + + +
+ Recent large-scale pre-trained diffusion models have demonstrated a powerful +generative ability to produce high-quality videos from detailed text +descriptions. However, exerting control over the motion of objects in videos +generated by any video diffusion model is a challenging problem. In this paper, +we propose a novel zero-shot moving object trajectory control framework, +Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video +diffusion model.To this end, an initial noise prior module is designed to +provide a position-based prior to improve the stability of the appearance of +the moving object and the accuracy of position. In addition, based on the +attention map of the U-net, spatial constraints are directly applied to the +denoising process of diffusion models, which further ensures the positional and +spatial consistency of moving objects during the inference. Furthermore, +temporal consistency is guaranteed with a proposed shift temporal attention +mechanism. Our method can be flexibly applied to various state-of-the-art video +diffusion models without any training process. Extensive experiments +demonstrate our proposed method can control the motion trajectories of objects +and generate high-quality videos. + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Fast graph-based denoising for point cloud color information ICASSP 2024 + + +
+ Point clouds are utilized in various 3D applications such as cross-reality +(XR) and realistic 3D displays. In some applications, e.g., for live streaming +using a 3D point cloud, real-time point cloud denoising methods are required to +enhance the visual quality. However, conventional high-precision denoising +methods cannot be executed in real time for large-scale point clouds owing to +the complexity of graph constructions with K nearest neighbors and noise level +estimation. This paper proposes a fast graph-based denoising (FGBD) for a +large-scale point cloud. First, high-speed graph construction is achieved by +scanning a point cloud in various directions and searching adjacent +neighborhoods on the scanning lines. Second, we propose a fast noise level +estimation method using eigenvalues of the covariance matrix on a graph. +Finally, we also propose a new low-cost filter selection method to enhance +denoising accuracy to compensate for the degradation caused by the acceleration +algorithms. In our experiments, we succeeded in reducing the processing time +dramatically while maintaining accuracy relative to conventional denoising +methods. Denoising was performed at 30fps, with frames containing approximately +1 million points. + +
+
+ comment: Published in the proceeding of 2024 IEEE International Conference on + Acoustics, Speech and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ♻ ☆ Progressive Text-to-Image Diffusion with Soft Latent Direction + + +
+ In spite of the rapidly evolving landscape of text-to-image generation, the +synthesis and manipulation of multiple entities while adhering to specific +relational constraints pose enduring challenges. This paper introduces an +innovative progressive synthesis and editing operation that systematically +incorporates entities into the target image, ensuring their adherence to +spatial and relational constraints at each sequential step. Our key insight +stems from the observation that while a pre-trained text-to-image diffusion +model adeptly handles one or two entities, it often falters when dealing with a +greater number. To address this limitation, we propose harnessing the +capabilities of a Large Language Model (LLM) to decompose intricate and +protracted text descriptions into coherent directives adhering to stringent +formats. To facilitate the execution of directives involving distinct semantic +operations-namely insertion, editing, and erasing-we formulate the Stimulus, +Response, and Fusion (SRF) framework. Within this framework, latent regions are +gently stimulated in alignment with each operation, followed by the fusion of +the responsive latent components to achieve cohesive entity manipulation. Our +proposed framework yields notable advancements in object synthesis, +particularly when confronted with intricate and lengthy textual inputs. +Consequently, it establishes a new benchmark for text-to-image generation +tasks, further elevating the field's performance standards. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ Cross-Modality Perturbation Synergy Attack for Person Re-identification + + +
+ In recent years, there has been significant research focusing on addressing +security concerns in single-modal person re-identification (ReID) systems that +are based on RGB images. However, the safety of cross-modality scenarios, which +are more commonly encountered in practical applications involving images +captured by infrared cameras, has not received adequate attention. The main +challenge in cross-modality ReID lies in effectively dealing with visual +differences between different modalities. For instance, infrared images are +typically grayscale, unlike visible images that contain color information. +Existing attack methods have primarily focused on the characteristics of the +visible image modality, overlooking the features of other modalities and the +variations in data distribution among different modalities. This oversight can +potentially undermine the effectiveness of these methods in image retrieval +across diverse modalities. This study represents the first exploration into the +security of cross-modality ReID models and proposes a universal perturbation +attack specifically designed for cross-modality ReID. This attack optimizes +perturbations by leveraging gradients from diverse modality data, thereby +disrupting the discriminator and reinforcing the differences between +modalities. We conducted experiments on two widely used cross-modality +datasets, namely RegDB and SYSU, which not only demonstrated the effectiveness +of our method but also provided insights for future enhancements in the +robustness of cross-modality ReID systems. + +
+
+
+
+
+ + ♻ ☆ Diffusion-based Data Augmentation for Nuclei Image Segmentation MICCAI 2023 + + +
+ Nuclei segmentation is a fundamental but challenging task in the quantitative +analysis of histopathology images. Although fully-supervised deep +learning-based methods have made significant progress, a large number of +labeled images are required to achieve great segmentation performance. +Considering that manually labeling all nuclei instances for a dataset is +inefficient, obtaining a large-scale human-annotated dataset is time-consuming +and labor-intensive. Therefore, augmenting a dataset with only a few labeled +images to improve the segmentation performance is of significant research and +application value. In this paper, we introduce the first diffusion-based +augmentation method for nuclei segmentation. The idea is to synthesize a large +number of labeled images to facilitate training the segmentation model. To +achieve this, we propose a two-step strategy. In the first step, we train an +unconditional diffusion model to synthesize the Nuclei Structure that is +defined as the representation of pixel-level semantic and distance transform. +Each synthetic nuclei structure will serve as a constraint on histopathology +image synthesis and is further post-processed to be an instance map. In the +second step, we train a conditioned diffusion model to synthesize +histopathology images based on nuclei structures. The synthetic histopathology +images paired with synthetic instance maps will be added to the real dataset +for training the segmentation model. The experimental results show that by +augmenting 10% labeled real dataset with synthetic samples, one can achieve +comparable segmentation results with the fully-supervised baseline. The code is +released in: https://github.com/lhaof/Nudiff + +
+
+ comment: MICCAI 2023, released code: https://github.com/lhaof/Nudiff +
+
+
+
+
+ + ♻ ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed improvements of up to 1.6% in test data, while maintaining the same +inference time, and a substantial 1.0% points performance gain in deformation +field smoothness. + +
+
+
+
+
+ + ♻ ☆ Exploiting Multiple Sequence Lengths in Fast End to End Training for + Image Captioning + + +
+ We introduce a method called the Expansion mechanism that processes the input +unconstrained by the number of elements in the sequence. By doing so, the model +can learn more effectively compared to traditional attention-based approaches. +To support this claim, we design a novel architecture ExpansionNet v2 that +achieved strong results on the MS COCO 2014 Image Captioning challenge and the +State of the Art in its respective category, with a score of 143.7 CIDErD in +the offline test split, 140.8 CIDErD in the online evaluation server and 72.9 +AllCIDEr on the nocaps validation set. Additionally, we introduce an End to End +training algorithm up to 2.8 times faster than established alternatives. Source +code available at: https://github.com/jchenghu/ExpansionNet_v2 + +
+
+
+
+
+ + ♻ ☆ VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text + Recognition + + +
+ Scene Text Recognition (STR) is a challenging task that involves recognizing +text within images of natural scenes. Although current state-of-the-art models +for STR exhibit high performance, they typically suffer from low inference +efficiency due to their reliance on hybrid architectures comprised of visual +encoders and sequence decoders. In this work, we propose the VIsion Permutable +extractor for fast and efficient scene Text Recognition (VIPTR), which achieves +an impressive balance between high performance and rapid inference speeds in +the domain of STR. Specifically, VIPTR leverages a visual-semantic extractor +with a pyramid structure, characterized by multiple self-attention layers, +while eschewing the traditional sequence decoder. This design choice results in +a lightweight and efficient model capable of handling inputs of varying sizes. +Extensive experimental results on various standard datasets for both Chinese +and English scene text recognition validate the superiority of VIPTR. Notably, +the VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with +other lightweight models and achieves SOTA inference speeds. Meanwhile, the +VIPTR-L (Large) variant attains greater recognition accuracy, while maintaining +a low parameter count and favorable inference speed. Our proposed method +provides a compelling solution for the STR challenge, which blends high +accuracy with efficiency and greatly benefits real-world applications requiring +fast and reliable text recognition. The code is publicly available at +https://github.com/cxfyxl/VIPTR. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2205.00159 by other authors +
+
+
+
+
+ + ♻ ☆ WaterHE-NeRF: Water-ray Tracing Neural Radiance Fields for Underwater + Scene Reconstruction + + +
+ Neural Radiance Field (NeRF) technology demonstrates immense potential in +novel viewpoint synthesis tasks, due to its physics-based volumetric rendering +process, which is particularly promising in underwater scenes. Addressing the +limitations of existing underwater NeRF methods in handling light attenuation +caused by the water medium and the lack of real Ground Truth (GT) supervision, +this study proposes WaterHE-NeRF. We develop a new water-ray tracing field by +Retinex theory that precisely encodes color, density, and illuminance +attenuation in three-dimensional space. WaterHE-NeRF, through its illuminance +attenuation mechanism, generates both degraded and clear multi-view images and +optimizes image restoration by combining reconstruction loss with Wasserstein +distance. Additionally, the use of histogram equalization (HE) as pseudo-GT +enhances the network's accuracy in preserving original details and color +distribution. Extensive experiments on real underwater datasets and synthetic +datasets validate the effectiveness of WaterHE-NeRF. Our code will be made +publicly available. + +
+
+
+
+
+ + ♻ ☆ DynPoint: Dynamic Neural Point For View Synthesis + + +
+ The introduction of neural radiance fields has greatly improved the +effectiveness of view synthesis for monocular videos. However, existing +algorithms face difficulties when dealing with uncontrolled or lengthy +scenarios, and require extensive training time specific to each new scenario. +To tackle these limitations, we propose DynPoint, an algorithm designed to +facilitate the rapid synthesis of novel views for unconstrained monocular +videos. Rather than encoding the entirety of the scenario information into a +latent representation, DynPoint concentrates on predicting the explicit 3D +correspondence between neighboring frames to realize information aggregation. +Specifically, this correspondence prediction is achieved through the estimation +of consistent depth and scene flow information across frames. Subsequently, the +acquired correspondence is utilized to aggregate information from multiple +reference frames to a target frame, by constructing hierarchical neural point +clouds. The resulting framework enables swift and accurate view synthesis for +desired views of target frames. The experimental results obtained demonstrate +the considerable acceleration of training time achieved - typically an order of +magnitude - by our proposed method while yielding comparable outcomes compared +to prior approaches. Furthermore, our method exhibits strong robustness in +handling long-duration videos without learning a canonical representation of +video content. + +
+
+
+
+
+ + ♻ ☆ On the Adversarial Robustness of Camera-based 3D Object Detection + + +
+ In recent years, camera-based 3D object detection has gained widespread +attention for its ability to achieve high performance with low computational +cost. However, the robustness of these methods to adversarial attacks has not +been thoroughly examined, especially when considering their deployment in +safety-critical domains like autonomous driving. In this study, we conduct the +first comprehensive investigation of the robustness of leading camera-based 3D +object detection approaches under various adversarial conditions. We +systematically analyze the resilience of these models under two attack +settings: white-box and black-box; focusing on two primary objectives: +classification and localization. Additionally, we delve into two types of +adversarial attack techniques: pixel-based and patch-based. Our experiments +yield four interesting findings: (a) bird's-eye-view-based representations +exhibit stronger robustness against localization attacks; (b) +depth-estimation-free approaches have the potential to show stronger +robustness; (c) accurate depth estimation effectively improves robustness for +depth-estimation-based methods; (d) incorporating multi-frame benign inputs can +effectively mitigate adversarial attacks. We hope our findings can steer the +development of future camera-based object detection models with enhanced +adversarial robustness. + +
+
+ comment: Transactions on Machine Learning Research, 2024. ISSN 2835-8856 +
+
+
+
+
+ + ♻ ☆ IA2U: A Transfer Plugin with Multi-Prior for In-Air Model to Underwater + + +
+ In underwater environments, variations in suspended particle concentration +and turbidity cause severe image degradation, posing significant challenges to +image enhancement (IE) and object detection (OD) tasks. Currently, in-air image +enhancement and detection methods have made notable progress, but their +application in underwater conditions is limited due to the complexity and +variability of these environments. Fine-tuning in-air models saves high +overhead and has more optional reference work than building an underwater model +from scratch. To address these issues, we design a transfer plugin with +multiple priors for converting in-air models to underwater applications, named +IA2U. IA2U enables efficient application in underwater scenarios, thereby +improving performance in Underwater IE and OD. IA2U integrates three types of +underwater priors: the water type prior that characterizes the degree of image +degradation, such as color and visibility; the degradation prior, focusing on +differences in details and textures; and the sample prior, considering the +environmental conditions at the time of capture and the characteristics of the +photographed object. Utilizing a Transformer-like structure, IA2U employs these +priors as query conditions and a joint task loss function to achieve +hierarchical enhancement of task-level underwater image features, therefore +considering the requirements of two different tasks, IE and OD. Experimental +results show that IA2U combined with an in-air model can achieve superior +performance in underwater image enhancement and object detection tasks. The +code will be made publicly available. + +
+
+
+
+
+ + ♻ ☆ DGNet: Dynamic Gradient-guided Network with Noise Suppression for + Underwater Image Enhancement + + +
+ Underwater image enhancement (UIE) is a challenging task due to the complex +degradation caused by underwater environments. To solve this issue, previous +methods often idealize the degradation process, and neglect the impact of +medium noise and object motion on the distribution of image features, limiting +the generalization and adaptability of the model. Previous methods use the +reference gradient that is constructed from original images and synthetic +ground-truth images. This may cause the network performance to be influenced by +some low-quality training data. Our approach utilizes predicted images to +dynamically update pseudo-labels, adding a dynamic gradient to optimize the +network's gradient space. This process improves image quality and avoids local +optima. Moreover, we propose a Feature Restoration and Reconstruction module +(FRR) based on a Channel Combination Inference (CCI) strategy and a Frequency +Domain Smoothing module (FRS). These modules decouple other degradation +features while reducing the impact of various types of noise on network +performance. Experiments on multiple public datasets demonstrate the +superiority of our method over existing state-of-the-art approaches, especially +in achieving performance milestones: PSNR of 25.6dB and SSIM of 0.93 on the +UIEB dataset. Its efficiency in terms of parameter size and inference time +further attests to its broad practicality. The code will be made publicly +available. + +
+
+
+
+
+ + ♻ ☆ OTS: A One-shot Learning Approach for Text Spotting in Historical + Manuscripts + + +
+ In the field of historical manuscript research, scholars frequently encounter +novel symbols in ancient texts, investing considerable effort in their +identification and documentation. Although some object detection methods have +achieved impressive performance, they primarily excel at detecting categories +included in training datasets, often failing to recognize novel symbols without +retraining. To overcome this limitation, we propose a novel One-shot +learning-based Text Spotting (OTS) approach that accurately and reliably spots +novel characters with just one annotated support sample. Drawing inspiration +from cognitive research, we introduce a spatial alignment module that finds, +focuses on, and learns the most discriminative spatial regions in the query +image based on one support image. Especially, since the low-resource spotting +task often faces the problem of example imbalance, we propose a novel loss +function called torus loss which can make the embedding space of distance +metric more discriminative. Our approach is highly efficient and requires only +a few training samples while exhibiting the remarkable ability to handle novel +characters and symbols. To enhance dataset diversity, a new manuscript dataset +that contains the ancient Dongba hieroglyphics (DBH) is created, a script +associated with China and developed by the ancestors of the Naxi minority. We +conduct experiments on publicly available DBH, EGY, VML-HD, TKH, and NC +datasets. The experimental results demonstrate that OTS outperforms the +state-of-the-art methods in one-shot text spotting. Overall, our proposed +method offers promising applications in text spotting in historical +manuscripts. + +
+
+
+
+
+ + ♻ ☆ Diffusion Model with Perceptual Loss + + +
+ Diffusion models trained with mean squared error loss tend to generate +unrealistic samples. Current state-of-the-art models rely on classifier-free +guidance to improve sample quality, yet its surprising effectiveness is not +fully understood. In this paper, we show that the effectiveness of +classifier-free guidance partly originates from it being a form of implicit +perceptual guidance. As a result, we can directly incorporate perceptual loss +in diffusion training to improve sample quality. Since the score matching +objective used in diffusion training strongly resembles the denoising +autoencoder objective used in unsupervised training of perceptual networks, the +diffusion model itself is a perceptual network and can be used to generate +meaningful perceptual loss. We propose a novel self-perceptual objective that +results in diffusion models capable of generating more realistic samples. For +conditional generation, our method only improves sample quality without +entanglement with the conditional input and therefore does not sacrifice sample +diversity. Our method can also improve sample quality for unconditional +generation, which was not possible with classifier-free guidance before. + +
+
+
+
+
+
+
+
+ + Information Retrieval 18 + +
+
+
+ + ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence + in extremist social media + + +
+ Online hate speech proliferation has created a difficult problem for social +media platforms. A particular challenge relates to the use of coded language by +groups interested in both creating a sense of belonging for its users and +evading detection. Coded language evolves quickly and its use varies over time. +This paper proposes a methodology for detecting emerging coded hate-laden +terminology. The methodology is tested in the context of online antisemitic +discourse. The approach considers posts scraped from social media platforms, +often used by extremist users. The posts are scraped using seed expressions +related to previously known discourse of hatred towards Jews. The method begins +by identifying the expressions most representative of each post and calculating +their frequency in the whole corpus. It filters out grammatically incoherent +expressions as well as previously encountered ones so as to focus on emergent +well-formed terminology. This is followed by an assessment of semantic +similarity to known antisemitic terminology using a fine-tuned large language +model, and subsequent filtering out of the expressions that are too distant +from known expressions of hatred. Emergent antisemitic expressions containing +terms clearly relating to Jewish topics are then removed to return only coded +expressions of hatred. + +
+
+ comment: 9 pages, 4 figures, 2 algorithms, 3 tables +
+
+
+
+
+ + ☆ Dynamic Q&A of Clinical Documents with Large Language Models + + +
+ Electronic health records (EHRs) house crucial patient data in clinical +notes. As these notes grow in volume and complexity, manual extraction becomes +challenging. This work introduces a natural language interface using large +language models (LLMs) for dynamic question-answering on clinical notes. Our +chatbot, powered by Langchain and transformer-based LLMs, allows users to query +in natural language, receiving relevant answers from clinical notes. +Experiments, utilizing various embedding models and advanced LLMs, show Wizard +Vicuna's superior accuracy, albeit with high compute demands. Model +optimization, including weight quantization, improves latency by approximately +48 times. Promising results indicate potential, yet challenges such as model +hallucinations and limited diverse medical case evaluations remain. Addressing +these gaps is crucial for unlocking the value in clinical notes and advancing +AI-driven clinical decision-making. + +
+
+ comment: 8 pages, 4 figures +
+
+
+
+
+ + ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and + unfairness in dyadic regression models + + +
+ Dyadic regression models, which predict real-valued outcomes for pairs of +entities, are fundamental in many domains (e.g. predicting the rating of a user +to a product in Recommender Systems) and promising and under exploration in +many others (e.g. approximating the adequate dosage of a drug for a patient in +personalized pharmacology). In this work, we demonstrate that non-uniformity in +the observed value distributions of individual entities leads to severely +biased predictions in state-of-the-art models, skewing predictions towards the +average of observed past values for the entity and providing worse-than-random +predictive power in eccentric yet equally important cases. We show that the +usage of global error metrics like Root Mean Squared Error (RMSE) and Mean +Absolute Error (MAE) is insufficient to capture this phenomenon, which we name +eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as +a new complementary metric that can quantify it in all studied models and +datasets. We also prove the adequateness of EAUC by using naive de-biasing +corrections to demonstrate that a lower model bias correlates with a lower EAUC +and vice-versa. This work contributes a bias-aware evaluation of dyadic +regression models to avoid potential unfairness and risks in critical +real-world applications of such systems. + +
+
+
+
+
+ + ☆ Automatic Construction of Multi-faceted User Profiles using Text + Clustering and its Application to Expert Recommendation and Filtering + Problems + + +
+ In the information age we are living in today, not only are we interested in +accessing multimedia objects such as documents, videos, etc. but also in +searching for professional experts, people or celebrities, possibly for +professional needs or just for fun. Information access systems need to be able +to extract and exploit various sources of information (usually in text format) +about such individuals, and to represent them in a suitable way usually in the +form of a profile. In this article, we tackle the problems of profile-based +expert recommendation and document filtering from a machine learning +perspective by clustering expert textual sources to build profiles and capture +the different hidden topics in which the experts are interested. The experts +will then be represented by means of multi-faceted profiles. Our experiments +show that this is a valid technique to improve the performance of expert +finding and document filtering. + +
+
+
+
+
+ + ☆ LDA-based Term Profiles for Expert Finding in a Political Setting + + +
+ A common task in many political institutions (i.e. Parliament) is to find +politicians who are experts in a particular field. In order to tackle this +problem, the first step is to obtain politician profiles which include their +interests, and these can be automatically learned from their speeches. As a +politician may have various areas of expertise, one alternative is to use a set +of subprofiles, each of which covers a different subject. In this study, we +propose a novel approach for this task by using latent Dirichlet allocation +(LDA) to determine the main underlying topics of each political speech, and to +distribute the related terms among the different topic-based subprofiles. With +this objective, we propose the use of fifteen distance and similarity measures +to automatically determine the optimal number of topics discussed in a +document, and to demonstrate that every measure converges into five strategies: +Euclidean, Dice, Sorensen, Cosine and Overlap. Our experimental results showed +that the scores of the different accuracy metrics of the proposed strategies +tended to be higher than those of the baselines for expert recommendation +tasks, and that the use of an appropriate number of topics has proved relevant. + +
+
+
+
+
+ + ☆ Publication venue recommendation using profiles based on clustering + + +
+ In this paper we study the venue recommendation problem in order to help +researchers to identify a journal or conference to submit a given paper. A +common approach to tackle this problem is to build profiles defining the scope +of each venue. Then, these profiles are compared against the target paper. In +our approach we will study how clustering techniques can be used to construct +topic-based profiles and use an Information Retrieval based approach to obtain +the final recommendations. Additionally, we will explore how the use of +authorship, representing a complementary piece of information, helps to improve +the recommendations. + +
+
+
+
+
+ + ☆ Use of topical and temporal profiles and their hybridisation for + content-based recommendation + + +
+ In the context of content-based recommender systems, the aim of this paper is +to determine how better profiles can be built and how these affect the +recommendation process based on the incorporation of temporality, i.e. the +inclusion of time in the recommendation process, and topicality, i.e. the +representation of texts associated with users and items using topics and their +combination. The main contribution of the paper is to present two different +ways of hybridising these two dimensions and to evaluate and compare them with +other alternatives. + +
+
+
+
+
+ + ☆ Understanding Biases in ChatGPT-based Recommender Systems: Provider + Fairness, Temporal Stability, and Recency + + +
+ This study explores the nuanced capabilities and inherent biases of +Recommender Systems using Large Language Models (RecLLMs), with a focus on +ChatGPT-based systems. It studies into the contrasting behaviors of generative +models and traditional collaborative filtering models in movie recommendations. +The research primarily investigates prompt design strategies and their impact +on various aspects of recommendation quality, including accuracy, provider +fairness, diversity, stability, genre dominance, and temporal freshness +(recency). + Our experimental analysis reveals that the introduction of specific 'system +roles' and 'prompt strategies' in RecLLMs significantly influences their +performance. For instance, role-based prompts enhance fairness and diversity in +recommendations, mitigating popularity bias. We find that while GPT-based +models do not always match the performance of CF baselines, they exhibit a +unique tendency to recommend newer and more diverse movie genres. Notably, +GPT-based models tend to recommend more recent films, particularly those +released post-2000, and show a preference for genres like \sq{Drama} and +Comedy, and Romance (compared to CF Action, Adventure) presumably due to the +RecLLMs' training on varied data sets, which allows them to capture recent +trends and discussions more effectively than CF models. Interestingly, our +results demonstrate that the 'Simple' and 'Chain of Thought (COT)' paradigms +yield the highest accuracy. These findings imply the potential of combining +these strategies with scenarios that favor more recent content, thereby +offering a more balanced and up-to-date recommendation experience. This study +contributes significantly to the understanding of emerging RecLLMs, +particularly in the context of harms and biases within these systems. + +
+
+
+
+
+ + ☆ Generative Dense Retrieval: Memory Can Be a Burden EACL 2024 + + +
+ Generative Retrieval (GR), autoregressively decoding relevant document +identifiers given a query, has been shown to perform well under the setting of +small-scale corpora. By memorizing the document corpus with model parameters, +GR implicitly achieves deep interaction between query and document. However, +such a memorizing mechanism faces three drawbacks: (1) Poor memory accuracy for +fine-grained features of documents; (2) Memory confusion gets worse as the +corpus size increases; (3) Huge memory update costs for new documents. To +alleviate these problems, we propose the Generative Dense Retrieval (GDR) +paradigm. Specifically, GDR first uses the limited memory volume to achieve +inter-cluster matching from query to relevant document clusters. +Memorizing-free matching mechanism from Dense Retrieval (DR) is then introduced +to conduct fine-grained intra-cluster matching from clusters to relevant +documents. The coarse-to-fine process maximizes the advantages of GR's deep +interaction and DR's scalability. Besides, we design a cluster identifier +constructing strategy to facilitate corpus memory and a cluster-adaptive +negative sampling strategy to enhance the intra-cluster mapping ability. +Empirical results show that GDR obtains an average of 3.0 R@100 improvement on +NQ dataset under multiple settings and has better scalability. + +
+
+ comment: EACL 2024 main +
+
+
+
+
+ + ☆ Enhancing Scalability in Recommender Systems through Lottery Ticket + Hypothesis and Knowledge Distillation-based Neural Network Pruning + + +
+ This study introduces an innovative approach aimed at the efficient pruning +of neural networks, with a particular focus on their deployment on edge +devices. Our method involves the integration of the Lottery Ticket Hypothesis +(LTH) with the Knowledge Distillation (KD) framework, resulting in the +formulation of three distinct pruning models. These models have been developed +to address scalability issue in recommender systems, whereby the complexities +of deep learning models have hindered their practical deployment. With +judicious application of the pruning techniques, we effectively curtail the +power consumption and model dimensions without compromising on accuracy. +Empirical evaluation has been performed using two real world datasets from +diverse domains against two baselines. Gratifyingly, our approaches yielded a +GPU computation-power reduction of up to 66.67%. Notably, our study contributes +to the field of recommendation system by pioneering the application of LTH and +KD. + +
+
+ comment: Accepted in WITS 2023 as a workshop paper +
+
+
+
+
+ + ☆ On the selection of the correct number of terms for profile + construction: theoretical and empirical analysis + + +
+ In this paper, we examine the problem of building a user profile from a set +of documents. This profile will consist of a subset of the most representative +terms in the documents that best represent user preferences or interests. +Inspired by the discrete concentration theory we have conducted an axiomatic +study of seven properties that a selection function should fulfill: the minimum +and maximum uncertainty principle, invariant to adding zeros, invariant to +scale transformations, principle of nominal increase, transfer principle and +the richest get richer inequality. We also present a novel selection function +based on the use of similarity metrics, and more specifically the cosine +measure which is commonly used in information retrieval, and demonstrate that +this verifies six of the properties in addition to a weaker variant of the +transfer principle, thereby representing a good selection approach. The +theoretical study was complemented with an empirical study to compare the +performance of different selection criteria (weight- and unweight-based) using +real data in a parliamentary setting. In this study, we analyze the performance +of the different functions focusing on the two main factors affecting the +selection process: profile size (number of terms) and weight distribution. +These profiles are then used in a document filtering task to show that our +similarity-based approach performs well in terms not only of recommendation +accuracy but also efficiency (we obtain smaller profiles and consequently +faster recommendations). + +
+
+
+
+
+ + ☆ Positive unlabeled learning for building recommender systems in a + parliamentary setting + + +
+ Our goal is to learn about the political interests and preferences of the +Members of Parliament by mining their parliamentary activity, in order to +develop a recommendation/filtering system that, given a stream of documents to +be distributed among them, is able to decide which documents should receive +each Member of Parliament. We propose to use positive unlabeled learning to +tackle this problem, because we only have information about relevant documents +(the own interventions of each Member of Parliament in the debates) but not +about irrelevant documents, so that we cannot use standard binary classifiers +trained with positive and negative examples. We have also developed a new +algorithm of this type, which compares favourably with: a) the baseline +approach assuming that all the interventions of other Members of Parliament are +irrelevant, b) another well-known positive unlabeled learning method and c) an +approach based on information retrieval methods that matches documents and +legislators' representations. The experiments have been carried out with data +from the regional Andalusian Parliament at Spain. + +
+
+
+
+
+ + ☆ AI Revolution on Chat Bot: Evidence from a Randomized Controlled + Experiment + + +
+ In recent years, generative AI has undergone major advancements, +demonstrating significant promise in augmenting human productivity. Notably, +large language models (LLM), with ChatGPT-4 as an example, have drawn +considerable attention. Numerous articles have examined the impact of LLM-based +tools on human productivity in lab settings and designed tasks or in +observational studies. Despite recent advances, field experiments applying +LLM-based tools in realistic settings are limited. This paper presents the +findings of a field randomized controlled trial assessing the effectiveness of +LLM-based tools in providing unmonitored support services for information +retrieval. + +
+
+
+
+
+ + ♻ ☆ Context-Driven Interactive Query Simulations Based on Generative Large + Language Models ECIR 2024 + + +
+ Simulating user interactions enables a more user-oriented evaluation of +information retrieval (IR) systems. While user simulations are cost-efficient +and reproducible, many approaches often lack fidelity regarding real user +behavior. Most notably, current user models neglect the user's context, which +is the primary driver of perceived relevance and the interactions with the +search results. To this end, this work introduces the simulation of +context-driven query reformulations. The proposed query generation methods +build upon recent Large Language Model (LLM) approaches and consider the user's +context throughout the simulation of a search session. Compared to simple +context-free query generation approaches, these methods show better +effectiveness and allow the simulation of more efficient IR sessions. +Similarly, our evaluations consider more interaction context than current +session-based measures and reveal interesting complementary insights in +addition to the established evaluation protocols. We conclude with directions +for future work and provide an entirely open experimental setup. + +
+
+ comment: Accepted at ECIR 2024 (Full Paper) +
+
+
+
+
+ + ♻ ☆ Large Language Models for Information Retrieval: A Survey + + +
+ As a primary means of information acquisition, information retrieval (IR) +systems, such as search engines, have integrated themselves into our daily +lives. These systems also serve as components of dialogue, question-answering, +and recommender systems. The trajectory of IR has evolved dynamically from its +origins in term-based methods to its integration with advanced neural models. +While the neural models excel at capturing complex contextual signals and +semantic nuances, thereby reshaping the IR landscape, they still face +challenges such as data scarcity, interpretability, and the generation of +contextually plausible yet potentially inaccurate responses. This evolution +requires a combination of both traditional methods (such as term-based sparse +retrieval methods with rapid response) and modern neural architectures (such as +language models with powerful language understanding capacity). Meanwhile, the +emergence of large language models (LLMs), typified by ChatGPT and GPT-4, has +revolutionized natural language processing due to their remarkable language +understanding, generation, generalization, and reasoning abilities. +Consequently, recent research has sought to leverage LLMs to improve IR +systems. Given the rapid evolution of this research trajectory, it is necessary +to consolidate existing methodologies and provide nuanced insights through a +comprehensive overview. In this survey, we delve into the confluence of LLMs +and IR systems, including crucial aspects such as query rewriters, retrievers, +rerankers, and readers. Additionally, we explore promising directions, such as +search agents, within this expanding field. + +
+
+ comment: updated to version 2 +
+
+
+
+
+ + ♻ ☆ A Survey on Cross-Domain Sequential Recommendation + + +
+ Cross-domain sequential recommendation (CDSR) shifts the modeling of user +preferences from flat to stereoscopic by integrating and learning interaction +information from multiple domains at different granularities (ranging from +inter-sequence to intra-sequence and from single-domain to cross-domain). In +this survey, we first define the CDSR problem using a four-dimensional tensor +and then analyze its multi-type input representations under multidirectional +dimensionality reductions. Following that, we provide a systematic overview +from both macro and micro views. From a macro view, we abstract the multi-level +fusion structures of various models across domains and discuss their bridges +for fusion. From a micro view, focusing on the existing models, we specifically +discuss the basic technologies and then explain the auxiliary learning +technologies. Finally, we exhibit the available public datasets and the +representative experimental results as well as provide some insights into +future directions for research in CDSR. + +
+
+
+
+
+ + ♻ ☆ Source Code Clone Detection Using Unsupervised Similarity Measures + + +
+ Assessing similarity in source code has gained significant attention in +recent years due to its importance in software engineering tasks such as clone +detection and code search and recommendation. This work presents a comparative +analysis of unsupervised similarity measures for identifying source code clone +detection. The goal is to overview the current state-of-the-art techniques, +their strengths, and weaknesses. To do that, we compile the existing +unsupervised strategies and evaluate their performance on a benchmark dataset +to guide software engineers in selecting appropriate methods for their specific +use cases. The source code of this study is available at +https://github.com/jorge-martinez-gil/codesim + +
+
+ comment: Accepted for publication as Full Paper in the Software Quality Days + 2024, Vienna, Austria +
+
+
+
+
+ + ♻ ☆ Improving Text Embeddings with Large Language Models + + +
+ In this paper, we introduce a novel and simple method for obtaining +high-quality text embeddings using only synthetic data and less than 1k +training steps. Unlike existing methods that often depend on multi-stage +intermediate pre-training with billions of weakly-supervised text pairs, +followed by fine-tuning with a few labeled datasets, our method does not +require building complex training pipelines or relying on manually collected +datasets that are often constrained by task diversity and language coverage. We +leverage proprietary LLMs to generate diverse synthetic data for hundreds of +thousands of text embedding tasks across nearly 100 languages. We then +fine-tune open-source decoder-only LLMs on the synthetic data using standard +contrastive loss. Experiments demonstrate that our method achieves strong +performance on highly competitive text embedding benchmarks without using any +labeled data. Furthermore, when fine-tuned with a mixture of synthetic and +labeled data, our model sets new state-of-the-art results on the BEIR and MTEB +benchmarks. + +
+
+ comment: 20 pages, 15 tables +
+
+
+
+
+
+
+
+ + Machine Learning 115 + +
+
+
+ + ☆ SCENES: Subpixel Correspondence Estimation With Epipolar Supervision + + +
+ Extracting point correspondences from two or more views of a scene is a +fundamental computer vision problem with particular importance for relative +camera pose estimation and structure-from-motion. Existing local feature +matching approaches, trained with correspondence supervision on large-scale +datasets, obtain highly-accurate matches on the test sets. However, they do not +generalise well to new datasets with different characteristics to those they +were trained on, unlike classic feature extractors. Instead, they require +finetuning, which assumes that ground-truth correspondences or ground-truth +camera poses and 3D structure are available. We relax this assumption by +removing the requirement of 3D structure, e.g., depth maps or point clouds, and +only require camera pose information, which can be obtained from odometry. We +do so by replacing correspondence losses with epipolar losses, which encourage +putative matches to lie on the associated epipolar line. While weaker than +correspondence supervision, we observe that this cue is sufficient for +finetuning existing models on new data. We then further relax the assumption of +known camera poses by using pose estimates in a novel bootstrapping approach. +We evaluate on highly challenging datasets, including an indoor drone dataset +and an outdoor smartphone camera dataset, and obtain state-of-the-art results +without strong supervision. + +
+
+
+
+
+ + ☆ Applications of flow models to the generation of correlated lattice QCD + ensembles + + +
+ Machine-learned normalizing flows can be used in the context of lattice +quantum field theory to generate statistically correlated ensembles of lattice +gauge fields at different action parameters. This work demonstrates how these +correlations can be exploited for variance reduction in the computation of +observables. Three different proof-of-concept applications are demonstrated +using a novel residual flow architecture: continuum limits of gauge theories, +the mass dependence of QCD observables, and hadronic matrix elements based on +the Feynman-Hellmann approach. In all three cases, it is shown that statistical +uncertainties are significantly reduced when machine-learned flows are +incorporated as compared with the same calculations performed with uncorrelated +ensembles or direct reweighting. + +
+
+ comment: 11 pages, 2 tables, 5 figures +
+
+
+
+
+ + ☆ Pruning for Protection: Increasing Jailbreak Resistance in Aligned LLMs + Without Fine-Tuning + + +
+ Large Language Models (LLMs) are vulnerable to `Jailbreaking' prompts, a type +of attack that can coax these models into generating harmful and illegal +content. In this paper, we show that pruning up to 20% of LLM parameters +markedly increases their resistance to such attacks without additional training +and without sacrificing their performance in standard benchmarks. Intriguingly, +we discovered that the enhanced safety observed post-pruning correlates to the +initial safety training level of the model, hinting that the effect of pruning +could be more general and may hold for other LLM behaviors beyond safety. +Additionally, we introduce a curated dataset of 225 harmful tasks across five +categories, inserted into ten different Jailbreaking prompts, showing that +pruning aids LLMs in concentrating attention on task-relevant tokens in +jailbreaking prompts. Lastly, our experiments reveal that the prominent chat +models, such as LLaMA-2 Chat, Vicuna, and Mistral Instruct exhibit high +susceptibility to jailbreaking attacks, with some categories achieving nearly +70-100% success rate. These insights underline the potential of pruning as a +generalizable approach for improving LLM safety, reliability, and potentially +other desired behaviors. + +
+
+
+
+
+ + ☆ Ensembler: Combating model inversion attacks using model ensemble during + collaborative inference + + +
+ Deep learning models have exhibited remarkable performance across various +domains. Nevertheless, the burgeoning model sizes compel edge devices to +offload a significant portion of the inference process to the cloud. While this +practice offers numerous advantages, it also raises critical concerns regarding +user data privacy. In scenarios where the cloud server's trustworthiness is in +question, the need for a practical and adaptable method to safeguard data +privacy becomes imperative. In this paper, we introduce Ensembler, an +extensible framework designed to substantially increase the difficulty of +conducting model inversion attacks for adversarial parties. Ensembler leverages +model ensembling on the adversarial server, running in parallel with existing +approaches that introduce perturbations to sensitive data during colloborative +inference. Our experiments demonstrate that when combined with even basic +Gaussian noise, Ensembler can effectively shield images from reconstruction +attacks, achieving recognition levels that fall below human performance in some +strict settings, significantly outperforming baseline methods lacking the +Ensembler framework. + +
+
+ comment: in submission +
+
+
+
+
+ + ☆ Using LLMs to discover emerging coded antisemitic hate-speech emergence + in extremist social media + + +
+ Online hate speech proliferation has created a difficult problem for social +media platforms. A particular challenge relates to the use of coded language by +groups interested in both creating a sense of belonging for its users and +evading detection. Coded language evolves quickly and its use varies over time. +This paper proposes a methodology for detecting emerging coded hate-laden +terminology. The methodology is tested in the context of online antisemitic +discourse. The approach considers posts scraped from social media platforms, +often used by extremist users. The posts are scraped using seed expressions +related to previously known discourse of hatred towards Jews. The method begins +by identifying the expressions most representative of each post and calculating +their frequency in the whole corpus. It filters out grammatically incoherent +expressions as well as previously encountered ones so as to focus on emergent +well-formed terminology. This is followed by an assessment of semantic +similarity to known antisemitic terminology using a fine-tuned large language +model, and subsequent filtering out of the expressions that are too distant +from known expressions of hatred. Emergent antisemitic expressions containing +terms clearly relating to Jewish topics are then removed to return only coded +expressions of hatred. + +
+
+ comment: 9 pages, 4 figures, 2 algorithms, 3 tables +
+
+
+
+
+ + ☆ Understanding Video Transformers via Universal Concept Discovery + + +
+ This paper studies the problem of concept-based interpretability of +transformer representations for videos. Concretely, we seek to explain the +decision-making process of video transformers based on high-level, +spatiotemporal concepts that are automatically discovered. Prior research on +concept-based interpretability has concentrated solely on image-level tasks. +Comparatively, video models deal with the added temporal dimension, increasing +complexity and posing challenges in identifying dynamic concepts over time. In +this work, we systematically address these challenges by introducing the first +Video Transformer Concept Discovery (VTCD) algorithm. To this end, we propose +an efficient approach for unsupervised identification of units of video +transformer representations - concepts, and ranking their importance to the +output of a model. The resulting concepts are highly interpretable, revealing +spatio-temporal reasoning mechanisms and object-centric representations in +unstructured video models. Performing this analysis jointly over a diverse set +of supervised and self-supervised representations, we discover that some of +these mechanism are universal in video transformers. Finally, we demonstrate +that VTCDcan be used to improve model performance for fine-grained tasks. + +
+
+
+
+
+ + ☆ A survey on recent advances in named entity recognition + + +
+ Named Entity Recognition seeks to extract substrings within a text that name +real-world objects and to determine their type (for example, whether they refer +to persons or organizations). In this survey, we first present an overview of +recent popular approaches, but we also look at graph- and transformer- based +methods including Large Language Models (LLMs) that have not had much coverage +in other surveys. Second, we focus on methods designed for datasets with scarce +annotations. Third, we evaluate the performance of the main NER implementations +on a variety of datasets with differing characteristics (as regards their +domain, their size, and their number of classes). We thus provide a deep +comparison of algorithms that are never considered together. Our experiments +shed some light on how the characteristics of datasets affect the behavior of +the methods that we compare. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Optimisation in Neurosymbolic Learning Systems + + +
+ Neurosymbolic AI aims to integrate deep learning with symbolic AI. This +integration has many promises, such as decreasing the amount of data required +to train a neural network, improving the explainability and interpretability of +answers given by models and verifying the correctness of trained systems. We +study neurosymbolic learning, where we have both data and background knowledge +expressed using symbolic languages. How do we connect the symbolic and neural +components to communicate this knowledge? One option is fuzzy reasoning, which +studies degrees of truth. For example, being tall is not a binary concept. +Instead, probabilistic reasoning studies the probability that something is true +or will happen. Our first research question studies how different forms of +fuzzy reasoning combine with learning. We find surprising results like a +connection to the Raven paradox stating we confirm "ravens are black" when we +observe a green apple. In this study, we did not use the background knowledge +when we deployed our models after training. In our second research question, we +studied how to use background knowledge in deployed models. We developed a new +neural network layer based on fuzzy reasoning. Probabilistic reasoning is a +natural fit for neural networks, which we usually train to be probabilistic. +However, they are expensive to compute and do not scale well to large tasks. In +our third research question, we study how to connect probabilistic reasoning +with neural networks by sampling to estimate averages, while in the final +research question, we study scaling probabilistic neurosymbolic learning to +much larger problems than before. Our insight is to train a neural network with +synthetic data to predict the result of probabilistic reasoning. + +
+
+ comment: PhD dissertation +
+
+
+
+
+ + ☆ Co-Pilot for Health: Personalized Algorithmic AI Nudging to Improve + Health Outcomes + + +
+ The ability to shape health behaviors of large populations automatically, +across wearable types and disease conditions at scale has tremendous potential +to improve global health outcomes. We designed and implemented an AI driven +platform for digital algorithmic nudging, enabled by a Graph-Neural Network +(GNN) based Recommendation System, and granular health behavior data from +wearable fitness devices. Here we describe the efficacy results of this +platform with its capabilities of personalized and contextual nudging to +$n=84,764$ individuals over a 12-week period in Singapore. We statistically +validated that participants in the target group who received such AI optimized +daily nudges increased daily physical activity like step count by 6.17% ($p = +3.09\times10^{-4}$) and weekly minutes of Moderate to Vigorous Physical +Activity (MVPA) by 7.61% ($p = 1.16\times10^{-2}$), compared to matched +participants in control group who did not receive any nudges. Further, such +nudges were very well received, with a 13.1% of nudges sent being opened (open +rate), and 11.7% of the opened nudges rated useful compared to 1.9% rated as +not useful thereby demonstrating significant improvement in population level +engagement metrics. + +
+
+ comment: 19 pages, 2 figures +
+
+
+
+
+ + ☆ Simulation Based Bayesian Optimization + + +
+ Bayesian Optimization (BO) is a powerful method for optimizing black-box +functions by combining prior knowledge with ongoing function evaluations. BO +constructs a probabilistic surrogate model of the objective function given the +covariates, which is in turn used to inform the selection of future evaluation +points through an acquisition function. For smooth continuous search spaces, +Gaussian Processes (GPs) are commonly used as the surrogate model as they offer +analytical access to posterior predictive distributions, thus facilitating the +computation and optimization of acquisition functions. However, in complex +scenarios involving optimizations over categorical or mixed covariate spaces, +GPs may not be ideal. + This paper introduces Simulation Based Bayesian Optimization (SBBO) as a +novel approach to optimizing acquisition functions that only requires +\emph{sampling-based} access to posterior predictive distributions. SBBO allows +the use of surrogate probabilistic models tailored for combinatorial spaces +with discrete variables. Any Bayesian model in which posterior inference is +carried out through Markov chain Monte Carlo can be selected as the surrogate +model in SBBO. In applications involving combinatorial optimization, we +demonstrate empirically the effectiveness of SBBO method using various choices +of surrogate models. + +
+
+
+
+
+ + ☆ Neglected Hessian component explains mysteries in Sharpness + regularization + + +
+ Recent work has shown that methods like SAM which either explicitly or +implicitly penalize second order information can improve generalization in deep +learning. Seemingly similar methods like weight noise and gradient penalties +often fail to provide such benefits. We show that these differences can be +explained by the structure of the Hessian of the loss. First, we show that a +common decomposition of the Hessian can be quantitatively interpreted as +separating the feature exploitation from feature exploration. The feature +exploration, which can be described by the Nonlinear Modeling Error matrix +(NME), is commonly neglected in the literature since it vanishes at +interpolation. Our work shows that the NME is in fact important as it can +explain why gradient penalties are sensitive to the choice of activation +function. Using this insight we design interventions to improve performance. We +also provide evidence that challenges the long held equivalence of weight noise +and gradient penalties. This equivalence relies on the assumption that the NME +can be ignored, which we find does not hold for modern networks since they +involve significant feature learning. We find that regularizing feature +exploitation but not feature exploration yields performance similar to gradient +penalties. + +
+
+
+
+
+ + ☆ Learning to Visually Connect Actions and their Effects + + +
+ In this work, we introduce the novel concept of visually Connecting Actions +and Their Effects (CATE) in video understanding. CATE can have applications in +areas like task planning and learning from demonstration. We propose different +CATE-based task formulations, such as action selection and action +specification, where video understanding models connect actions and effects at +semantic and fine-grained levels. We observe that different formulations +produce representations capturing intuitive action properties. We also design +various baseline models for action selection and action specification. Despite +the intuitive nature of the task, we observe that models struggle, and humans +outperform them by a large margin. The study aims to establish a foundation for +future efforts, showcasing the flexibility and versatility of connecting +actions and effects in video understanding, with the hope of inspiring advanced +formulations and models. + +
+
+
+
+
+ + ☆ Estimation of AMOC transition probabilities using a machine learning + based rare-event algorithm + + +
+ The Atlantic Meridional Overturning Circulation (AMOC) is an important +component of the global climate, known to be a tipping element, as it could +collapse under global warming. The main objective of this study is to compute +the probability that the AMOC collapses within a specified time window, using a +rare-event algorithm called Trajectory-Adaptive Multilevel Splitting (TAMS). +However, the efficiency and accuracy of TAMS depend on the choice of the score +function. Although the definition of the optimal score function, called +``committor function" is known, it is impossible in general to compute it a +priori. Here, we combine TAMS with a Next-Generation Reservoir Computing +technique that estimates the committor function from the data generated by the +rare-event algorithm. We test this technique in a stochastic box model of the +AMOC for which two types of transition exist, the so-called F(ast)-transitions +and S(low)-transitions. Results for the F-transtions compare favorably with +those in the literature where a physically-informed score function was used. We +show that coupling a rare-event algorithm with machine learning allows for a +correct estimation of transition probabilities, transition times, and even +transition paths for a wide range of model parameters. We then extend these +results to the more difficult problem of S-transitions in the same model. In +both cases of F- and S-transitions, we also show how the Next-Generation +Reservoir Computing technique can be interpreted to retrieve an analytical +estimate of the committor function. + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ Novel Representation Learning Technique using Graphs for Performance + Analytics ICML + + +
+ The performance analytics domain in High Performance Computing (HPC) uses +tabular data to solve regression problems, such as predicting the execution +time. Existing Machine Learning (ML) techniques leverage the correlations among +features given tabular datasets, not leveraging the relationships between +samples directly. Moreover, since high-quality embeddings from raw features +improve the fidelity of the downstream predictive models, existing methods rely +on extensive feature engineering and pre-processing steps, costing time and +manual effort. To fill these two gaps, we propose a novel idea of transforming +tabular performance data into graphs to leverage the advancement of Graph +Neural Network-based (GNN) techniques in capturing complex relationships +between features and samples. In contrast to other ML application domains, such +as social networks, the graph is not given; instead, we need to build it. To +address this gap, we propose graph-building methods where nodes represent +samples, and the edges are automatically inferred iteratively based on the +similarity between the features in the samples. We evaluate the effectiveness +of the generated embeddings from GNNs based on how well they make even a simple +feed-forward neural network perform for regression tasks compared to other +state-of-the-art representation learning techniques. Our evaluation +demonstrates that even with up to 25% random missing values for each dataset, +our method outperforms commonly used graph and Deep Neural Network (DNN)-based +approaches and achieves up to 61.67% & 78.56% improvement in MSE loss over the +DNN baseline respectively for HPC dataset and Machine Learning Datasets. + +
+
+ comment: This paper has been accepted at 22nd International Conference on + Machine Learning and Applications (ICMLA2023) +
+
+
+
+
+ + ☆ Deep Reinforcement Learning Empowered Activity-Aware Dynamic Health + Monitoring Systems + + +
+ In smart healthcare, health monitoring utilizes diverse tools and +technologies to analyze patients' real-time biosignal data, enabling immediate +actions and interventions. Existing monitoring approaches were designed on the +premise that medical devices track several health metrics concurrently, +tailored to their designated functional scope. This means that they report all +relevant health values within that scope, which can result in excess resource +use and the gathering of extraneous data due to monitoring irrelevant health +metrics. In this context, we propose Dynamic Activity-Aware Health Monitoring +strategy (DActAHM) for striking a balance between optimal monitoring +performance and cost efficiency, a novel framework based on Deep Reinforcement +Learning (DRL) and SlowFast Model to ensure precise monitoring based on users' +activities. Specifically, with the SlowFast Model, DActAHM efficiently +identifies individual activities and captures these results for enhanced +processing. Subsequently, DActAHM refines health metric monitoring in response +to the identified activity by incorporating a DRL framework. Extensive +experiments comparing DActAHM against three state-of-the-art approaches +demonstrate it achieves 27.3% higher gain than the best-performing baseline +that fixes monitoring actions over timeline. + +
+
+
+
+
+ + ☆ Early alignment in two-layer networks training is a two-edged sword + + +
+ Training neural networks with first order optimisation methods is at the core +of the empirical success of deep learning. The scale of initialisation is a +crucial factor, as small initialisations are generally associated to a feature +learning regime, for which gradient descent is implicitly biased towards simple +solutions. This work provides a general and quantitative description of the +early alignment phase, originally introduced by Maennel et al. (2018) . For +small initialisation and one hidden ReLU layer networks, the early stage of the +training dynamics leads to an alignment of the neurons towards key directions. +This alignment induces a sparse representation of the network, which is +directly related to the implicit bias of gradient flow at convergence. This +sparsity inducing alignment however comes at the expense of difficulties in +minimising the training objective: we also provide a simple data example for +which overparameterised networks fail to converge towards global minima and +only converge to a spurious stationary point instead. + +
+
+
+
+
+ + ☆ Measuring the Impact of Scene Level Objects on Object Detection: Towards + Quantitative Explanations of Detection Decisions + + +
+ Although accuracy and other common metrics can provide a useful window into +the performance of an object detection model, they lack a deeper view of the +model's decision process. Regardless of the quality of the training data and +process, the features that an object detection model learns cannot be +guaranteed. A model may learn a relationship between certain background +context, i.e., scene level objects, and the presence of the labeled classes. +Furthermore, standard performance verification and metrics would not identify +this phenomenon. This paper presents a new black box explainability method for +additional verification of object detection models by finding the impact of +scene level objects on the identification of the objects within the image. By +comparing the accuracies of a model on test data with and without certain scene +level objects, the contributions of these objects to the model's performance +becomes clearer. The experiment presented here will assess the impact of +buildings and people in image context on the detection of emergency road +vehicles by a fine-tuned YOLOv8 model. A large increase in accuracy in the +presence of a scene level object will indicate the model's reliance on that +object to make its detections. The results of this research lead to providing a +quantitative explanation of the object detection model's decision process, +enabling a deeper understanding of the model's performance. + +
+
+ comment: 9 pages, 4 figures, 1 table +
+
+
+
+
+ + ☆ Medusa: Simple LLM Inference Acceleration Framework with Multiple + Decoding Heads + + +
+ The inference process in Large Language Models (LLMs) is often limited due to +the absence of parallelism in the auto-regressive decoding process, resulting +in most operations being restricted by the memory bandwidth of accelerators. +While methods such as speculative decoding have been suggested to address this +issue, their implementation is impeded by the challenges associated with +acquiring and maintaining a separate draft model. In this paper, we present +Medusa, an efficient method that augments LLM inference by adding extra +decoding heads to predict multiple subsequent tokens in parallel. Using a +tree-based attention mechanism, Medusa constructs multiple candidate +continuations and verifies them simultaneously in each decoding step. By +leveraging parallel processing, Medusa introduces only minimal overhead in +terms of single-step latency while substantially reducing the number of +decoding steps required. + We present two levels of fine-tuning procedures for Medusa to meet the needs +of different use cases: Medusa-1: Medusa is directly fine-tuned on top of a +frozen backbone LLM, enabling lossless inference acceleration. Medusa-2: Medusa +is fine-tuned together with the backbone LLM, enabling better prediction +accuracy of Medusa heads and higher speedup but needing a special training +recipe that preserves the backbone model's capabilities. + Moreover, we propose several extensions that improve or expand the utility of +Medusa, including a self-distillation to handle situations where no training +data is available and a typical acceptance scheme to boost the acceptance rate +while maintaining generation quality. We evaluate Medusa on models of various +sizes and training procedures. Our experiments demonstrate that Medusa-1 can +achieve over 2.2x speedup without compromising generation quality, while +Medusa-2 further improves the speedup to 2.3-3.6x. + +
+
+ comment: The code for this implementation is available at + https://github.com/FasterDecoding/Medusa +
+
+
+
+
+ + ☆ Starlit: Privacy-Preserving Federated Learning to Enhance Financial + Fraud Detection + + +
+ Federated Learning (FL) is a data-minimization approach enabling +collaborative model training across diverse clients with local data, avoiding +direct data exchange. However, state-of-the-art FL solutions to identify +fraudulent financial transactions exhibit a subset of the following +limitations. They (1) lack a formal security definition and proof, (2) assume +prior freezing of suspicious customers' accounts by financial institutions +(limiting the solutions' adoption), (3) scale poorly, involving either $O(n^2)$ +computationally expensive modular exponentiation (where $n$ is the total number +of financial institutions) or highly inefficient fully homomorphic encryption, +(4) assume the parties have already completed the identity alignment phase, +hence excluding it from the implementation, performance evaluation, and +security analysis, and (5) struggle to resist clients' dropouts. This work +introduces Starlit, a novel scalable privacy-preserving FL mechanism that +overcomes these limitations. It has various applications, such as enhancing +financial fraud detection, mitigating terrorism, and enhancing digital health. +We implemented Starlit and conducted a thorough performance analysis using +synthetic data from a key player in global financial transactions. The +evaluation indicates Starlit's scalability, efficiency, and accuracy. + +
+
+
+
+
+ + ☆ Data Augmentation for Traffic Classification + + +
+ Data Augmentation (DA) -- enriching training data by adding synthetic samples +-- is a technique widely adopted in Computer Vision (CV) and Natural Language +Processing (NLP) tasks to improve models performance. Yet, DA has struggled to +gain traction in networking contexts, particularly in Traffic Classification +(TC) tasks. In this work, we fulfill this gap by benchmarking 18 augmentation +functions applied to 3 TC datasets using packet time series as input +representation and considering a variety of training conditions. Our results +show that (i) DA can reap benefits previously unexplored with (ii) +augmentations acting on time series sequence order and masking being a better +suit for TC and (iii) simple latent space analysis can provide hints about why +augmentations have positive or negative effects. + +
+
+ comment: to appear at Passive and Active Measurements (PAM), 2024 +
+
+
+
+
+ + ☆ BoolGebra: Attributed Graph-learning for Boolean Algebraic Manipulation DATE 2024 + + +
+ Boolean algebraic manipulation is at the core of logic synthesis in +Electronic Design Automation (EDA) design flow. Existing methods struggle to +fully exploit optimization opportunities, and often suffer from an explosive +search space and limited scalability efficiency. This work presents BoolGebra, +a novel attributed graph-learning approach for Boolean algebraic manipulation +that aims to improve fundamental logic synthesis. BoolGebra incorporates Graph +Neural Networks (GNNs) and takes initial feature embeddings from both +structural and functional information as inputs. A fully connected neural +network is employed as the predictor for direct optimization result +predictions, significantly reducing the search space and efficiently locating +the optimization space. The experiments involve training the BoolGebra model +w.r.t design-specific and cross-design inferences using the trained model, +where BoolGebra demonstrates generalizability for cross-design inference and +its potential to scale from small, simple training datasets to large, complex +inference datasets. Finally, BoolGebra is integrated with existing synthesis +tool ABC to perform end-to-end logic minimization evaluation w.r.t SOTA +baselines. + +
+
+ comment: DATE 2024 extended version. arXiv admin note: text overlap with + arXiv:2310.07846 +
+
+
+
+
+ + ☆ A Systematic Evaluation of Euclidean Alignment with Deep Learning for + EEG Decoding + + +
+ Electroencephalography (EEG) signals are frequently used for various +Brain-Computer Interface (BCI) tasks. While Deep Learning (DL) techniques have +shown promising results, they are hindered by the substantial data +requirements. By leveraging data from multiple subjects, transfer learning +enables more effective training of DL models. A technique that is gaining +popularity is Euclidean Alignment (EA) due to its ease of use, low +computational complexity, and compatibility with Deep Learning models. However, +few studies evaluate its impact on the training performance of shared and +individual DL models. In this work, we systematically evaluate the effect of EA +combined with DL for decoding BCI signals. We used EA to train shared models +with data from multiple subjects and evaluated its transferability to new +subjects. Our experimental results show that it improves decoding in the target +subject by 4.33% and decreases convergence time by more than 70%. We also +trained individual models for each subject to use as a majority-voting ensemble +classifier. In this scenario, using EA improved the 3-model ensemble accuracy +by 3.7%. However, when compared to the shared model with EA, the ensemble +accuracy was 3.62% lower. + +
+
+ comment: 14 pages and 10 figures +
+
+
+
+
+ + ☆ Empowering Aggregators with Practical Data-Driven Tools: Harnessing + Aggregated and Disaggregated Flexibility for Demand Response + + +
+ This study explores the crucial interplay between aggregators and building +occupants in activating flexibility through Demand Response (DR) programs, with +a keen focus on achieving robust decarbonization and fortifying the resilience +of the energy system amidst the uncertainties presented by Renewable Energy +Sources (RES). Firstly, it introduces a methodology of optimizing aggregated +flexibility provision strategies in environments with limited data, utilizing +Discrete Fourier Transformation (DFT) and clustering techniques to identify +building occupant's activity patterns. Secondly, the study assesses the +disaggregated flexibility provision of Heating Ventilation and Air Conditioning +(HVAC) systems during DR events, employing machine learning and optimization +techniques for precise, device-level analysis. The first approach offers a +non-intrusive pathway for aggregators to provide flexibility services in +environments of a single smart meter for the whole building's consumption, +while the second approach carefully considers building occupants' thermal +comfort profiles, while maximizing flexibility in case of existence of +dedicated smart meters to the HVAC systems. Through the application of +data-driven techniques and encompassing case studies from both industrial and +residential buildings, this paper not only unveils pivotal opportunities for +aggregators in the balancing and emerging flexibility markets but also +successfully develops end-to-end practical tools for aggregators. Furthermore, +the efficacy of this tool is validated through detailed case studies, +substantiating its operational capability and contributing to the evolution of +a resilient and efficient energy system. + +
+
+
+
+
+ + ☆ Real-Time Zero-Day Intrusion Detection System for Automotive Controller + Area Network on FPGAs + + +
+ Increasing automation in vehicles enabled by increased connectivity to the +outside world has exposed vulnerabilities in previously siloed automotive +networks like controller area networks (CAN). Attributes of CAN such as +broadcast-based communication among electronic control units (ECUs) that +lowered deployment costs are now being exploited to carry out active injection +attacks like denial of service (DoS), fuzzing, and spoofing attacks. Research +literature has proposed multiple supervised machine learning models deployed as +Intrusion detection systems (IDSs) to detect such malicious activity; however, +these are largely limited to identifying previously known attack vectors. With +the ever-increasing complexity of active injection attacks, detecting zero-day +(novel) attacks in these networks in real-time (to prevent propagation) becomes +a problem of particular interest. This paper presents an +unsupervised-learning-based convolutional autoencoder architecture for +detecting zero-day attacks, which is trained only on benign (attack-free) CAN +messages. We quantise the model using Vitis-AI tools from AMD/Xilinx targeting +a resource-constrained Zynq Ultrascale platform as our IDS-ECU system for +integration. The proposed model successfully achieves equal or higher +classification accuracy (> 99.5%) on unseen DoS, fuzzing, and spoofing attacks +from a publicly available attack dataset when compared to the state-of-the-art +unsupervised learning-based IDSs. Additionally, by cleverly overlapping IDS +operation on a window of CAN messages with the reception, the model is able to +meet line-rate detection (0.43 ms per window) of high-speed CAN, which when +coupled with the low energy consumption per inference, makes this architecture +ideally suited for detecting zero-day attacks on critical CAN networks. + +
+
+ comment: 8 pages, 6 figures, 7 tables +
+
+
+
+
+ + ☆ Generative Model for Constructing Reaction Path from Initial to Final + States + + +
+ Mapping out reaction pathways and their corresponding activation barriers is +a significant aspect of molecular simulation. Given their inherent complexity +and nonlinearity, even generating a initial guess of these paths remains a +challenging problem. Presented in this paper is an innovative approach that +utilizes neural networks to generate initial guess for these reaction pathways. +The proposed method is initiated by inputting the coordinates of the initial +state, followed by progressive alterations to its structure. This iterative +process culminates in the generation of the approximate representation of the +reaction path and the coordinates of the final state. The application of this +method extends to complex reaction pathways illustrated by organic reactions. +Training was executed on the Transition1x dataset, an organic reaction pathway +dataset. The results revealed generation of reactions that bore substantial +similarities with the corresponding test data. The method's flexibility allows +for reactions to be generated either to conform to predetermined conditions or +in a randomized manner. + +
+
+
+
+
+ + ☆ Classification with neural networks with quadratic decision functions + + +
+ Neural network with quadratic decision functions have been introduced as +alternatives to standard neural networks with affine linear one. They are +advantageous when the objects to be identified are of compact basic geometries +like circles, ellipsis etc. In this paper we investigate the use of such ansatz +functions for classification. In particular we test and compare the algorithm +on the MNIST dataset for classification of handwritten digits and for +classification of subspecies. We also show, that the implementation can be +based on the neural network structure in the software Tensorflow and Keras, +respectively. + +
+
+
+
+
+ + ☆ Safe Offline Reinforcement Learning with Feasibility-Guided Diffusion + Model ICLR 2024 + + +
+ Safe offline RL is a promising way to bypass risky online interactions +towards safe policy learning. Most existing methods only enforce soft +constraints, i.e., constraining safety violations in expectation below +thresholds predetermined. This can lead to potentially unsafe outcomes, thus +unacceptable in safety-critical scenarios. An alternative is to enforce the +hard constraint of zero violation. However, this can be challenging in offline +setting, as it needs to strike the right balance among three highly intricate +and correlated aspects: safety constraint satisfaction, reward maximization, +and behavior regularization imposed by offline datasets. Interestingly, we +discover that via reachability analysis of safe-control theory, the hard safety +constraint can be equivalently translated to identifying the largest feasible +region given the offline dataset. This seamlessly converts the original trilogy +problem to a feasibility-dependent objective, i.e., maximizing reward value +within the feasible region while minimizing safety risks in the infeasible +region. Inspired by these, we propose FISOR (FeasIbility-guided Safe Offline +RL), which allows safety constraint adherence, reward maximization, and offline +policy learning to be realized via three decoupled processes, while offering +strong safety performance and stability. In FISOR, the optimal policy for the +translated optimization problem can be derived in a special form of weighted +behavior cloning. Thus, we propose a novel energy-guided diffusion model that +does not require training a complicated time-dependent classifier to extract +the policy, greatly simplifying the training. We compare FISOR against +baselines on DSRL benchmark for safe offline RL. Evaluation results show that +FISOR is the only method that can guarantee safety satisfaction in all tasks, +while achieving top returns in most tasks. + +
+
+ comment: ICLR 2024, 30pages, 11 figures +
+
+
+
+
+ + ☆ Beyond RMSE and MAE: Introducing EAUC to unmask hidden bias and + unfairness in dyadic regression models + + +
+ Dyadic regression models, which predict real-valued outcomes for pairs of +entities, are fundamental in many domains (e.g. predicting the rating of a user +to a product in Recommender Systems) and promising and under exploration in +many others (e.g. approximating the adequate dosage of a drug for a patient in +personalized pharmacology). In this work, we demonstrate that non-uniformity in +the observed value distributions of individual entities leads to severely +biased predictions in state-of-the-art models, skewing predictions towards the +average of observed past values for the entity and providing worse-than-random +predictive power in eccentric yet equally important cases. We show that the +usage of global error metrics like Root Mean Squared Error (RMSE) and Mean +Absolute Error (MAE) is insufficient to capture this phenomenon, which we name +eccentricity bias, and we introduce Eccentricity-Area Under the Curve (EAUC) as +a new complementary metric that can quantify it in all studied models and +datasets. We also prove the adequateness of EAUC by using naive de-biasing +corrections to demonstrate that a lower model bias correlates with a lower EAUC +and vice-versa. This work contributes a bias-aware evaluation of dyadic +regression models to avoid potential unfairness and risks in critical +real-world applications of such systems. + +
+
+
+
+
+ + ☆ A Lightweight Multi-Attack CAN Intrusion Detection System on Hybrid + FPGAs + + +
+ Rising connectivity in vehicles is enabling new capabilities like connected +autonomous driving and advanced driver assistance systems (ADAS) for improving +the safety and reliability of next-generation vehicles. This increased access +to in-vehicle functions compromises critical capabilities that use legacy +invehicle networks like Controller Area Network (CAN), which has no inherent +security or authentication mechanism. Intrusion detection and mitigation +approaches, particularly using machine learning models, have shown promising +results in detecting multiple attack vectors in CAN through their ability to +generalise to new vectors. However, most deployments require dedicated +computing units like GPUs to perform line-rate detection, consuming much higher +power. In this paper, we present a lightweight multi-attack quantised machine +learning model that is deployed using Xilinx's Deep Learning Processing Unit IP +on a Zynq Ultrascale+ (XCZU3EG) FPGA, which is trained and validated using the +public CAN Intrusion Detection dataset. The quantised model detects denial of +service and fuzzing attacks with an accuracy of above 99 % and a false positive +rate of 0.07%, which are comparable to the state-of-the-art techniques in the +literature. The Intrusion Detection System (IDS) execution consumes just 2.0 W +with software tasks running on the ECU and achieves a 25 % reduction in +per-message processing latency over the state-of-the-art implementations. This +deployment allows the ECU function to coexist with the IDS with minimal changes +to the tasks, making it ideal for real-time IDS in in-vehicle systems. + +
+
+ comment: 5 pages, 2 figures, 6 tables +
+
+
+
+
+ + ☆ Manipulating Sparse Double Descent + + +
+ This paper investigates the double descent phenomenon in two-layer neural +networks, focusing on the role of L1 regularization and representation +dimensions. It explores an alternative double descent phenomenon, named sparse +double descent. The study emphasizes the complex relationship between model +complexity, sparsity, and generalization, and suggests further research into +more diverse models and datasets. The findings contribute to a deeper +understanding of neural network training and optimization. + +
+
+
+
+
+ + ☆ Towards End-to-End GPS Localization with Neural Pseudorange Correction + + +
+ Pseudorange errors are the root cause of localization inaccuracy in GPS. +Previous data-driven methods regress and eliminate pseudorange errors using +handcrafted intermediate labels. Unlike them, we propose an end-to-end GPS +localization framework, E2E-PrNet, to train a neural network for pseudorange +correction (PrNet) directly using the final task loss calculated with the +ground truth of GPS receiver states. The gradients of the loss with respect to +learnable parameters are backpropagated through a differentiable nonlinear +least squares optimizer to PrNet. The feasibility is verified with GPS data +collected by Android phones, showing that E2E-PrNet outperforms the +state-of-the-art end-to-end GPS localization methods. + +
+
+
+
+
+ + ☆ Deep Learning-based Embedded Intrusion Detection System for Automotive + CAN + + +
+ Rising complexity of in-vehicle electronics is enabling new capabilities like +autonomous driving and active safety. However, rising automation also increases +risk of security threats which is compounded by lack of in-built security +measures in legacy networks like CAN, allowing attackers to observe, tamper and +modify information shared over such broadcast networks. Various intrusion +detection approaches have been proposed to detect and tackle such threats, with +machine learning models proving highly effective. However, deploying machine +learning models will require high processing power through high-end processors +or GPUs to perform them close to line rate. In this paper, we propose a hybrid +FPGA-based ECU approach that can transparently integrate IDS functionality +through a dedicated off-the-shelf hardware accelerator that implements a +deep-CNN intrusion detection model. Our results show that the proposed approach +provides an average accuracy of over 99% across multiple attack datasets with +0.64% false detection rates while consuming 94% less energy and achieving 51.8% +reduction in per-message processing latency when compared to IDS +implementations on GPUs. + +
+
+ comment: 5 pages, 1 figure, 8 tables +
+
+
+
+
+ + ☆ FIMBA: Evaluating the Robustness of AI in Genomics via Feature + Importance Adversarial Attacks + + +
+ With the steady rise of the use of AI in bio-technical applications and the +widespread adoption of genomics sequencing, an increasing amount of AI-based +algorithms and tools is entering the research and production stage affecting +critical decision-making streams like drug discovery and clinical outcomes. +This paper demonstrates the vulnerability of AI models often utilized +downstream tasks on recognized public genomics datasets. We undermine model +robustness by deploying an attack that focuses on input transformation while +mimicking the real data and confusing the model decision-making, ultimately +yielding a pronounced deterioration in model performance. Further, we enhance +our approach by generating poisoned data using a variational autoencoder-based +model. Our empirical findings unequivocally demonstrate a decline in model +performance, underscored by diminished accuracy and an upswing in false +positives and false negatives. Furthermore, we analyze the resulting +adversarial samples via spectral analysis yielding conclusions for +countermeasures against such attacks. + +
+
+ comment: 15 pages, core code available at: + https://github.com/HeorhiiS/fimba-attack +
+
+
+
+
+ + ☆ Attentive Fusion: A Transformer-based Approach to Multimodal Hate Speech + Detection + + +
+ With the recent surge and exponential growth of social media usage, +scrutinizing social media content for the presence of any hateful content is of +utmost importance. Researchers have been diligently working since the past +decade on distinguishing between content that promotes hatred and content that +does not. Traditionally, the main focus has been on analyzing textual content. +However, recent research attempts have also commenced into the identification +of audio-based content. Nevertheless, studies have shown that relying solely on +audio or text-based content may be ineffective, as recent upsurge indicates +that individuals often employ sarcasm in their speech and writing. To overcome +these challenges, we present an approach to identify whether a speech promotes +hate or not utilizing both audio and textual representations. Our methodology +is based on the Transformer framework that incorporates both audio and text +sampling, accompanied by our very own layer called "Attentive Fusion". The +results of our study surpassed previous state-of-the-art techniques, achieving +an impressive macro F1 score of 0.927 on the Test Set. + +
+
+ comment: Accepted in 20th International Conference on Natural Language + Processing (ICON) +
+
+
+
+
+ + ☆ AutoChunk: Automated Activation Chunk for Memory-Efficient Long Sequence + Inference ICLR 2024 + + +
+ Large deep learning models have achieved impressive performance across a +range of applications. However, their large memory requirements, including +parameter memory and activation memory, have become a significant challenge for +their practical serving. While existing methods mainly address parameter +memory, the importance of activation memory has been overlooked. Especially for +long input sequences, activation memory is expected to experience a significant +exponential growth as the length of sequences increases. In this approach, we +propose AutoChunk, an automatic and adaptive compiler system that efficiently +reduces activation memory for long sequence inference by chunk strategies. The +proposed system generates chunk plans by optimizing through multiple stages. In +each stage, the chunk search pass explores all possible chunk candidates and +the chunk selection pass identifies the optimal one. At runtime, AutoChunk +employs code generation to automatically apply chunk strategies. The +experiments demonstrate that AutoChunk can reduce over 80\% of activation +memory while maintaining speed loss within 10%, extend max sequence length by +3.2x to 11.7x, and outperform state-of-the-art methods by a large margin. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Area Modeling using Stay Information for Large-Scale Users and Analysis + for Influence of COVID-19 + + +
+ Understanding how people use area in a city can be a valuable information in +a wide range of fields, from marketing to urban planning. Area usage is subject +to change over time due to various events including seasonal shifts and +pandemics. Before the spread of smartphones, this data had been collected +through questionnaire survey. However, this is not a sustainable approach in +terms of time to results and cost. There are many existing studies on area +modeling, which characterize an area with some kind of information, using Point +of Interest (POI) or inter-area movement data. However, since POI is data that +is statically tied to space, and inter-area movement data ignores the behavior +of people within an area, existing methods are not sufficient in terms of +capturing area usage changes. In this paper, we propose a novel area modeling +method named Area2Vec, inspired by Word2Vec, which models areas based on +people's location data. This method is based on the discovery that it is +possible to characterize an area based on its usage by using people's stay +information in the area. And it is a novel method that can reflect the +dynamically changing people's behavior in an area in the modeling results. We +validated Area2vec by performing a functional classification of areas in a +district of Japan. The results show that Area2Vec can be usable in general area +analysis. We also investigated area usage changes due to COVID-19 in two +districts in Japan. We could find that COVID-19 made people refrain from +unnecessary going out, such as visiting entertainment areas. + +
+
+ comment: This paper is an English translation of the paper published in the + Transactions of the Information Processing Society of Japan + (http://doi.org/10.20729/00213190) +
+
+
+
+
+ + ☆ Empowering HWNs with Efficient Data Labeling: A Clustered Federated + Semi-Supervised Learning Approach + + +
+ Clustered Federated Multitask Learning (CFL) has gained considerable +attention as an effective strategy for overcoming statistical challenges, +particularly when dealing with non independent and identically distributed (non +IID) data across multiple users. However, much of the existing research on CFL +operates under the unrealistic premise that devices have access to accurate +ground truth labels. This assumption becomes especially problematic in +hierarchical wireless networks (HWNs), where edge networks contain a large +amount of unlabeled data, resulting in slower convergence rates and increased +processing times, particularly when dealing with two layers of model +aggregation. To address these issues, we introduce a novel framework, Clustered +Federated Semi-Supervised Learning (CFSL), designed for more realistic HWN +scenarios. Our approach leverages a best-performing specialized model +algorithm, wherein each device is assigned a specialized model that is highly +adept at generating accurate pseudo-labels for unlabeled data, even when the +data stems from diverse environments. We validate the efficacy of CFSL through +extensive experiments, comparing it with existing methods highlighted in recent +literature. Our numerical results demonstrate that CFSL significantly improves +upon key metrics such as testing accuracy, labeling accuracy, and labeling +latency under varying proportions of labeled and unlabeled data while also +accommodating the non-IID nature of the data and the unique characteristics of +wireless edge networks. + +
+
+ comment: Accepted for IEEE Wireless Communications and Networking Conference + (WCNC) 2024 +
+
+
+
+
+ + ☆ A Comprehensive Survey on Deep-Learning-based Vehicle Re-Identification: + Models, Data Sets and Challenges + + +
+ Vehicle re-identification (ReID) endeavors to associate vehicle images +collected from a distributed network of cameras spanning diverse traffic +environments. This task assumes paramount importance within the spectrum of +vehicle-centric technologies, playing a pivotal role in deploying Intelligent +Transportation Systems (ITS) and advancing smart city initiatives. Rapid +advancements in deep learning have significantly propelled the evolution of +vehicle ReID technologies in recent years. Consequently, undertaking a +comprehensive survey of methodologies centered on deep learning for vehicle +re-identification has become imperative and inescapable. This paper extensively +explores deep learning techniques applied to vehicle ReID. It outlines the +categorization of these methods, encompassing supervised and unsupervised +approaches, delves into existing research within these categories, introduces +datasets and evaluation criteria, and delineates forthcoming challenges and +potential research directions. This comprehensive assessment examines the +landscape of deep learning in vehicle ReID and establishes a foundation and +starting point for future works. It aims to serve as a complete reference by +highlighting challenges and emerging trends, fostering advancements and +applications in vehicle ReID utilizing deep learning models. + +
+
+
+
+
+ + ☆ Towards Universal Unsupervised Anomaly Detection in Medical Imaging + + +
+ The increasing complexity of medical imaging data underscores the need for +advanced anomaly detection methods to automatically identify diverse +pathologies. Current methods face challenges in capturing the broad spectrum of +anomalies, often limiting their use to specific lesion types in brain scans. To +address this challenge, we introduce a novel unsupervised approach, termed +\textit{Reversed Auto-Encoders (RA)}, designed to create realistic +pseudo-healthy reconstructions that enable the detection of a wider range of +pathologies. We evaluate the proposed method across various imaging modalities, +including magnetic resonance imaging (MRI) of the brain, pediatric wrist X-ray, +and chest X-ray, and demonstrate superior performance in detecting anomalies +compared to existing state-of-the-art methods. Our unsupervised anomaly +detection approach may enhance diagnostic accuracy in medical imaging by +identifying a broader range of unknown pathologies. Our code is publicly +available at: \url{https://github.com/ci-ber/RA}. + +
+
+
+
+
+ + ☆ Interventional Fairness on Partially Known Causal Graphs: A Constrained + Optimization Approach ICLR24 + + +
+ Fair machine learning aims to prevent discrimination against individuals or +sub-populations based on sensitive attributes such as gender and race. In +recent years, causal inference methods have been increasingly used in fair +machine learning to measure unfairness by causal effects. However, current +methods assume that the true causal graph is given, which is often not true in +real-world applications. To address this limitation, this paper proposes a +framework for achieving causal fairness based on the notion of interventions +when the true causal graph is partially known. The proposed approach involves +modeling fair prediction using a Partially Directed Acyclic Graph (PDAG), +specifically, a class of causal DAGs that can be learned from observational +data combined with domain knowledge. The PDAG is used to measure causal +fairness, and a constrained optimization problem is formulated to balance +between fairness and accuracy. Results on both simulated and real-world +datasets demonstrate the effectiveness of this method. + +
+
+ comment: Accepted to ICLR24 +
+
+
+
+
+ + ☆ Polytopic Autoencoders with Smooth Clustering for Reduced-order + Modelling of Flows + + +
+ With the advancement of neural networks, there has been a notable increase, +both in terms of quantity and variety, in research publications concerning the +application of autoencoders to reduced-order models. We propose a polytopic +autoencoder architecture that includes a lightweight nonlinear encoder, a +convex combination decoder, and a smooth clustering network. Supported by +several proofs, the model architecture ensures that all reconstructed states +lie within a polytope, accompanied by a metric indicating the quality of the +constructed polytopes, referred to as polytope error. Additionally, it offers a +minimal number of convex coordinates for polytopic linear-parameter varying +systems while achieving acceptable reconstruction errors compared to proper +orthogonal decomposition (POD). To validate our proposed model, we conduct +simulations involving two flow scenarios with the incompressible Navier-Stokes +equation. Numerical results demonstrate the guaranteed properties of the model, +low reconstruction errors compared to POD, and the improvement in error using a +clustering network. + +
+
+ comment: 28 pages, 18 figures +
+
+
+
+
+ + ☆ ZnTrack -- Data as Code + + +
+ The past decade has seen tremendous breakthroughs in computation and there is +no indication that this will slow any time soon. Machine learning, large-scale +computing resources, and increased industry focus have resulted in rising +investments in computer-driven solutions for data management, simulations, and +model generation. However, with this growth in computation has come an even +larger expansion of data and with it, complexity in data storage, sharing, and +tracking. In this work, we introduce ZnTrack, a Python-driven data versioning +tool. ZnTrack builds upon established version control systems to provide a +user-friendly and easy-to-use interface for tracking parameters in experiments, +designing workflows, and storing and sharing data. From this ability to reduce +large datasets to a simple Python script emerges the concept of Data as Code, a +core component of the work presented here and an undoubtedly important concept +as the age of computation continues to evolve. ZnTrack offers an open-source, +FAIR data compatible Python package to enable users to harness these concepts +of the future. + +
+
+ comment: 22 pages, 10 figures, 2MB PDF +
+
+
+
+
+ + ☆ Adversarially Robust Signed Graph Contrastive Learning from Balance + Augmentation + + +
+ Signed graphs consist of edges and signs, which can be separated into +structural information and balance-related information, respectively. Existing +signed graph neural networks (SGNNs) typically rely on balance-related +information to generate embeddings. Nevertheless, the emergence of recent +adversarial attacks has had a detrimental impact on the balance-related +information. Similar to how structure learning can restore unsigned graphs, +balance learning can be applied to signed graphs by improving the balance +degree of the poisoned graph. However, this approach encounters the challenge +"Irreversibility of Balance-related Information" - while the balance degree +improves, the restored edges may not be the ones originally affected by +attacks, resulting in poor defense effectiveness. To address this challenge, we +propose a robust SGNN framework called Balance Augmented-Signed Graph +Contrastive Learning (BA-SGCL), which combines Graph Contrastive Learning +principles with balance augmentation techniques. Experimental results +demonstrate that BA-SGCL not only enhances robustness against existing +adversarial attacks but also achieves superior performance on link sign +prediction task across various datasets. + +
+
+
+
+
+ + ☆ PuriDefense: Randomized Local Implicit Adversarial Purification for + Defending Black-box Query-based Attacks + + +
+ Black-box query-based attacks constitute significant threats to Machine +Learning as a Service (MLaaS) systems since they can generate adversarial +examples without accessing the target model's architecture and parameters. +Traditional defense mechanisms, such as adversarial training, gradient masking, +and input transformations, either impose substantial computational costs or +compromise the test accuracy of non-adversarial inputs. To address these +challenges, we propose an efficient defense mechanism, PuriDefense, that +employs random patch-wise purifications with an ensemble of lightweight +purification models at a low level of inference cost. These models leverage the +local implicit function and rebuild the natural image manifold. Our theoretical +analysis suggests that this approach slows down the convergence of query-based +attacks by incorporating randomness into purifications. Extensive experiments +on CIFAR-10 and ImageNet validate the effectiveness of our proposed +purifier-based defense mechanism, demonstrating significant improvements in +robustness against query-based attacks. + +
+
+
+
+
+ + ☆ Robust Multi-Modal Density Estimation + + +
+ Development of multi-modal, probabilistic prediction models has lead to a +need for comprehensive evaluation metrics. While several metrics can +characterize the accuracy of machine-learned models (e.g., negative +log-likelihood, Jensen-Shannon divergence), these metrics typically operate on +probability densities. Applying them to purely sample-based prediction models +thus requires that the underlying density function is estimated. However, +common methods such as kernel density estimation (KDE) have been demonstrated +to lack robustness, while more complex methods have not been evaluated in +multi-modal estimation problems. In this paper, we present ROME (RObust +Multi-modal density Estimator), a non-parametric approach for density +estimation which addresses the challenge of estimating multi-modal, non-normal, +and highly correlated distributions. ROME utilizes clustering to segment a +multi-modal set of samples into multiple uni-modal ones and then combines +simple KDE estimates obtained for individual clusters in a single multi-modal +estimate. We compared our approach to state-of-the-art methods for density +estimation as well as ablations of ROME, showing that it not only outperforms +established methods but is also more robust to a variety of distributions. Our +results demonstrate that ROME can overcome the issues of over-fitting and +over-smoothing exhibited by other estimators, promising a more robust +evaluation of probabilistic machine learning models. + +
+
+
+
+
+ + ☆ OrchMoE: Efficient Multi-Adapter Learning with Task-Skill Synergy + + +
+ We advance the field of Parameter-Efficient Fine-Tuning (PEFT) with our novel +multi-adapter method, OrchMoE, which capitalizes on modular skill architecture +for enhanced forward transfer in neural networks. Unlike prior models that +depend on explicit task identification inputs, OrchMoE automatically discerns +task categories, streamlining the learning process. This is achieved through an +integrated mechanism comprising an Automatic Task Classification module and a +Task-Skill Allocation module, which collectively deduce task-specific +classifications and tailor skill allocation matrices. Our extensive evaluations +on the 'Super Natural Instructions' dataset, featuring 1,600 diverse +instructional tasks, indicate that OrchMoE substantially outperforms comparable +multi-adapter baselines in terms of both performance and sample utilization +efficiency, all while operating within the same parameter constraints. These +findings suggest that OrchMoE offers a significant leap forward in multi-task +learning efficiency. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Unified View Imputation and Feature Selection Learning for Incomplete + Multi-view Data + + +
+ Although multi-view unsupervised feature selection (MUFS) is an effective +technology for reducing dimensionality in machine learning, existing methods +cannot directly deal with incomplete multi-view data where some samples are +missing in certain views. These methods should first apply predetermined values +to impute missing data, then perform feature selection on the complete dataset. +Separating imputation and feature selection processes fails to capitalize on +the potential synergy where local structural information gleaned from feature +selection could guide the imputation, thereby improving the feature selection +performance in turn. Additionally, previous methods only focus on leveraging +samples' local structure information, while ignoring the intrinsic locality of +the feature space. To tackle these problems, a novel MUFS method, called +UNified view Imputation and Feature selectIon lEaRning (UNIFIER), is proposed. +UNIFIER explores the local structure of multi-view data by adaptively learning +similarity-induced graphs from both the sample and feature spaces. Then, +UNIFIER dynamically recovers the missing views, guided by the sample and +feature similarity graphs during the feature selection procedure. Furthermore, +the half-quadratic minimization technique is used to automatically weight +different instances, alleviating the impact of outliers and unreliable restored +data. Comprehensive experimental results demonstrate that UNIFIER outperforms +other state-of-the-art methods. + +
+
+
+
+
+ + ☆ PhoGAD: Graph-based Anomaly Behavior Detection with Persistent Homology + Optimization WSDM 2024 + + +
+ A multitude of toxic online behaviors, ranging from network attacks to +anonymous traffic and spam, have severely disrupted the smooth operation of +networks. Due to the inherent sender-receiver nature of network behaviors, +graph-based frameworks are commonly used for detecting anomalous behaviors. +However, in real-world scenarios, the boundary between normal and anomalous +behaviors tends to be ambiguous. The local heterophily of graphs interferes +with the detection, and existing methods based on nodes or edges introduce +unwanted noise into representation results, thereby impacting the effectiveness +of detection. To address these issues, we propose PhoGAD, a graph-based anomaly +detection framework. PhoGAD leverages persistent homology optimization to +clarify behavioral boundaries. Building upon this, the weights of adjacent +edges are designed to mitigate the effects of local heterophily. Subsequently, +to tackle the noise problem, we conduct a formal analysis and propose a +disentangled representation-based explicit embedding method, ultimately +achieving anomaly behavior detection. Experiments on intrusion, traffic, and +spam datasets verify that PhoGAD has surpassed the performance of +state-of-the-art (SOTA) frameworks in detection efficacy. Notably, PhoGAD +demonstrates robust detection even with diminished anomaly proportions, +highlighting its applicability to real-world scenarios. The analysis of +persistent homology demonstrates its effectiveness in capturing the topological +structure formed by normal edge features. Additionally, ablation experiments +validate the effectiveness of the innovative mechanisms integrated within +PhoGAD. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ☆ I-SplitEE: Image classification in Split Computing DNNs with Early Exits + + +
+ The recent advances in Deep Neural Networks (DNNs) stem from their +exceptional performance across various domains. However, their inherent large +size hinders deploying these networks on resource-constrained devices like +edge, mobile, and IoT platforms. Strategies have emerged, from partial cloud +computation offloading (split computing) to integrating early exits within DNN +layers. Our work presents an innovative unified approach merging early exits +and split computing. We determine the 'splitting layer', the optimal depth in +the DNN for edge device computations, and whether to infer on edge device or be +offloaded to the cloud for inference considering accuracy, computational +efficiency, and communication costs. Also, Image classification faces diverse +environmental distortions, influenced by factors like time of day, lighting, +and weather. To adapt to these distortions, we introduce I-SplitEE, an online +unsupervised algorithm ideal for scenarios lacking ground truths and with +sequential data. Experimental validation using Caltech-256 and Cifar-10 +datasets subjected to varied distortions showcases I-SplitEE's ability to +reduce costs by a minimum of 55% with marginal performance degradation of at +most 5%. + +
+
+ comment: To appear in proceedings of IEEE International Conference on + Communications 2024 +
+
+
+
+
+ + ☆ The "Colonial Impulse" of Natural Language Processing: An Audit of + Bengali Sentiment Analysis Tools and Their Identity-based Biases + + +
+ While colonization has sociohistorically impacted people's identities across +various dimensions, those colonial values and biases continue to be perpetuated +by sociotechnical systems. One category of sociotechnical systems--sentiment +analysis tools--can also perpetuate colonial values and bias, yet less +attention has been paid to how such tools may be complicit in perpetuating +coloniality, although they are often used to guide various practices (e.g., +content moderation). In this paper, we explore potential bias in sentiment +analysis tools in the context of Bengali communities that have experienced and +continue to experience the impacts of colonialism. Drawing on identity +categories most impacted by colonialism amongst local Bengali communities, we +focused our analytic attention on gender, religion, and nationality. We +conducted an algorithmic audit of all sentiment analysis tools for Bengali, +available on the Python package index (PyPI) and GitHub. Despite similar +semantic content and structure, our analyses showed that in addition to +inconsistencies in output from different tools, Bengali sentiment analysis +tools exhibit bias between different identity categories and respond +differently to different ways of identity expression. Connecting our findings +with colonially shaped sociocultural structures of Bengali communities, we +discuss the implications of downstream bias of sentiment analysis tools. + +
+
+
+
+
+ + ☆ Mementos: A Comprehensive Benchmark for Multimodal Large Language Model + Reasoning over Image Sequences + + +
+ Multimodal Large Language Models (MLLMs) have demonstrated proficiency in +handling a variety of visual-language tasks. However, current MLLM benchmarks +are predominantly designed to evaluate reasoning based on static information +about a single image, and the ability of modern MLLMs to extrapolate from image +sequences, which is essential for understanding our ever-changing world, has +been less investigated. To address this challenge, this paper introduces +Mementos, a new benchmark designed to assess MLLMs' sequential image reasoning +abilities. Mementos features 4,761 diverse image sequences with varying +lengths. We also employ a GPT-4 assisted method to evaluate MLLM reasoning +performance. Through a careful evaluation of nine recent MLLMs on Mementos, +including GPT-4V and Gemini, we find that they struggle to accurately describe +dynamic information about given image sequences, often leading to +hallucinations/misrepresentations of objects and their corresponding behaviors. +Our quantitative analysis and case studies identify three key factors impacting +MLLMs' sequential image reasoning: the correlation between object and +behavioral hallucinations, the influence of cooccurring behaviors, and the +compounding impact of behavioral hallucinations. Our dataset is available at +https://github.com/umd-huang-lab/Mementos. + +
+
+ comment: 27 pages, 23 figures +
+
+
+
+
+ + ☆ FARe: Fault-Aware GNN Training on ReRAM-based PIM Accelerators DATE + + +
+ Resistive random-access memory (ReRAM)-based processing-in-memory (PIM) +architecture is an attractive solution for training Graph Neural Networks +(GNNs) on edge platforms. However, the immature fabrication process and limited +write endurance of ReRAMs make them prone to hardware faults, thereby limiting +their widespread adoption for GNN training. Further, the existing +fault-tolerant solutions prove inadequate for effectively training GNNs in the +presence of faults. In this paper, we propose a fault-aware framework referred +to as FARe that mitigates the effect of faults during GNN training. FARe +outperforms existing approaches in terms of both accuracy and timing overhead. +Experimental results demonstrate that FARe framework can restore GNN test +accuracy by 47.6% on faulty ReRAM hardware with a ~1% timing overhead compared +to the fault-free counterpart. + +
+
+ comment: This paper has been accepted to the conference DATE (Design, + Automation and Test in Europe) - 2024 +
+
+
+
+
+ + ☆ Spatial-temporal Forecasting for Regions without Observations EDBT2024 + + +
+ Spatial-temporal forecasting plays an important role in many real-world +applications, such as traffic forecasting, air pollutant forecasting, +crowd-flow forecasting, and so on. State-of-the-art spatial-temporal +forecasting models take data-driven approaches and rely heavily on data +availability. Such models suffer from accuracy issues when data is incomplete, +which is common in reality due to the heavy costs of deploying and maintaining +sensors for data collection. A few recent studies attempted to address the +issue of incomplete data. They typically assume some data availability in a +region of interest either for a short period or at a few locations. In this +paper, we further study spatial-temporal forecasting for a region of interest +without any historical observations, to address scenarios such as unbalanced +region development, progressive deployment of sensors or lack of open data. We +propose a model named STSM for the task. The model takes a contrastive +learning-based approach to learn spatial-temporal patterns from adjacent +regions that have recorded data. Our key insight is to learn from the locations +that resemble those in the region of interest, and we propose a selective +masking strategy to enable the learning. As a result, our model outperforms +adapted state-of-the-art models, reducing errors consistently over both traffic +and air pollutant forecasting tasks. The source code is available at +https://github.com/suzy0223/STSM. + +
+
+ comment: Accepted by EDBT2024 +
+
+
+
+
+ + ☆ Episodic Reinforcement Learning with Expanded State-reward Space AAMAS'24 + + +
+ Empowered by deep neural networks, deep reinforcement learning (DRL) has +demonstrated tremendous empirical successes in various domains, including +games, health care, and autonomous driving. Despite these advancements, DRL is +still identified as data-inefficient as effective policies demand vast numbers +of environmental samples. Recently, episodic control (EC)-based model-free DRL +methods enable sample efficiency by recalling past experiences from episodic +memory. However, existing EC-based methods suffer from the limitation of +potential misalignment between the state and reward spaces for neglecting the +utilization of (past) retrieval states with extensive information, which +probably causes inaccurate value estimation and degraded policy performance. To +tackle this issue, we introduce an efficient EC-based DRL framework with +expanded state-reward space, where the expanded states used as the input and +the expanded rewards used in the training both contain historical and current +information. To be specific, we reuse the historical states retrieved by EC as +part of the input states and integrate the retrieved MC-returns into the +immediate reward in each interactive transition. As a result, our method is +able to simultaneously achieve the full utilization of retrieval information +and the better evaluation of state values by a Temporal Difference (TD) loss. +Empirical results on challenging Box2d and Mujoco tasks demonstrate the +superiority of our method over a recent sibling method and common baselines. +Further, we also verify our method's effectiveness in alleviating Q-value +overestimation by additional experiments of Q-value comparison. + +
+
+ comment: Accepted at AAMAS'24 +
+
+
+
+
+ + ☆ A match made in consistency heaven: when large language models meet + evolutionary algorithms + + +
+ Pre-trained large language models (LLMs) have powerful capabilities for +generating creative natural text. Evolutionary algorithms (EAs) can discover +diverse solutions to complex real-world problems. Motivated by the common +collective and directionality of text sequence generation and evolution, this +paper illustrates the strong consistency of LLMs and EAs, which includes +multiple one-to-one key characteristics: token embedding and genotype-phenotype +mapping, position encoding and fitness shaping, position embedding and +selection, attention and crossover, feed-forward neural network and mutation, +model training and parameter update, and multi-task learning and +multi-objective optimization. Based on this consistency perspective, existing +coupling studies are analyzed, including evolutionary fine-tuning and +LLM-enhanced EAs. Leveraging these insights, we outline a fundamental roadmap +for future research in coupling LLMs and EAs, while highlighting key challenges +along the way. The consistency not only reveals the evolution mechanism behind +LLMs but also facilitates the development of evolved artificial agents that +approach or surpass biological organisms. + +
+
+ comment: A perspective article under review +
+
+
+
+
+ + ☆ Causal Layering via Conditional Entropy + + +
+ Causal discovery aims to recover information about an unobserved causal graph +from the observable data it generates. Layerings are orderings of the variables +which place causes before effects. In this paper, we provide ways to recover +layerings of a graph by accessing the data via a conditional entropy oracle, +when distributions are discrete. Our algorithms work by repeatedly removing +sources or sinks from the graph. Under appropriate assumptions and +conditioning, we can separate the sources or sinks from the remainder of the +nodes by comparing their conditional entropy to the unconditional entropy of +their noise. Our algorithms are provably correct and run in worst-case +quadratic time. The main assumptions are faithfulness and injective noise, and +either known noise entropies or weakly monotonically increasing noise entropies +along directed paths. In addition, we require one of either a very mild +extension of faithfulness, or strictly monotonically increasing noise +entropies, or expanding noise injectivity to include an additional single +argument in the structural functions. + +
+
+
+
+
+ + ☆ Generalization Error Guaranteed Auto-Encoder-Based Nonlinear Model + Reduction for Operator Learning + + +
+ Many physical processes in science and engineering are naturally represented +by operators between infinite-dimensional function spaces. The problem of +operator learning, in this context, seeks to extract these physical processes +from empirical data, which is challenging due to the infinite or high +dimensionality of data. An integral component in addressing this challenge is +model reduction, which reduces both the data dimensionality and problem size. +In this paper, we utilize low-dimensional nonlinear structures in model +reduction by investigating Auto-Encoder-based Neural Network (AENet). AENet +first learns the latent variables of the input data and then learns the +transformation from these latent variables to corresponding output data. Our +numerical experiments validate the ability of AENet to accurately learn the +solution operator of nonlinear partial differential equations. Furthermore, we +establish a mathematical and statistical estimation theory that analyzes the +generalization error of AENet. Our theoretical framework shows that the sample +complexity of training AENet is intricately tied to the intrinsic dimension of +the modeled process, while also demonstrating the remarkable resilience of +AENet to noise. + +
+
+
+
+
+ + ☆ Budgeted Online Model Selection and Fine-Tuning via Federated Learning + + +
+ Online model selection involves selecting a model from a set of candidate +models 'on the fly' to perform prediction on a stream of data. The choice of +candidate models henceforth has a crucial impact on the performance. Although +employing a larger set of candidate models naturally leads to more flexibility +in model selection, this may be infeasible in cases where prediction tasks are +performed on edge devices with limited memory. Faced with this challenge, the +present paper proposes an online federated model selection framework where a +group of learners (clients) interacts with a server with sufficient memory such +that the server stores all candidate models. However, each client only chooses +to store a subset of models that can be fit into its memory and performs its +own prediction task using one of the stored models. Furthermore, employing the +proposed algorithm, clients and the server collaborate to fine-tune models to +adapt them to a non-stationary environment. Theoretical analysis proves that +the proposed algorithm enjoys sub-linear regret with respect to the best model +in hindsight. Experiments on real datasets demonstrate the effectiveness of the +proposed algorithm. + +
+
+ comment: Accepted by Transactions on Machine Learning Research (TMLR) +
+
+
+
+
+ + ☆ LDReg: Local Dimensionality Regularized Self-Supervised Learning ICLR 2024 + + +
+ Representations learned via self-supervised learning (SSL) can be susceptible +to dimensional collapse, where the learned representation subspace is of +extremely low dimensionality and thus fails to represent the full data +distribution and modalities. Dimensional collapse also known as the +"underfilling" phenomenon is one of the major causes of degraded performance on +downstream tasks. Previous work has investigated the dimensional collapse +problem of SSL at a global level. In this paper, we demonstrate that +representations can span over high dimensional space globally, but collapse +locally. To address this, we propose a method called $\textit{local +dimensionality regularization (LDReg)}$. Our formulation is based on the +derivation of the Fisher-Rao metric to compare and optimize local distance +distributions at an asymptotically small radius for each data point. By +increasing the local intrinsic dimensionality, we demonstrate through a range +of experiments that LDReg improves the representation quality of SSL. The +results also show that LDReg can regularize dimensionality at both local and +global levels. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Learning Backdoors for Mixed Integer Programs with Contrastive Learning + + +
+ Many real-world problems can be efficiently modeled as Mixed Integer Programs +(MIPs) and solved with the Branch-and-Bound method. Prior work has shown the +existence of MIP backdoors, small sets of variables such that prioritizing +branching on them when possible leads to faster running times. However, finding +high-quality backdoors that improve running times remains an open question. +Previous work learns to estimate the relative solver speed of randomly sampled +backdoors through ranking and then decide whether to use it. In this paper, we +utilize the Monte-Carlo tree search method to collect backdoors for training, +rather than relying on random sampling, and adapt a contrastive learning +framework to train a Graph Attention Network model to predict backdoors. Our +method, evaluated on four common MIP problem domains, demonstrates performance +improvements over both Gurobi and previous models. + +
+
+
+
+
+ + ☆ Critical Data Size of Language Models from a Grokking Perspective + + +
+ We explore the critical data size in language models, a threshold that marks +a fundamental shift from quick memorization to slow generalization. We +formalize the phase transition under the grokking configuration into the Data +Efficiency Hypothesis and identify data insufficiency, sufficiency, and surplus +regimes in language models training dynamics. We develop a grokking +configuration to reproduce grokking on simplistic language models stably by +rescaling initialization and weight decay. We show that generalization occurs +only when language models reach a critical size. We analyze grokking across +sample-wise and model-wise, verifying the proposed data efficiency hypothesis. +Our experiments reveal smoother phase transitions occurring at the critical +dataset size for language datasets. As the model size increases, this critical +point also becomes larger, indicating that larger models require more data. Our +results deepen the understanding of language model training, offering a novel +perspective on the role of data in the learning mechanism of language models. + +
+
+
+
+
+ + ☆ Ultra-lightweight Neural Differential DSP Vocoder For High Quality + Speech Synthesis ICASSP 2024 + + +
+ Neural vocoders model the raw audio waveform and synthesize high-quality +audio, but even the highly efficient ones, like MB-MelGAN and LPCNet, fail to +run real-time on a low-end device like a smartglass. A pure digital signal +processing (DSP) based vocoder can be implemented via lightweight fast Fourier +transforms (FFT), and therefore, is a magnitude faster than any neural vocoder. +A DSP vocoder often gets a lower audio quality due to consuming over-smoothed +acoustic model predictions of approximate representations for the vocal tract. +In this paper, we propose an ultra-lightweight differential DSP (DDSP) vocoder +that uses a jointly optimized acoustic model with a DSP vocoder, and learns +without an extracted spectral feature for the vocal tract. The model achieves +audio quality comparable to neural vocoders with a high average MOS of 4.36 +while being efficient as a DSP vocoder. Our C++ implementation, without any +hardware-specific optimization, is at 15 MFLOPS, surpasses MB-MelGAN by 340 +times in terms of FLOPS, and achieves a vocoder-only RTF of 0.003 and overall +RTF of 0.044 while running single-threaded on a 2GHz Intel Xeon CPU. + +
+
+ comment: Accepted for ICASSP 2024 +
+
+
+
+
+ + ☆ Contrastive Unlearning: A Contrastive Approach to Machine Unlearning + + +
+ Machine unlearning aims to eliminate the influence of a subset of training +samples (i.e., unlearning samples) from a trained model. Effectively and +efficiently removing the unlearning samples without negatively impacting the +overall model performance is still challenging. In this paper, we propose a +contrastive unlearning framework, leveraging the concept of representation +learning for more effective unlearning. It removes the influence of unlearning +samples by contrasting their embeddings against the remaining samples so that +they are pushed away from their original classes and pulled toward other +classes. By directly optimizing the representation space, it effectively +removes the influence of unlearning samples while maintaining the +representations learned from the remaining samples. Experiments on a variety of +datasets and models on both class unlearning and sample unlearning showed that +contrastive unlearning achieves the best unlearning effects and efficiency with +the lowest performance loss compared with the state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ Learning-assisted Stochastic Capacity Expansion Planning: A Bayesian + Optimization Approach + + +
+ Solving large-scale capacity expansion problems (CEPs) is central to +cost-effective decarbonization of regional-scale energy systems. To ensure the +intended outcomes of CEPs, modeling uncertainty due to weather-dependent +variable renewable energy (VRE) supply and energy demand becomes crucially +important. However, the resulting stochastic optimization models are often less +computationally tractable than their deterministic counterparts. Here, we +propose a learning-assisted approximate solution method to tractably solve +two-stage stochastic CEPs. Our method identifies low-cost planning decisions by +constructing and solving a sequence of tractable temporally aggregated +surrogate problems. We adopt a Bayesian optimization approach to searching the +space of time series aggregation hyperparameters and compute approximate +solutions that minimize costs on a validation set of supply-demand projections. +Importantly, we evaluate solved planning outcomes on a held-out set of test +projections. We apply our approach to generation and transmission expansion +planning for a joint power-gas system spanning New England. We show that our +approach yields an estimated cost savings of up to 3.8% in comparison to +benchmark time series aggregation approaches. + +
+
+
+
+
+ + ☆ Investigating Training Strategies and Model Robustness of Low-Rank + Adaptation for Language Modeling in Speech Recognition + + +
+ The use of low-rank adaptation (LoRA) with frozen pretrained language models +(PLMs) has become increasing popular as a mainstream, resource-efficient +modeling approach for memory-constrained hardware. In this study, we first +explore how to enhance model performance by introducing various LoRA training +strategies, achieving relative word error rate reductions of 3.50\% on the +public Librispeech dataset and of 3.67\% on an internal dataset in the +messaging domain. To further characterize the stability of LoRA-based +second-pass speech recognition models, we examine robustness against input +perturbations. These perturbations are rooted in homophone replacements and a +novel metric called N-best Perturbation-based Rescoring Robustness (NPRR), both +designed to measure the relative degradation in the performance of rescoring +models. Our experimental results indicate that while advanced variants of LoRA, +such as dynamic rank-allocated LoRA, lead to performance degradation in +$1$-best perturbation, they alleviate the degradation in $N$-best perturbation. +This finding is in comparison to fully-tuned models and vanilla LoRA tuning +baselines, suggesting that a comprehensive selection is needed when using +LoRA-based adaptation for compute-cost savings and robust language modeling. + +
+
+
+
+
+ + ☆ Large Language Models are Efficient Learners of Noise-Robust Speech + Recognition ICLR 2024 + + +
+ Recent advances in large language models (LLMs) have promoted generative +error correction (GER) for automatic speech recognition (ASR), which leverages +the rich linguistic knowledge and powerful reasoning ability of LLMs to improve +recognition results. The latest work proposes a GER benchmark with HyPoradise +dataset to learn the mapping from ASR N-best hypotheses to ground-truth +transcription by efficient LLM finetuning, which shows great effectiveness but +lacks specificity on noise-robust ASR. In this work, we extend the benchmark to +noisy conditions and investigate if we can teach LLMs to perform denoising for +GER just like what robust ASR do}, where one solution is introducing noise +information as a conditioner into LLM. However, directly incorporating noise +embeddings from audio encoder could harm the LLM tuning due to cross-modality +gap. To this end, we propose to extract a language-space noise embedding from +the N-best list to represent the noise conditions of source speech, which can +promote the denoising process in GER. Furthermore, in order to enhance its +representation ability of audio noise, we design a knowledge distillation (KD) +approach via mutual information estimation to distill the real noise +information in audio embeddings to our language embedding. Experiments on +various latest LLMs demonstrate our approach achieves a new breakthrough with +up to 53.9% correction improvement in terms of word error rate while with +limited training data. Analysis shows that our language-space noise embedding +can well represent the noise conditions of source speech, under which +off-the-shelf LLMs show strong ability of language-space denoising. + +
+
+ comment: Accepted to ICLR 2024, Spotlight top 5%, 24 pages. This work will be + open sourced at: https://github.com/YUCHEN005/RobustGER under MIT license +
+
+
+
+
+ + ☆ Path Choice Matters for Clear Attribution in Path Methods ICLR 2024 + + +
+ Rigorousness and clarity are both essential for interpretations of DNNs to +engender human trust. Path methods are commonly employed to generate rigorous +attributions that satisfy three axioms. However, the meaning of attributions +remains ambiguous due to distinct path choices. To address the ambiguity, we +introduce \textbf{Concentration Principle}, which centrally allocates high +attributions to indispensable features, thereby endowing aesthetic and +sparsity. We then present \textbf{SAMP}, a model-agnostic interpreter, which +efficiently searches the near-optimal path from a pre-defined set of +manipulation paths. Moreover, we propose the infinitesimal constraint (IC) and +momentum strategy (MS) to improve the rigorousness and optimality. +Visualizations show that SAMP can precisely reveal DNNs by pinpointing salient +image pixels. We also perform quantitative experiments and observe that our +method significantly outperforms the counterparts. Code: +https://github.com/zbr17/SAMP. + +
+
+ comment: ICLR 2024 accepted +
+
+
+
+
+ + ☆ A2Q+: Improving Accumulator-Aware Weight Quantization + + +
+ Quantization techniques commonly reduce the inference costs of neural +networks by restricting the precision of weights and activations. Recent +studies show that also reducing the precision of the accumulator can further +improve hardware efficiency at the risk of numerical overflow, which introduces +arithmetic errors that can degrade model accuracy. To avoid numerical overflow +while maintaining accuracy, recent work proposed accumulator-aware quantization +(A2Q), a quantization-aware training method that constrains model weights +during training to safely use a target accumulator bit width during inference. +Although this shows promise, we demonstrate that A2Q relies on an overly +restrictive constraint and a sub-optimal weight initialization strategy that +each introduce superfluous quantization error. To address these shortcomings, +we introduce: (1) an improved bound that alleviates accumulator constraints +without compromising overflow avoidance; and (2) a new strategy for +initializing quantized weights from pre-trained floating-point checkpoints. We +combine these contributions with weight normalization to introduce A2Q+. We +support our analysis with experiments that show A2Q+ significantly improves the +trade-off between accumulator bit width and model accuracy and characterize new +trade-offs that arise as a consequence of accumulator constraints. + +
+
+
+
+
+ + ♻ ☆ Optimal Sets and Solution Paths of ReLU Networks + + +
+ We develop an analytical framework to characterize the set of optimal ReLU +neural networks by reformulating the non-convex training problem as a convex +program. We show that the global optima of the convex parameterization are +given by a polyhedral set and then extend this characterization to the optimal +set of the non-convex training objective. Since all stationary points of the +ReLU training problem can be represented as optima of sub-sampled convex +programs, our work provides a general expression for all critical points of the +non-convex objective. We then leverage our results to provide an optimal +pruning algorithm for computing minimal networks, establish conditions for the +regularization path of ReLU networks to be continuous, and develop sensitivity +results for minimal ReLU networks. + +
+
+ comment: Minor updates and corrections to clarify the role of merge/split + symmetries in formation of ReLU optimal set and add missing sufficient + conditions for all minimal models to have the same cardinality +
+
+
+
+
+ + ♻ ☆ LLMCarbon: Modeling the end-to-end Carbon Footprint of Large Language + Models + + +
+ The carbon footprint associated with large language models (LLMs) is a +significant concern, encompassing emissions from their training, inference, +experimentation, and storage processes, including operational and embodied +carbon emissions. An essential aspect is accurately estimating the carbon +impact of emerging LLMs even before their training, which heavily relies on GPU +usage. Existing studies have reported the carbon footprint of LLM training, but +only one tool, mlco2, can predict the carbon footprint of new neural networks +prior to physical training. However, mlco2 has several serious limitations. It +cannot extend its estimation to dense or mixture-of-experts (MoE) LLMs, +disregards critical architectural parameters, focuses solely on GPUs, and +cannot model embodied carbon footprints. Addressing these gaps, we introduce +\textit{\carb}, an end-to-end carbon footprint projection model designed for +both dense and MoE LLMs. Compared to mlco2, \carb~significantly enhances the +accuracy of carbon footprint estimations for various LLMs. The source code is +released at \url{https://github.com/SotaroKaneda/MLCarbon}. + +
+
+ comment: 15 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Choreographer: Learning and Adapting Skills in Imagination ICLR 2023 + + +
+ Unsupervised skill learning aims to learn a rich repertoire of behaviors +without external supervision, providing artificial agents with the ability to +control and influence the environment. However, without appropriate knowledge +and exploration, skills may provide control only over a restricted area of the +environment, limiting their applicability. Furthermore, it is unclear how to +leverage the learned skill behaviors for adapting to downstream tasks in a +data-efficient manner. We present Choreographer, a model-based agent that +exploits its world model to learn and adapt skills in imagination. Our method +decouples the exploration and skill learning processes, being able to discover +skills in the latent state space of the model. During adaptation, the agent +uses a meta-controller to evaluate and adapt the learned skills efficiently by +deploying them in parallel in imagination. Choreographer is able to learn +skills both from offline data, and by collecting data simultaneously with an +exploration policy. The skills can be used to effectively adapt to downstream +tasks, as we show in the URL benchmark, where we outperform previous approaches +from both pixels and states inputs. The learned skills also explore the +environment thoroughly, finding sparse rewards more frequently, as shown in +goal-reaching tasks from the DMC Suite and Meta-World. Website and code: +https://skillchoreographer.github.io/ + +
+
+ comment: Accepted at ICLR 2023 (notable top 25%) +
+
+
+
+
+ + ♻ ☆ Towards Robust Offline Reinforcement Learning under Diverse Data + Corruption ICLR 2024 + + +
+ Offline reinforcement learning (RL) presents a promising approach for +learning reinforced policies from offline datasets without the need for costly +or unsafe interactions with the environment. However, datasets collected by +humans in real-world environments are often noisy and may even be maliciously +corrupted, which can significantly degrade the performance of offline RL. In +this work, we first investigate the performance of current offline RL +algorithms under comprehensive data corruption, including states, actions, +rewards, and dynamics. Our extensive experiments reveal that implicit +Q-learning (IQL) demonstrates remarkable resilience to data corruption among +various offline RL algorithms. Furthermore, we conduct both empirical and +theoretical analyses to understand IQL's robust performance, identifying its +supervised policy learning scheme as the key factor. Despite its relative +robustness, IQL still suffers from heavy-tail targets of Q functions under +dynamics corruption. To tackle this challenge, we draw inspiration from robust +statistics to employ the Huber loss to handle the heavy-tailedness and utilize +quantile estimators to balance penalization for corrupted data and learning +stability. By incorporating these simple yet effective modifications into IQL, +we propose a more robust offline RL approach named Robust IQL (RIQL). Extensive +experiments demonstrate that RIQL exhibits highly robust performance when +subjected to diverse data corruption scenarios. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Algorithmic Assistance with Recommendation-Dependent Preferences + + +
+ When an algorithm provides risk assessments, we typically think of them as +helpful inputs to human decisions, such as when risk scores are presented to +judges or doctors. However, a decision-maker may not only react to the +information provided by the algorithm. The decision-maker may also view the +algorithmic recommendation as a default action, making it costly for them to +deviate, such as when a judge is reluctant to overrule a high-risk assessment +for a defendant or a doctor fears the consequences of deviating from +recommended procedures. To address such unintended consequences of algorithmic +assistance, we propose a principal-agent model of joint human-machine +decision-making. Within this model, we consider the effect and design of +algorithmic recommendations when they affect choices not just by shifting +beliefs, but also by altering preferences. We motivate this assumption from +institutional factors, such as a desire to avoid audits, as well as from +well-established models in behavioral science that predict loss aversion +relative to a reference point, which here is set by the algorithm. We show that +recommendation-dependent preferences create inefficiencies where the +decision-maker is overly responsive to the recommendation. As a potential +remedy, we discuss algorithms that strategically withhold recommendations, and +show how they can improve the quality of final decisions. + +
+
+
+
+
+ + ♻ ☆ Towards Quantum Graph Neural Networks: An Ego-Graph Learning Approach + + +
+ Quantum machine learning is a fast-emerging field that aims to tackle machine +learning using quantum algorithms and quantum computing. Due to the lack of +physical qubits and an effective means to map real-world data from Euclidean +space to Hilbert space, most of these methods focus on quantum analogies or +process simulations rather than devising concrete architectures based on +qubits. In this paper, we propose a novel hybrid quantum-classical algorithm +for graph-structured data, which we refer to as the Ego-graph based Quantum +Graph Neural Network (egoQGNN). egoQGNN implements the GNN theoretical +framework using the tensor product and unity matrix representation, which +greatly reduces the number of model parameters required. When controlled by a +classical computer, egoQGNN can accommodate arbitrarily sized graphs by +processing ego-graphs from the input graph using a modestly-sized quantum +device. The architecture is based on a novel mapping from real-world data to +Hilbert space. This mapping maintains the distance relations present in the +data and reduces information loss. Experimental results show that the proposed +method outperforms competitive state-of-the-art models with only 1.68\% +parameters compared to those models. + +
+
+
+
+
+ + ♻ ☆ Solution of the Probabilistic Lambert Problem: Connections with Optimal + Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs + + +
+ Lambert's problem concerns with transferring a spacecraft from a given +initial to a given terminal position within prescribed flight time via velocity +control subject to a gravitational force field. We consider a probabilistic +variant of the Lambert problem where the knowledge of the endpoint constraints +in position vectors are replaced by the knowledge of their respective joint +probability density functions. We show that the Lambert problem with endpoint +joint probability density constraints is a generalized optimal mass transport +(OMT) problem, thereby connecting this classical astrodynamics problem with a +burgeoning area of research in modern stochastic control and stochastic machine +learning. This newfound connection allows us to rigorously establish the +existence and uniqueness of solution for the probabilistic Lambert problem. The +same connection also helps to numerically solve the probabilistic Lambert +problem via diffusion regularization, i.e., by leveraging further connection of +the OMT with the Schr\"odinger bridge problem (SBP). This also shows that the +probabilistic Lambert problem with additive dynamic process noise is in fact a +generalized SBP, and can be solved numerically using the so-called +Schr\"odinger factors, as we do in this work. We explain how the resulting +analysis leads to solving a boundary-coupled system of reaction-diffusion PDEs +where the nonlinear gravitational potential appears as the reaction rate. We +propose novel algorithms for the same, and present illustrative numerical +results. Our analysis and the algorithmic framework are nonparametric, i.e., we +make neither statistical (e.g., Gaussian, first few moments, mixture or +exponential family, finite dimensionality of the sufficient statistic) nor +dynamical (e.g., Taylor series) approximations. + +
+
+
+
+
+ + ♻ ☆ Have it your way: Individualized Privacy Assignment for DP-SGD NeurIPS'2024 + + +
+ When training a machine learning model with differential privacy, one sets a +privacy budget. This budget represents a maximal privacy violation that any +user is willing to face by contributing their data to the training set. We +argue that this approach is limited because different users may have different +privacy expectations. Thus, setting a uniform privacy budget across all points +may be overly conservative for some users or, conversely, not sufficiently +protective for others. In this paper, we capture these preferences through +individualized privacy budgets. To demonstrate their practicality, we introduce +a variant of Differentially Private Stochastic Gradient Descent (DP-SGD) which +supports such individualized budgets. DP-SGD is the canonical approach to +training models with differential privacy. We modify its data sampling and +gradient noising mechanisms to arrive at our approach, which we call +Individualized DP-SGD (IDP-SGD). Because IDP-SGD provides privacy guarantees +tailored to the preferences of individual users and their data points, we find +it empirically improves privacy-utility trade-offs. + +
+
+ comment: Published at NeurIPS'2024 +
+
+
+
+
+ + ♻ ☆ Group-level Brain Decoding with Deep Learning + + +
+ Decoding brain imaging data are gaining popularity, with applications in +brain-computer interfaces and the study of neural representations. Decoding is +typicallysubject-specific and does not generalise well over subjects, due to +high amounts ofbetween subject variability. Techniques that overcome this will +not only providericher neuroscientific insights but also make it possible for +group-level models to out-perform subject-specific models. Here, we propose a +method that uses subjectembedding, analogous to word embedding in natural +language processing, to learnand exploit the structure in between-subject +variability as part of a decoding model,our adaptation of the WaveNet +architecture for classification. We apply this to mag-netoencephalography data, +where 15 subjects viewed 118 different images, with30 examples per image; to +classify images using the entire 1 s window followingimage presentation. We +show that the combination of deep learning and subjectembedding is crucial to +closing the performance gap between subject- and group-level decoding models. +Importantly, group models outperform subject models onlow-accuracy subjects +(although slightly impair high-accuracy subjects) and can behelpful for +initialising subject models. While we have not generally found +group-levelmodels to perform better than subject-level models, the performance +of groupmodelling is expected to be even higher with bigger datasets. In order +to providephysiological interpretation at the group level, we make use of +permutation featureimportance. This provides insights into the spatiotemporal +and spectral informationencoded in the models. All code is available on GitHub +(https://github.com/ricsinaruto/MEG-group-decode). + +
+
+ comment: Published in Human Brain Mapping +
+
+
+
+
+ + ♻ ☆ Salted Inference: Enhancing Privacy while Maintaining Efficiency of + Split Inference in Mobile Computing + + +
+ In split inference, a deep neural network (DNN) is partitioned to run the +early part of the DNN at the edge and the later part of the DNN in the cloud. +This meets two key requirements for on-device machine learning: input privacy +and computation efficiency. Still, an open question in split inference is +output privacy, given that the outputs of the DNN are observable in the cloud. +While encrypted computing can protect output privacy too, homomorphic +encryption requires substantial computation and communication resources from +both edge and cloud devices. In this paper, we introduce Salted DNNs: a novel +approach that enables clients at the edge, who run the early part of the DNN, +to control the semantic interpretation of the DNN's outputs at inference time. +Our proposed Salted DNNs maintain classification accuracy and computation +efficiency very close to the standard DNN counterparts. Experimental +evaluations conducted on both images and wearable sensor data demonstrate that +Salted DNNs attain classification accuracy very close to standard DNNs, +particularly when the Salted Layer is positioned within the early part to meet +the requirements of split inference. Our approach is general and can be applied +to various types of DNNs. As a benchmark for future studies, we open-source our +code. + +
+
+ comment: To be appeared in the 25th International Workshop on Mobile Computing + Systems and Applications (HotMobile 2024) +
+
+
+
+
+ + ♻ ☆ Explaining dark matter halo density profiles with neural networks + + +
+ We use explainable neural networks to connect the evolutionary history of +dark matter halos with their density profiles. The network captures independent +factors of variation in the density profiles within a low-dimensional +representation, which we physically interpret using mutual information. Without +any prior knowledge of the halos' evolution, the network recovers the known +relation between the early time assembly and the inner profile, and discovers +that the profile beyond the virial radius is described by a single parameter +capturing the most recent mass accretion rate. The results illustrate the +potential for machine-assisted scientific discovery in complicated +astrophysical datasets. + +
+
+ comment: 7 pages, 5 figures. Minor changes to match version accepted for + publication in PRL +
+
+
+
+
+ + ♻ ☆ A Fast, Performant, Secure Distributed Training Framework For Large + Language Model ICASSP 2024 + + +
+ The distributed (federated) LLM is an important method for co-training the +domain-specific LLM using siloed data. However, maliciously stealing model +parameters and data from the server or client side has become an urgent problem +to be solved. In this paper, we propose a secure distributed LLM based on model +slicing. In this case, we deploy the Trusted Execution Environment (TEE) on +both the client and server side, and put the fine-tuned structure (LoRA or +embedding of P-tuning v2) into the TEE. Then, secure communication is executed +in the TEE and general environments through lightweight encryption. In order to +further reduce the equipment cost as well as increase the model performance and +accuracy, we propose a split fine-tuning scheme. In particular, we split the +LLM by layers and place the latter layers in a server-side TEE (the client does +not need a TEE). We then combine the proposed Sparsification Parameter +Fine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream +task. Numerous experiments have shown that our method guarantees accuracy while +maintaining security. + +
+
+ comment: Accepted by ICASSP 2024 (Federated LLM) +
+
+
+
+
+ + ♻ ☆ TemperatureGAN: Generative Modeling of Regional Atmospheric Temperatures + + +
+ Stochastic generators are useful for estimating climate impacts on various +sectors. Projecting climate risk in various sectors, e.g. energy systems, +requires generators that are accurate (statistical resemblance to +ground-truth), reliable (do not produce erroneous examples), and efficient. +Leveraging data from the North American Land Data Assimilation System, we +introduce TemperatureGAN, a Generative Adversarial Network conditioned on +months, locations, and time periods, to generate 2m above ground atmospheric +temperatures at an hourly resolution. We propose evaluation methods and metrics +to measure the quality of generated samples. We show that TemperatureGAN +produces high-fidelity examples with good spatial representation and temporal +dynamics consistent with known diurnal cycles. + +
+
+
+
+
+ + ♻ ☆ Let's do the time-warp-attend: Learning topological invariants of + dynamical systems + + +
+ Dynamical systems across the sciences, from electrical circuits to ecological +networks, undergo qualitative and often catastrophic changes in behavior, +called bifurcations, when their underlying parameters cross a threshold. +Existing methods predict oncoming catastrophes in individual systems but are +primarily time-series-based and struggle both to categorize qualitative +dynamical regimes across diverse systems and to generalize to real data. To +address this challenge, we propose a data-driven, physically-informed +deep-learning framework for classifying dynamical regimes and characterizing +bifurcation boundaries based on the extraction of topologically invariant +features. We focus on the paradigmatic case of the supercritical Hopf +bifurcation, which is used to model periodic dynamics across a wide range of +applications. Our convolutional attention method is trained with data +augmentations that encourage the learning of topological invariants which can +be used to detect bifurcation boundaries in unseen systems and to design models +of biological systems like oscillatory gene regulatory networks. We further +demonstrate our method's use in analyzing real data by recovering distinct +proliferation and differentiation dynamics along pancreatic endocrinogenesis +trajectory in gene expression space based on single-cell data. Our method +provides valuable insights into the qualitative, long-term behavior of a wide +range of dynamical systems, and can detect bifurcations or catastrophic +transitions in large-scale physical and biological systems. + +
+
+
+
+
+ + ♻ ☆ $α$-divergence Improves the Entropy Production Estimation via + Machine Learning + + +
+ Recent years have seen a surge of interest in the algorithmic estimation of +stochastic entropy production (EP) from trajectory data via machine learning. A +crucial element of such algorithms is the identification of a loss function +whose minimization guarantees the accurate EP estimation. In this study, we +show that there exists a host of loss functions, namely those implementing a +variational representation of the $\alpha$-divergence, which can be used for +the EP estimation. By fixing $\alpha$ to a value between $-1$ and $0$, the +$\alpha$-NEEP (Neural Estimator for Entropy Production) exhibits a much more +robust performance against strong nonequilibrium driving or slow dynamics, +which adversely affects the existing method based on the Kullback-Leibler +divergence ($\alpha = 0$). In particular, the choice of $\alpha = -0.5$ tends +to yield the optimal results. To corroborate our findings, we present an +exactly solvable simplification of the EP estimation problem, whose loss +function landscape and stochastic properties give deeper intuition into the +robustness of the $\alpha$-NEEP. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ A Foundation Graph Model NeurIPS 2023 + + +
+ The principal benefit of unsupervised graph representation learning is that a +pre-trained model can be fine-tuned where data or labels are scarce. Existing +approaches are domain specific, maintaining consistent node and edge attributes +across the pre-training and target datasets. This precludes transfer to other +domains. A model capable of positive transfer on arbitrary tasks and domains +would represent the first foundation graph model. + In this work we use adversarial contrastive learning to present FoToM, a +graph pre-training method based on node and edge feature exclusion. We use +FoToM to pre-train models over multiple graph domains, producing the first +foundation graph models. We demonstrate positive transfer on evaluation +datasets from multiple domains, including domains not present in pre-training +data. On all datasets performance is at worst on-par and on 76% significantly +better than a supervised baseline ($P \leq 0.01$), with an 8 to 40% reduction +in error at 95% confidence. Contrary to other research, pre-training on a +dataset with the target domain excluded leads us to better performance than +pre-training on a dataset from only the target domain. The multi-domain model +at worst, matches, and on 56% of tasks, significantly outperforms single-domain +($P \leq 0.01$). These results include when node labels are used in evaluation, +where performance is consistently superior to single-domain or non-pre-trained +models. Notably, FoToM benefits scenarios in both large or scarce data regimes +for the target domains. + +
+
+ comment: Presented at the NeurIPS 2023 New Frontiers in Graph Learning + workshop +
+
+
+
+
+ + ♻ ☆ Privacy-Preserving Neural Graph Databases + + +
+ In the era of big data and rapidly evolving information systems, efficient +and accurate data retrieval has become increasingly crucial. Neural graph +databases (NGDBs) have emerged as a powerful paradigm that combines the +strengths of graph databases (graph DBs) and neural networks to enable +efficient storage, retrieval, and analysis of graph-structured data. The usage +of neural embedding storage and complex neural logical query answering provides +NGDBs with generalization ability. When the graph is incomplete, by extracting +latent patterns and representations, neural graph databases can fill gaps in +the graph structure, revealing hidden relationships and enabling accurate query +answering. Nevertheless, this capability comes with inherent trade-offs, as it +introduces additional privacy risks to the database. Malicious attackers can +infer more sensitive information in the database using well-designed +combinatorial queries, such as by comparing the answer sets of where Turing +Award winners born before 1950 and after 1940 lived, the living places of +Turing Award winner Hinton are probably exposed, although the living places may +have been deleted in the training due to the privacy concerns. In this work, +inspired by the privacy protection in graph embeddings, we propose a +privacy-preserving neural graph database (P-NGDB) to alleviate the risks of +privacy leakage in NGDBs. We introduce adversarial training techniques in the +training stage to force the NGDBs to generate indistinguishable answers when +queried with private information, enhancing the difficulty of inferring +sensitive information through combinations of multiple innocuous queries. +Extensive experiment results on three datasets show that P-NGDB can effectively +protect private information in the graph database while delivering high-quality +public answers responses to queries. + +
+
+
+
+
+ + ♻ ☆ Interplay between depth and width for interpolation in neural ODEs + + +
+ Neural ordinary differential equations (neural ODEs) have emerged as a +natural tool for supervised learning from a control perspective, yet a complete +understanding of their optimal architecture remains elusive. In this work, we +examine the interplay between their width $p$ and number of layer transitions +$L$ (effectively the depth $L+1$). Specifically, we assess the model +expressivity in terms of its capacity to interpolate either a finite dataset +$D$ comprising $N$ pairs of points or two probability measures in +$\mathbb{R}^d$ within a Wasserstein error margin $\varepsilon>0$. Our findings +reveal a balancing trade-off between $p$ and $L$, with $L$ scaling as +$O(1+N/p)$ for dataset interpolation, and +$L=O\left(1+(p\varepsilon^d)^{-1}\right)$ for measure interpolation. + In the autonomous case, where $L=0$, a separate study is required, which we +undertake focusing on dataset interpolation. We address the relaxed problem of +$\varepsilon$-approximate controllability and establish an error decay of +$\varepsilon\sim O(\log(p)p^{-1/d})$. This decay rate is a consequence of +applying a universal approximation theorem to a custom-built Lipschitz vector +field that interpolates $D$. In the high-dimensional setting, we further +demonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact +control. + +
+
+ comment: 16 pages, 10 figures, double column +
+
+
+
+
+ + ♻ ☆ Imitation Learning Inputting Image Feature to Each Layer of Neural + Network + + +
+ Imitation learning enables robots to learn and replicate human behavior from +training data. Recent advances in machine learning enable end-to-end learning +approaches that directly process high-dimensional observation data, such as +images. However, these approaches face a critical challenge when processing +data from multiple modalities, inadvertently ignoring data with a lower +correlation to the desired output, especially when using short sampling +periods. This paper presents a useful method to address this challenge, which +amplifies the influence of data with a relatively low correlation to the output +by inputting the data into each neural network layer. The proposed approach +effectively incorporates diverse data sources into the learning process. +Through experiments using a simple pick-and-place operation with raw images and +joint information as input, significant improvements in success rates are +demonstrated even when dealing with data from short sampling periods. + +
+
+ comment: 6 pages, 4 figures, Accepted at AMC2024 +
+
+
+
+
+ + ♻ ☆ A ripple in time: a discontinuity in American history + + +
+ In this note we use the State of the Union Address (SOTU) dataset from Kaggle +to make some surprising (and some not so surprising) observations pertaining to +the general timeline of American history, and the character and nature of the +addresses themselves. Our main approach is using vector embeddings, such as +BERT (DistilBERT) and GPT-2. + While it is widely believed that BERT (and its variations) is most suitable +for NLP classification tasks, we find out that GPT-2 in conjunction with +nonlinear dimension reduction methods such as UMAP provide better separation +and stronger clustering. This makes GPT-2 + UMAP an interesting alternative. In +our case, no model fine-tuning is required, and the pre-trained out-of-the-box +GPT-2 model is enough. + We also used a fine-tuned DistilBERT model for classification detecting which +President delivered which address, with very good results (accuracy 93\% - 95\% +depending on the run). An analogous task was performed to determine the year of +writing, and we were able to pin it down to about 4 years (which is a single +presidential term). + It is worth noting that SOTU addresses provide relatively small writing +samples (with about 8000 words on average, and varying widely from under 2000 +words to more than 20000), and that the amount of authors is relatively large +(we used SOTU addresses of 42 US presidents). This shows that the techniques +employed turn out to be rather efficient, while all the computations described +in this note can be performed using a single GPU instance of Google Colab. + The accompanying code is available on GitHub. + +
+
+ comment: 7 pages, 8 figures; GitHub repository + https://github.com/sashakolpakov/ripple_in_time +
+
+
+
+
+ + ♻ ☆ EZ-CLIP: Efficient Zeroshot Video Action Recognition + + +
+ Recent advancements in large-scale pre-training of visual-language models on +paired image-text data have demonstrated impressive generalization capabilities +for zero-shot tasks. Building on this success, efforts have been made to adapt +these image-based visual-language models, such as CLIP, for videos extending +their zero-shot capabilities to the video domain. While these adaptations have +shown promising results, they come at a significant computational cost and +struggle with effectively modeling the crucial temporal aspects inherent to the +video domain. In this study, we present EZ-CLIP, a simple and efficient +adaptation of CLIP that addresses these challenges. EZ-CLIP leverages temporal +visual prompting for seamless temporal adaptation, requiring no fundamental +alterations to the core CLIP architecture while preserving its remarkable +generalization abilities. Moreover, we introduce a novel learning objective +that guides the temporal visual prompts to focus on capturing motion, thereby +enhancing its learning capabilities from video data. We conducted extensive +experiments on five different benchmark datasets, thoroughly evaluating EZ-CLIP +for zero-shot learning and base-to-novel video action recognition, and also +demonstrating its potential for few-shot generalization.Impressively, with a +mere 5.2 million learnable parameters (as opposed to the 71.1 million in the +prior best model), EZ-CLIP can be efficiently trained on a single GPU, +outperforming existing approaches in several evaluations. + +
+
+
+
+
+ + ♻ ☆ LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly + Detector + + +
+ This paper introduces LogLead, a tool designed for efficient log analysis +benchmarking. LogLead combines three essential steps in log processing: +loading, enhancing, and anomaly detection. The tool leverages Polars, a +high-speed DataFrame library. We currently have Loaders for eight systems that +are publicly available (HDFS, Hadoop, BGL, Thunderbird, Spirit, Liberty, +TrainTicket, and GC Webshop). We have multiple enhancers with three parsers +(Drain, Spell, LenMa), Bert embedding creation and other log representation +techniques like bag-of-words. LogLead integrates to five supervised and four +unsupervised machine learning algorithms for anomaly detection from SKLearn. By +integrating diverse datasets, log representation methods and anomaly detectors, +LogLead facilitates comprehensive benchmarking in log analysis research. We +show that log loading from raw file to dataframe is over 10x faster with +LogLead compared to past solutions. We demonstrate roughly 2x improvement in +Drain parsing speed by off-loading log message normalization to LogLead. Our +brief benchmarking on HDFS indicates that log representations extending beyond +the bag-of-words approach offer limited additional benefits. Tool URL: +https://github.com/EvoTestOps/LogLead + +
+
+ comment: 2024 IEEE International Conference on Software Analysis, Evolution + and Reengineering (SANER) +
+
+
+
+
+ + ♻ ☆ Divide and not forget: Ensemble of selectively trained experts in + Continual Learning ICLR 2024 + + +
+ Class-incremental learning is becoming more popular as it helps models widen +their applicability while not forgetting what they already know. A trend in +this area is to use a mixture-of-expert technique, where different models work +together to solve the task. However, the experts are usually trained all at +once using whole task data, which makes them all prone to forgetting and +increasing computational burden. To address this limitation, we introduce a +novel approach named SEED. SEED selects only one, the most optimal expert for a +considered task, and uses data from this task to fine-tune only this expert. +For this purpose, each expert represents each class with a Gaussian +distribution, and the optimal expert is selected based on the similarity of +those distributions. Consequently, SEED increases diversity and heterogeneity +within the experts while maintaining the high stability of this ensemble +method. The extensive experiments demonstrate that SEED achieves +state-of-the-art performance in exemplar-free settings across various +scenarios, showing the potential of expert diversification through data in +continual learning. + +
+
+ comment: Accepted for ICLR 2024 (main track), code is available at: + https://github.com/grypesc/SEED +
+
+
+
+
+ + ♻ ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Statistical Test for Attention Map in Vision Transformer + + +
+ The Vision Transformer (ViT) demonstrates exceptional performance in various +computer vision tasks. Attention is crucial for ViT to capture complex +wide-ranging relationships among image patches, allowing the model to weigh the +importance of image patches and aiding our understanding of the decision-making +process. However, when utilizing the attention of ViT as evidence in +high-stakes decision-making tasks such as medical diagnostics, a challenge +arises due to the potential of attention mechanisms erroneously focusing on +irrelevant regions. In this study, we propose a statistical test for ViT's +attentions, enabling us to use the attentions as reliable quantitative evidence +indicators for ViT's decision-making with a rigorously controlled error rate. +Using the framework called selective inference, we quantify the statistical +significance of attentions in the form of p-values, which enables the +theoretically grounded quantification of the false positive detection +probability of attentions. We demonstrate the validity and the effectiveness of +the proposed method through numerical experiments and applications to brain +image diagnoses. + +
+
+ comment: 42pages, 17figures +
+
+
+
+
+ + ♻ ☆ Input Convex Lipschitz RNN: A Fast and Robust Approach for Engineering + Tasks + + +
+ Computational efficiency and adversarial robustness are critical factors in +real-world engineering applications. Yet, conventional neural networks often +fall short in addressing both simultaneously, or even separately. Drawing +insights from natural physical systems and existing literature, it is known +that an input convex architecture enhances computational efficiency, while a +Lipschitz-constrained architecture bolsters adversarial robustness. By +leveraging the strengths of convexity and Lipschitz continuity, we develop a +novel network architecture, termed Input Convex Lipschitz Recurrent Neural +Networks. This model outperforms existing recurrent units across a spectrum of +engineering tasks in terms of computational efficiency and adversarial +robustness. These tasks encompass a benchmark MNIST image classification, +real-world solar irradiance prediction for Solar PV system planning at LHT +Holdings in Singapore, and real-time Model Predictive Control optimization for +a chemical reactor. + +
+
+
+
+
+ + ♻ ☆ How Abilities in Large Language Models are Affected by Supervised + Fine-tuning Data Composition + + +
+ Large language models (LLMs) with enormous pre-training tokens and parameters +emerge diverse abilities, including math reasoning, code generation, and +instruction following. These abilities are further enhanced by supervised +fine-tuning (SFT). While the open-source community has explored ad-hoc SFT for +enhancing individual capabilities, proprietary LLMs exhibit versatility across +various skills. Therefore, understanding the facilitation of multiple abilities +via SFT is paramount. In this study, we specifically focuses on the interplay +of data composition between mathematical reasoning, code generation, and +general human-aligning abilities during SFT. We propose four intriguing +research questions to explore the association between model performance and +various factors including data amount, composition ratio, model size and SFT +strategies. Our experiments reveal that distinct capabilities scale differently +and larger models generally show superior performance with same amount of data. +Mathematical reasoning and code generation consistently improve with increasing +data amount, whereas general abilities plateau after roughly a thousand +samples. Moreover, we observe data composition appears to enhance various +abilities under limited data conditions, yet can lead to performance conflicts +when data is plentiful. Our findings also suggest the amount of composition +data influences performance more than the composition ratio. In analysis of SFT +strategies, we find that sequentially learning multiple skills risks +catastrophic forgetting. Our proposed Dual-stage Mixed Fine-tuning (DMT) +strategy offers a promising solution to learn multiple abilities with different +scaling patterns. + +
+
+
+
+
+ + ♻ ☆ Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model + Predictive Control + + +
+ Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive +Control (MPC) successfully attains globally optimal solutions by upholding +convexity within the MPC framework. However, current ICNN architectures +encounter the issue of vanishing/exploding gradients, which limits their +ability to serve as deep neural networks for complex tasks. Additionally, the +current neural network-based MPC, including conventional neural network-based +MPC and ICNN-based MPC, faces slower convergence speed when compared to MPC +based on first-principles models. In this study, we leverage the principles of +ICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the +specific goal of reducing convergence time and mitigating the +vanishing/exploding gradient problem while ensuring closed-loop stability. From +a simulation study of a nonlinear chemical reactor, we observed a mitigation of +vanishing/exploding gradient problem and a reduction in convergence time, with +a percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain +RNN, plain LSTM, and Input Convex Recurrent Neural Networks, respectively. + +
+
+ comment: Submitted to 6th Annual Learning for Dynamics & Control Conference + (L4DC 2024) +
+
+
+
+
+ + ♻ ☆ Towards Efficient and Certified Recovery from Poisoning Attacks in + Federated Learning + + +
+ Federated learning (FL) is vulnerable to poisoning attacks, where malicious +clients manipulate their updates to affect the global model. Although various +methods exist for detecting those clients in FL, identifying malicious clients +requires sufficient model updates, and hence by the time malicious clients are +detected, FL models have been already poisoned. Thus, a method is needed to +recover an accurate global model after malicious clients are identified. +Current recovery methods rely on (i) all historical information from +participating FL clients and (ii) the initial model unaffected by the malicious +clients, leading to a high demand for storage and computational resources. In +this paper, we show that highly effective recovery can still be achieved based +on (i) selective historical information rather than all historical information +and (ii) a historical model that has not been significantly affected by +malicious clients rather than the initial model. In this scenario, while +maintaining comparable recovery performance, we can accelerate the recovery +speed and decrease memory consumption. Following this concept, we introduce +Crab, an efficient and certified recovery method, which relies on selective +information storage and adaptive model rollback. Theoretically, we demonstrate +that the difference between the global model recovered by Crab and the one +recovered by train-from-scratch can be bounded under certain assumptions. Our +empirical evaluation, conducted across three datasets over multiple machine +learning models, and a variety of untargeted and targeted poisoning attacks +reveals that Crab is both accurate and efficient, and consistently outperforms +previous approaches in terms of both recovery speed and memory consumption. + +
+
+
+
+
+ + ♻ ☆ Knowledge from Large-Scale Protein Contact Prediction Models Can Be + Transferred to the Data-Scarce RNA Contact Prediction Task + + +
+ RNA, whose functionality is largely determined by its structure, plays an +important role in many biological activities. The prediction of pairwise +structural proximity between each nucleotide of an RNA sequence can +characterize the structural information of the RNA. Historically, this problem +has been tackled by machine learning models using expert-engineered features +and trained on scarce labeled datasets. Here, we find that the knowledge +learned by a protein-coevolution Transformer-based deep neural network can be +transferred to the RNA contact prediction task. As protein datasets are orders +of magnitude larger than those for RNA contact prediction, our findings and the +subsequent framework greatly reduce the data scarcity bottleneck. Experiments +confirm that RNA contact prediction through transfer learning using a publicly +available protein model is greatly improved. Our findings indicate that the +learned structural patterns of proteins can be transferred to RNAs, opening up +potential new avenues for research. + +
+
+ comment: The code is available at + https://github.com/yiren-jian/CoT-RNA-Transfer +
+
+
+
+
+ + ♻ ☆ Neural Spectral Methods: Self-supervised learning in the spectral domain ICLR + + +
+ We present Neural Spectral Methods, a technique to solve parametric Partial +Differential Equations (PDEs), grounded in classical spectral methods. Our +method uses orthogonal bases to learn PDE solutions as mappings between +spectral coefficients. In contrast to current machine learning approaches which +enforce PDE constraints by minimizing the numerical quadrature of the residuals +in the spatiotemporal domain, we leverage Parseval's identity and introduce a +new training strategy through a \textit{spectral loss}. Our spectral loss +enables more efficient differentiation through the neural network, and +substantially reduces training complexity. At inference time, the computational +cost of our method remains constant, regardless of the spatiotemporal +resolution of the domain. Our experimental results demonstrate that our method +significantly outperforms previous machine learning approaches in terms of +speed and accuracy by one to two orders of magnitude on multiple different +problems. When compared to numerical solvers of the same accuracy, our method +demonstrates a $10\times$ increase in performance speed. + +
+
+ comment: Accepted to International Conference on Learning Representations + (ICLR) 2024 +
+
+
+
+
+ + ♻ ☆ Granular-ball computing: an efficient, robust, and interpretable + adaptive multi-granularity representation and computation method + + +
+ Human cognition operates on a "Global-first" cognitive mechanism, +prioritizing information processing based on coarse-grained details. This +mechanism inherently possesses an adaptive multi-granularity description +capacity, resulting in computational traits such as efficiency, robustness, and +interpretability. The analysis pattern reliance on the finest granularity and +single-granularity makes most existing computational methods less efficient, +robust, and interpretable, which is an important reason for the current lack of +interpretability in neural networks. Multi-granularity granular-ball computing +employs granular-balls of varying sizes to daptively represent and envelop the +sample space, facilitating learning based on these granular-balls. Given that +the number of coarse-grained "granular-balls" is fewer than sample points, +granular-ball computing proves more efficient. Moreover, the inherent +coarse-grained nature of granular-balls reduces susceptibility to fine-grained +sample disturbances, enhancing robustness. The multi-granularity construct of +granular-balls generates topological structures and coarse-grained +descriptions, naturally augmenting interpretability. Granular-ball computing +has successfully ventured into diverse AI domains, fostering the development of +innovative theoretical methods, including granular-ball classifiers, clustering +techniques, neural networks, rough sets, and evolutionary computing. This has +notably ameliorated the efficiency, noise robustness, and interpretability of +traditional methods. Overall, granular-ball computing is a rare and innovative +theoretical approach in AI that can adaptively and simultaneously enhance +efficiency, robustness, and interpretability. This article delves into the main +application landscapes for granular-ball computing, aiming to equip future +researchers with references and insights to refine and expand this promising +theory. + +
+
+
+
+
+ + ♻ ☆ Distribution Fitting for Combating Mode Collapse in Generative + Adversarial Networks + + +
+ Mode collapse is a significant unsolved issue of generative adversarial +networks. In this work, we examine the causes of mode collapse from a novel +perspective. Due to the nonuniform sampling in the training process, some +sub-distributions may be missed when sampling data. As a result, even when the +generated distribution differs from the real one, the GAN objective can still +achieve the minimum. To address the issue, we propose a global distribution +fitting (GDF) method with a penalty term to confine the generated data +distribution. When the generated distribution differs from the real one, GDF +will make the objective harder to reach the minimal value, while the original +global minimum is not changed. To deal with the circumstance when the overall +real data is unreachable, we also propose a local distribution fitting (LDF) +method. Experiments on several benchmarks demonstrate the effectiveness and +competitive performance of GDF and LDF. + +
+
+
+
+
+ + ♻ ☆ Convergence Analysis of Fractional Gradient Descent + + +
+ Fractional derivatives are a well-studied generalization of integer order +derivatives. Naturally, for optimization, it is of interest to understand the +convergence properties of gradient descent using fractional derivatives. +Convergence analysis of fractional gradient descent is currently limited both +in the methods analyzed and the settings analyzed. This paper aims to fill in +these gaps by analyzing variations of fractional gradient descent in smooth and +convex, smooth and strongly convex, and smooth and non-convex settings. First, +novel bounds will be established bridging fractional and integer derivatives. +Then, these bounds will be applied to the aforementioned settings to prove +linear convergence for smooth and strongly convex functions and $O(1/T)$ +convergence for smooth and convex functions. Additionally, we prove $O(1/T)$ +convergence for smooth and non-convex functions using an extended notion of +smoothness - H\"older smoothness - that is more natural for fractional +derivatives. Finally, empirical results will be presented on the potential +speed up of fractional gradient descent over standard gradient descent as well +as the challenges of predicting which will be faster in general. + +
+
+ comment: 24 pages, 4 figures. Added additional results for smooth and convex + functions +
+
+
+
+
+ + ♻ ☆ BioBridge: Bridging Biomedical Foundation Models via Knowledge Graphs ICLR 2024 + + +
+ Foundation models (FMs) are able to leverage large volumes of unlabeled data +to demonstrate superior performance across a wide range of tasks. However, FMs +developed for biomedical domains have largely remained unimodal, i.e., +independently trained and used for tasks on protein sequences alone, small +molecule structures alone, or clinical data alone. To overcome this limitation +of biomedical FMs, we present BioBridge, a novel parameter-efficient learning +framework, to bridge independently trained unimodal FMs to establish multimodal +behavior. BioBridge achieves it by utilizing Knowledge Graphs (KG) to learn +transformations between one unimodal FM and another without fine-tuning any +underlying unimodal FMs. Our empirical results demonstrate that BioBridge can +beat the best baseline KG embedding methods (on average by around 76.3%) in +cross-modal retrieval tasks. We also identify BioBridge demonstrates +out-of-domain generalization ability by extrapolating to unseen modalities or +relations. Additionally, we also show that BioBridge presents itself as a +general purpose retriever that can aid biomedical multimodal question answering +as well as enhance the guided generation of novel drugs. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed improvements of up to 1.6% in test data, while maintaining the same +inference time, and a substantial 1.0% points performance gain in deformation +field smoothness. + +
+
+
+
+
+ + ♻ ☆ CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in + Variational AutoEncoder + + +
+ Symmetries of input and latent vectors have provided valuable insights for +disentanglement learning in VAEs.However, only a few works were proposed as an +unsupervised method, and even these works require known factor information in +training data. We propose a novel method, Composite Factor-Aligned Symmetry +Learning (CFASL), which is integrated into VAEs for learning symmetry-based +disentanglement in unsupervised learning without any knowledge of the dataset +factor information.CFASL incorporates three novel features for learning +symmetry-based disentanglement: 1) Injecting inductive bias to align latent +vector dimensions to factor-aligned symmetries within an explicit learnable +symmetry codebook 2) Learning a composite symmetry to express unknown factors +change between two random samples by learning factor-aligned symmetries within +the codebook 3) Inducing group equivariant encoder and decoder in training VAEs +with the two conditions. In addition, we propose an extended evaluation metric +for multi-factor changes in comparison to disentanglement evaluation in VAEs. +In quantitative and in-depth qualitative analysis, CFASL demonstrates a +significant improvement of disentanglement in single-factor change, and +multi-factor change conditions compared to state-of-the-art methods. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Utilizing synthetic training data for the supervised classification of + rat ultrasonic vocalizations + + +
+ Murine rodents generate ultrasonic vocalizations (USVs) with frequencies that +extend to around 120kHz. These calls are important in social behaviour, and so +their analysis can provide insights into the function of vocal communication, +and its dysfunction. The manual identification of USVs, and subsequent +classification into different subcategories is time consuming. Although machine +learning approaches for identification and classification can lead to enormous +efficiency gains, the time and effort required to generate training data can be +high, and the accuracy of current approaches can be problematic. Here we +compare the detection and classification performance of a trained human against +two convolutional neural networks (CNNs), DeepSqueak and VocalMat, on audio +containing rat USVs. Furthermore, we test the effect of inserting synthetic +USVs into the training data of the VocalMat CNN as a means of reducing the +workload associated with generating a training set. Our results indicate that +VocalMat outperformed the DeepSqueak CNN on measures of call identification, +and classification. Additionally, we found that the augmentation of training +data with synthetic images resulted in a further improvement in accuracy, such +that it was sufficiently close to human performance to allow for the use of +this software in laboratory conditions. + +
+
+ comment: 25 pages, 5 main figures, 2 tables +
+
+
+
+
+ + ♻ ☆ A Deep Neural Network Based Reverse Radio Spectrogram Search Algorithm + + +
+ Modern radio astronomy instruments generate vast amounts of data, and the +increasingly challenging radio frequency interference (RFI) environment +necessitates ever-more sophisticated RFI rejection algorithms. The "needle in a +haystack" nature of searches for transients and technosignatures requires us to +develop methods that can determine whether a signal of interest has unique +properties, or is a part of some larger set of pernicious RFI. In the past, +this vetting has required onerous manual inspection of very large numbers of +signals. In this paper we present a fast and modular deep learning algorithm to +search for lookalike signals of interest in radio spectrogram data. First, we +trained a B-Variational Autoencoder on signals returned by an energy detection +algorithm. We then adapted a positional embedding layer from classical +Transformer architecture to a embed additional metadata, which we demonstrate +using a frequency-based embedding. Next we used the encoder component of the +B-Variational Autoencoder to extract features from small (~ 715,Hz, with a +resolution of 2.79Hz per frequency bin) windows in the radio spectrogram. We +used our algorithm to conduct a search for a given query (encoded signal of +interest) on a set of signals (encoded features of searched items) to produce +the top candidates with similar features. We successfully demonstrate that the +algorithm retrieves signals with similar appearance, given only the original +radio spectrogram data. This algorithm can be used to improve the efficiency of +vetting signals of interest in technosignature searches, but could also be +applied to a wider variety of searches for "lookalike" signals in large +astronomical datasets. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Exploring Local Explanations of Nonlinear Models Using Animated Linear + Projections + + +
+ The increased predictive power of machine learning models comes at the cost +of increased complexity and loss of interpretability, particularly in +comparison to parametric statistical models. This trade-off has led to the +emergence of eXplainable AI (XAI) which provides methods, such as local +explanations (LEs) and local variable attributions (LVAs), to shed light on how +a model use predictors to arrive at a prediction. These provide a point +estimate of the linear variable importance in the vicinity of a single +observation. However, LVAs tend not to effectively handle association between +predictors. To understand how the interaction between predictors affects the +variable importance estimate, we can convert LVAs into linear projections and +use the radial tour. This is also useful for learning how a model has made a +mistake, or the effect of outliers, or the clustering of observations. The +approach is illustrated with examples from categorical (penguin species, +chocolate types) and quantitative (soccer/football salaries, house prices) +response models. The methods are implemented in the R package cheem, available +on CRAN. + +
+
+
+
+
+ + ♻ ☆ Deep Efficient Private Neighbor Generation for Subgraph Federated + Learning SDM 2024 + + +
+ Behemoth graphs are often fragmented and separately stored by multiple data +owners as distributed subgraphs in many realistic applications. Without harming +data privacy, it is natural to consider the subgraph federated learning +(subgraph FL) scenario, where each local client holds a subgraph of the entire +global graph, to obtain globally generalized graph mining models. To overcome +the unique challenge of incomplete information propagation on local subgraphs +due to missing cross-subgraph neighbors, previous works resort to the +augmentation of local neighborhoods through the joint FL of missing neighbor +generators and GNNs. Yet their technical designs have profound limitations +regarding the utility, efficiency, and privacy goals of FL. In this work, we +propose FedDEP to comprehensively tackle these challenges in subgraph FL. +FedDEP consists of a series of novel technical designs: (1) Deep neighbor +generation through leveraging the GNN embeddings of potential missing +neighbors; (2) Efficient pseudo-FL for neighbor generation through embedding +prototyping; and (3) Privacy protection through noise-less +edge-local-differential-privacy. We analyze the correctness and efficiency of +FedDEP, and provide theoretical guarantees on its privacy. Empirical results on +four real-world datasets justify the clear benefits of proposed techniques. + +
+
+ comment: Accepted to SDM 2024 +
+
+
+
+
+ + ♻ ☆ Rethinking Dimensional Rationale in Graph Contrastive Learning from + Causal Perspective AAAI2024 + + +
+ Graph contrastive learning is a general learning paradigm excelling at +capturing invariant information from diverse perturbations in graphs. Recent +works focus on exploring the structural rationale from graphs, thereby +increasing the discriminability of the invariant information. However, such +methods may incur in the mis-learning of graph models towards the +interpretability of graphs, and thus the learned noisy and task-agnostic +information interferes with the prediction of graphs. To this end, with the +purpose of exploring the intrinsic rationale of graphs, we accordingly propose +to capture the dimensional rationale from graphs, which has not received +sufficient attention in the literature. The conducted exploratory experiments +attest to the feasibility of the aforementioned roadmap. To elucidate the +innate mechanism behind the performance improvement arising from the +dimensional rationale, we rethink the dimensional rationale in graph +contrastive learning from a causal perspective and further formalize the +causality among the variables in the pre-training stage to build the +corresponding structural causal model. On the basis of the understanding of the +structural causal model, we propose the dimensional rationale-aware graph +contrastive learning approach, which introduces a learnable dimensional +rationale acquiring network and a redundancy reduction constraint. The +learnable dimensional rationale acquiring network is updated by leveraging a +bi-level meta-learning technique, and the redundancy reduction constraint +disentangles the redundant features through a decorrelation process during +learning. Empirically, compared with state-of-the-art methods, our method can +yield significant performance boosts on various benchmarks with respect to +discriminability and transferability. The code implementation of our method is +available at https://github.com/ByronJi/DRGCL. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ♻ ☆ A Novel Maximum-Entropy-Driven Technique for Low-Rank Orthogonal + Nonnegative Matrix Factorization with $\ell_0$-Norm sparsity Constraint + + +
+ In data-driven control and machine learning, a common requirement involves +breaking down large matrices into smaller, low-rank factors that possess +specific levels of sparsity. This paper introduces an innovative solution to +the orthogonal nonnegative matrix factorization (ONMF) problem. The objective +is to approximate input data by using two low-rank nonnegative matrices, +adhering to both orthogonality and $\ell_0$-norm sparsity constraints. the +proposed maximum-entropy-principle based framework ensures orthogonality and +sparsity of features or the mixing matrix, while maintaining nonnegativity in +both. Additionally, the methodology offers a quantitative determination of the +``true'' number of underlying features, a crucial hyperparameter for ONMF. +Experimental evaluation on synthetic and a standard datasets highlights the +method's superiority in terms of sparsity, orthogonality, and computational +speed compared to existing approaches. Notably, the proposed method achieves +comparable or improved reconstruction errors in line with the literature. + +
+
+
+
+
+ + ♻ ☆ Diffusion Model with Perceptual Loss + + +
+ Diffusion models trained with mean squared error loss tend to generate +unrealistic samples. Current state-of-the-art models rely on classifier-free +guidance to improve sample quality, yet its surprising effectiveness is not +fully understood. In this paper, we show that the effectiveness of +classifier-free guidance partly originates from it being a form of implicit +perceptual guidance. As a result, we can directly incorporate perceptual loss +in diffusion training to improve sample quality. Since the score matching +objective used in diffusion training strongly resembles the denoising +autoencoder objective used in unsupervised training of perceptual networks, the +diffusion model itself is a perceptual network and can be used to generate +meaningful perceptual loss. We propose a novel self-perceptual objective that +results in diffusion models capable of generating more realistic samples. For +conditional generation, our method only improves sample quality without +entanglement with the conditional input and therefore does not sacrifice sample +diversity. Our method can also improve sample quality for unconditional +generation, which was not possible with classifier-free guidance before. + +
+
+
+
+
+ + ♻ ☆ Folding Attention: Memory and Power Optimization for On-Device + Transformer-based Streaming Speech Recognition + + +
+ Transformer-based models excel in speech recognition. Existing efforts to +optimize Transformer inference, typically for long-context applications, center +on simplifying attention score calculations. However, streaming speech +recognition models usually process a limited number of tokens each time, making +attention score calculation less of a bottleneck. Instead, the bottleneck lies +in the linear projection layers of multi-head attention and feedforward +networks, constituting a substantial portion of the model size and contributing +significantly to computation, memory, and power usage. + To address this bottleneck, we propose folding attention, a technique +targeting these linear layers, significantly reducing model size and improving +memory and power efficiency. Experiments on on-device Transformer-based +streaming speech recognition models show that folding attention reduces model +size (and corresponding memory consumption) by up to 24% and power consumption +by up to 23%, all without compromising model accuracy or computation overhead. + +
+
+
+
+
+ + ♻ ☆ Hybrid Parameter Search and Dynamic Model Selection for Mixed-Variable + Bayesian Optimization + + +
+ This paper presents a new type of hybrid model for Bayesian optimization (BO) +adept at managing mixed variables, encompassing both quantitative (continuous +and integer) and qualitative (categorical) types. Our proposed new hybrid +models (named hybridM) merge the Monte Carlo Tree Search structure (MCTS) for +categorical variables with Gaussian Processes (GP) for continuous ones. hybridM +leverages the upper confidence bound tree search (UCTS) for MCTS strategy, +showcasing the tree architecture's integration into Bayesian optimization. Our +innovations, including dynamic online kernel selection in the surrogate +modeling phase and a unique UCTS search strategy, position our hybrid models as +an advancement in mixed-variable surrogate models. Numerical experiments +underscore the superiority of hybrid models, highlighting their potential in +Bayesian optimization. + +
+
+ comment: 33 pages, 8 Figures +
+
+
+
+
+ + ♻ ☆ Enhancing Speech Emotion Recognition Through Differentiable Architecture + Search + + +
+ Speech Emotion Recognition (SER) is a critical enabler of emotion-aware +communication in human-computer interactions. Recent advancements in Deep +Learning (DL) have substantially enhanced the performance of SER models through +increased model complexity. However, designing optimal DL architectures +requires prior experience and experimental evaluations. Encouragingly, Neural +Architecture Search (NAS) offers a promising avenue to determine an optimal DL +model automatically. In particular, Differentiable Architecture Search (DARTS) +is an efficient method of using NAS to search for optimised models. This paper +proposes a DARTS-optimised joint CNN and LSTM architecture, to improve SER +performance, where the literature informs the selection of CNN and LSTM +coupling to offer improved performance. While DARTS has previously been applied +to CNN and LSTM combinations, our approach introduces a novel mechanism, +particularly in selecting CNN operations using DARTS. In contrast to previous +studies, we refrain from imposing constraints on the order of the layers for +the CNN within the DARTS cell; instead, we allow DARTS to determine the optimal +layer order autonomously. Experimenting with the IEMOCAP and MSP-IMPROV +datasets, we demonstrate that our proposed methodology achieves significantly +higher SER accuracy than hand-engineering the CNN-LSTM configuration. It also +outperforms the best-reported SER results achieved using DARTS on CNN-LSTM. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+
+
+
+ + Multimedia 2 + +
+
+
+ + ☆ M2ORT: Many-To-One Regression Transformer for Spatial Transcriptomics + Prediction from Histopathology Images + + +
+ The advancement of Spatial Transcriptomics (ST) has facilitated the +spatially-aware profiling of gene expressions based on histopathology images. +Although ST data offers valuable insights into the micro-environment of tumors, +its acquisition cost remains expensive. Therefore, directly predicting the ST +expressions from digital pathology images is desired. Current methods usually +adopt existing regression backbones for this task, which ignore the inherent +multi-scale hierarchical data structure of digital pathology images. To address +this limit, we propose M2ORT, a many-to-one regression Transformer that can +accommodate the hierarchical structure of the pathology images through a +decoupled multi-scale feature extractor. Different from traditional models that +are trained with one-to-one image-label pairs, M2ORT accepts multiple pathology +images of different magnifications at a time to jointly predict the gene +expressions at their corresponding common ST spot, aiming at learning a +many-to-one relationship through training. We have tested M2ORT on three public +ST datasets and the experimental results show that M2ORT can achieve +state-of-the-art performance with fewer parameters and floating-point +operations (FLOPs). The code is available at: +https://github.com/Dootmaan/M2ORT/. + +
+
+
+
+
+ + ☆ CBVS: A Large-Scale Chinese Image-Text Benchmark for Real-World Short + Video Search Scenarios + + +
+ Vision-Language Models pre-trained on large-scale image-text datasets have +shown superior performance in downstream tasks such as image retrieval. Most of +the images for pre-training are presented in the form of open domain +common-sense visual elements. Differently, video covers in short video search +scenarios are presented as user-originated contents that provide important +visual summaries of videos. In addition, a portion of the video covers come +with manually designed cover texts that provide semantic complements. In order +to fill in the gaps in short video cover data, we establish the first +large-scale cover-text benchmark for Chinese short video search scenarios. +Specifically, we release two large-scale datasets CBVS-5M/10M to provide short +video covers, and the manual fine-labeling dataset CBVS-20K to provide real +user queries, which serves as an image-text benchmark test in the Chinese short +video search field. To integrate the semantics of cover text in the case of +modality missing, we propose UniCLIP where cover texts play a guiding role +during training, however are not relied upon by inference. Extensive evaluation +on CBVS-20K demonstrates the excellent performance of our proposal. UniCLIP has +been deployed to Tencent's online video search systems with hundreds of +millions of visits and achieved significant gains. The complete dataset, code +and checkpoints will be available upon release. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 71 + +
+
+
+ + ☆ ChatQA: Building GPT-4 Level Conversational QA Models + + +
+ In this work, we introduce ChatQA, a family of conversational question +answering (QA) models, that obtain GPT-4 level accuracies. Specifically, we +propose a two-stage instruction tuning method that can significantly improve +the zero-shot conversational QA results from large language models (LLMs). To +handle retrieval in conversational QA, we fine-tune a dense retriever on a +multi-turn QA dataset, which provides comparable results to using the +state-of-the-art query rewriting model while largely reducing deployment cost. +Notably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10 +conversational QA datasets (54.14 vs. 53.90), without relying on any synthetic +data from OpenAI GPT models. + +
+
+
+
+
+ + ☆ MM-Interleaved: Interleaved Image-Text Generative Modeling via + Multi-modal Feature Synchronizer + + +
+ Developing generative models for interleaved image-text data has both +research and practical value. It requires models to understand the interleaved +sequences and subsequently generate images and text. However, existing attempts +are limited by the issue that the fixed number of visual tokens cannot +efficiently capture image details, which is particularly problematic in the +multi-image scenarios. To address this, this paper presents MM-Interleaved, an +end-to-end generative model for interleaved image-text data. It introduces a +multi-scale and multi-image feature synchronizer module, allowing direct access +to fine-grained image features in the previous context during the generation +process. MM-Interleaved is end-to-end pre-trained on both paired and +interleaved image-text corpora. It is further enhanced through a supervised +fine-tuning phase, wherein the model improves its ability to follow complex +multi-modal instructions. Experiments demonstrate the versatility of +MM-Interleaved in recognizing visual details following multi-modal instructions +and generating consistent images following both textual and visual conditions. +Code and models are available at +\url{https://github.com/OpenGVLab/MM-Interleaved}. + +
+
+ comment: 20 pages, 9 figures, 17 tables +
+
+
+
+
+ + ☆ Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through + Text Reconstruction EACL 2024 + + +
+ Fine-grained few-shot entity extraction in the chemical domain faces two +unique challenges. First, compared with entity extraction tasks in the general +domain, sentences from chemical papers usually contain more entities. Moreover, +entity extraction models usually have difficulty extracting entities of +long-tailed types. In this paper, we propose Chem-FINESE, a novel +sequence-to-sequence (seq2seq) based few-shot entity extraction approach, to +address these two challenges. Our Chem-FINESE has two components: a seq2seq +entity extractor to extract named entities from the input sentence and a +seq2seq self-validation module to reconstruct the original input sentence from +extracted entities. Inspired by the fact that a good entity extraction system +needs to extract entities faithfully, our new self-validation module leverages +entity extraction results to reconstruct the original input sentence. Besides, +we design a new contrastive loss to reduce excessive copying during the +extraction process. Finally, we release ChemNER+, a new fine-grained chemical +entity extraction dataset that is annotated by domain experts with the ChemNER +schema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets +show that our newly proposed framework has contributed up to 8.26% and 6.84% +absolute F1-score gains respectively. + +
+
+ comment: 16 pages. Accepted by Findings of the Association for Computational + Linguistics: EACL 2024. Code and resources are available at + https://github.com/EagleW/Chem-FINESE +
+
+
+
+
+ + ☆ Beyond Reference-Based Metrics: Analyzing Behaviors of Open LLMs on + Data-to-Text Generation + + +
+ We investigate to which extent open large language models (LLMs) can generate +coherent and relevant text from structured data. To prevent bias from +benchmarks leaked into LLM training data, we collect Quintd-1: an ad-hoc +benchmark for five data-to-text (D2T) generation tasks, consisting of +structured data records in standard formats gathered from public APIs. We +leverage reference-free evaluation metrics and LLMs' in-context learning +capabilities, allowing us to test the models with no human-written references. +Our evaluation focuses on annotating semantic accuracy errors on token-level, +combining human annotators and a metric based on GPT-4. Our systematic +examination of the models' behavior across domains and tasks suggests that +state-of-the-art open LLMs with 7B parameters can generate fluent and coherent +text from various standard data formats in zero-shot settings. However, we also +show that semantic accuracy of the outputs remains a major issue: on our +benchmark, 80% of outputs of open LLMs contain a semantic error according to +human annotators (91% according to GPT-4). Our code, data, and model outputs +are available at https://d2t-llm.github.io. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Spatial-Temporal Large Language Model for Traffic Prediction + + +
+ Traffic prediction, a critical component for intelligent transportation +systems, endeavors to foresee future traffic at specific locations using +historical data. Although existing traffic prediction models often emphasize +developing complex neural network structures, their accuracy has not seen +improvements accordingly. Recently, Large Language Models (LLMs) have shown +outstanding capabilities in time series analysis. Differing from existing +models, LLMs progress mainly through parameter expansion and extensive +pre-training while maintaining their fundamental structures. In this paper, we +propose a Spatial-Temporal Large Language Model (ST-LLM) for traffic +prediction. Specifically, ST-LLM redefines the timesteps at each location as +tokens and incorporates a spatial-temporal embedding module to learn the +spatial location and global temporal representations of tokens. Then these +representations are fused to provide each token with unified spatial and +temporal information. Furthermore, we propose a novel partially frozen +attention strategy of the LLM, which is designed to capture spatial-temporal +dependencies for traffic prediction. Comprehensive experiments on real traffic +datasets offer evidence that ST-LLM outperforms state-of-the-art models. +Notably, the ST-LLM also exhibits robust performance in both few-shot and +zero-shot prediction scenarios. + +
+
+
+
+
+ + ☆ Marrying Adapters and Mixup to Efficiently Enhance the Adversarial + Robustness of Pre-Trained Language Models for Text Classification + + +
+ Existing works show that augmenting training data of neural networks using +both clean and adversarial examples can enhance their generalizability under +adversarial attacks. However, this training approach often leads to performance +degradation on clean inputs. Additionally, it requires frequent re-training of +the entire model to account for new attack types, resulting in significant and +costly computations. Such limitations make adversarial training mechanisms less +practical, particularly for complex Pre-trained Language Models (PLMs) with +millions or even billions of parameters. To overcome these challenges while +still harnessing the theoretical benefits of adversarial training, this study +combines two concepts: (1) adapters, which enable parameter-efficient +fine-tuning, and (2) Mixup, which train NNs via convex combinations of pairs +data pairs. Intuitively, we propose to fine-tune PLMs through convex +combinations of non-data pairs of fine-tuned adapters, one trained with clean +and another trained with adversarial examples. Our experiments show that the +proposed method achieves the best trade-off between training efficiency and +predictive performance, both with and without attacks compared to other +baselines on a variety of downstream tasks. + +
+
+ comment: 10 pages and 2 figures +
+
+
+
+
+ + ☆ Power in Numbers: Robust reading comprehension by finetuning with four + adversarial sentences per example + + +
+ Recent models have achieved human level performance on the Stanford Question +Answering Dataset when using F1 scores to evaluate the reading comprehension +task. Yet, teaching machines to comprehend text has not been solved in the +general case. By appending one adversarial sentence to the context paragraph, +past research has shown that the F1 scores from reading comprehension models +drop almost in half. In this paper, I replicate past adversarial research with +a new model, ELECTRA-Small, and demonstrate that the new model's F1 score drops +from 83.9% to 29.2%. To improve ELECTRA-Small's resistance to this attack, I +finetune the model on SQuAD v1.1 training examples with one to five adversarial +sentences appended to the context paragraph. Like past research, I find that +the finetuned model on one adversarial sentence does not generalize well across +evaluation datasets. However, when finetuned on four or five adversarial +sentences the model attains an F1 score of more than 70% on most evaluation +datasets with multiple appended and prepended adversarial sentences. The +results suggest that with enough examples we can make models robust to +adversarial attacks. + +
+
+
+
+
+ + ☆ Communication-Efficient Personalized Federated Learning for + Speech-to-Text Tasks ICASSP 2024 + + +
+ To protect privacy and meet legal regulations, federated learning (FL) has +gained significant attention for training speech-to-text (S2T) systems, +including automatic speech recognition (ASR) and speech translation (ST). +However, the commonly used FL approach (i.e., \textsc{FedAvg}) in S2T tasks +typically suffers from extensive communication overhead due to multi-round +interactions based on the whole model and performance degradation caused by +data heterogeneity among clients.To address these issues, we propose a +personalized federated S2T framework that introduces \textsc{FedLoRA}, a +lightweight LoRA module for client-side tuning and interaction with the server +to minimize communication overhead, and \textsc{FedMem}, a global model +equipped with a $k$-nearest-neighbor ($k$NN) classifier that captures +client-specific distributional shifts to achieve personalization and overcome +data heterogeneity. Extensive experiments based on Conformer and Whisper +backbone models on CoVoST and GigaSpeech benchmarks show that our approach +significantly reduces the communication overhead on all S2T tasks and +effectively personalizes the global model to overcome data heterogeneity. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ☆ Code Prompting Elicits Conditional Reasoning Abilities in Text+Code LLMs + + +
+ Reasoning is a fundamental component for achieving language understanding. +Among the multiple types of reasoning, conditional reasoning, the ability to +draw different conclusions depending on some condition, has been understudied +in large language models (LLMs). Recent prompting methods, such as chain of +thought, have significantly improved LLMs on reasoning tasks. Nevertheless, +there is still little understanding of what triggers reasoning abilities in +LLMs. We hypothesize that code prompts can trigger conditional reasoning in +LLMs trained on text and code. We propose a chain of prompts that transforms a +natural language problem into code and prompts the LLM with the generated code. +Our experiments find that code prompts exhibit a performance boost between 2.6 +and 7.7 points on GPT 3.5 across multiple datasets requiring conditional +reasoning. We then conduct experiments to discover how code prompts elicit +conditional reasoning abilities and through which features. We observe that +prompts need to contain natural language text accompanied by high-quality code +that closely represents the semantics of the instance text. Furthermore, we +show that code prompts are more efficient, requiring fewer demonstrations, and +that they trigger superior state tracking of variables or key entities. + +
+
+ comment: Code, prompt templates, prompts, and outputs are publicly available + at https://github.com/UKPLab/arxiv2024-conditional-reasoning-llms +
+
+
+
+
+ + ☆ Antonym vs Synonym Distinction using InterlaCed Encoder NETworks + (ICE-NET) + + +
+ Antonyms vs synonyms distinction is a core challenge in lexico-semantic +analysis and automated lexical resource construction. These pairs share a +similar distributional context which makes it harder to distinguish them. +Leading research in this regard attempts to capture the properties of the +relation pairs, i.e., symmetry, transitivity, and trans-transitivity. However, +the inability of existing research to appropriately model the relation-specific +properties limits their end performance. In this paper, we propose InterlaCed +Encoder NETworks (i.e., ICE-NET) for antonym vs synonym distinction, that aim +to capture and model the relation-specific properties of the antonyms and +synonyms pairs in order to perform the classification task in a +performance-enhanced manner. Experimental evaluation using the benchmark +datasets shows that ICE-NET outperforms the existing research by a relative +score of upto 1.8% in F1-measure. We release the codes for ICE-NET at +https://github.com/asif6827/ICENET. + +
+
+
+
+
+ + ☆ Large Language Models for Scientific Information Extraction: An + Empirical Study for Virology ACL + + +
+ In this paper, we champion the use of structured and semantic content +representation of discourse-based scholarly communication, inspired by tools +like Wikipedia infoboxes or structured Amazon product descriptions. These +representations provide users with a concise overview, aiding scientists in +navigating the dense academic landscape. Our novel automated approach leverages +the robust text generation capabilities of LLMs to produce structured scholarly +contribution summaries, offering both a practical solution and insights into +LLMs' emergent abilities. + For LLMs, the prime focus is on improving their general intelligence as +conversational agents. We argue that these models can also be applied +effectively in information extraction (IE), specifically in complex IE tasks +within terse domains like Science. This paradigm shift replaces the traditional +modular, pipelined machine learning approach with a simpler objective expressed +through instructions. Our results show that finetuned FLAN-T5 with 1000x fewer +parameters than the state-of-the-art GPT-davinci is competitive for the task. + +
+
+ comment: 8 pages, 6 figures, Accepted as Findings of the ACL: EACL 2024 +
+
+
+
+
+ + ☆ Evolutionary Computation in the Era of Large Language Model: Survey and + Roadmap + + +
+ Large Language Models (LLMs), built upon Transformer-based architectures with +massive pretraining on diverse data, have not only revolutionized natural +language processing but also extended their prowess to various domains, marking +a significant stride towards artificial general intelligence. The interplay +between LLMs and Evolutionary Algorithms (EAs), despite differing in objectives +and methodologies, reveals intriguing parallels, especially in their shared +optimization nature, black-box characteristics, and proficiency in handling +complex problems. Meanwhile, EA can not only provide an optimization framework +for LLM's further enhancement under black-box settings but also empower LLM +with flexible global search and iterative mechanism in applications. On the +other hand, LLM's abundant domain knowledge enables EA to perform smarter +searches, while its text processing capability assist in deploying EA across +various tasks. Based on their complementary advantages, this paper presents a +comprehensive review and forward-looking roadmap, categorizing their mutual +inspiration into LLM-enhanced evolutionary optimization and EA-enhanced LLM. +Some integrated synergy methods are further introduced to exemplify the +amalgamation of LLMs and EAs in various application scenarios, including neural +architecture search, code generation, software engineering, and text +generation. As the first comprehensive review specifically focused on the EA +research in the era of LLMs, this paper provides a foundational stepping stone +for understanding and harnessing the collaborative potential of LLMs and EAs. +By presenting a comprehensive review, categorization, and critical analysis, we +contribute to the ongoing discourse on the cross-disciplinary study of these +two powerful paradigms. The identified challenges and future directions offer +guidance to unlock the full potential of this innovative collaboration. + +
+
+ comment: evolutionary algorithm (EA), large language model (LLM), optimization + problem, prompt optimization, architecture search, code generation +
+
+
+
+
+ + ☆ Framing Analysis of Health-Related Narratives: Conspiracy versus + Mainstream Media + + +
+ Understanding how online media frame issues is crucial due to their impact on +public opinion. Research on framing using natural language processing +techniques mainly focuses on specific content features in messages and neglects +their narrative elements. Also, the distinction between framing in different +sources remains an understudied problem. We address those issues and +investigate how the framing of health-related topics, such as COVID-19 and +other diseases, differs between conspiracy and mainstream websites. We +incorporate narrative information into the framing analysis by introducing a +novel frame extraction approach based on semantic graphs. We find that +health-related narratives in conspiracy media are predominantly framed in terms +of beliefs, while mainstream media tend to present them in terms of science. We +hope our work offers new ways for a more nuanced frame analysis. + +
+
+
+
+
+ + ☆ Self-Rewarding Language Models + + +
+ We posit that to achieve superhuman agents, future models require superhuman +feedback in order to provide an adequate training signal. Current approaches +commonly train reward models from human preferences, which may then be +bottlenecked by human performance level, and secondly these separate frozen +reward models cannot then learn to improve during LLM training. In this work, +we study Self-Rewarding Language Models, where the language model itself is +used via LLM-as-a-Judge prompting to provide its own rewards during training. +We show that during Iterative DPO training that not only does instruction +following ability improve, but also the ability to provide high-quality rewards +to itself. Fine-tuning Llama 2 70B on three iterations of our approach yields a +model that outperforms many existing systems on the AlpacaEval 2.0 leaderboard, +including Claude 2, Gemini Pro, and GPT-4 0613. While only a preliminary study, +this work opens the door to the possibility of models that can continually +improve in both axes. + +
+
+
+
+
+ + ☆ R-Judge: Benchmarking Safety Risk Awareness for LLM Agents + + +
+ Large language models (LLMs) have exhibited great potential in autonomously +completing tasks across real-world applications. Despite this, these LLM agents +introduce unexpected safety risks when operating in interactive environments. +Instead of centering on LLM-generated content safety in most prior studies, +this work addresses the imperative need for benchmarking the behavioral safety +of LLM agents within diverse environments. We introduce R-Judge, a benchmark +crafted to evaluate the proficiency of LLMs in judging safety risks given agent +interaction records. R-Judge comprises 162 agent interaction records, +encompassing 27 key risk scenarios among 7 application categories and 10 risk +types. It incorporates human consensus on safety with annotated safety risk +labels and high-quality risk descriptions. Utilizing R-Judge, we conduct a +comprehensive evaluation of 8 prominent LLMs commonly employed as the backbone +for agents. The best-performing model, GPT-4, achieves 72.29% in contrast to +the human score of 89.38%, showing considerable room for enhancing the risk +awareness of LLMs. Notably, leveraging risk descriptions as environment +feedback significantly improves model performance, revealing the importance of +salient safety risk feedback. Furthermore, we design an effective chain of +safety analysis technique to help the judgment of safety risks and conduct an +in-depth case study to facilitate future research. R-Judge is publicly +available at https://github.com/Lordog/R-Judge. + +
+
+
+
+
+ + ☆ Gender Bias in Machine Translation and The Era of Large Language Models + + +
+ This chapter examines the role of Machine Translation in perpetuating gender +bias, highlighting the challenges posed by cross-linguistic settings and +statistical dependencies. A comprehensive overview of relevant existing work +related to gender bias in both conventional Neural Machine Translation +approaches and Generative Pretrained Transformer models employed as Machine +Translation systems is provided. Through an experiment using ChatGPT (based on +GPT-3.5) in an English-Italian translation context, we further assess ChatGPT's +current capacity to address gender bias. The findings emphasize the ongoing +need for advancements in mitigating bias in Machine Translation systems and +underscore the importance of fostering fairness and inclusivity in language +technologies. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Towards Hierarchical Spoken Language Dysfluency Modeling EACL + + +
+ Speech dysfluency modeling is the bottleneck for both speech therapy and +language learning. However, there is no AI solution to systematically tackle +this problem. We first propose to define the concept of dysfluent speech and +dysfluent speech modeling. We then present Hierarchical Unconstrained +Dysfluency Modeling (H-UDM) approach that addresses both dysfluency +transcription and detection to eliminate the need for extensive manual +annotation. Furthermore, we introduce a simulated dysfluent dataset called +VCTK++ to enhance the capabilities of H-UDM in phonetic transcription. Our +experimental results demonstrate the effectiveness and robustness of our +proposed methods in both transcription and detection tasks. + +
+
+ comment: 2024 EACL Long (main conference). arXiv admin note: substantial text + overlap with arXiv:2312.12810 +
+
+
+
+
+ + ☆ Advancing Large Multi-modal Models with Explicit Chain-of-Reasoning and + Visual Question Generation + + +
+ The increasing demand for intelligent systems capable of interpreting and +reasoning about visual content requires the development of Large Multi-Modal +Models (LMMs) that are not only accurate but also have explicit reasoning +capabilities. This paper presents a novel approach to imbue an LMM with the +ability to conduct explicit reasoning based on visual content and textual +instructions. We introduce a system that can ask a question to acquire +necessary knowledge, thereby enhancing the robustness and explicability of the +reasoning process. Our method comprises the development of a novel dataset +generated by a Large Language Model (LLM), designed to promote chain-of-thought +reasoning combined with a question-asking mechanism. We designed an LMM, which +has high capabilities on region awareness to address the intricate requirements +of image-text alignment. The model undergoes a three-stage training phase, +starting with large-scale image-text alignment using a large-scale datasets, +followed by instruction tuning, and fine-tuning with a focus on +chain-of-thought reasoning. The results demonstrate a stride toward a more +robust, accurate, and interpretable LMM, capable of reasoning explicitly and +seeking information proactively when confronted with ambiguous visual input. + +
+
+
+
+
+ + ☆ Distantly Supervised Morpho-Syntactic Model for Relation Extraction + + +
+ The task of Information Extraction (IE) involves automatically converting +unstructured textual content into structured data. Most research in this field +concentrates on extracting all facts or a specific set of relationships from +documents. In this paper, we present a method for the extraction and +categorisation of an unrestricted set of relationships from text. Our method +relies on morpho-syntactic extraction patterns obtained by a distant +supervision method, and creates Syntactic and Semantic Indices to extract and +classify candidate graphs. We evaluate our approach on six datasets built on +Wikidata and Wikipedia. The evaluation shows that our approach can achieve +Precision scores of up to 0.85, but with lower Recall and F1 scores. Our +approach allows to quickly create rule-based systems for Information Extraction +and to build annotated datasets to train machine-learning and deep-learning +based classifiers. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Gradable ChatGPT Translation Evaluation + + +
+ ChatGPT, as a language model based on large-scale pre-training, has exerted a +profound influence on the domain of machine translation. In ChatGPT, a "Prompt" +refers to a segment of text or instruction employed to steer the model towards +generating a specific category of response. The design of the translation +prompt emerges as a key aspect that can wield influence over factors such as +the style, precision and accuracy of the translation to a certain extent. +However, there is a lack of a common standard and methodology on how to design +and select a translation prompt. Accordingly, this paper proposes a generic +taxonomy, which defines gradable translation prompts in terms of expression +type, translation style, POS information and explicit statement, thus +facilitating the construction of prompts endowed with distinct attributes +tailored for various translation tasks. Specific experiments and cases are +selected to validate and illustrate the effectiveness of the method. + +
+
+ comment: Under review in the journal Procesamiento del Lenguaje Natural +
+
+
+
+
+ + ☆ Better Explain Transformers by Illuminating Important Information + + +
+ Transformer-based models excel in various natural language processing (NLP) +tasks, attracting countless efforts to explain their inner workings. Prior +methods explain Transformers by focusing on the raw gradient and attention as +token attribution scores, where non-relevant information is often considered +during explanation computation, resulting in confusing results. In this work, +we propose highlighting the important information and eliminating irrelevant +information by a refined information flow on top of the layer-wise relevance +propagation (LRP) method. Specifically, we consider identifying syntactic and +positional heads as important attention heads and focus on the relevance +obtained from these important heads. Experimental results demonstrate that +irrelevant information does distort output attribution scores and then should +be masked during explanation computation. Compared to eight baselines on both +classification and question-answering datasets, our method consistently +outperforms with over 3\% to 33\% improvement on explanation metrics, providing +superior explanation performance. Our anonymous code repository is available +at: https://github.com/LinxinS97/Mask-LRP + +
+
+
+
+
+ + ☆ Sketch-Guided Constrained Decoding for Boosting Blackbox Large Language + Models without Logit Access + + +
+ Constrained decoding, a technique for enforcing constraints on language model +outputs, offers a way to control text generation without retraining or +architectural modifications. Its application is, however, typically restricted +to models that give users access to next-token distributions (usually via +softmax logits), which poses a limitation with blackbox large language models +(LLMs). This paper introduces sketch-guided constrained decoding (SGCD), a +novel approach to constrained decoding for blackbox LLMs, which operates +without access to the logits of the blackbox LLM. SGCD utilizes a locally +hosted auxiliary model to refine the output of an unconstrained blackbox LLM, +effectively treating this initial output as a "sketch" for further elaboration. +This approach is complementary to traditional logit-based techniques and +enables the application of constrained decoding in settings where full model +transparency is unavailable. We demonstrate the efficacy of SGCD through +experiments in closed information extraction and constituency parsing, showing +how it enhances the utility and flexibility of blackbox LLMs for complex NLP +tasks. + +
+
+
+
+
+ + ☆ Meme-ingful Analysis: Enhanced Understanding of Cyberbullying in Memes + Through Multimodal Explanations EACL2024 + + +
+ Internet memes have gained significant influence in communicating political, +psychological, and sociocultural ideas. While memes are often humorous, there +has been a rise in the use of memes for trolling and cyberbullying. Although a +wide variety of effective deep learning-based models have been developed for +detecting offensive multimodal memes, only a few works have been done on +explainability aspect. Recent laws like "right to explanations" of General Data +Protection Regulation, have spurred research in developing interpretable models +rather than only focusing on performance. Motivated by this, we introduce {\em +MultiBully-Ex}, the first benchmark dataset for multimodal explanation from +code-mixed cyberbullying memes. Here, both visual and textual modalities are +highlighted to explain why a given meme is cyberbullying. A Contrastive +Language-Image Pretraining (CLIP) projection-based multimodal shared-private +multitask approach has been proposed for visual and textual explanation of a +meme. Experimental results demonstrate that training with multimodal +explanations improves performance in generating textual justifications and more +accurately identifying the visual evidence supporting a decision with reliable +performance improvements. + +
+
+ comment: EACL2024 +
+
+
+
+
+ + ☆ A Survey on Hardware Accelerators for Large Language Models + + +
+ Large Language Models (LLMs) have emerged as powerful tools for natural +language processing tasks, revolutionizing the field with their ability to +understand and generate human-like text. As the demand for more sophisticated +LLMs continues to grow, there is a pressing need to address the computational +challenges associated with their scale and complexity. This paper presents a +comprehensive survey on hardware accelerators designed to enhance the +performance and energy efficiency of Large Language Models. By examining a +diverse range of accelerators, including GPUs, FPGAs, and custom-designed +architectures, we explore the landscape of hardware solutions tailored to meet +the unique computational demands of LLMs. The survey encompasses an in-depth +analysis of architecture, performance metrics, and energy efficiency +considerations, providing valuable insights for researchers, engineers, and +decision-makers aiming to optimize the deployment of LLMs in real-world +applications. + +
+
+
+
+
+ + ☆ Attention-Based Recurrent Neural Network For Automatic Behavior Laying + Hen Recognition + + +
+ One of the interests of modern poultry farming is the vocalization of laying +hens which contain very useful information on health behavior. This information +is used as health and well-being indicators that help breeders better monitor +laying hens, which involves early detection of problems for rapid and more +effective intervention. In this work, we focus on the sound analysis for the +recognition of the types of calls of the laying hens in order to propose a +robust system of characterization of their behavior for a better monitoring. To +do this, we first collected and annotated laying hen call signals, then +designed an optimal acoustic characterization based on the combination of time +and frequency domain features. We then used these features to build the +multi-label classification models based on recurrent neural network to assign a +semantic class to the vocalization that characterize the laying hen behavior. +The results show an overall performance with our model based on the combination +of time and frequency domain features that obtained the highest F1-score +(F1=92.75) with a gain of 17% on the models using the frequency domain features +and of 8% on the compared approaches from the litterature. + +
+
+
+
+
+ + ☆ Evolutionary Multi-Objective Optimization of Large Language Model + Prompts for Balancing Sentiments + + +
+ The advent of large language models (LLMs) such as ChatGPT has attracted +considerable attention in various domains due to their remarkable performance +and versatility. As the use of these models continues to grow, the importance +of effective prompt engineering has come to the fore. Prompt optimization +emerges as a crucial challenge, as it has a direct impact on model performance +and the extraction of relevant information. Recently, evolutionary algorithms +(EAs) have shown promise in addressing this issue, paving the way for novel +optimization strategies. In this work, we propose a evolutionary +multi-objective (EMO) approach specifically tailored for prompt optimization +called EMO-Prompts, using sentiment analysis as a case study. We use sentiment +analysis capabilities as our experimental targets. Our results demonstrate that +EMO-Prompts effectively generates prompts capable of guiding the LLM to produce +texts embodying two conflicting emotions simultaneously. + +
+
+ comment: Accepted in EvoApps at EvoStar 2024 +
+
+
+
+
+ + ☆ MatSciRE: Leveraging Pointer Networks to Automate Entity and Relation + Extraction for Material Science Knowledge-base Construction + + +
+ Material science literature is a rich source of factual information about +various categories of entities (like materials and compositions) and various +relations between these entities, such as conductivity, voltage, etc. +Automatically extracting this information to generate a material science +knowledge base is a challenging task. In this paper, we propose MatSciRE +(Material Science Relation Extractor), a Pointer Network-based encoder-decoder +framework, to jointly extract entities and relations from material science +articles as a triplet ($entity1, relation, entity2$). Specifically, we target +the battery materials and identify five relations to work on - conductivity, +coulombic efficiency, capacity, voltage, and energy. Our proposed approach +achieved a much better F1-score (0.771) than a previous attempt using +ChemDataExtractor (0.716). The overall graphical framework of MatSciRE is shown +in Fig 1. The material information is extracted from material science +literature in the form of entity-relation triplets using MatSciRE. + +
+
+
+
+
+ + ☆ Simple and effective data augmentation for compositional generalization + + +
+ Compositional generalization, the ability to predict complex meanings from +training on simpler sentences, poses challenges for powerful pretrained seq2seq +models. In this paper, we show that data augmentation methods that sample MRs +and backtranslate them can be effective for compositional generalization, but +only if we sample from the right distribution. Remarkably, sampling from a +uniform distribution performs almost as well as sampling from the test +distribution, and greatly outperforms earlier methods that sampled from the +training distribution. We further conduct experiments to investigate the reason +why this happens and where the benefit of such data augmentation methods come +from. + +
+
+
+
+
+ + ☆ All in How You Ask for It: Simple Black-Box Method for Jailbreak Attacks + + +
+ Large Language Models (LLMs) like ChatGPT face `jailbreak' challenges, where +safeguards are bypassed to produce ethically harmful prompts. This study +introduces a simple black-box method to effectively generate jailbreak prompts, +overcoming the limitations of high complexity and computational costs +associated with existing methods. The proposed technique iteratively rewrites +harmful prompts into non-harmful expressions using the target LLM itself, based +on the hypothesis that LLMs can directly sample safeguard-bypassing +expressions. Demonstrated through experiments with ChatGPT (GPT-3.5 and GPT-4) +and Gemini-Pro, this method achieved an attack success rate of over 80% within +an average of 5 iterations and remained effective despite model updates. The +jailbreak prompts generated were naturally-worded and concise, suggesting they +are less detectable. The results indicate that creating effective jailbreak +prompts is simpler than previously considered, and black-box jailbreak attacks +pose a more serious security threat. + +
+
+ comment: 11 pages, 3 figures, 2 tables +
+
+
+
+
+ + ☆ Instant Answering in E-Commerce Buyer-Seller Messaging ECIR 2024 + + +
+ E-commerce customers frequently seek detailed product information for +purchase decisions, commonly contacting sellers directly with extended queries. +This manual response requirement imposes additional costs and disrupts buyer's +shopping experience with response time fluctuations ranging from hours to days. +We seek to automate buyer inquiries to sellers in a leading e-commerce store +using a domain-specific federated Question Answering (QA) system. The main +challenge is adapting current QA systems, designed for single questions, to +address detailed customer queries. We address this with a low-latency, +sequence-to-sequence approach, MESSAGE-TO-QUESTION ( M2Q ). It reformulates +buyer messages into succinct questions by identifying and extracting the most +salient information from a message. Evaluation against baselines shows that M2Q +yields relative increases of 757% in question understanding, and 1,746% in +answering rate from the federated QA system. Live deployment shows that +automatic answering saves sellers from manually responding to millions of +messages per year, and also accelerates customer purchase decisions by +eliminating the need for buyers to wait for a reply + +
+
+ comment: Accepted at ECIR 2024 +
+
+
+
+
+ + ☆ Leveraging Biases in Large Language Models: "bias-kNN'' for Effective + Few-Shot Learning ICASSP 2024 + + +
+ Large Language Models (LLMs) have shown significant promise in various +applications, including zero-shot and few-shot learning. However, their +performance can be hampered by inherent biases. Instead of traditionally sought +methods that aim to minimize or correct these biases, this study introduces a +novel methodology named ``bias-kNN''. This approach capitalizes on the biased +outputs, harnessing them as primary features for kNN and supplementing with +gold labels. Our comprehensive evaluations, spanning diverse domain text +classification datasets and different GPT-2 model sizes, indicate the +adaptability and efficacy of the ``bias-kNN'' method. Remarkably, this approach +not only outperforms conventional in-context learning in few-shot scenarios but +also demonstrates robustness across a spectrum of samples, templates and +verbalizers. This study, therefore, presents a unique perspective on harnessing +biases, transforming them into assets for enhanced model performance. + +
+
+ comment: Accepted by the 49th IEEE International Conference on Acoustics, + Speech, and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ Controllable Decontextualization of Yes/No Question and Answers into + Factual Statements ECIR 2024 + + +
+ Yes/No or polar questions represent one of the main linguistic question +categories. They consist of a main interrogative clause, for which the answer +is binary (assertion or negation). Polar questions and answers (PQA) represent +a valuable knowledge resource present in many community and other curated QA +sources, such as forums or e-commerce applications. Using answers to polar +questions alone in other contexts is not trivial. Answers are contextualized, +and presume that the interrogative question clause and any shared knowledge +between the asker and answerer are provided. + We address the problem of controllable rewriting of answers to polar +questions into decontextualized and succinct factual statements. We propose a +Transformer sequence to sequence model that utilizes soft-constraints to ensure +controllable rewriting, such that the output statement is semantically +equivalent to its PQA input. Evaluation on three separate PQA datasets as +measured through automated and human evaluation metrics show that our proposed +approach achieves the best performance when compared to existing baselines. + +
+
+ comment: Accepted at ECIR 2024 +
+
+
+
+
+ + ☆ On the Audio Hallucinations in Large Audio-Video Language Models + + +
+ Large audio-video language models can generate descriptions for both video +and audio. However, they sometimes ignore audio content, producing audio +descriptions solely reliant on visual information. This paper refers to this as +audio hallucinations and analyzes them in large audio-video language models. We +gather 1,000 sentences by inquiring about audio information and annotate them +whether they contain hallucinations. If a sentence is hallucinated, we also +categorize the type of hallucination. The results reveal that 332 sentences are +hallucinated with distinct trends observed in nouns and verbs for each +hallucination type. Based on this, we tackle a task of audio hallucination +classification using pre-trained audio-text models in the zero-shot and +fine-tuning settings. Our experimental results reveal that the zero-shot models +achieve higher performance (52.2% in F1) than the random (40.3%) and the +fine-tuning models achieve 87.9%, outperforming the zero-shot models. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ A Comparative Study on Annotation Quality of Crowdsourcing and LLM via + Label Aggregation ICASSP 2024 + + +
+ Whether Large Language Models (LLMs) can outperform crowdsourcing on the data +annotation task is attracting interest recently. Some works verified this issue +with the average performance of individual crowd workers and LLM workers on +some specific NLP tasks by collecting new datasets. However, on the one hand, +existing datasets for the studies of annotation quality in crowdsourcing are +not yet utilized in such evaluations, which potentially provide reliable +evaluations from a different viewpoint. On the other hand, the quality of these +aggregated labels is crucial because, when utilizing crowdsourcing, the +estimated labels aggregated from multiple crowd labels to the same instances +are the eventually collected labels. Therefore, in this paper, we first +investigate which existing crowdsourcing datasets can be used for a comparative +study and create a benchmark. We then compare the quality between individual +crowd labels and LLM labels and make the evaluations on the aggregated labels. +In addition, we propose a Crowd-LLM hybrid label aggregation method and verify +the performance. We find that adding LLM labels from good LLMs to existing +crowdsourcing datasets can enhance the quality of the aggregated labels of the +datasets, which is also higher than the quality of LLM labels themselves. + +
+
+ comment: Accepted in ICASSP 2024 +
+
+
+
+
+ + ☆ Resolving Regular Polysemy in Named Entities + + +
+ Word sense disambiguation primarily addresses the lexical ambiguity of common +words based on a predefined sense inventory. Conversely, proper names are +usually considered to denote an ad-hoc real-world referent. Once the reference +is decided, the ambiguity is purportedly resolved. However, proper names also +exhibit ambiguities through appellativization, i.e., they act like common words +and may denote different aspects of their referents. We proposed to address the +ambiguities of proper names through the light of regular polysemy, which we +formalized as dot objects. This paper introduces a combined word sense +disambiguation (WSD) model for disambiguating common words against Chinese +Wordnet (CWN) and proper names as dot objects. The model leverages the +flexibility of a gloss-based model architecture, which takes advantage of the +glosses and example sentences of CWN. We show that the model achieves +competitive results on both common and proper nouns, even on a relatively +sparse sense dataset. Aside from being a performant WSD tool, the model further +facilitates the future development of the lexical resource. + +
+
+
+
+
+ + ☆ Large Language Model Lateral Spear Phishing: A Comparative Study in + Large-Scale Organizational Settings + + +
+ The critical threat of phishing emails has been further exacerbated by the +potential of LLMs to generate highly targeted, personalized, and automated +spear phishing attacks. Two critical problems concerning LLM-facilitated +phishing require further investigation: 1) Existing studies on lateral phishing +lack specific examination of LLM integration for large-scale attacks targeting +the entire organization, and 2) Current anti-phishing infrastructure, despite +its extensive development, lacks the capability to prevent LLM-generated +attacks, potentially impacting both employees and IT security incident +management. However, the execution of such investigative studies necessitates a +real-world environment, one that functions during regular business operations +and mirrors the complexity of a large organizational infrastructure. This +setting must also offer the flexibility required to facilitate a diverse array +of experimental conditions, particularly the incorporation of phishing emails +crafted by LLMs. This study is a pioneering exploration into the use of Large +Language Models (LLMs) for the creation of targeted lateral phishing emails, +targeting a large tier 1 university's operation and workforce of approximately +9,000 individuals over an 11-month period. It also evaluates the capability of +email filtering infrastructure to detect such LLM-generated phishing attempts, +providing insights into their effectiveness and identifying potential areas for +improvement. Based on our findings, we propose machine learning-based detection +techniques for such emails to detect LLM-generated phishing emails that were +missed by the existing infrastructure, with an F1-score of 98.96. + +
+
+
+
+
+ + ☆ Predicting Viral Rumors and Vulnerable Users for Infodemic Surveillance + + +
+ In the age of the infodemic, it is crucial to have tools for effectively +monitoring the spread of rampant rumors that can quickly go viral, as well as +identifying vulnerable users who may be more susceptible to spreading such +misinformation. This proactive approach allows for timely preventive measures +to be taken, mitigating the negative impact of false information on society. We +propose a novel approach to predict viral rumors and vulnerable users using a +unified graph neural network model. We pre-train network-based user embeddings +and leverage a cross-attention mechanism between users and posts, together with +a community-enhanced vulnerability propagation (CVP) method to improve user and +propagation graph representations. Furthermore, we employ two multi-task +training strategies to mitigate negative transfer effects among tasks in +different settings, enhancing the overall performance of our approach. We also +construct two datasets with ground-truth annotations on information virality +and user vulnerability in rumor and non-rumor events, which are automatically +derived from existing rumor detection datasets. Extensive evaluation results of +our joint learning model confirm its superiority over strong baselines in all +three tasks: rumor detection, virality prediction, and user vulnerability +scoring. For instance, compared to the best baselines based on the Weibo +dataset, our model makes 3.8\% and 3.0\% improvements on Accuracy and MacF1 for +rumor detection, and reduces mean squared error (MSE) by 23.9\% and 16.5\% for +virality prediction and user vulnerability scoring, respectively. Our findings +suggest that our approach effectively captures the correlation between rumor +virality and user vulnerability, leveraging this information to improve +prediction performance and provide a valuable tool for infodemic surveillance. + +
+
+ comment: Accepted by IP&M +
+
+
+
+
+ + ☆ Curriculum Recommendations Using Transformer Base Model with InfoNCE + Loss And Language Switching Method + + +
+ The Curriculum Recommendations paradigm is dedicated to fostering learning +equality within the ever-evolving realms of educational technology and +curriculum development. In acknowledging the inherent obstacles posed by +existing methodologies, such as content conflicts and disruptions from language +translation, this paradigm aims to confront and overcome these challenges. +Notably, it addresses content conflicts and disruptions introduced by language +translation, hindrances that can impede the creation of an all-encompassing and +personalized learning experience. The paradigm's objective is to cultivate an +educational environment that not only embraces diversity but also customizes +learning experiences to suit the distinct needs of each learner. To overcome +these challenges, our approach builds upon notable contributions in curriculum +development and personalized learning, introducing three key innovations. These +include the integration of Transformer Base Model to enhance computational +efficiency, the implementation of InfoNCE Loss for accurate content-topic +matching, and the adoption of a language switching strategy to alleviate +translation-related ambiguities. Together, these innovations aim to +collectively tackle inherent challenges and contribute to forging a more +equitable and effective learning journey for a diverse range of learners. +Competitive cross-validation scores underscore the efficacy of +sentence-transformers/LaBSE, achieving 0.66314, showcasing our methodology's +effectiveness in diverse linguistic nuances for content alignment prediction. +Index Terms-Curriculum Recommendation, Transformer model with InfoNCE Loss, +Language Switching. + +
+
+ comment: 4pages, 2 figures, ICAICA2023 +
+
+
+
+
+ + ☆ Can Large Language Model Summarizers Adapt to Diverse Scientific + Communication Goals? + + +
+ In this work, we investigate the controllability of large language models +(LLMs) on scientific summarization tasks. We identify key stylistic and content +coverage factors that characterize different types of summaries such as paper +reviews, abstracts, and lay summaries. By controlling stylistic features, we +find that non-fine-tuned LLMs outperform humans in the MuP review generation +task, both in terms of similarity to reference summaries and human preferences. +Also, we show that we can improve the controllability of LLMs with +keyword-based classifier-free guidance (CFG) while achieving lexical overlap +comparable to strong fine-tuned baselines on arXiv and PubMed. However, our +results also indicate that LLMs cannot consistently generate long summaries +with more than 8 sentences. Furthermore, these models exhibit limited capacity +to produce highly abstractive lay summaries. Although LLMs demonstrate strong +generic summarization competency, sophisticated content control without costly +fine-tuning remains an open problem for domain-specific applications. + +
+
+
+
+
+ + ☆ Learning High-Quality and General-Purpose Phrase Representations EACL 2024 + + +
+ Phrase representations play an important role in data science and natural +language processing, benefiting various tasks like Entity Alignment, Record +Linkage, Fuzzy Joins, and Paraphrase Classification. The current +state-of-the-art method involves fine-tuning pre-trained language models for +phrasal embeddings using contrastive learning. However, we have identified +areas for improvement. First, these pre-trained models tend to be unnecessarily +complex and require to be pre-trained on a corpus with context sentences. +Second, leveraging the phrase type and morphology gives phrase representations +that are both more precise and more flexible. We propose an improved framework +to learn phrase representations in a context-free fashion. The framework +employs phrase type classification as an auxiliary task and incorporates +character-level information more effectively into the phrase representation. +Furthermore, we design three granularities of data augmentation to increase the +diversity of training samples. Our experiments across a wide range of tasks +show that our approach generates superior phrase embeddings compared to +previous methods while requiring a smaller model size. The code is available at +\faGithub~ \url{https://github.com/tigerchen52/PEARL} \end{abstract} + +
+
+ comment: Findings of EACL 2024 +
+
+
+
+
+ + ☆ Inconsistent dialogue responses and how to recover from them EACL 2024 + + +
+ One critical issue for chat systems is to stay consistent about preferences, +opinions, beliefs and facts of itself, which has been shown a difficult +problem. In this work, we study methods to assess and bolster utterance +consistency of chat systems. A dataset is first developed for studying the +inconsistencies, where inconsistent dialogue responses, explanations of the +inconsistencies, and recovery utterances are authored by annotators. This +covers the life span of inconsistencies, namely introduction, understanding, +and resolution. Building on this, we introduce a set of tasks centered on +dialogue consistency, specifically focused on its detection and resolution. Our +experimental findings indicate that our dataset significantly helps the +progress in identifying and resolving conversational inconsistencies, and +current popular large language models like ChatGPT which are good at resolving +inconsistencies however still struggle with detection. + +
+
+ comment: Accepted in EACL 2024. Code and dataset available at + https://github.com/mianzhang/CIDER +
+
+
+
+
+ + ☆ Bridging Cultural Nuances in Dialogue Agents through Cultural Value + Surveys EACL 2024 + + +
+ The cultural landscape of interactions with dialogue agents is a compelling +yet relatively unexplored territory. It's clear that various sociocultural +aspects -- from communication styles and beliefs to shared metaphors and +knowledge -- profoundly impact these interactions. To delve deeper into this +dynamic, we introduce cuDialog, a first-of-its-kind benchmark for dialogue +generation with a cultural lens. We also develop baseline models capable of +extracting cultural attributes from dialogue exchanges, with the goal of +enhancing the predictive accuracy and quality of dialogue agents. To +effectively co-learn cultural understanding and multi-turn dialogue +predictions, we propose to incorporate cultural dimensions with dialogue +encoding features. Our experimental findings highlight that incorporating +cultural value surveys boosts alignment with references and cultural markers, +demonstrating its considerable influence on personalization and dialogue +quality. To facilitate further exploration in this exciting domain, we publish +our benchmark publicly accessible at https://github.com/yongcaoplus/cuDialog. + +
+
+ comment: 16pages, 7 figures, EACL 2024 main +
+
+
+
+
+ + ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource + Security Attack Pattern Recognition EACL 2024 + + +
+ Tactics, Techniques and Procedures (TTPs) represent sophisticated attack +patterns in the cybersecurity domain, described encyclopedically in textual +knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP +mapping, is an important and challenging task. Conventional learning approaches +often target the problem in the classical multi-class or multilabel +classification setting. This setting hinders the learning ability of the model +due to a large number of classes (i.e., TTPs), the inevitable skewness of the +label distribution and the complex hierarchical structure of the label space. +We formulate the problem in a different learning paradigm, where the assignment +of a text to a TTP label is decided by the direct semantic similarity between +the two, thus reducing the complexity of competing solely over the large +labeling space. To that end, we propose a neural matching architecture with an +effective sampling-based learn-to-compare mechanism, facilitating the learning +process of the matching model despite constrained resources. + +
+
+ comment: accepted at EACL 2024, in ARR October 2023 +
+
+
+
+
+ + ♻ ☆ MetaTool Benchmark for Large Language Models: Deciding Whether to Use + Tools and Which to Use + + +
+ Large language models (LLMs) have garnered significant attention due to their +impressive natural language processing (NLP) capabilities. Recently, many +studies have focused on the tool utilization ability of LLMs. They primarily +investigated how LLMs effectively collaborate with given specific tools. +However, in scenarios where LLMs serve as intelligent agents, as seen in +applications like AutoGPT and MetaGPT, LLMs are expected to engage in intricate +decision-making processes that involve deciding whether to employ a tool and +selecting the most suitable tool(s) from a collection of available tools to +fulfill user requests. Therefore, in this paper, we introduce MetaTool, a +benchmark designed to evaluate whether LLMs have tool usage awareness and can +correctly choose tools. Specifically, we create a dataset called ToolE within +the benchmark. This dataset contains various types of user queries in the form +of prompts that trigger LLMs to use tools, including both single-tool and +multi-tool scenarios. Subsequently, we set the tasks for both tool usage +awareness and tool selection. We define four subtasks from different +perspectives in tool selection, including tool selection with similar choices, +tool selection in specific scenarios, tool selection with possible reliability +issues, and multi-tool selection. We conduct experiments involving eight +popular LLMs and find that the majority of them still struggle to effectively +select tools, highlighting the existing gaps between LLMs and genuine +intelligent agents. However, through the error analysis, we found there is +still significant room for improvement. Finally, we conclude with insights for +tool developers -- we strongly recommend that tool developers choose an +appropriate rewrite model for generating new descriptions based on the +downstream LLM the tool will apply to. Our code is in +\href{https://github.com/HowieHwong/MetaTool}{Github}. + +
+
+
+
+
+ + ♻ ☆ Functional Invariants to Watermark Large Transformers ICASSP 2024 + + +
+ The rapid growth of transformer-based models increases the concerns about +their integrity and ownership insurance. Watermarking addresses this issue by +embedding a unique identifier into the model, while preserving its performance. +However, most existing approaches require to optimize the weights to imprint +the watermark signal, which is not suitable at scale due to the computational +cost. This paper explores watermarks with virtually no computational cost, +applicable to a non-blind white-box setting (assuming access to both the +original and watermarked networks). They generate functionally equivalent +copies by leveraging the models' invariance, via operations like dimension +permutations or scaling/unscaling. This enables to watermark models without any +change in their outputs and remains stealthy. Experiments demonstrate the +effectiveness of the approach and its robustness against various model +transformations (fine-tuning, quantization, pruning), making it a practical +solution to protect the integrity of large models. + +
+
+ comment: Published at ICASSP 2024. Webpage at + https://pierrefdz.github.io/publications/invariancewm/ +
+
+
+
+
+ + ♻ ☆ Principled Instructions Are All You Need for Questioning LLaMA-1/2, + GPT-3.5/4 + + +
+ This paper introduces 26 guiding principles designed to streamline the +process of querying and prompting large language models. Our goal is to +simplify the underlying concepts of formulating questions for various scales of +large language models, examining their abilities, and enhancing user +comprehension on the behaviors of different scales of large language models +when feeding into different prompts. Extensive experiments are conducted on +LLaMA-1/2 (7B, 13B and 70B), GPT-3.5/4 to verify the effectiveness of the +proposed principles on instructions and prompts design. We hope that this work +can provide a better guide for researchers working on the prompting of large +language models. Project page is available at +https://github.com/VILA-Lab/ATLAS. + +
+
+ comment: Github at: https://github.com/VILA-Lab/ATLAS +
+
+
+
+
+ + ♻ ☆ Less is More for Long Document Summary Evaluation by LLMs EACL + + +
+ Large Language Models (LLMs) have shown promising performance in summary +evaluation tasks, yet they face challenges such as high computational costs and +the Lost-in-the-Middle problem where important information in the middle of +long documents is often overlooked. To address these issues, this paper +introduces a novel approach, Extract-then-Evaluate, which involves extracting +key sentences from a long source document and then evaluating the summary by +prompting LLMs. The results reveal that the proposed method not only +significantly reduces evaluation costs but also exhibits a higher correlation +with human evaluations. Furthermore, we provide practical recommendations for +optimal document length and sentence extraction methods, contributing to the +development of cost-effective yet more accurate methods for LLM-based text +generation evaluation. + +
+
+ comment: EACL (main) +
+
+
+
+
+ + ♻ ☆ Assertion Enhanced Few-Shot Learning: Instructive Technique for Large + Language Models to Generate Educational Explanations + + +
+ Human educators possess an intrinsic ability to anticipate and seek +educational explanations from students, which drives them to pose +thought-provoking questions when students cannot articulate these explanations +independently. We aim to imbue Intelligent Tutoring Systems with this ability +using few-shot learning capability of Large Language Models. Our work proposes +a novel prompting technique, Assertion Enhanced Few-Shot Learning, to +facilitate the generation of accurate, detailed oriented educational +explanations. Our central hypothesis is that, in educational domain, few-shot +demonstrations are necessary but not a sufficient condition for quality +explanation generation. We conducted a study involving 12 in-service teachers, +comparing our approach to Traditional Few-Shot Learning. The results show that +Assertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and +yields higher-quality explanations, as evaluated by teachers. We also conduct a +qualitative ablation study to factor the impact of assertions to provide +educator-friendly prompting guidelines for generating explanations in their +domain of interest. + +
+
+
+
+
+ + ♻ ☆ FactCHD: Benchmarking Fact-Conflicting Hallucination Detection + + +
+ Despite their impressive generative capabilities, LLMs are hindered by +fact-conflicting hallucinations in real-world applications. The accurate +identification of hallucinations in texts generated by LLMs, especially in +complex inferential scenarios, is a relatively unexplored area. To address this +gap, we present FactCHD, a dedicated benchmark designed for the detection of +fact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset +that spans various factuality patterns, including vanilla, multi-hop, +comparison, and set operation. A distinctive element of FactCHD is its +integration of fact-based evidence chains, significantly enhancing the depth of +evaluating the detectors' explanations. Experiments on different LLMs expose +the shortcomings of current approaches in detecting factual errors accurately. +Furthermore, we introduce Truth-Triangulator that synthesizes reflective +considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming +to yield more credible detection through the amalgamation of predictive results +and evidence. The benchmark dataset is available at +https://github.com/zjunlp/FactCHD. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ CodeKGC: Code Language Model for Generative Knowledge Graph Construction + + +
+ Current generative knowledge graph construction approaches usually fail to +capture structural knowledge by simply flattening natural language into +serialized texts or a specification language. However, large generative +language model trained on structured data such as code has demonstrated +impressive capability in understanding natural language for structural +prediction and reasoning tasks. Intuitively, we address the task of generative +knowledge graph construction with code language model: given a code-format +natural language input, the target is to generate triples which can be +represented as code completion tasks. Specifically, we develop schema-aware +prompts that effectively utilize the semantic structure within the knowledge +graph. As code inherently possesses structure, such as class and function +definitions, it serves as a useful model for prior semantic structural +knowledge. Furthermore, we employ a rationale-enhanced generation method to +boost the performance. Rationales provide intermediate steps, thereby improving +knowledge extraction abilities. Experimental results indicate that the proposed +approach can obtain better performance on benchmark datasets compared with +baselines. Code and datasets are available in +https://github.com/zjunlp/DeepKE/tree/main/example/llm. + +
+
+ comment: ACM Transactions on Asian and Low-Resource Language Information + Processing +
+
+
+
+
+ + ♻ ☆ Conversational Process Modeling: Can Generative AI Empower Domain + Experts in Creating and Redesigning Process Models? + + +
+ AI-driven chatbots such as ChatGPT have caused a tremendous hype lately. For +BPM applications, several applications for AI-driven chatbots have been +identified to be promising to generate business value, including explanation of +process mining outcomes and preparation of input data. However, a systematic +analysis of chatbots for their support of conversational process modeling as a +process-oriented capability is missing. This work aims at closing this gap by +providing a systematic analysis of existing chatbots. Application scenarios are +identified along the process life cycle. Then a systematic literature review on +conversational process modeling is performed, resulting in a taxonomy of +application scenarios for conversational process modeling, including +paraphrasing and improvement of process descriptions. In addition, this work +suggests and applies an evaluation method for the output of AI-driven chatbots +with respect to completeness and correctness of the process models. This method +consists of a set of KPIs on a test set, a set of prompts for task and control +flow extraction, as well as a survey with users. Based on the literature and +the evaluation, recommendations for the usage (practical implications) and +further development (research directions) of conversational process modeling +are derived. + +
+
+
+
+
+ + ♻ ☆ Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation + with Large Language Models + + +
+ Large Language Models (LLMs) demonstrate impressive capabilities to generate +accurate code snippets given natural language intents in zero-shot, i.e., +without the need for specific fine-tuning. While prior studies have highlighted +the advantages of fine-tuning LLMs, this process incurs high computational +costs, making it impractical in resource-scarce environments, particularly for +models with billions of parameters. To address these challenges, previous +research explored In-Context Learning (ICL) as a strategy to guide the LLM +generative process with task-specific prompt examples. However, ICL introduces +inconveniences, such as the need for designing contextually relevant prompts +and the absence of learning task-specific parameters, thereby limiting +downstream task performance. In this context, we foresee Parameter-Efficient +Fine-Tuning (PEFT) techniques as a promising approach to efficiently specialize +LLMs to task-specific data while maintaining reasonable resource consumption. +In this paper, we deliver a comprehensive study of PEFT techniques for LLMs +under the automated code generation scenario. Our comprehensive investigation +of PEFT techniques for LLMs reveals their superiority and potential over ICL +across a diverse set of LLMs. Additionally, we demonstrate the extended +capabilities of PEFT, showcasing its ability to learn from two distinct +datasets jointly without compromising performance. Furthermore, our study +highlights the potential for tuning larger LLMs and significant reductions in +memory usage by combining PEFT with quantization. Therefore, this study opens +opportunities for broader applications of PEFT in software engineering +scenarios. Our code is available at +https://github.com/martin-wey/peft-llm-code/. + +
+
+
+
+
+ + ♻ ☆ Large Language Model-Enhanced Algorithm Selection: Towards Comprehensive + Algorithm Representation + + +
+ Algorithm selection aims to identify the most suitable algorithm for solving +a specific problem before execution, which has become a critical process of the +AutoML. Current mainstream algorithm selection techniques rely heavily on +feature representations of various problems and employ the performance of each +algorithm as supervised information. However, there is a significant research +gap concerning the consideration of algorithm features. This gap is primarily +attributed to the inherent complexity of algorithms, making it particularly +challenging to find a universally effective feature extraction method that is +applicable across a diverse range of algorithms. Unfortunately, neglecting this +aspect undoubtedly impacts the accuracy of algorithm selection and indirectly +necessitates an increased volume of problem data for training purposes. This +paper takes a significant stride towards addressing this gap by proposing an +approach that integrates algorithm representation into the algorithm selection +process. Specifically, our proposed model employs distinct modules to extract +representations of both problems and algorithms, where the algorithm +representation leverages the capabilities of pre-trained LLMs in the realm of +code comprehension. Following the extraction of embedding vectors for both +algorithms and problems, the most suitable algorithm is determined through +calculations of matching degrees. Our experiments not only validate the +effectiveness of the proposed model but also showcase the performance of +different embedded pre-trained LLMs, which suggests that the proposed algorithm +selection framework holds the potential to serve as a baseline task for +evaluating the code representation capabilities of LLMs. + +
+
+
+
+
+ + ♻ ☆ Debiasing Algorithm through Model Adaptation ICLR 2024 + + +
+ Large language models are becoming the go-to solution for various language +tasks. However, with growing capacity, models are prone to rely on spurious +correlations stemming from biases and stereotypes present in the training data. +This work proposes a novel method for detecting and mitigating gender bias in +language models. We perform causal analysis to identify problematic model +components and discover that mid-upper feed-forward layers are most prone to +convey biases. Based on the analysis results, we adapt the model by multiplying +these layers by a linear projection. Our titular method, DAMA, significantly +decreases bias as measured by diverse metrics while maintaining the model's +performance on downstream tasks. We release code for our method and models, +which retrain LLaMA's state-of-the-art performance while being significantly +less biased. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Grammar-Constrained Decoding for Structured NLP Tasks without Finetuning EMNLP 2023 + + +
+ Despite their impressive performance, large language models (LMs) still +struggle with reliably generating complex output structures when not finetuned +to follow the required output format exactly. To address this issue, +grammar-constrained decoding (GCD) can be used to control the generation of +LMs, guaranteeing that the output follows a given structure. Most existing GCD +methods are, however, limited to specific tasks, such as parsing or code +generation. In this work, we demonstrate that formal grammars can describe the +output space for a much wider range of tasks and argue that GCD can serve as a +unified framework for structured NLP tasks in general. For increased +flexibility, we introduce input-dependent grammars, which allow the grammar to +depend on the input and thus enable the generation of different output +structures for different inputs. We then empirically demonstrate the power and +flexibility of GCD-enhanced LMs on (1) information extraction, (2) entity +disambiguation, and (3) constituency parsing. Our results indicate that +grammar-constrained LMs substantially outperform unconstrained LMs or even beat +task-specific finetuned models. Grammar constraints thus hold great promise for +harnessing off-the-shelf LMs for a wide range of structured NLP tasks, +especially where training data is scarce or finetuning is expensive. Code and +data: https://github.com/epfl-dlab/GCD. + +
+
+ comment: Accepted at EMNLP 2023 Main Conference +
+
+
+
+
+ + ♻ ☆ Understanding the Humans Behind Online Misinformation: An Observational + Study Through the Lens of the COVID-19 Pandemic + + +
+ The proliferation of online misinformation has emerged as one of the biggest +threats to society. Considerable efforts have focused on building +misinformation detection models, still the perils of misinformation remain +abound. Mitigating online misinformation and its ramifications requires a +holistic approach that encompasses not only an understanding of its intricate +landscape in relation to the complex issue and topic-rich information ecosystem +online, but also the psychological drivers of individuals behind it. Adopting a +time series analytic technique and robust causal inference-based design, we +conduct a large-scale observational study analyzing over 32 million COVID-19 +tweets and 16 million historical timeline tweets. We focus on understanding the +behavior and psychology of users disseminating misinformation during COVID-19 +and its relationship with the historical inclinations towards sharing +misinformation on Non-COVID domains before the pandemic. Our analysis +underscores the intricacies inherent to cross-domain misinformation, and +highlights that users' historical inclination toward sharing misinformation is +positively associated with their present behavior pertaining to misinformation +sharing on emergent topics and beyond. This work may serve as a valuable +foundation for designing user-centric inoculation strategies and +ecologically-grounded agile interventions for effectively tackling online +misinformation. + +
+
+
+
+
+ + ♻ ☆ Improving Domain Adaptation through Extended-Text Reading Comprehension + + +
+ To enhance the domain-specific capabilities of large language models, +continued pre-training on a domain-specific corpus is a prevalent method. +Recent work demonstrates that adapting models using reading comprehension data +formatted by regex-based patterns can significantly improve performance on +domain-specific tasks. However, regex-based patterns are incapable of parsing +raw corpora using domain-specific knowledge. Furthermore, the question and +answer pairs are extracted directly from the corpus in predefined formats +offers limited context. To address this limitation, we improve reading +comprehension via LLM and clustering. LLM focuses on leveraging domain +knowledge within the corpus to refine comprehension stage, while clustering +supplies relevant knowledge by extending the context to enrich reading stage. +Additionally, our method incorporates parameter-efficient fine-tuning to +improve the efficiency of domain adaptation. In comparison to AdaptLLM, our +method achieves an improvement exceeding 5% in domain-specific tasks. Our code +will available at https://github.com/microsoft/LMOps. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ♻ ☆ Contrastive Preference Optimization: Pushing the Boundaries of LLM + Performance in Machine Translation + + +
+ Moderate-sized large language models (LLMs) -- those with 7B or 13B +parameters -- exhibit promising machine translation (MT) performance. However, +even the top-performing 13B LLM-based translation models, like ALMA, does not +match the performance of state-of-the-art conventional encoder-decoder +translation models or larger-scale LLMs such as GPT-4. In this study, we bridge +this performance gap. We first assess the shortcomings of supervised +fine-tuning for LLMs in the MT task, emphasizing the quality issues present in +the reference data, despite being human-generated. Then, in contrast to SFT +which mimics reference translations, we introduce Contrastive Preference +Optimization (CPO), a novel approach that trains models to avoid generating +adequate but not perfect translations. Applying CPO to ALMA models with only +22K parallel sentences and 12M parameters yields significant improvements. The +resulting model, called ALMA-R, can match or exceed the performance of the WMT +competition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets. + +
+
+
+
+
+ + ♻ ☆ MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and + Uni-Modal Adapter EMNLP + + +
+ Language Models (LMs) have demonstrated impressive molecule understanding +ability on various 1D text-related tasks. However, they inherently lack 2D +graph perception - a critical ability of human professionals in comprehending +molecules' topological structures. To bridge this gap, we propose MolCA: +Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal +Adapter. MolCA enables an LM (e.g., Galactica) to understand both text- and +graph-based molecular contents via the cross-modal projector. Specifically, the +cross-modal projector is implemented as a Q-Former to connect a graph encoder's +representation space and an LM's text space. Further, MolCA employs a uni-modal +adapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks. +Unlike previous studies that couple an LM with a graph encoder via cross-modal +contrastive learning, MolCA retains the LM's ability of open-ended text +generation and augments it with 2D graph information. To showcase its +effectiveness, we extensively benchmark MolCA on tasks of molecule captioning, +IUPAC name prediction, and molecule-text retrieval, on which MolCA +significantly outperforms the baselines. Our codes and checkpoints can be found +at https://github.com/acharkq/MolCA. + +
+
+ comment: EMNLP main conference. 9 pages +
+
+
+
+
+ + ♻ ☆ Panoptic Vision-Language Feature Fields + + +
+ Recently, methods have been proposed for 3D open-vocabulary semantic +segmentation. Such methods are able to segment scenes into arbitrary classes +based on text descriptions provided during runtime. In this paper, we propose +to the best of our knowledge the first algorithm for open-vocabulary panoptic +segmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature +Fields (PVLFF), learns a semantic feature field of the scene by distilling +vision-language features from a pretrained 2D model, and jointly fits an +instance feature field through contrastive learning using 2D instance segments +on input frames. Despite not being trained on the target classes, our method +achieves panoptic segmentation performance similar to the state-of-the-art +closed-set 3D systems on the HyperSim, ScanNet and Replica dataset and +additionally outperforms current 3D open-vocabulary systems in terms of +semantic segmentation. We ablate the components of our method to demonstrate +the effectiveness of our model architecture. Our code will be available at +https://github.com/ethz-asl/pvlff. + +
+
+ comment: This work has been accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ Developing ChatGPT for Biology and Medicine: A Complete Review of + Biomedical Question Answering + + +
+ ChatGPT explores a strategic blueprint of question answering (QA) in +delivering medical diagnosis, treatment recommendations, and other healthcare +support. This is achieved through the increasing incorporation of medical +domain data via natural language processing (NLP) and multimodal paradigms. By +transitioning the distribution of text, images, videos, and other modalities +from the general domain to the medical domain, these techniques have expedited +the progress of medical domain question answering (MDQA). They bridge the gap +between human natural language and sophisticated medical domain knowledge or +expert manual annotations, handling large-scale, diverse, unbalanced, or even +unlabeled data analysis scenarios in medical contexts. Central to our focus is +the utilizing of language models and multimodal paradigms for medical question +answering, aiming to guide the research community in selecting appropriate +mechanisms for their specific medical research requirements. Specialized tasks +such as unimodal-related question answering, reading comprehension, reasoning, +diagnosis, relation extraction, probability modeling, and others, as well as +multimodal-related tasks like vision question answering, image caption, +cross-modal retrieval, report summarization, and generation, are discussed in +detail. Each section delves into the intricate specifics of the respective +method under consideration. This paper highlights the structures and +advancements of medical domain explorations against general domain methods, +emphasizing their applications across different tasks and datasets. It also +outlines current challenges and opportunities for future medical domain +research, paving the way for continued innovation and application in this +rapidly evolving field. + +
+
+ comment: There are some mistakes in introducing medical language question + answering Models and medical multimodal question answering models, such as + their dataset should be displayed for pretraining +
+
+
+
+
+ + ♻ ☆ Exploring the Reasoning Abilities of Multimodal Large Language Models + (MLLMs): A Comprehensive Survey on Emerging Trends in Multimodal Reasoning + + +
+ Strong Artificial Intelligence (Strong AI) or Artificial General Intelligence +(AGI) with abstract reasoning ability is the goal of next-generation AI. Recent +advancements in Large Language Models (LLMs), along with the emerging field of +Multimodal Large Language Models (MLLMs), have demonstrated impressive +capabilities across a wide range of multimodal tasks and applications. +Particularly, various MLLMs, each with distinct model architectures, training +data, and training stages, have been evaluated across a broad range of MLLM +benchmarks. These studies have, to varying degrees, revealed different aspects +of the current capabilities of MLLMs. However, the reasoning abilities of MLLMs +have not been systematically investigated. In this survey, we comprehensively +review the existing evaluation protocols of multimodal reasoning, categorize +and illustrate the frontiers of MLLMs, introduce recent trends in applications +of MLLMs on reasoning-intensive tasks, and finally discuss current practices +and future directions. We believe our survey establishes a solid base and sheds +light on this important topic, multimodal reasoning. + +
+
+
+
+
+ + ♻ ☆ ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration + Measure ICLR 2023 + + +
+ Studies have shown that modern neural networks tend to be poorly calibrated +due to over-confident predictions. Traditionally, post-processing methods have +been used to calibrate the model after training. In recent years, various +trainable calibration measures have been proposed to incorporate them directly +into the training process. However, these methods all incorporate internal +hyperparameters, and the performance of these calibration objectives relies on +tuning these hyperparameters, incurring more computational costs as the size of +neural networks and datasets become larger. As such, we present Expected +Squared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable +calibration objective loss, where we view the calibration error from the +perspective of the squared difference between the two expectations. With +extensive experiments on several architectures (CNNs, Transformers) and +datasets, we demonstrate that (1) incorporating ESD into the training improves +model calibration in various batch size settings without the need for internal +hyperparameter tuning, (2) ESD yields the best-calibrated results compared with +previous approaches, and (3) ESD drastically improves the computational costs +required for calibration during training due to the absence of internal +hyperparameter. The code is publicly accessible at +https://github.com/hee-suk-yoon/ESD. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ Separate the Wheat from the Chaff: Model Deficiency Unlearning via + Parameter-Efficient Module Operation AAAI 2024 + + +
+ Large language models (LLMs) have been widely used in various applications +but are known to suffer from issues related to untruthfulness and toxicity. +While parameter-efficient modules (PEMs) have demonstrated their effectiveness +in equipping models with new skills, leveraging PEMs for deficiency unlearning +remains underexplored. In this work, we propose a PEMs operation approach, +namely Extraction-before-Subtraction (Ext-Sub), to enhance the truthfulness and +detoxification of LLMs through the integration of ``expert'' PEM and +``anti-expert'' PEM. Remarkably, even anti-expert PEM possess valuable +capabilities due to their proficiency in generating fabricated content, which +necessitates language modeling and logical narrative competence. Rather than +merely negating the parameters, our approach involves extracting and +eliminating solely the deficiency capability within anti-expert PEM while +preserving the general capabilities. To evaluate the effectiveness of our +approach in terms of truthfulness and detoxification, we conduct extensive +experiments on LLMs, encompassing additional abilities such as language +modeling and mathematical reasoning. Our empirical results demonstrate that our +approach effectively improves truthfulness and detoxification, while largely +preserving the fundamental abilities of LLMs. + +
+
+ comment: AAAI 2024; The first two authors contributed equally to this paper +
+
+
+
+
+ + ♻ ☆ LoraHub: Efficient Cross-Task Generalization via Dynamic LoRA + Composition + + +
+ Low-rank adaptations (LoRA) are often employed to fine-tune large language +models (LLMs) for new tasks. This paper investigates LoRA composability for +cross-task generalization and introduces LoraHub, a simple framework devised +for the purposive assembly of LoRA modules trained on diverse given tasks, with +the objective of achieving adaptable performance on unseen tasks. With just a +few examples from a new task, LoraHub can fluidly combine multiple LoRA +modules, eliminating the need for human expertise and assumptions. Notably, the +composition requires neither additional model parameters nor gradients. +Empirical results on the Big-Bench Hard benchmark suggest that LoraHub, while +not surpassing the performance of in-context learning, offers a notable +performance-efficiency trade-off in few-shot scenarios by employing a +significantly reduced number of tokens per example during inference. Notably, +LoraHub establishes a better upper bound compared to in-context learning when +paired with different demonstration examples, demonstrating its potential for +future development. Our vision is to establish a platform for LoRA modules, +empowering users to share their trained LoRA modules. This collaborative +approach facilitates the seamless application of LoRA modules to novel tasks, +contributing to an adaptive ecosystem. Our code is available at +https://github.com/sail-sg/lorahub, and all the pre-trained LoRA modules are +released at https://huggingface.co/lorahub. + +
+
+ comment: Add more related work and experimental results +
+
+
+
+
+ + ♻ ☆ SpecTr: Fast Speculative Decoding via Optimal Transport NeurIPS 2023 + + +
+ Autoregressive sampling from large language models has led to +state-of-the-art results in several natural language tasks. However, +autoregressive sampling generates tokens one at a time making it slow, and even +prohibitive in certain tasks. One way to speed up sampling is +$\textit{speculative decoding}$: use a small model to sample a $\textit{draft}$ +(block or sequence of tokens), and then score all tokens in the draft by the +large language model in parallel. A subset of the tokens in the draft are +accepted (and the rest rejected) based on a statistical method to guarantee +that the final output follows the distribution of the large model. In this +work, we provide a principled understanding of speculative decoding through the +lens of optimal transport (OT) with $\textit{membership cost}$. This framework +can be viewed as an extension of the well-known $\textit{maximal-coupling}$ +problem. This new formulation enables us to generalize the speculative decoding +method to allow for a set of $k$ candidates at the token-level, which leads to +an improved optimal membership cost. We show that the optimal draft selection +algorithm (transport plan) can be computed via linear programming, whose +best-known runtime is exponential in $k$. We then propose a valid draft +selection algorithm whose acceptance probability is $(1-1/e)$-optimal +multiplicatively. Moreover, it can be computed in time almost linear with size +of domain of a single token. Using this $new draft selection$ algorithm, we +develop a new autoregressive sampling algorithm called $\textit{SpecTr}$, which +provides speedup in decoding while ensuring that there is no quality +degradation in the decoded output. We experimentally demonstrate that for +state-of-the-art large language models, the proposed approach achieves a wall +clock speedup of 2.13X, a further 1.37X speedup over speculative decoding on +standard benchmarks. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ E^2-LLM: Efficient and Extreme Length Extension of Large Language Models + + +
+ Typically, training LLMs with long context sizes is computationally +expensive, requiring extensive training hours and GPU resources. Existing +long-context extension methods usually need additional training procedures to +support corresponding long-context windows, where the long-context training +data (e.g., 32k) is needed, and high GPU training costs are assumed. To address +the aforementioned issues, we propose an Efficient and Extreme length extension +method for Large Language Models, called E 2 -LLM, with only one training +procedure and dramatically reduced computation cost, which also removes the +need to collect long-context data. Concretely, first, the training data of our +E 2 -LLM only requires a short length (e.g., 4k), which reduces the tuning cost +greatly. Second, the training procedure on the short training context window is +performed only once time, and we can support different evaluation context +windows at inference. Third, in E 2 - LLM, based on RoPE position embeddings, +we introduce two different augmentation methods on the scale and position index +parameters for different samples in training. It aims to make the model more +robust to the different relative differences when directly interpolating the +arbitrary context length at inference. Comprehensive experimental results on +multiple benchmark datasets demonstrate the effectiveness of our E 2 -LLM on +challenging long-context tasks. + +
+
+
+
+
+ + ♻ ☆ Language Control Diffusion: Efficiently Scaling through Space, Time, and + Tasks ICLR 2024 + + +
+ Training generalist agents is difficult across several axes, requiring us to +deal with high-dimensional inputs (space), long horizons (time), and +generalization to novel tasks. Recent advances with architectures have allowed +for improved scaling along one or two of these axes, but are still +computationally prohibitive to use. In this paper, we propose to address all +three axes by leveraging \textbf{L}anguage to \textbf{C}ontrol +\textbf{D}iffusion models as a hierarchical planner conditioned on language +(LCD). We effectively and efficiently scale diffusion models for planning in +extended temporal, state, and task dimensions to tackle long horizon control +problems conditioned on natural language instructions, as a step towards +generalist agents. Comparing LCD with other state-of-the-art models on the +CALVIN language robotics benchmark finds that LCD outperforms other SOTA +methods in multi-task success rates, whilst improving inference speed over +other comparable diffusion models by 3.3x~15x. We show that LCD can +successfully leverage the unique strength of diffusion models to produce +coherent long range plans while addressing their weakness in generating +low-level details and control. + +
+
+ comment: ICLR 2024, Project and code available at + https://github.com/ezhang7423/language-control-diffusion +
+
+
+
+
+ + ♻ ☆ MULTISCRIPT: Multimodal Script Learning for Supporting Open Domain + Everyday Tasks AAAI 2024 + + +
+ Automatically generating scripts (i.e. sequences of key steps described in +text) from video demonstrations and reasoning about the subsequent steps are +crucial to the modern AI virtual assistants to guide humans to complete +everyday tasks, especially unfamiliar ones. However, current methods for +generative script learning rely heavily on well-structured preceding steps +described in text and/or images or are limited to a certain domain, resulting +in a disparity with real-world user scenarios. To address these limitations, we +present a new benchmark challenge -- MultiScript, with two new tasks on +task-oriented multimodal script learning: (1) multimodal script generation, and +(2) subsequent step prediction. For both tasks, the input consists of a target +task name and a video illustrating what has been done to complete the target +task, and the expected output is (1) a sequence of structured step descriptions +in text based on the demonstration video, and (2) a single text description for +the subsequent step, respectively. Built from WikiHow, MultiScript covers +multimodal scripts in videos and text descriptions for over 6,655 human +everyday tasks across 19 diverse domains. To establish baseline performance on +MultiScript, we propose two knowledge-guided multimodal generative frameworks +that incorporate the task-related knowledge prompted from large language models +such as Vicuna. Experimental results show that our proposed approaches +significantly improve over the competitive baselines. + +
+
+ comment: Accepted by AAAI 2024. 11 pages, 9 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Directed Regular and Context-Free Languages + + +
+ We study the problem of deciding whether a given language is directed. A +language $L$ is \emph{directed} if every pair of words in $L$ have a common +(scattered) superword in $L$. Deciding directedness is a fundamental problem in +connection with ideal decompositions of downward closed sets. Another +motivation is that deciding whether two \emph{directed} context-free languages +have the same downward closures can be decided in polynomial time, whereas for +general context-free languages, this problem is known to be coNEXP-complete. + We show that the directedness problem for regular languages, given as NFAs, +belongs to $AC^1$, and thus polynomial time. Moreover, it is NL-complete for +fixed alphabet sizes. Furthermore, we show that for context-free languages, the +directedness problem is PSPACE-complete. + +
+
+
+
+
+ + ♻ ☆ Improving Faithfulness of Abstractive Summarization by Controlling + Confounding Effect of Irrelevant Sentences + + +
+ Lack of factual correctness is an issue that still plagues state-of-the-art +summarization systems despite their impressive progress on generating seemingly +fluent summaries. In this paper, we show that factual inconsistency can be +caused by irrelevant parts of the input text, which act as confounders. To that +end, we leverage information-theoretic measures of causal effects to quantify +the amount of confounding and precisely quantify how they affect the +summarization performance. Based on insights derived from our theoretical +results, we design a simple multi-task model to control such confounding by +leveraging human-annotated relevant sentences when available. Crucially, we +give a principled characterization of data distributions where such confounding +can be large thereby necessitating the use of human annotated relevant +sentences to generate factual summaries. Our approach improves faithfulness +scores by 20\% over strong baselines on AnswerSumm +\citep{fabbri2021answersumm}, a conversation summarization dataset where lack +of faithfulness is a significant issue due to the subjective nature of the +task. Our best method achieves the highest faithfulness score while also +achieving state-of-the-art results on standard metrics like ROUGE and METEOR. +We corroborate these improvements through human evaluation. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 142 + +
+
+
+ + ☆ ParaHome: Parameterizing Everyday Home Activities Towards 3D Generative + Modeling of Human-Object Interactions + + +
+ To enable machines to learn how humans interact with the physical world in +our daily activities, it is crucial to provide rich data that encompasses the +3D motion of humans as well as the motion of objects in a learnable 3D +representation. Ideally, this data should be collected in a natural setup, +capturing the authentic dynamic 3D signals during human-object interactions. To +address this challenge, we introduce the ParaHome system, designed to capture +and parameterize dynamic 3D movements of humans and objects within a common +home environment. Our system consists of a multi-view setup with 70 +synchronized RGB cameras, as well as wearable motion capture devices equipped +with an IMU-based body suit and hand motion capture gloves. By leveraging the +ParaHome system, we collect a novel large-scale dataset of human-object +interaction. Notably, our dataset offers key advancement over existing datasets +in three main aspects: (1) capturing 3D body and dexterous hand manipulation +motion alongside 3D object movement within a contextual home environment during +natural activities; (2) encompassing human interaction with multiple objects in +various episodic scenarios with corresponding descriptions in texts; (3) +including articulated objects with multiple parts expressed with parameterized +articulations. Building upon our dataset, we introduce new research tasks aimed +at building a generative model for learning and synthesizing human-object +interactions in a real-world room setting. + +
+
+
+
+
+ + ☆ OMG-Seg: Is One Model Good Enough For All Segmentation? + + +
+ In this work, we address various segmentation tasks, each traditionally +tackled by distinct or partially unified models. We propose OMG-Seg, One Model +that is Good enough to efficiently and effectively handle all the segmentation +tasks, including image semantic, instance, and panoptic segmentation, as well +as their video counterparts, open vocabulary settings, prompt-driven, +interactive segmentation like SAM, and video object segmentation. To our +knowledge, this is the first model to handle all these tasks in one model and +achieve satisfactory performance. We show that OMG-Seg, a transformer-based +encoder-decoder architecture with task-specific queries and outputs, can +support over ten distinct segmentation tasks and yet significantly reduce +computational and parameter overhead across various tasks and datasets. We +rigorously evaluate the inter-task influences and correlations during +co-training. Code and models are available at https://github.com/lxtGH/OMG-Seg. + +
+
+ comment: Project Page: https://lxtgh.github.io/project/omg_seg/ +
+
+
+
+
+ + ☆ RAP-SAM: Towards Real-Time All-Purpose Segment Anything + + +
+ Advanced by transformer architecture, vision foundation models (VFMs) achieve +remarkable progress in performance and generalization ability. Segment Anything +Model (SAM) is one remarkable model that can achieve generalized segmentation. +However, most VFMs cannot run in realtime, which makes it difficult to transfer +them into several products. On the other hand, current real-time segmentation +mainly has one purpose, such as semantic segmentation on the driving scene. We +argue that diverse outputs are needed for real applications. Thus, this work +explores a new real-time segmentation setting, named all-purpose segmentation +in real-time, to transfer VFMs in real-time deployment. It contains three +different tasks, including interactive segmentation, panoptic segmentation, and +video segmentation. We aim to use one model to achieve the above tasks in +real-time. We first benchmark several strong baselines. Then, we present +Real-Time All Purpose SAM (RAP-SAM). It contains an efficient encoder and an +efficient decoupled decoder to perform prompt-driven decoding. Moreover, we +further explore different training strategies and tuning methods to boost +co-training performance further. Our code and model are available at +https://github.com/xushilin1/RAP-SAM/. + +
+
+ comment: Project Page: https://xushilin1.github.io/rap_sam/ +
+
+
+
+
+ + ☆ A Simple Latent Diffusion Approach for Panoptic Segmentation and Mask + Inpainting + + +
+ Panoptic and instance segmentation networks are often trained with +specialized object detection modules, complex loss functions, and ad-hoc +post-processing steps to handle the permutation-invariance of the instance +masks. This work builds upon Stable Diffusion and proposes a latent diffusion +approach for panoptic segmentation, resulting in a simple architecture which +omits these complexities. Our training process consists of two steps: (1) +training a shallow autoencoder to project the segmentation masks to latent +space; (2) training a diffusion model to allow image-conditioned sampling in +latent space. The use of a generative model unlocks the exploration of mask +completion or inpainting, which has applications in interactive segmentation. +The experimental validation yields promising results for both panoptic +segmentation and mask inpainting. While not setting a new state-of-the-art, our +model's simplicity, generality, and mask completion capability are desirable +properties. + +
+
+ comment: Code: https://github.com/segments-ai/latent-diffusion-segmentation +
+
+
+
+
+ + ☆ Towards Language-Driven Video Inpainting via Multimodal Large Language + Models + + +
+ We introduce a new task -- language-driven video inpainting, which uses +natural language instructions to guide the inpainting process. This approach +overcomes the limitations of traditional video inpainting methods that depend +on manually labeled binary masks, a process often tedious and labor-intensive. +We present the Remove Objects from Videos by Instructions (ROVI) dataset, +containing 5,650 videos and 9,091 inpainting results, to support training and +evaluation for this task. We also propose a novel diffusion-based +language-driven video inpainting framework, the first end-to-end baseline for +this task, integrating Multimodal Large Language Models to understand and +execute complex language-based inpainting requests effectively. Our +comprehensive results showcase the dataset's versatility and the model's +effectiveness in various language-instructed inpainting scenarios. We will make +datasets, code, and models publicly available. + +
+
+ comment: Project Page: https://jianzongwu.github.io/projects/rovi +
+
+
+
+
+ + ☆ The Manga Whisperer: Automatically Generating Transcriptions for Comics + + +
+ In the past few decades, Japanese comics, commonly referred to as Manga, have +transcended both cultural and linguistic boundaries to become a true worldwide +sensation. Yet, the inherent reliance on visual cues and illustration within +manga renders it largely inaccessible to individuals with visual impairments. +In this work, we seek to address this substantial barrier, with the aim of +ensuring that manga can be appreciated and actively engaged by everyone. +Specifically, we tackle the problem of diarisation i.e. generating a +transcription of who said what and when, in a fully automatic way. + To this end, we make the following contributions: (1) we present a unified +model, Magi, that is able to (a) detect panels, text boxes and character boxes, +(b) cluster characters by identity (without knowing the number of clusters +apriori), and (c) associate dialogues to their speakers; (2) we propose a novel +approach that is able to sort the detected text boxes in their reading order +and generate a dialogue transcript; (3) we annotate an evaluation benchmark for +this task using publicly available [English] manga pages. The code, evaluation +datasets and the pre-trained model can be found at: +https://github.com/ragavsachdeva/magi. + +
+
+
+
+
+ + ☆ Supervised Fine-tuning in turn Improves Visual Foundation Models + + +
+ Image-text training like CLIP has dominated the pretraining of vision +foundation models in recent years. Subsequent efforts have been made to +introduce region-level visual learning into CLIP's pretraining but face +scalability challenges due to the lack of large-scale region-level datasets. +Drawing inspiration from supervised fine-tuning (SFT) in natural language +processing such as instruction tuning, we explore the potential of fine-grained +SFT in enhancing the generation of vision foundation models after their +pretraining. Thus a two-stage method ViSFT (Vision SFT) is proposed to unleash +the fine-grained knowledge of vision foundation models. In ViSFT, the vision +foundation model is enhanced by performing visual joint learning on some +in-domain tasks and then tested on out-of-domain benchmarks. With updating +using ViSFT on 8 V100 GPUs in less than 2 days, a vision transformer with over +4.4B parameters shows improvements across various out-of-domain benchmarks +including vision and vision-linguistic scenarios. + +
+
+ comment: 14 pages, 3 figures, Project page: + https://github.com/TencentARC/ViSFT/tree/main +
+
+
+
+
+ + ☆ AutoFT: Robust Fine-Tuning by Optimizing Hyperparameters on OOD Data + + +
+ Foundation models encode rich representations that can be adapted to a +desired task by fine-tuning on task-specific data. However, fine-tuning a model +on one particular data distribution often compromises the model's original +performance on other distributions. Current methods for robust fine-tuning +utilize hand-crafted regularization techniques to constrain the fine-tuning +process towards the base foundation model. Yet, it is hard to precisely specify +what characteristics of the foundation model to retain during fine-tuning, as +this depends on how the pre-training, fine-tuning, and evaluation data +distributions relate to each other. We propose AutoFT, a data-driven approach +for guiding foundation model fine-tuning. AutoFT optimizes fine-tuning +hyperparameters to maximize performance on a small out-of-distribution (OOD) +validation set. To guide fine-tuning in a granular way, AutoFT searches a +highly expressive hyperparameter space that includes weight coefficients for +many different losses, in addition to learning rate and weight decay values. We +evaluate AutoFT on nine natural distribution shifts which include domain shifts +and subpopulation shifts. Our experiments show that AutoFT significantly +improves generalization to new OOD data, outperforming existing robust +fine-tuning methods. Notably, AutoFT achieves new state-of-the-art performance +on the WILDS-iWildCam and WILDS-FMoW benchmarks, outperforming the previous +best methods by $6.0\%$ and $1.5\%$, respectively. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Edit One for All: Interactive Batch Image Editing + + +
+ In recent years, image editing has advanced remarkably. With increased human +control, it is now possible to edit an image in a plethora of ways; from +specifying in text what we want to change, to straight up dragging the contents +of the image in an interactive point-based manner. However, most of the focus +has remained on editing single images at a time. Whether and how we can +simultaneously edit large batches of images has remained understudied. With the +goal of minimizing human supervision in the editing process, this paper +presents a novel method for interactive batch image editing using StyleGAN as +the medium. Given an edit specified by users in an example image (e.g., make +the face frontal), our method can automatically transfer that edit to other +test images, so that regardless of their initial state (pose), they all arrive +at the same final state (e.g., all facing front). Extensive experiments +demonstrate that edits performed using our method have similar visual quality +to existing single-image-editing methods, while having more visual consistency +and saving significant time and human effort. + +
+
+ comment: Project page: https://thaoshibe.github.io/edit-one-for-all/ +
+
+
+
+
+ + ☆ Explaining the Implicit Neural Canvas: Connecting Pixels to Neurons by + Tracing their Contributions + + +
+ The many variations of Implicit Neural Representations (INRs), where a neural +network is trained as a continuous representation of a signal, have tremendous +practical utility for downstream tasks including novel view synthesis, video +compression, and image superresolution. Unfortunately, the inner workings of +these networks are seriously under-studied. Our work, eXplaining the Implicit +Neural Canvas (XINC), is a unified framework for explaining properties of INRs +by examining the strength of each neuron's contribution to each output pixel. +We call the aggregate of these contribution maps the Implicit Neural Canvas and +we use this concept to demonstrate that the INRs which we study learn to +''see'' the frames they represent in surprising ways. For example, INRs tend to +have highly distributed representations. While lacking high-level object +semantics, they have a significant bias for color and edges, and are almost +entirely space-agnostic. We arrive at our conclusions by examining how objects +are represented across time in video INRs, using clustering to visualize +similar neurons across layers and architectures, and show that this is +dominated by motion. These insights demonstrate the general usefulness of our +analysis framework. Our project page is available at +https://namithap10.github.io/xinc. + +
+
+ comment: Project site: https://namithap10.github.io/xinc +
+
+
+
+
+ + ☆ GPAvatar: Generalizable and Precise Head Avatar from Image(s) ICLR 2024 + + +
+ Head avatar reconstruction, crucial for applications in virtual reality, +online meetings, gaming, and film industries, has garnered substantial +attention within the computer vision community. The fundamental objective of +this field is to faithfully recreate the head avatar and precisely control +expressions and postures. Existing methods, categorized into 2D-based warping, +mesh-based, and neural rendering approaches, present challenges in maintaining +multi-view consistency, incorporating non-facial information, and generalizing +to new identities. In this paper, we propose a framework named GPAvatar that +reconstructs 3D head avatars from one or several images in a single forward +pass. The key idea of this work is to introduce a dynamic point-based +expression field driven by a point cloud to precisely and effectively capture +expressions. Furthermore, we use a Multi Tri-planes Attention (MTA) fusion +module in the tri-planes canonical field to leverage information from multiple +input images. The proposed method achieves faithful identity reconstruction, +precise expression control, and multi-view consistency, demonstrating promising +results for free-viewpoint rendering and novel view synthesis. + +
+
+ comment: ICLR 2024, code is available at https://github.com/xg-chu/GPAvatar +
+
+
+
+
+ + ☆ MM-Interleaved: Interleaved Image-Text Generative Modeling via + Multi-modal Feature Synchronizer + + +
+ Developing generative models for interleaved image-text data has both +research and practical value. It requires models to understand the interleaved +sequences and subsequently generate images and text. However, existing attempts +are limited by the issue that the fixed number of visual tokens cannot +efficiently capture image details, which is particularly problematic in the +multi-image scenarios. To address this, this paper presents MM-Interleaved, an +end-to-end generative model for interleaved image-text data. It introduces a +multi-scale and multi-image feature synchronizer module, allowing direct access +to fine-grained image features in the previous context during the generation +process. MM-Interleaved is end-to-end pre-trained on both paired and +interleaved image-text corpora. It is further enhanced through a supervised +fine-tuning phase, wherein the model improves its ability to follow complex +multi-modal instructions. Experiments demonstrate the versatility of +MM-Interleaved in recognizing visual details following multi-modal instructions +and generating consistent images following both textual and visual conditions. +Code and models are available at +\url{https://github.com/OpenGVLab/MM-Interleaved}. + +
+
+ comment: 20 pages, 9 figures, 17 tables +
+
+
+
+
+ + ☆ Divide and not forget: Ensemble of selectively trained experts in + Continual Learning ICLR2024 + + +
+ Class-incremental learning is becoming more popular as it helps models widen +their applicability while not forgetting what they already know. A trend in +this area is to use a mixture-of-expert technique, where different models work +together to solve the task. However, the experts are usually trained all at +once using whole task data, which makes them all prone to forgetting and +increasing computational burden. To address this limitation, we introduce a +novel approach named SEED. SEED selects only one, the most optimal expert for a +considered task, and uses data from this task to fine-tune only this expert. +For this purpose, each expert represents each class with a Gaussian +distribution, and the optimal expert is selected based on the similarity of +those distributions. Consequently, SEED increases diversity and heterogeneity +within the experts while maintaining the high stability of this ensemble +method. The extensive experiments demonstrate that SEED achieves +state-of-the-art performance in exemplar-free settings across various +scenarios, showing the potential of expert diversification through data in +continual learning. + +
+
+ comment: Accepted to ICLR2024 (main track), code is available at: + https://github.com/grypesc/SEED +
+
+
+
+
+ + ☆ Neural Echos: Depthwise Convolutional Filters Replicate Biological + Receptive Fields + + +
+ In this study, we present evidence suggesting that depthwise convolutional +kernels are effectively replicating the structural intricacies of the +biological receptive fields observed in the mammalian retina. We provide +analytics of trained kernels from various state-of-the-art models +substantiating this evidence. Inspired by this intriguing discovery, we propose +an initialization scheme that draws inspiration from the biological receptive +fields. Experimental analysis of the ImageNet dataset with multiple CNN +architectures featuring depthwise convolutions reveals a marked enhancement in +the accuracy of the learned model when initialized with biologically derived +weights. This underlies the potential for biologically inspired computational +models to further our understanding of vision processing systems and to improve +the efficacy of convolutional networks. + +
+
+
+
+
+ + ☆ Comprehensive OOD Detection Improvements + + +
+ As machine learning becomes increasingly prevalent in impactful decisions, +recognizing when inference data is outside the model's expected input +distribution is paramount for giving context to predictions. +Out-of-distribution (OOD) detection methods have been created for this task. +Such methods can be split into representation-based or logit-based methods from +whether they respectively utilize the model's embeddings or predictions for OOD +detection. In contrast to most papers which solely focus on one such group, we +address both. We employ dimensionality reduction on feature embeddings in +representation-based methods for both time speedups and improved performance. +Additionally, we propose DICE-COL, a modification of the popular logit-based +method Directed Sparsification (DICE) that resolves an unnoticed flaw. We +demonstrate the effectiveness of our methods on the OpenOODv1.5 benchmark +framework, where they significantly improve performance and set +state-of-the-art results. + +
+
+
+
+
+ + ☆ SHINOBI: Shape and Illumination using Neural Object Decomposition via + BRDF Optimization In-the-wild + + +
+ We present SHINOBI, an end-to-end framework for the reconstruction of shape, +material, and illumination from object images captured with varying lighting, +pose, and background. Inverse rendering of an object based on unconstrained +image collections is a long-standing challenge in computer vision and graphics +and requires a joint optimization over shape, radiance, and pose. We show that +an implicit shape representation based on a multi-resolution hash encoding +enables faster and robust shape reconstruction with joint camera alignment +optimization that outperforms prior work. Further, to enable the editing of +illumination and object reflectance (i.e. material) we jointly optimize BRDF +and illumination together with the object's shape. Our method is class-agnostic +and works on in-the-wild image collections of objects to produce relightable 3D +assets for several use cases such as AR/VR, movies, games, etc. Project page: +https://shinobi.aengelhardt.com Video: +https://www.youtube.com/watch?v=iFENQ6AcYd8&feature=youtu.be + +
+
+
+
+
+ + ☆ VMamba: Visual State Space Model + + +
+ Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs) stand as +the two most popular foundation models for visual representation learning. +While CNNs exhibit remarkable scalability with linear complexity w.r.t. image +resolution, ViTs surpass them in fitting capabilities despite contending with +quadratic complexity. A closer inspection reveals that ViTs achieve superior +visual modeling performance through the incorporation of global receptive +fields and dynamic weights. This observation motivates us to propose a novel +architecture that inherits these components while enhancing computational +efficiency. To this end, we draw inspiration from the recently introduced state +space model and propose the Visual State Space Model (VMamba), which achieves +linear complexity without sacrificing global receptive fields. To address the +encountered direction-sensitive issue, we introduce the Cross-Scan Module (CSM) +to traverse the spatial domain and convert any non-causal visual image into +order patch sequences. Extensive experimental results substantiate that VMamba +not only demonstrates promising capabilities across various visual perception +tasks, but also exhibits more pronounced advantages over established benchmarks +as the image resolution increases. Source code has been available at +https://github.com/MzeroMiko/VMamba. + +
+
+ comment: 13 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Motion-Zero: Zero-Shot Moving Object Control Framework for + Diffusion-Based Video Generation IJCAI + + +
+ Recent large-scale pre-trained diffusion models have demonstrated a powerful +generative ability to produce high-quality videos from detailed text +descriptions. However, exerting control over the motion of objects in videos +generated by any video diffusion model is a challenging problem. In this paper, +we propose a novel zero-shot moving object trajectory control framework, +Motion-Zero, to enable a bounding-box-trajectories-controlled text-to-video +diffusion model.To this end, an initial noise prior module is designed to +provide a position-based prior to improve the stability of the appearance of +the moving object and the accuracy of position. In addition, based on the +attention map of the U-net, spatial constraints are directly applied to the +denoising process of diffusion models, which further ensures the positional and +spatial consistency of moving objects during the inference. Furthermore, +temporal consistency is guaranteed with a proposed shift temporal attention +mechanism. Our method can be flexibly applied to various state-of-the-art video +diffusion models without any training process. Extensive experiments +demonstrate our proposed method can control the motion trajectories of objects +and generate high-quality videos. + +
+
+ comment: 9 pages, 4 figures, IJCAI paper +
+
+
+
+
+ + ☆ Explicitly Disentangled Representations in Object-Centric Learning + + +
+ Extracting structured representations from raw visual data is an important +and long-standing challenge in machine learning. Recently, techniques for +unsupervised learning of object-centric representations have raised growing +interest. In this context, enhancing the robustness of the latent features can +improve the efficiency and effectiveness of the training of downstream tasks. A +promising step in this direction is to disentangle the factors that cause +variation in the data. Previously, Invariant Slot Attention disentangled +position, scale, and orientation from the remaining features. Extending this +approach, we focus on separating the shape and texture components. In +particular, we propose a novel architecture that biases object-centric models +toward disentangling shape and texture components into two non-overlapping +subsets of the latent space dimensions. These subsets are known a priori, hence +before the training process. Experiments on a range of object-centric +benchmarks reveal that our approach achieves the desired disentanglement while +also numerically improving baseline performance in most cases. In addition, we +show that our method can generate novel textures for a specific object or +transfer textures between objects with distinct shapes. + +
+
+
+
+
+ + ☆ Model Compression Techniques in Biometrics Applications: A Survey + + +
+ The development of deep learning algorithms has extensively empowered +humanity's task automatization capacity. However, the huge improvement in the +performance of these models is highly correlated with their increasing level of +complexity, limiting their usefulness in human-oriented applications, which are +usually deployed in resource-constrained devices. This led to the development +of compression techniques that drastically reduce the computational and memory +costs of deep learning models without significant performance degradation. This +paper aims to systematize the current literature on this topic by presenting a +comprehensive survey of model compression techniques in biometrics +applications, namely quantization, knowledge distillation and pruning. We +conduct a critical analysis of the comparative value of these techniques, +focusing on their advantages and disadvantages and presenting suggestions for +future work directions that can potentially improve the current methods. +Additionally, we discuss and analyze the link between model bias and model +compression, highlighting the need to direct compression research toward model +fairness in future works. + +
+
+ comment: Under review at IEEE Journal +
+
+
+
+
+ + ☆ Few-shot learning for COVID-19 Chest X-Ray Classification with + Imbalanced Data: An Inter vs. Intra Domain Study + + +
+ Medical image datasets are essential for training models used in +computer-aided diagnosis, treatment planning, and medical research. However, +some challenges are associated with these datasets, including variability in +data distribution, data scarcity, and transfer learning issues when using +models pre-trained from generic images. This work studies the effect of these +challenges at the intra- and inter-domain level in few-shot learning scenarios +with severe data imbalance. For this, we propose a methodology based on Siamese +neural networks in which a series of techniques are integrated to mitigate the +effects of data scarcity and distribution imbalance. Specifically, different +initialization and data augmentation methods are analyzed, and four adaptations +to Siamese networks of solutions to deal with imbalanced data are introduced, +including data balancing and weighted loss, both separately and combined, and +with a different balance of pairing ratios. Moreover, we also assess the +inference process considering four classifiers, namely Histogram, $k$NN, SVM, +and Random Forest. Evaluation is performed on three chest X-ray datasets with +annotated cases of both positive and negative COVID-19 diagnoses. The accuracy +of each technique proposed for the Siamese architecture is analyzed separately +and their results are compared to those obtained using equivalent methods on a +state-of-the-art CNN. We conclude that the introduced techniques offer +promising improvements over the baseline in almost all cases, and that the +selection of the technique may vary depending on the amount of data available +and the level of imbalance. + +
+
+ comment: Submited to Pattern Analysis and Applications +
+
+
+
+
+ + ☆ Sub2Full: split spectrum to boost OCT despeckling without clean data + + +
+ Optical coherence tomography (OCT) suffers from speckle noise, causing the +deterioration of image quality, especially in high-resolution modalities like +visible light OCT (vis-OCT). The potential of conventional supervised deep +learning denoising methods is limited by the difficulty of obtaining clean +data. Here, we proposed an innovative self-supervised strategy called Sub2Full +(S2F) for OCT despeckling without clean data. This approach works by acquiring +two repeated B-scans, splitting the spectrum of the first repeat as a +low-resolution input, and utilizing the full spectrum of the second repeat as +the high-resolution target. The proposed method was validated on vis-OCT +retinal images visualizing sublaminar structures in outer retina and +demonstrated superior performance over conventional Noise2Noise and Noise2Void +schemes. The code is available at +https://github.com/PittOCT/Sub2Full-OCT-Denoising. + +
+
+
+
+
+ + ☆ Exposing Lip-syncing Deepfakes from Mouth Inconsistencies + + +
+ A lip-syncing deepfake is a digitally manipulated video in which a person's +lip movements are created convincingly using AI models to match altered or +entirely new audio. Lip-syncing deepfakes are a dangerous type of deepfakes as +the artifacts are limited to the lip region and more difficult to discern. In +this paper, we describe a novel approach, LIP-syncing detection based on mouth +INConsistency (LIPINC), for lip-syncing deepfake detection by identifying +temporal inconsistencies in the mouth region. These inconsistencies are seen in +the adjacent frames and throughout the video. Our model can successfully +capture these irregularities and outperforms the state-of-the-art methods on +several benchmark deepfake datasets. + +
+
+
+
+
+ + ☆ VIPTR: A Vision Permutable Extractor for Fast and Efficient Scene Text + Recognition + + +
+ Scene Text Recognition (STR) is a challenging task that involves recognizing +text within images of natural scenes. Although current state-of-the-art models +for STR exhibit high performance, they typically suffer from low inference +efficiency due to their reliance on hybrid architectures comprised of visual +encoders and sequence decoders. In this work, we propose the VIsion Permutable +extractor for fast and efficient scene Text Recognition (VIPTR), which achieves +an impressive balance between high performance and rapid inference speeds in +the domain of STR. Specifically, VIPTR leverages a visual-semantic extractor +with a pyramid structure, characterized by multiple self-attention layers, +while eschewing the traditional sequence decoder. This design choice results in +a lightweight and efficient model capable of handling inputs of varying sizes. +Extensive experimental results on various standard datasets for both Chinese +and English scene text recognition validate the superiority of VIPTR. Notably, +the VIPTR-T (Tiny) variant delivers highly competitive accuracy on par with +other lightweight models and achieves SOTA inference speeds. Meanwhile, the +VIPTR-L (Large) variant attains greater recognition accuracy, while maintaining +a low parameter count and favorable inference speed. Our proposed method +provides a compelling solution for the STR challenge, which blends high +accuracy with efficiency and greatly benefits real-world applications requiring +fast and reliable text recognition. The code is publicly available at +https://github.com/cxfyxl/VIPTR. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2205.00159 by other authors +
+
+
+
+
+ + ☆ Cross-Modality Perturbation Synergy Attack for Person Re-identification + + +
+ In recent years, there has been significant research focusing on addressing +security concerns in single-modal person re-identification (ReID) systems that +are based on RGB images. However, the safety of cross-modality scenarios, which +are more commonly encountered in practical applications involving images +captured by infrared cameras, has not received adequate attention. The main +challenge in cross-modality ReID lies in effectively dealing with visual +differences between different modalities. For instance, infrared images are +typically grayscale, unlike visible images that contain color information. +Existing attack methods have primarily focused on the characteristics of the +visible image modality, overlooking the features of other modalities and the +variations in data distribution among different modalities. This oversight can +potentially undermine the effectiveness of these methods in image retrieval +across diverse modalities. This study represents the first exploration into the +security of cross-modality ReID models and proposes a universal perturbation +attack specifically designed for cross-modality ReID. This attack optimizes +perturbations by leveraging gradients from diverse modality data, thereby +disrupting the discriminator and reinforcing the differences between +modalities. We conducted experiments on two widely used cross-modality +datasets, namely RegDB and SYSU, which not only demonstrated the effectiveness +of our method but also provided insights for future enhancements in the +robustness of cross-modality ReID systems. + +
+
+
+
+
+ + ☆ DiffusionGPT: LLM-Driven Text-to-Image Generation System + + +
+ Diffusion models have opened up new avenues for the field of image +generation, resulting in the proliferation of high-quality models shared on +open-source platforms. However, a major challenge persists in current +text-to-image systems are often unable to handle diverse inputs, or are limited +to single model results. Current unified attempts often fall into two +orthogonal aspects: i) parse Diverse Prompts in input stage; ii) activate +expert model to output. To combine the best of both worlds, we propose +DiffusionGPT, which leverages Large Language Models (LLM) to offer a unified +generation system capable of seamlessly accommodating various types of prompts +and integrating domain-expert models. DiffusionGPT constructs domain-specific +Trees for various generative models based on prior knowledge. When provided +with an input, the LLM parses the prompt and employs the Trees-of-Thought to +guide the selection of an appropriate model, thereby relaxing input constraints +and ensuring exceptional performance across diverse domains. Moreover, we +introduce Advantage Databases, where the Tree-of-Thought is enriched with human +feedback, aligning the model selection process with human preferences. Through +extensive experiments and comparisons, we demonstrate the effectiveness of +DiffusionGPT, showcasing its potential for pushing the boundaries of image +synthesis in diverse domains. + +
+
+
+
+
+ + ☆ ContextMix: A context-aware data augmentation method for industrial + visual inspection systems + + +
+ While deep neural networks have achieved remarkable performance, data +augmentation has emerged as a crucial strategy to mitigate overfitting and +enhance network performance. These techniques hold particular significance in +industrial manufacturing contexts. Recently, image mixing-based methods have +been introduced, exhibiting improved performance on public benchmark datasets. +However, their application to industrial tasks remains challenging. The +manufacturing environment generates massive amounts of unlabeled data on a +daily basis, with only a few instances of abnormal data occurrences. This leads +to severe data imbalance. Thus, creating well-balanced datasets is not +straightforward due to the high costs associated with labeling. Nonetheless, +this is a crucial step for enhancing productivity. For this reason, we +introduce ContextMix, a method tailored for industrial applications and +benchmark datasets. ContextMix generates novel data by resizing entire images +and integrating them into other images within the batch. This approach enables +our method to learn discriminative features based on varying sizes from resized +images and train informative secondary features for object recognition using +occluded images. With the minimal additional computation cost of image +resizing, ContextMix enhances performance compared to existing augmentation +techniques. We evaluate its effectiveness across classification, detection, and +segmentation tasks using various network architectures on public benchmark +datasets. Our proposed method demonstrates improved results across a range of +robustness tasks. Its efficacy in real industrial environments is particularly +noteworthy, as demonstrated using the passive component dataset. + +
+
+ comment: Accepted to EAAI +
+
+
+
+
+ + ☆ Deep spatial context: when attention-based models meet spatial + regression + + +
+ We propose 'Deep spatial context' (DSCon) method, which serves for +investigation of the attention-based vision models using the concept of spatial +context. It was inspired by histopathologists, however, the method can be +applied to various domains. The DSCon allows for a quantitative measure of the +spatial context's role using three Spatial Context Measures: $SCM_{features}$, +$SCM_{targets}$, $SCM_{residuals}$ to distinguish whether the spatial context +is observable within the features of neighboring regions, their target values +(attention scores) or residuals, respectively. It is achieved by integrating +spatial regression into the pipeline. The DSCon helps to verify research +questions. The experiments reveal that spatial relationships are much bigger in +the case of the classification of tumor lesions than normal tissues. Moreover, +it turns out that the larger the size of the neighborhood taken into account +within spatial regression, the less valuable contextual information is. +Furthermore, it is observed that the spatial context measure is the largest +when considered within the feature space as opposed to the targets and +residuals. + +
+
+
+
+
+ + ☆ CMFN: Cross-Modal Fusion Network for Irregular Scene Text Recognition ICONIP 2023 + + +
+ Scene text recognition, as a cross-modal task involving vision and text, is +an important research topic in computer vision. Most existing methods use +language models to extract semantic information for optimizing visual +recognition. However, the guidance of visual cues is ignored in the process of +semantic mining, which limits the performance of the algorithm in recognizing +irregular scene text. To tackle this issue, we propose a novel cross-modal +fusion network (CMFN) for irregular scene text recognition, which incorporates +visual cues into the semantic mining process. Specifically, CMFN consists of a +position self-enhanced encoder, a visual recognition branch and an iterative +semantic recognition branch. The position self-enhanced encoder provides +character sequence position encoding for both the visual recognition branch and +the iterative semantic recognition branch. The visual recognition branch +carries out visual recognition based on the visual features extracted by CNN +and the position encoding information provided by the position self-enhanced +encoder. The iterative semantic recognition branch, which consists of a +language recognition module and a cross-modal fusion gate, simulates the way +that human recognizes scene text and integrates cross-modal visual cues for +text recognition. The experiments demonstrate that the proposed CMFN algorithm +achieves comparable performance to state-of-the-art algorithms, indicating its +effectiveness. + +
+
+ comment: Accepted to ICONIP 2023 +
+
+
+
+
+ + ☆ GPT4Ego: Unleashing the Potential of Pre-trained Models for Zero-Shot + Egocentric Action Recognition + + +
+ Vision-Language Models (VLMs), pre-trained on large-scale datasets, have +shown impressive performance in various visual recognition tasks. This +advancement paves the way for notable performance in Zero-Shot Egocentric +Action Recognition (ZS-EAR). Typically, VLMs handle ZS-EAR as a global +video-text matching task, which often leads to suboptimal alignment of vision +and linguistic knowledge. We propose a refined approach for ZS-EAR using VLMs, +emphasizing fine-grained concept-description alignment that capitalizes on the +rich semantic and contextual details in egocentric videos. In this paper, we +introduce GPT4Ego, a straightforward yet remarkably potent VLM framework for +ZS-EAR, designed to enhance the fine-grained alignment of concept and +description between vision and language. Extensive experiments demonstrate +GPT4Ego significantly outperforms existing VLMs on three large-scale egocentric +video benchmarks, i.e., EPIC-KITCHENS-100 (33.2%, +9.4%), EGTEA (39.6%, +5.5%), +and CharadesEgo (31.5%, +2.6%). + +
+
+
+
+
+ + ☆ Depth Over RGB: Automatic Evaluation of Open Surgery Skills Using Depth + Camera + + +
+ Purpose: In this paper, we present a novel approach to the automatic +evaluation of open surgery skills using depth cameras. This work is intended to +show that depth cameras achieve similar results to RGB cameras, which is the +common method in the automatic evaluation of open surgery skills. Moreover, +depth cameras offer advantages such as robustness to lighting variations, +camera positioning, simplified data compression, and enhanced privacy, making +them a promising alternative to RGB cameras. + Methods: Experts and novice surgeons completed two simulators of open +suturing. We focused on hand and tool detection, and action segmentation in +suturing procedures. YOLOv8 was used for tool detection in RGB and depth +videos. Furthermore, UVAST and MSTCN++ were used for action segmentation. Our +study includes the collection and annotation of a dataset recorded with Azure +Kinect. + Results: We demonstrated that using depth cameras in object detection and +action segmentation achieves comparable results to RGB cameras. Furthermore, we +analyzed 3D hand path length, revealing significant differences between experts +and novice surgeons, emphasizing the potential of depth cameras in capturing +surgical skills. We also investigated the influence of camera angles on +measurement accuracy, highlighting the advantages of 3D cameras in providing a +more accurate representation of hand movements. + Conclusion: Our research contributes to advancing the field of surgical skill +assessment by leveraging depth cameras for more reliable and privacy +evaluations. The findings suggest that depth cameras can be valuable in +assessing surgical skills and provide a foundation for future research in this +area. + +
+
+
+
+
+ + ☆ Text Region Multiple Information Perception Network for Scene Text + Detection ICASSP 2024 + + +
+ Segmentation-based scene text detection algorithms can handle arbitrary shape +scene texts and have strong robustness and adaptability, so it has attracted +wide attention. Existing segmentation-based scene text detection algorithms +usually only segment the pixels in the center region of the text, while +ignoring other information of the text region, such as edge information, +distance information, etc., thus limiting the detection accuracy of the +algorithm for scene text. This paper proposes a plug-and-play module called the +Region Multiple Information Perception Module (RMIPM) to enhance the detection +performance of segmentation-based algorithms. Specifically, we design an +improved module that can perceive various types of information about scene text +regions, such as text foreground classification maps, distance maps, direction +maps, etc. Experiments on MSRA-TD500 and TotalText datasets show that our +method achieves comparable performance with current state-of-the-art +algorithms. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ CPCL: Cross-Modal Prototypical Contrastive Learning for Weakly + Supervised Text-based Person Re-Identification + + +
+ Weakly supervised text-based person re-identification (TPRe-ID) seeks to +retrieve images of a target person using textual descriptions, without relying +on identity annotations and is more challenging and practical. The primary +challenge is the intra-class differences, encompassing intra-modal feature +variations and cross-modal semantic gaps. Prior works have focused on +instance-level samples and ignored prototypical features of each person which +are intrinsic and invariant. Toward this, we propose a Cross-Modal Prototypical +Contrastive Learning (CPCL) method. In practice, the CPCL introduces the CLIP +model to weakly supervised TPRe-ID for the first time, mapping visual and +textual instances into a shared latent space. Subsequently, the proposed +Prototypical Multi-modal Memory (PMM) module captures associations between +heterogeneous modalities of image-text pairs belonging to the same person +through the Hybrid Cross-modal Matching (HCM) module in a many-to-many mapping +fashion. Moreover, the Outlier Pseudo Label Mining (OPLM) module further +distinguishes valuable outlier samples from each modality, enhancing the +creation of more reliable clusters by mining implicit relationships between +image-text pairs. Experimental results demonstrate that our proposed CPCL +attains state-of-the-art performance on all three public datasets, with a +significant improvement of 11.58%, 8.77% and 5.25% in Rank@1 accuracy on +CUHK-PEDES, ICFG-PEDES and RSTPReid datasets, respectively. The code is +available at https://github.com/codeGallery24/CPCL. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Advancing Large Multi-modal Models with Explicit Chain-of-Reasoning and + Visual Question Generation + + +
+ The increasing demand for intelligent systems capable of interpreting and +reasoning about visual content requires the development of Large Multi-Modal +Models (LMMs) that are not only accurate but also have explicit reasoning +capabilities. This paper presents a novel approach to imbue an LMM with the +ability to conduct explicit reasoning based on visual content and textual +instructions. We introduce a system that can ask a question to acquire +necessary knowledge, thereby enhancing the robustness and explicability of the +reasoning process. Our method comprises the development of a novel dataset +generated by a Large Language Model (LLM), designed to promote chain-of-thought +reasoning combined with a question-asking mechanism. We designed an LMM, which +has high capabilities on region awareness to address the intricate requirements +of image-text alignment. The model undergoes a three-stage training phase, +starting with large-scale image-text alignment using a large-scale datasets, +followed by instruction tuning, and fine-tuning with a focus on +chain-of-thought reasoning. The results demonstrate a stride toward a more +robust, accurate, and interpretable LMM, capable of reasoning explicitly and +seeking information proactively when confronted with ambiguous visual input. + +
+
+
+
+
+ + ☆ BPDO:Boundary Points Dynamic Optimization for Arbitrary Shape Scene Text + Detection ICASSP 2024 + + +
+ Arbitrary shape scene text detection is of great importance in scene +understanding tasks. Due to the complexity and diversity of text in natural +scenes, existing scene text algorithms have limited accuracy for detecting +arbitrary shape text. In this paper, we propose a novel arbitrary shape scene +text detector through boundary points dynamic optimization(BPDO). The proposed +model is designed with a text aware module (TAM) and a boundary point dynamic +optimization module (DOM). Specifically, the model designs a text aware module +based on segmentation to obtain boundary points describing the central region +of the text by extracting a priori information about the text region. Then, +based on the idea of deformable attention, it proposes a dynamic optimization +model for boundary points, which gradually optimizes the exact position of the +boundary points based on the information of the adjacent region of each +boundary point. Experiments on CTW-1500, Total-Text, and MSRA-TD500 datasets +show that the model proposed in this paper achieves a performance that is +better than or comparable to the state-of-the-art algorithm, proving the +effectiveness of the model. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Developing an AI-based Integrated System for Bee Health Evaluation + + +
+ Honey bees pollinate about one-third of the world's food supply, but bee +colonies have alarmingly declined by nearly 40% over the past decade due to +several factors, including pesticides and pests. Traditional methods for +monitoring beehives, such as human inspection, are subjective, disruptive, and +time-consuming. To overcome these limitations, artificial intelligence has been +used to assess beehive health. However, previous studies have lacked an +end-to-end solution and primarily relied on data from a single source, either +bee images or sounds. This study introduces a comprehensive system consisting +of bee object detection and health evaluation. Additionally, it utilized a +combination of visual and audio signals to analyze bee behaviors. An +Attention-based Multimodal Neural Network (AMNN) was developed to adaptively +focus on key features from each type of signal for accurate bee health +assessment. The AMNN achieved an overall accuracy of 92.61%, surpassing eight +existing single-signal Convolutional Neural Networks and Recurrent Neural +Networks. It outperformed the best image-based model by 32.51% and the top +sound-based model by 13.98% while maintaining efficient processing times. +Furthermore, it improved prediction robustness, attaining an F1-score higher +than 90% across all four evaluated health conditions. The study also shows that +audio signals are more reliable than images for assessing bee health. By +seamlessly integrating AMNN with image and sound data in a comprehensive bee +health monitoring system, this approach provides a more efficient and +non-invasive solution for the early detection of bee diseases and the +preservation of bee colonies. + +
+
+
+
+
+ + ☆ WorldDreamer: Towards General World Models for Video Generation via + Predicting Masked Tokens + + +
+ World models play a crucial role in understanding and predicting the dynamics +of the world, which is essential for video generation. However, existing world +models are confined to specific scenarios such as gaming or driving, limiting +their ability to capture the complexity of general world dynamic environments. +Therefore, we introduce WorldDreamer, a pioneering world model to foster a +comprehensive comprehension of general world physics and motions, which +significantly enhances the capabilities of video generation. Drawing +inspiration from the success of large language models, WorldDreamer frames +world modeling as an unsupervised visual sequence modeling challenge. This is +achieved by mapping visual inputs to discrete tokens and predicting the masked +ones. During this process, we incorporate multi-modal prompts to facilitate +interaction within the world model. Our experiments show that WorldDreamer +excels in generating videos across different scenarios, including natural +scenes and driving environments. WorldDreamer showcases versatility in +executing tasks such as text-to-video conversion, image-tovideo synthesis, and +video editing. These results underscore WorldDreamer's effectiveness in +capturing dynamic elements within diverse general world environments. + +
+
+ comment: project page: https://world-dreamer.github.io/ +
+
+
+
+
+ + ☆ Ventricular Segmentation: A Brief Comparison of U-Net Derivatives + + +
+ Medical imaging refers to the technologies and methods utilized to view the +human body and its inside, in order to diagnose, monitor, or even treat medical +disorders. This paper aims to explore the application of deep learning +techniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic +Resonance Imaging) images, aiming to enhance the diagnosis, monitoring, and +treatment of medical disorders related to the heart. The focus centers on +implementing various architectures that are derivatives of U-Net, to +effectively isolate specific parts of the heart for comprehensive anatomical +and functional analysis. Through a combination of images, graphs, and +quantitative metrics, the efficacy of the models and their predictions are +showcased. Additionally, this paper addresses encountered challenges and +outline strategies for future improvements. This abstract provides a concise +overview of the efforts in utilizing deep learning for cardiac image +segmentation, emphasizing both the accomplishments and areas for further +refinement. + +
+
+
+
+
+ + ☆ CustomVideo: Customizing Text-to-Video Generation with Multiple Subjects + + +
+ Customized text-to-video generation aims to generate high-quality videos +guided by text prompts and subject references. Current approaches designed for +single subjects suffer from tackling multiple subjects, which is a more +challenging and practical scenario. In this work, we aim to promote +multi-subject guided text-to-video customization. We propose CustomVideo, a +novel framework that can generate identity-preserving videos with the guidance +of multiple subjects. To be specific, firstly, we encourage the co-occurrence +of multiple subjects via composing them in a single image. Further, upon a +basic text-to-video diffusion model, we design a simple yet effective attention +control strategy to disentangle different subjects in the latent space of +diffusion model. Moreover, to help the model focus on the specific object area, +we segment the object from given reference images and provide a corresponding +object mask for attention learning. Also, we collect a multi-subject +text-to-video generation dataset as a comprehensive benchmark, with 69 +individual subjects and 57 meaningful pairs. Extensive qualitative, +quantitative, and user study results demonstrate the superiority of our method, +compared with the previous state-of-the-art approaches. + +
+
+ comment: 10 pages, 7 figures, 5 tables +
+
+
+
+
+ + ☆ Multi-task Learning for Joint Re-identification, Team Affiliation, and + Role Classification for Sports Visual Tracking + + +
+ Effective tracking and re-identification of players is essential for +analyzing soccer videos. But, it is a challenging task due to the non-linear +motion of players, the similarity in appearance of players from the same team, +and frequent occlusions. Therefore, the ability to extract meaningful +embeddings to represent players is crucial in developing an effective tracking +and re-identification system. In this paper, a multi-purpose part-based person +representation method, called PRTreID, is proposed that performs three tasks of +role classification, team affiliation, and re-identification, simultaneously. +In contrast to available literature, a single network is trained with +multi-task supervision to solve all three tasks, jointly. The proposed joint +method is computationally efficient due to the shared backbone. Also, the +multi-task learning leads to richer and more discriminative representations, as +demonstrated by both quantitative and qualitative results. To demonstrate the +effectiveness of PRTreID, it is integrated with a state-of-the-art tracking +method, using a part-based post-processing module to handle long-term tracking. +The proposed tracking method outperforms all existing tracking methods on the +challenging SoccerNet tracking dataset. + +
+
+
+
+
+ + ☆ ICGNet: A Unified Approach for Instance-Centric Grasping + + +
+ Accurate grasping is the key to several robotic tasks including assembly and +household robotics. Executing a successful grasp in a cluttered environment +requires multiple levels of scene understanding: First, the robot needs to +analyze the geometric properties of individual objects to find feasible grasps. +These grasps need to be compliant with the local object geometry. Second, for +each proposed grasp, the robot needs to reason about the interactions with +other objects in the scene. Finally, the robot must compute a collision-free +grasp trajectory while taking into account the geometry of the target object. +Most grasp detection algorithms directly predict grasp poses in a monolithic +fashion, which does not capture the composability of the environment. In this +paper, we introduce an end-to-end architecture for object-centric grasping. The +method uses pointcloud data from a single arbitrary viewing direction as an +input and generates an instance-centric representation for each partially +observed object in the scene. This representation is further used for object +reconstruction and grasp detection in cluttered table-top scenes. We show the +effectiveness of the proposed method by extensively evaluating it against +state-of-the-art methods on synthetic datasets, indicating superior performance +for grasping and reconstruction. Additionally, we demonstrate real-world +applicability by decluttering scenes with varying numbers of objects. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ☆ MAMBA: Multi-level Aggregation via Memory Bank for Video Object + Detection + + +
+ State-of-the-art video object detection methods maintain a memory structure, +either a sliding window or a memory queue, to enhance the current frame using +attention mechanisms. However, we argue that these memory structures are not +efficient or sufficient because of two implied operations: (1) concatenating +all features in memory for enhancement, leading to a heavy computational cost; +(2) frame-wise memory updating, preventing the memory from capturing more +temporal information. In this paper, we propose a multi-level aggregation +architecture via memory bank called MAMBA. Specifically, our memory bank +employs two novel operations to eliminate the disadvantages of existing +methods: (1) light-weight key-set construction which can significantly reduce +the computational cost; (2) fine-grained feature-wise updating strategy which +enables our method to utilize knowledge from the whole video. To better enhance +features from complementary levels, i.e., feature maps and proposals, we +further propose a generalized enhancement operation (GEO) to aggregate +multi-level features in a unified manner. We conduct extensive evaluations on +the challenging ImageNetVID dataset. Compared with existing state-of-the-art +methods, our method achieves superior performance in terms of both speed and +accuracy. More remarkably, MAMBA achieves mAP of 83.7/84.6% at 12.6/9.1 FPS +with ResNet-101. Code is available at +https://github.com/guanxiongsun/video_feature_enhancement. + +
+
+
+
+
+ + ☆ BlenDA: Domain Adaptive Object Detection through diffusion-based + blending ICASSP + + +
+ Unsupervised domain adaptation (UDA) aims to transfer a model learned using +labeled data from the source domain to unlabeled data in the target domain. To +address the large domain gap issue between the source and target domains, we +propose a novel regularization method for domain adaptive object detection, +BlenDA, by generating the pseudo samples of the intermediate domains and their +corresponding soft domain labels for adaptation training. The intermediate +samples are generated by dynamically blending the source images with their +corresponding translated images using an off-the-shelf pre-trained +text-to-image diffusion model which takes the text label of the target domain +as input and has demonstrated superior image-to-image translation quality. +Based on experimental results from two adaptation benchmarks, our proposed +approach can significantly enhance the performance of the state-of-the-art +domain adaptive object detector, Adversarial Query Transformer (AQT). +Particularly, in the Cityscapes to Foggy Cityscapes adaptation, we achieve an +impressive 53.4% mAP on the Foggy Cityscapes dataset, surpassing the previous +state-of-the-art by 1.5%. It is worth noting that our proposed method is also +applicable to various paradigms of domain adaptive object detection. The code +is available at:https://github.com/aiiu-lab/BlenDA + +
+
+ comment: ICASSP(2024):2024 IEEE International Conference on Acoustics, Speech + and Signal Processing +
+
+
+
+
+ + ☆ XAI-Enhanced Semantic Segmentation Models for Visual Quality Inspection + + +
+ Visual quality inspection systems, crucial in sectors like manufacturing and +logistics, employ computer vision and machine learning for precise, rapid +defect detection. However, their unexplained nature can hinder trust, error +identification, and system improvement. This paper presents a framework to +bolster visual quality inspection by using CAM-based explanations to refine +semantic segmentation models. Our approach consists of 1) Model Training, 2) +XAI-based Model Explanation, 3) XAI Evaluation, and 4) Annotation Augmentation +for Model Enhancement, informed by explanations and expert insights. +Evaluations show XAI-enhanced models surpass original DeepLabv3-ResNet101 +models, especially in intricate object segmentation. + +
+
+ comment: IEEE ICCE 2024 +
+
+
+
+
+ + ☆ Skeleton-Guided Instance Separation for Fine-Grained Segmentation in + Microscopy + + +
+ One of the fundamental challenges in microscopy (MS) image analysis is +instance segmentation (IS), particularly when segmenting cluster regions where +multiple objects of varying sizes and shapes may be connected or even +overlapped in arbitrary orientations. Existing IS methods usually fail in +handling such scenarios, as they rely on coarse instance representations such +as keypoints and horizontal bounding boxes (h-bboxes). In this paper, we +propose a novel one-stage framework named A2B-IS to address this challenge and +enhance the accuracy of IS in MS images. Our approach represents each instance +with a pixel-level mask map and a rotated bounding box (r-bbox). Unlike +two-stage methods that use box proposals for segmentations, our method +decouples mask and box predictions, enabling simultaneous processing to +streamline the model pipeline. Additionally, we introduce a Gaussian skeleton +map to aid the IS task in two key ways: (1) It guides anchor placement, +reducing computational costs while improving the model's capacity to learn +RoI-aware features by filtering out noise from background regions. (2) It +ensures accurate isolation of densely packed instances by rectifying erroneous +box predictions near instance boundaries. To further enhance the performance, +we integrate two modules into the framework: (1) An Atrous Attention Block +(A2B) designed to extract high-resolution feature maps with fine-grained +multiscale information, and (2) A Semi-Supervised Learning (SSL) strategy that +leverages both labeled and unlabeled images for model training. Our method has +been thoroughly validated on two large-scale MS datasets, demonstrating its +superiority over most state-of-the-art approaches. + +
+
+
+
+
+ + ☆ Question-Answer Cross Language Image Matching for Weakly Supervised + Semantic Segmentation ACM MM 2023 + + +
+ Class Activation Map (CAM) has emerged as a popular tool for weakly +supervised semantic segmentation (WSSS), allowing the localization of object +regions in an image using only image-level labels. However, existing CAM +methods suffer from under-activation of target object regions and +false-activation of background regions due to the fact that a lack of detailed +supervision can hinder the model's ability to understand the image as a whole. +In this paper, we propose a novel Question-Answer Cross-Language-Image Matching +framework for WSSS (QA-CLIMS), leveraging the vision-language foundation model +to maximize the text-based understanding of images and guide the generation of +activation maps. First, a series of carefully designed questions are posed to +the VQA (Visual Question Answering) model with Question-Answer Prompt +Engineering (QAPE) to generate a corpus of both foreground target objects and +backgrounds that are adaptive to query images. We then employ contrastive +learning in a Region Image Text Contrastive (RITC) network to compare the +obtained foreground and background regions with the generated corpus. Our +approach exploits the rich textual information from the open vocabulary as +additional supervision, enabling the model to generate high-quality CAMs with a +more complete object region and reduce false-activation of background regions. +We conduct extensive analysis to validate the proposed method and show that our +approach performs state-of-the-art on both PASCAL VOC 2012 and MS COCO +datasets. Code is available at: https://github.com/CVI-SZU/QA-CLIMS + +
+
+ comment: ACM MM 2023 +
+
+
+
+
+ + ☆ Boosting Few-Shot Segmentation via Instance-Aware Data Augmentation and + Local Consensus Guided Cross Attention + + +
+ Few-shot segmentation aims to train a segmentation model that can fast adapt +to a novel task for which only a few annotated images are provided. Most recent +models have adopted a prototype-based paradigm for few-shot inference. These +approaches may have limited generalization capacity beyond the standard 1- or +5-shot settings. In this paper, we closely examine and reevaluate the +fine-tuning based learning scheme that fine-tunes the classification layer of a +deep segmentation network pre-trained on diverse base classes. To improve the +generalizability of the classification layer optimized with sparsely annotated +samples, we introduce an instance-aware data augmentation (IDA) strategy that +augments the support images based on the relative sizes of the target objects. +The proposed IDA effectively increases the support set's diversity and promotes +the distribution consistency between support and query images. On the other +hand, the large visual difference between query and support images may hinder +knowledge transfer and cripple the segmentation performance. To cope with this +challenge, we introduce the local consensus guided cross attention (LCCA) to +align the query feature with support features based on their dense correlation, +further improving the model's generalizability to the query image. The +significant performance improvements on the standard few-shot segmentation +benchmarks PASCAL-$5^i$ and COCO-$20^i$ verify the efficacy of our proposed +method. + +
+
+
+
+
+ + ☆ Improving fine-grained understanding in image-text pre-training + + +
+ We introduce SPARse Fine-grained Contrastive Alignment (SPARC), a simple +method for pretraining more fine-grained multimodal representations from +image-text pairs. Given that multiple image patches often correspond to single +words, we propose to learn a grouping of image patches for every token in the +caption. To achieve this, we use a sparse similarity metric between image +patches and language tokens and compute for each token a language-grouped +vision embedding as the weighted average of patches. The token and +language-grouped vision embeddings are then contrasted through a fine-grained +sequence-wise loss that only depends on individual samples and does not require +other batch samples as negatives. This enables more detailed information to be +learned in a computationally inexpensive manner. SPARC combines this +fine-grained loss with a contrastive loss between global image and text +embeddings to learn representations that simultaneously encode global and local +information. We thoroughly evaluate our proposed method and show improved +performance over competing approaches both on image-level tasks relying on +coarse-grained information, e.g. classification, as well as region-level tasks +relying on fine-grained information, e.g. retrieval, object detection, and +segmentation. Moreover, SPARC improves model faithfulness and captioning in +foundational vision-language models. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Temporal Insight Enhancement: Mitigating Temporal Hallucination in + Multimodal Large Language Models + + +
+ Recent advancements in Multimodal Large Language Models (MLLMs) have +significantly enhanced the comprehension of multimedia content, bringing +together diverse modalities such as text, images, and videos. However, a +critical challenge faced by these models, especially when processing video +inputs, is the occurrence of hallucinations - erroneous perceptions or +interpretations, particularly at the event level. This study introduces an +innovative method to address event-level hallucinations in MLLMs, focusing on +specific temporal understanding in video content. Our approach leverages a +novel framework that extracts and utilizes event-specific information from both +the event query and the provided video to refine MLLMs' response. We propose a +unique mechanism that decomposes on-demand event queries into iconic actions. +Subsequently, we employ models like CLIP and BLIP2 to predict specific +timestamps for event occurrences. Our evaluation, conducted using the +Charades-STA dataset, demonstrates a significant reduction in temporal +hallucinations and an improvement in the quality of event-related responses. +This research not only provides a new perspective in addressing a critical +limitation of MLLMs but also contributes a quantitatively measurable method for +evaluating MLLMs in the context of temporal-related questions. + +
+
+ comment: 7 pages, 7 figures +
+
+
+
+
+ + ☆ Enhancing the Fairness and Performance of Edge Cameras with Explainable + AI + + +
+ The rising use of Artificial Intelligence (AI) in human detection on Edge +camera systems has led to accurate but complex models, challenging to interpret +and debug. Our research presents a diagnostic method using Explainable AI (XAI) +for model debugging, with expert-driven problem identification and solution +creation. Validated on the Bytetrack model in a real-world office Edge network, +we found the training dataset as the main bias source and suggested model +augmentation as a solution. Our approach helps identify model biases, essential +for achieving fair and trustworthy models. + +
+
+ comment: IEEE ICCE 2024 +
+
+
+
+
+ + ☆ Exploring Latent Cross-Channel Embedding for Accurate 3D Human Pose + Reconstruction in a Diffusion Framework + + +
+ Monocular 3D human pose estimation poses significant challenges due to the +inherent depth ambiguities that arise during the reprojection process from 2D +to 3D. Conventional approaches that rely on estimating an over-fit projection +matrix struggle to effectively address these challenges and often result in +noisy outputs. Recent advancements in diffusion models have shown promise in +incorporating structural priors to address reprojection ambiguities. However, +there is still ample room for improvement as these methods often overlook the +exploration of correlation between the 2D and 3D joint-level features. In this +study, we propose a novel cross-channel embedding framework that aims to fully +explore the correlation between joint-level features of 3D coordinates and +their 2D projections. In addition, we introduce a context guidance mechanism to +facilitate the propagation of joint graph attention across latent channels +during the iterative diffusion process. To evaluate the effectiveness of our +proposed method, we conduct experiments on two benchmark datasets, namely +Human3.6M and MPI-INF-3DHP. Our results demonstrate a significant improvement +in terms of reconstruction accuracy compared to state-of-the-art methods. The +code for our method will be made available online for further reference. + +
+
+
+
+
+ + ☆ Slicer Networks + + +
+ In medical imaging, scans often reveal objects with varied contrasts but +consistent internal intensities or textures. This characteristic enables the +use of low-frequency approximations for tasks such as segmentation and +deformation field estimation. Yet, integrating this concept into neural network +architectures for medical image analysis remains underexplored. In this paper, +we propose the Slicer Network, a novel architecture designed to leverage these +traits. Comprising an encoder utilizing models like vision transformers for +feature extraction and a slicer employing a learnable bilateral grid, the +Slicer Network strategically refines and upsamples feature maps via a +splatting-blurring-slicing process. This introduces an edge-preserving +low-frequency approximation for the network outcome, effectively enlarging the +effective receptive field. The enhancement not only reduces computational +complexity but also boosts overall performance. Experiments across different +medical imaging applications, including unsupervised and keypoints-based image +registration and lesion segmentation, have verified the Slicer Network's +improved accuracy and efficiency. + +
+
+ comment: 8 figures and 3 tables +
+
+
+
+
+ + ☆ Enhanced Automated Quality Assessment Network for Interactive Building + Segmentation in High-Resolution Remote Sensing Imagery + + +
+ In this research, we introduce the enhanced automated quality assessment +network (IBS-AQSNet), an innovative solution for assessing the quality of +interactive building segmentation within high-resolution remote sensing +imagery. This is a new challenge in segmentation quality assessment, and our +proposed IBS-AQSNet allievate this by identifying missed and mistaken segment +areas. First of all, to acquire robust image features, our method combines a +robust, pre-trained backbone with a lightweight counterpart for comprehensive +feature extraction from imagery and segmentation results. These features are +then fused through a simple combination of concatenation, convolution layers, +and residual connections. Additionally, ISR-AQSNet incorporates a multi-scale +differential quality assessment decoder, proficient in pinpointing areas where +segmentation result is either missed or mistaken. Experiments on a newly-built +EVLab-BGZ dataset, which includes over 39,198 buildings, demonstrate the +superiority of the proposed method in automating segmentation quality +assessment, thereby setting a new benchmark in the field. + +
+
+ comment: The manuscript is submitted to IEEE International Geoscience and + Remote Sensing Symposium(IGARSS2024) +
+
+
+
+
+ + ☆ Boosting Few-Shot Semantic Segmentation Via Segment Anything Model + + +
+ In semantic segmentation, accurate prediction masks are crucial for +downstream tasks such as medical image analysis and image editing. Due to the +lack of annotated data, few-shot semantic segmentation (FSS) performs poorly in +predicting masks with precise contours. Recently, we have noticed that the +large foundation model segment anything model (SAM) performs well in processing +detailed features. Inspired by SAM, we propose FSS-SAM to boost FSS methods by +addressing the issue of inaccurate contour. The FSS-SAM is training-free. It +works as a post-processing tool for any FSS methods and can improve the +accuracy of predicted masks. Specifically, we use predicted masks from FSS +methods to generate prompts and then use SAM to predict new masks. To avoid +predicting wrong masks with SAM, we propose a prediction result selection (PRS) +algorithm. The algorithm can remarkably decrease wrong predictions. Experiment +results on public datasets show that our method is superior to base FSS methods +in both quantitative and qualitative aspects. + +
+
+
+
+
+ + ☆ Enhancing Small Object Encoding in Deep Neural Networks: Introducing + Fast&Focused-Net with Volume-wise Dot Product Layer + + +
+ In this paper, we introduce Fast&Focused-Net, a novel deep neural network +architecture tailored for efficiently encoding small objects into fixed-length +feature vectors. Contrary to conventional Convolutional Neural Networks (CNNs), +Fast&Focused-Net employs a series of our newly proposed layer, the Volume-wise +Dot Product (VDP) layer, designed to address several inherent limitations of +CNNs. Specifically, CNNs often exhibit a smaller effective receptive field than +their theoretical counterparts, limiting their vision span. Additionally, the +initial layers in CNNs produce low-dimensional feature vectors, presenting a +bottleneck for subsequent learning. Lastly, the computational overhead of CNNs, +particularly in capturing diverse image regions by parameter sharing, is +significantly high. The VDP layer, at the heart of Fast&Focused-Net, aims to +remedy these issues by efficiently covering the entire image patch information +with reduced computational demand. Experimental results demonstrate the prowess +of Fast&Focused-Net in a variety of applications. For small object +classification tasks, our network outperformed state-of-the-art methods on +datasets such as CIFAR-10, CIFAR-100, STL-10, SVHN-Cropped, and Fashion-MNIST. +In the context of larger image classification, when combined with a transformer +encoder (ViT), Fast&Focused-Net produced competitive results for OpenImages V6, +ImageNet-1K, and Places365 datasets. Moreover, the same combination showcased +unparalleled performance in text recognition tasks across SVT, IC15, SVTP, and +HOST datasets. This paper presents the architecture, the underlying motivation, +and extensive empirical evidence suggesting that Fast&Focused-Net is a +promising direction for efficient and focused deep learning. + +
+
+
+
+
+ + ☆ Multilingual Visual Speech Recognition with a Single Model by Learning + with Discrete Visual Speech Units + + +
+ This paper explores sentence-level Multilingual Visual Speech Recognition +with a single model for the first time. As the massive multilingual modeling of +visual data requires huge computational costs, we propose a novel strategy, +processing with visual speech units. Motivated by the recent success of the +audio speech unit, the proposed visual speech unit is obtained by discretizing +the visual speech features extracted from the self-supervised visual speech +model. To correctly capture multilingual visual speech, we first train the +self-supervised visual speech model on 5,512 hours of multilingual audio-visual +data. Through analysis, we verify that the visual speech units mainly contain +viseme information while suppressing non-linguistic information. By using the +visual speech units as the inputs of our system, we pre-train the model to +predict corresponding text outputs on massive multilingual data constructed by +merging several VSR databases. As both the inputs and outputs are discrete, we +can greatly improve the training efficiency compared to the standard VSR +training. Specifically, the input data size is reduced to 0.016% of the +original video inputs. In order to complement the insufficient visual +information in speech recognition, we apply curriculum learning where the +inputs of the system begin with audio-visual speech units and gradually change +to visual speech units. After pre-training, the model is finetuned on +continuous features. We set new state-of-the-art multilingual VSR performances +by achieving comparable performances to the previous language-specific VSR +models, with a single trained model. + +
+
+
+
+
+ + ☆ Wavelet-Guided Acceleration of Text Inversion in Diffusion-Based Image + Editing ICASSP + + +
+ In the field of image editing, Null-text Inversion (NTI) enables fine-grained +editing while preserving the structure of the original image by optimizing null +embeddings during the DDIM sampling process. However, the NTI process is +time-consuming, taking more than two minutes per image. To address this, we +introduce an innovative method that maintains the principles of the NTI while +accelerating the image editing process. We propose the WaveOpt-Estimator, which +determines the text optimization endpoint based on frequency characteristics. +Utilizing wavelet transform analysis to identify the image's frequency +characteristics, we can limit text optimization to specific timesteps during +the DDIM sampling process. By adopting the Negative-Prompt Inversion (NPI) +concept, a target prompt representing the original image serves as the initial +text value for optimization. This approach maintains performance comparable to +NTI while reducing the average editing time by over 80% compared to the NTI +method. Our method presents a promising approach for efficient, high-quality +image editing based on diffusion models. + +
+
+ comment: The International Conference on Acoustics, Speech, & Signal + Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ BreastRegNet: A Deep Learning Framework for Registration of Breast + Faxitron and Histopathology Images + + +
+ A standard treatment protocol for breast cancer entails administering +neoadjuvant therapy followed by surgical removal of the tumor and surrounding +tissue. Pathologists typically rely on cabinet X-ray radiographs, known as +Faxitron, to examine the excised breast tissue and diagnose the extent of +residual disease. However, accurately determining the location, size, and +focality of residual cancer can be challenging, and incorrect assessments can +lead to clinical consequences. The utilization of automated methods can improve +the histopathology process, allowing pathologists to choose regions for +sampling more effectively and precisely. Despite the recognized necessity, +there are currently no such methods available. Training such automated +detection models require accurate ground truth labels on ex-vivo radiology +images, which can be acquired through registering Faxitron and histopathology +images and mapping the extent of cancer from histopathology to x-ray images. +This study introduces a deep learning-based image registration approach trained +on mono-modal synthetic image pairs. The models were trained using data from 50 +women who received neoadjuvant chemotherapy and underwent surgery. The results +demonstrate that our method is faster and yields significantly lower average +landmark error ($2.1\pm1.96$ mm) over the state-of-the-art iterative +($4.43\pm4.1$ mm) and deep learning ($4.02\pm3.15$ mm) approaches. Improved +performance of our approach in integrating radiology and pathology information +facilitates generating large datasets, which allows training models for more +accurate breast cancer detection. + +
+
+
+
+
+ + ☆ Adaptive Self-training Framework for Fine-grained Scene Graph Generation ICLR 2024 + + +
+ Scene graph generation (SGG) models have suffered from inherent problems +regarding the benchmark datasets such as the long-tailed predicate distribution +and missing annotation problems. In this work, we aim to alleviate the +long-tailed problem of SGG by utilizing unannotated triplets. To this end, we +introduce a Self-Training framework for SGG (ST-SGG) that assigns pseudo-labels +for unannotated triplets based on which the SGG models are trained. While there +has been significant progress in self-training for image recognition, designing +a self-training framework for the SGG task is more challenging due to its +inherent nature such as the semantic ambiguity and the long-tailed distribution +of predicate classes. Hence, we propose a novel pseudo-labeling technique for +SGG, called Class-specific Adaptive Thresholding with Momentum (CATM), which is +a model-agnostic framework that can be applied to any existing SGG models. +Furthermore, we devise a graph structure learner (GSL) that is beneficial when +adopting our proposed self-training framework to the state-of-the-art +message-passing neural network (MPNN)-based SGG models. Our extensive +experiments verify the effectiveness of ST-SGG on various SGG models, +particularly in enhancing the performance on fine-grained predicate classes. + +
+
+ comment: 9 pages; ICLR 2024 +
+
+
+
+
+ + ☆ On the Audio Hallucinations in Large Audio-Video Language Models + + +
+ Large audio-video language models can generate descriptions for both video +and audio. However, they sometimes ignore audio content, producing audio +descriptions solely reliant on visual information. This paper refers to this as +audio hallucinations and analyzes them in large audio-video language models. We +gather 1,000 sentences by inquiring about audio information and annotate them +whether they contain hallucinations. If a sentence is hallucinated, we also +categorize the type of hallucination. The results reveal that 332 sentences are +hallucinated with distinct trends observed in nouns and verbs for each +hallucination type. Based on this, we tackle a task of audio hallucination +classification using pre-trained audio-text models in the zero-shot and +fine-tuning settings. Our experimental results reveal that the zero-shot models +achieve higher performance (52.2% in F1) than the random (40.3%) and the +fine-tuning models achieve 87.9%, outperforming the zero-shot models. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SEINE: Structure Encoding and Interaction Network for Nuclei Instance + Segmentation + + +
+ Nuclei instance segmentation in histopathological images is of great +importance for biological analysis and cancer diagnosis but remains challenging +for two reasons. (1) Similar visual presentation of intranuclear and +extranuclear regions of chromophobe nuclei often causes under-segmentation, and +(2) current methods lack the exploration of nuclei structure, resulting in +fragmented instance predictions. To address these problems, this paper proposes +a structure encoding and interaction network, termed SEINE, which develops the +structure modeling scheme of nuclei and exploits the structure similarity +between nuclei to improve the integrality of each segmented instance. +Concretely, SEINE introduces a contour-based structure encoding (SE) that +considers the correlation between nuclei structure and semantics, realizing a +reasonable representation of the nuclei structure. Based on the encoding, we +propose a structure-guided attention (SGA) that takes the clear nuclei as +prototypes to enhance the structure learning for the fuzzy nuclei. To +strengthen the structural learning ability, a semantic feature fusion (SFF) is +presented to boost the semantic consistency of semantic and structure branches. +Furthermore, a position enhancement (PE) method is applied to suppress +incorrect nuclei boundary predictions. Extensive experiments demonstrate the +superiority of our approaches, and SEINE achieves state-of-the-art (SOTA) +performance on four datasets. The code is available at +\href{https://github.com/zhangye-zoe/SEINE}{https://github.com/zhangye-zoe/SEINE}. + +
+
+ comment: 10 pages, 12 figures, 6 tables, submitted to TMI +
+
+
+
+
+ + ☆ CLIP Model for Images to Textual Prompts Based on Top-k Neighbors + + +
+ Text-to-image synthesis, a subfield of multimodal generation, has gained +significant attention in recent years. We propose a cost-effective approach for +image-to-prompt generation that leverages generative models to generate textual +prompts without the need for large amounts of annotated data. We divide our +method into two stages: online stage and offline stage. We use a combination of +the CLIP model and K-nearest neighbors (KNN) algorithm. The proposed system +consists of two main parts: an offline task and an online task. Our method owns +the highest metric 0.612 among these models, which is 0.013, 0.055, 0.011 +higher than Clip, Clip + KNN(top 10) respectively. + +
+
+ comment: CLIP model, KNN, image-to-prompts +
+
+
+
+
+ + ☆ SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech + Recognition + + +
+ Audio-visual speech recognition (AVSR) is a multimodal extension of automatic +speech recognition (ASR), using video as a complement to audio. In AVSR, +considerable efforts have been directed at datasets for facial features such as +lip-readings, while they often fall short in evaluating the image comprehension +capabilities in broader contexts. In this paper, we construct SlideAVSR, an +AVSR dataset using scientific paper explanation videos. SlideAVSR provides a +new benchmark where models transcribe speech utterances with texts on the +slides on the presentation recordings. As technical terminologies that are +frequent in paper explanations are notoriously challenging to transcribe +without reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR +problems. As a simple yet effective baseline, we propose DocWhisper, an AVSR +model that can refer to textual information from slides, and confirm its +effectiveness on SlideAVSR. + +
+
+
+
+
+ + ☆ Image Translation as Diffusion Visual Programmers + + +
+ We introduce the novel Diffusion Visual Programmer (DVP), a neuro-symbolic +image translation framework. Our proposed DVP seamlessly embeds a +condition-flexible diffusion model within the GPT architecture, orchestrating a +coherent sequence of visual programs (i.e., computer vision models) for various +pro-symbolic steps, which span RoI identification, style transfer, and position +manipulation, facilitating transparent and controllable image translation +processes. Extensive experiments demonstrate DVP's remarkable performance, +surpassing concurrent arts. This success can be attributed to several key +features of DVP: First, DVP achieves condition-flexible translation via +instance normalization, enabling the model to eliminate sensitivity caused by +the manual guidance and optimally focus on textual descriptions for +high-quality content generation. Second, the framework enhances in-context +reasoning by deciphering intricate high-dimensional concepts in feature spaces +into more accessible low-dimensional symbols (e.g., [Prompt], [RoI object]), +allowing for localized, context-free editing while maintaining overall +coherence. Last but not least, DVP improves systemic controllability and +explainability by offering explicit symbolic representations at each +programming stage, empowering users to intuitively interpret and modify +results. Our research marks a substantial step towards harmonizing artificial +image translation processes with cognitive intelligence, promising broader +applications. + +
+
+ comment: 25 pages, 20 figures +
+
+
+
+
+ + ☆ Measuring the Discrepancy between 3D Geometric Models using Directional + Distance Fields + + +
+ Qualifying the discrepancy between 3D geometric models, which could be +represented with either point clouds or triangle meshes, is a pivotal issue +with board applications. Existing methods mainly focus on directly establishing +the correspondence between two models and then aggregating point-wise distance +between corresponding points, resulting in them being either inefficient or +ineffective. In this paper, we propose DirDist, an efficient, effective, +robust, and differentiable distance metric for 3D geometry data. Specifically, +we construct DirDist based on the proposed implicit representation of 3D +models, namely directional distance field (DDF), which defines the directional +distances of 3D points to a model to capture its local surface geometry. We +then transfer the discrepancy between two 3D geometric models as the +discrepancy between their DDFs defined on an identical domain, naturally +establishing model correspondence. To demonstrate the advantage of our DirDist, +we explore various distance metric-driven 3D geometric modeling tasks, +including template surface fitting, rigid registration, non-rigid registration, +scene flow estimation and human pose optimization. Extensive experiments show +that our DirDist achieves significantly higher accuracy under all tasks. As a +generic distance metric, DirDist has the potential to advance the field of 3D +geometric modeling. The source code is available at +\url{https://github.com/rsy6318/DirDist}. + +
+
+
+
+
+ + ☆ Instance Brownian Bridge as Texts for Open-vocabulary Video Instance + Segmentation + + +
+ Temporally locating objects with arbitrary class texts is the primary pursuit +of open-vocabulary Video Instance Segmentation (VIS). Because of the +insufficient vocabulary of video data, previous methods leverage image-text +pretraining model for recognizing object instances by separately aligning each +frame and class texts, ignoring the correlation between frames. As a result, +the separation breaks the instance movement context of videos, causing inferior +alignment between video and text. To tackle this issue, we propose to link +frame-level instance representations as a Brownian Bridge to model instance +dynamics and align bridge-level instance representation to class texts for more +precisely open-vocabulary VIS (BriVIS). Specifically, we build our system upon +a frozen video segmentor to generate frame-level instance queries, and design +Temporal Instance Resampler (TIR) to generate queries with temporal context +from frame queries. To mold instance queries to follow Brownian bridge and +accomplish alignment with class texts, we design Bridge-Text Alignment (BTA) to +learn discriminative bridge-level representations of instances via contrastive +objectives. Setting MinVIS as the basic video segmentor, BriVIS surpasses the +Open-vocabulary SOTA (OV2Seg) by a clear margin. For example, on the +challenging large-vocabulary VIS dataset (BURST), BriVIS achieves 7.43 mAP and +exhibits 49.49% improvement compared to OV2Seg (4.97 mAP). + +
+
+
+
+
+ + ☆ fast graph-based denoising for point cloud color information ICASSP 2024 + + +
+ Point clouds are utilized in various 3D applications such as cross-reality +(XR) and realistic 3D displays. In some applications, e.g., for live streaming +using a 3D point cloud, real-time point cloud denoising methods are required to +enhance the visual quality. However, conventional high-precision denoising +methods cannot be executed in real time for large-scale point clouds owing to +the complexity of graph constructions with K nearest neighbors and noise level +estimation. This paper proposes a fast graph-based denoising (FGBD) for a +large-scale point cloud. First, high-speed graph construction is achieved by +scanning a point cloud in various directions and searching adjacent +neighborhoods on the scanning lines. Second, we propose a fast noise level +estimation method using eigenvalues of the covariance matrix on a graph. +Finally, we also propose a new low-cost filter selection method to enhance +denoising accuracy to compensate for the degradation caused by the acceleration +algorithms. In our experiments, we succeeded in reducing the processing time +dramatically while maintaining accuracy relative to conventional denoising +methods. Denoising was performed at 30fps, with frames containing approximately +1 million points. + +
+
+ comment: Published in the proceeding of 2024 IEEE International Conference on + Acoustics, Speech and Signal Processing (ICASSP 2024) +
+
+
+
+
+ + ☆ GaussianBody: Clothed Human Reconstruction via 3d Gaussian Splatting + + +
+ In this work, we propose a novel clothed human reconstruction method called +GaussianBody, based on 3D Gaussian Splatting. Compared with the costly neural +radiance based models, 3D Gaussian Splatting has recently demonstrated great +performance in terms of training time and rendering quality. However, applying +the static 3D Gaussian Splatting model to the dynamic human reconstruction +problem is non-trivial due to complicated non-rigid deformations and rich cloth +details. To address these challenges, our method considers explicit pose-guided +deformation to associate dynamic Gaussians across the canonical space and the +observation space, introducing a physically-based prior with regularized +transformations helps mitigate ambiguity between the two spaces. During the +training process, we further propose a pose refinement strategy to update the +pose regression for compensating the inaccurate initial estimation and a +split-with-scale mechanism to enhance the density of regressed point clouds. +The experiments validate that our method can achieve state-of-the-art +photorealistic novel-view rendering results with high-quality details for +dynamic clothed human bodies, along with explicit geometry reconstruction. + +
+
+
+
+
+ + ☆ HCVP: Leveraging Hierarchical Contrastive Visual Prompt for Domain + Generalization + + +
+ Domain Generalization (DG) endeavors to create machine learning models that +excel in unseen scenarios by learning invariant features. In DG, the prevalent +practice of constraining models to a fixed structure or uniform +parameterization to encapsulate invariant features can inadvertently blend +specific aspects. Such an approach struggles with nuanced differentiation of +inter-domain variations and may exhibit bias towards certain domains, hindering +the precise learning of domain-invariant features. Recognizing this, we +introduce a novel method designed to supplement the model with domain-level and +task-specific characteristics. This approach aims to guide the model in more +effectively separating invariant features from specific characteristics, +thereby boosting the generalization. Building on the emerging trend of visual +prompts in the DG paradigm, our work introduces the novel \textbf{H}ierarchical +\textbf{C}ontrastive \textbf{V}isual \textbf{P}rompt (HCVP) methodology. This +represents a significant advancement in the field, setting itself apart with a +unique generative approach to prompts, alongside an explicit model structure +and specialized loss functions. Differing from traditional visual prompts that +are often shared across entire datasets, HCVP utilizes a hierarchical prompt +generation network enhanced by prompt contrastive learning. These generative +prompts are instance-dependent, catering to the unique characteristics inherent +to different domains and tasks. Additionally, we devise a prompt modulation +network that serves as a bridge, effectively incorporating the generated visual +prompts into the vision transformer backbone. Experiments conducted on five DG +datasets demonstrate the effectiveness of HCVP, outperforming both established +DG algorithms and adaptation protocols. + +
+
+
+
+
+ + ☆ SkyEyeGPT: Unifying Remote Sensing Vision-Language Tasks via Instruction + Tuning with Large Language Model + + +
+ Large language models (LLMs) have recently been extended to the +vision-language realm, obtaining impressive general multi-modal capabilities. +However, the exploration of multi-modal large language models (MLLMs) for +remote sensing (RS) data is still in its infancy, and the performance is not +satisfactory. In this work, we introduce SkyEyeGPT, a unified multi-modal large +language model specifically designed for RS vision-language understanding. To +this end, we meticulously curate an RS multi-modal instruction tuning dataset, +including single-task and multi-task conversation instructions. After manual +verification, we obtain a high-quality RS instruction-following dataset with +968k samples. Our research demonstrates that with a simple yet effective +design, SkyEyeGPT works surprisingly well on considerably different tasks +without the need for extra encoding modules. Specifically, after projecting RS +visual features to the language domain via an alignment layer, they are fed +jointly with task-specific instructions into an LLM-based RS decoder to predict +answers for RS open-ended tasks. In addition, we design a two-stage tuning +method to enhance instruction-following and multi-turn dialogue ability at +different granularities. Experiments on 8 datasets for RS vision-language tasks +demonstrate SkyEyeGPT's superiority in image-level and region-level tasks, such +as captioning and visual grounding. In particular, SkyEyeGPT exhibits +encouraging results compared to GPT-4V in some qualitative tests. The online +demo, code, and dataset will be released in +https://github.com/ZhanYang-nwpu/SkyEyeGPT. + +
+
+
+
+
+ + ☆ P2Seg: Pointly-supervised Segmentation via Mutual Distillation ICLR2024 + + +
+ Point-level Supervised Instance Segmentation (PSIS) aims to enhance the +applicability and scalability of instance segmentation by utilizing low-cost +yet instance-informative annotations. Existing PSIS methods usually rely on +positional information to distinguish objects, but predicting precise +boundaries remains challenging due to the lack of contour annotations. +Nevertheless, weakly supervised semantic segmentation methods are proficient in +utilizing intra-class feature consistency to capture the boundary contours of +the same semantic regions. In this paper, we design a Mutual Distillation +Module (MDM) to leverage the complementary strengths of both instance position +and semantic information and achieve accurate instance-level object perception. +The MDM consists of Semantic to Instance (S2I) and Instance to Semantic (I2S). +S2I is guided by the precise boundaries of semantic regions to learn the +association between annotated points and instance contours. I2S leverages +discriminative relationships between instances to facilitate the +differentiation of various objects within the semantic map. Extensive +experiments substantiate the efficacy of MDM in fostering the synergy between +instance and semantic information, consequently improving the quality of +instance-level object representations. Our method achieves 55.7 mAP$_{50}$ and +17.6 mAP on the PASCAL VOC and MS COCO datasets, significantly outperforming +recent PSIS methods and several box-supervised instance segmentation +competitors. + +
+
+ comment: 14 pages, 12 figures, published to ICLR2024 +
+
+
+
+
+ + ☆ Eye Motion Matters for 3D Face Reconstruction + + +
+ Recent advances in single-image 3D face reconstruction have shown remarkable +progress in various applications. Nevertheless, prevailing techniques tend to +prioritize the global facial contour and expression, often neglecting the +nuanced dynamics of the eye region. In response, we introduce an Eye Landmark +Adjustment Module, complemented by a Local Dynamic Loss, designed to capture +the dynamic features of the eyes area. Our module allows for flexible +adjustment of landmarks, resulting in accurate recreation of various eye +states. In this paper, we present a comprehensive evaluation of our approach, +conducting extensive experiments on two datasets. The results underscore the +superior performance of our approach, highlighting its significant +contributions in addressing this particular challenge. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) is widely adopted in computer vision to generate +new images with arbitrary styles. This process leverages neural networks to +merge aesthetic elements of a style image with the structural aspects of a +content image into a harmoniously integrated visual result. However, +unauthorized NST can exploit artwork. Such misuse raises socio-technical +concerns regarding artists' rights and motivates the development of technical +approaches for the proactive protection of original creations. Adversarial +attack is a concept primarily explored in machine learning security. Our work +introduces this technique to protect artists' intellectual property. In this +paper Locally Adaptive Adversarial Color Attack (LAACA), a method for altering +images in a manner imperceptible to the human eyes but disruptive to NST. +Specifically, we design perturbations targeting image areas rich in +high-frequency content, generated by disrupting intermediate features. Our +experiments and user study confirm that by attacking NST using the proposed +method results in visually worse neural style transfer, thus making it an +effective solution for visual artwork protection. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Towards Identifiable Unsupervised Domain Translation: A Diversified + Distribution Matching Approach + + +
+ Unsupervised domain translation (UDT) aims to find functions that convert +samples from one domain (e.g., sketches) to another domain (e.g., photos) +without changing the high-level semantic meaning (also referred to as +``content''). The translation functions are often sought by probability +distribution matching of the transformed source domain and target domain. +CycleGAN stands as arguably the most representative approach among this line of +work. However, it was noticed in the literature that CycleGAN and variants +could fail to identify the desired translation functions and produce +content-misaligned translations. This limitation arises due to the presence of +multiple translation functions -- referred to as ``measure-preserving +automorphism" (MPA) -- in the solution space of the learning criteria. Despite +awareness of such identifiability issues, solutions have remained elusive. This +study delves into the core identifiability inquiry and introduces an MPA +elimination theory. Our analysis shows that MPA is unlikely to exist, if +multiple pairs of diverse cross-domain conditional distributions are matched by +the learning function. Our theory leads to a UDT learner using distribution +matching over auxiliary variable-induced subsets of the domains -- other than +over the entire data domains as in the classical approaches. The proposed +framework is the first to rigorously establish translation identifiability +under reasonable UDT settings, to our best knowledge. Experiments corroborate +with our theoretical claims. + +
+
+
+
+
+ + ☆ M3BUNet: Mobile Mean Max UNet for Pancreas Segmentation on CT-Scans + + +
+ Segmenting organs in CT scan images is a necessary process for multiple +downstream medical image analysis tasks. Currently, manual CT scan segmentation +by radiologists is prevalent, especially for organs like the pancreas, which +requires a high level of domain expertise for reliable segmentation due to +factors like small organ size, occlusion, and varying shapes. When resorting to +automated pancreas segmentation, these factors translate to limited reliable +labeled data to train effective segmentation models. Consequently, the +performance of contemporary pancreas segmentation models is still not within +acceptable ranges. To improve that, we propose M3BUNet, a fusion of MobileNet +and U-Net neural networks, equipped with a novel Mean-Max (MM) attention that +operates in two stages to gradually segment pancreas CT images from coarse to +fine with mask guidance for object detection. This approach empowers the +network to surpass segmentation performance achieved by similar network +architectures and achieve results that are on par with complex state-of-the-art +methods, all while maintaining a low parameter count. Additionally, we +introduce external contour segmentation as a preprocessing step for the coarse +stage to assist in the segmentation process through image standardization. For +the fine segmentation stage, we found that applying a wavelet decomposition +filter to create multi-input images enhances pancreas segmentation performance. +We extensively evaluate our approach on the widely known NIH pancreas dataset +and MSD pancreas dataset. Our approach demonstrates a considerable performance +improvement, achieving an average Dice Similarity Coefficient (DSC) value of up +to 89.53% and an Intersection Over Union (IOU) score of up to 81.16 for the NIH +pancreas dataset, and 88.60% DSC and 79.90% IOU for the MSD Pancreas dataset. + +
+
+
+
+
+ + ☆ DataViz3D: An Novel Method Leveraging Online Holographic Modeling for + Extensive Dataset Preprocessing and Visualization + + +
+ DataViz3D is an innovative online software that transforms complex datasets +into interactive 3D spatial models using holographic technology. This tool +enables users to generate scatter plot within a 3D space, accurately mapped to +the XYZ coordinates of the dataset, providing a vivid and intuitive +understanding of the spatial relationships inherent in the data. DataViz3D's +user friendly interface makes advanced 3D modeling and holographic +visualization accessible to a wide range of users, fostering new opportunities +for collaborative research and education across various disciplines. + +
+
+
+
+
+ + ☆ Inflation with Diffusion: Efficient Temporal Adaptation for + Text-to-Video Super-Resolution WACV'24 + + +
+ We propose an efficient diffusion-based text-to-video super-resolution (SR) +tuning approach that leverages the readily learned capacity of pixel level +image diffusion model to capture spatial information for video generation. To +accomplish this goal, we design an efficient architecture by inflating the +weightings of the text-to-image SR model into our video generation framework. +Additionally, we incorporate a temporal adapter to ensure temporal coherence +across video frames. We investigate different tuning approaches based on our +inflated architecture and report trade-offs between computational costs and +super-resolution quality. Empirical evaluation, both quantitative and +qualitative, on the Shutterstock video dataset, demonstrates that our approach +is able to perform text-to-video SR generation with good visual quality and +temporal consistency. To evaluate temporal coherence, we also present +visualizations in video format in +https://drive.google.com/drive/folders/1YVc-KMSJqOrEUdQWVaI-Yfu8Vsfu_1aO?usp=sharing . + +
+
+ comment: WACV'24 workshop +
+
+
+
+
+ + ☆ Reconstructing the Invisible: Video Frame Restoration through Siamese + Masked Conditional Variational Autoencoder + + +
+ In the domain of computer vision, the restoration of missing information in +video frames is a critical challenge, particularly in applications such as +autonomous driving and surveillance systems. This paper introduces the Siamese +Masked Conditional Variational Autoencoder (SiamMCVAE), leveraging a siamese +architecture with twin encoders based on vision transformers. This innovative +design enhances the model's ability to comprehend lost content by capturing +intrinsic similarities between paired frames. SiamMCVAE proficiently +reconstructs missing elements in masked frames, effectively addressing issues +arising from camera malfunctions through variational inferences. Experimental +results robustly demonstrate the model's effectiveness in restoring missing +information, thus enhancing the resilience of computer vision systems. The +incorporation of Siamese Vision Transformer (SiamViT) encoders in SiamMCVAE +exemplifies promising potential for addressing real-world challenges in +computer vision, reinforcing the adaptability of autonomous systems in dynamic +environments. + +
+
+
+
+
+ + ☆ Analyzing and Mitigating Bias for Vulnerable Classes: Towards Balanced + Representation in Dataset + + +
+ The accuracy and fairness of perception systems in autonomous driving are +crucial, particularly for vulnerable road users. Mainstream research has looked +into improving the performance metrics for classification accuracy. However, +the hidden traits of bias inheritance in the AI models, class imbalances and +disparities in the datasets are often overlooked. In this context, our study +examines the class imbalances for vulnerable road users by focusing on class +distribution analysis, performance evaluation, and bias impact assessment. We +identify the concern of imbalances in class representation, leading to +potential biases in detection accuracy. Utilizing popular CNN models and Vision +Transformers (ViTs) with the nuScenes dataset, our performance evaluation +reveals detection disparities for underrepresented classes. We propose a +methodology for model optimization and bias mitigation, which includes data +augmentation, resampling, and metric-specific learning. Using the proposed +mitigation approaches, we see improvement in IoU(%) and NDS(%) metrics from +71.3 to 75.6 and 80.6 to 83.7 respectively, for the CNN model. Similarly, for +ViT, we observe improvement in IoU and NDS metrics from 74.9 to 79.2 and 83.8 +to 87.1 respectively. This research contributes to developing more reliable +models and datasets, enhancing inclusiveness for minority classes. + +
+
+
+
+
+ + ☆ Agricultural Object Detection with You Look Only Once (YOLO) Algorithm: + A Bibliometric and Systematic Literature Review + + +
+ Vision is a major component in several digital technologies and tools used in +agriculture. The object detector, You Look Only Once (YOLO), has gained +popularity in agriculture in a relatively short span due to its +state-of-the-art performance. YOLO offers real-time detection with good +accuracy and is implemented in various agricultural tasks, including +monitoring, surveillance, sensing, automation, and robotics. The research and +application of YOLO in agriculture are accelerating rapidly but are fragmented +and multidisciplinary. Moreover, the performance characteristics (i.e., +accuracy, speed, computation) of the object detector influence the rate of +technology implementation and adoption in agriculture. Thus, the study aims to +collect extensive literature to document and critically evaluate the advances +and application of YOLO for agricultural object recognition. First, we +conducted a bibliometric review of 257 articles to understand the scholarly +landscape of YOLO in agricultural domain. Secondly, we conducted a systematic +review of 30 articles to identify current knowledge, gaps, and modifications in +YOLO for specific agricultural tasks. The study critically assesses and +summarizes the information on YOLO's end-to-end learning approach, including +data acquisition, processing, network modification, integration, and +deployment. We also discussed task-specific YOLO algorithm modification and +integration to meet the agricultural object or environment-specific challenges. +In general, YOLO-integrated digital tools and technologies show the potential +for real-time, automated monitoring, surveillance, and object handling to +reduce labor, production cost, and environmental impact while maximizing +resource efficiency. The study provides detailed documentation and +significantly advances the existing knowledge on applying YOLO in agriculture, +which can greatly benefit the scientific community. + +
+
+
+
+
+ + ☆ Harmonized Spatial and Spectral Learning for Robust and Generalized + Medical Image Segmentation + + +
+ Deep learning has demonstrated remarkable achievements in medical image +segmentation. However, prevailing deep learning models struggle with poor +generalization due to (i) intra-class variations, where the same class appears +differently in different samples, and (ii) inter-class independence, resulting +in difficulties capturing intricate relationships between distinct objects, +leading to higher false negative cases. This paper presents a novel approach +that synergies spatial and spectral representations to enhance +domain-generalized medical image segmentation. We introduce the innovative +Spectral Correlation Coefficient objective to improve the model's capacity to +capture middle-order features and contextual long-range dependencies. This +objective complements traditional spatial objectives by incorporating valuable +spectral information. Extensive experiments reveal that optimizing this +objective with existing architectures like UNet and TransUNet significantly +enhances generalization, interpretability, and noise robustness, producing more +confident predictions. For instance, in cardiac segmentation, we observe a 0.81 +pp and 1.63 pp (pp = percentage point) improvement in DSC over UNet and +TransUNet, respectively. Our interpretability study demonstrates that, in most +tasks, objectives optimized with UNet outperform even TransUNet by introducing +global contextual information alongside local details. These findings +underscore the versatility and effectiveness of our proposed method across +diverse imaging modalities and medical domains. + +
+
+
+
+
+ + ☆ ELRT: Efficient Low-Rank Training for Compact Convolutional Neural + Networks + + +
+ Low-rank compression, a popular model compression technique that produces +compact convolutional neural networks (CNNs) with low rankness, has been +well-studied in the literature. On the other hand, low-rank training, as an +alternative way to train low-rank CNNs from scratch, has been exploited little +yet. Unlike low-rank compression, low-rank training does not need pre-trained +full-rank models, and the entire training phase is always performed on the +low-rank structure, bringing attractive benefits for practical applications. +However, the existing low-rank training solutions still face several +challenges, such as a considerable accuracy drop and/or still needing to update +full-size models during the training. In this paper, we perform a systematic +investigation on low-rank CNN training. By identifying the proper low-rank +format and performance-improving strategy, we propose ELRT, an efficient +low-rank training solution for high-accuracy, high-compactness, low-rank CNN +models. Our extensive evaluation results for training various CNNs on different +datasets demonstrate the effectiveness of ELRT. + +
+
+
+
+
+ + ♻ ☆ AGILE3D: Attention Guided Interactive Multi-object 3D Segmentation ICLR 2024 + + +
+ During interactive segmentation, a model and a user work together to +delineate objects of interest in a 3D point cloud. In an iterative process, the +model assigns each data point to an object (or the background), while the user +corrects errors in the resulting segmentation and feeds them back into the +model. The current best practice formulates the problem as binary +classification and segments objects one at a time. The model expects the user +to provide positive clicks to indicate regions wrongly assigned to the +background and negative clicks on regions wrongly assigned to the object. +Sequentially visiting objects is wasteful since it disregards synergies between +objects: a positive click for a given object can, by definition, serve as a +negative click for nearby objects. Moreover, a direct competition between +adjacent objects can speed up the identification of their common boundary. We +introduce AGILE3D, an efficient, attention-based model that (1) supports +simultaneous segmentation of multiple 3D objects, (2) yields more accurate +segmentation masks with fewer user clicks, and (3) offers faster inference. Our +core idea is to encode user clicks as spatial-temporal queries and enable +explicit interactions between click queries as well as between them and the 3D +scene through a click attention module. Every time new clicks are added, we +only need to run a lightweight decoder that produces updated segmentation +masks. In experiments with four different 3D point cloud datasets, AGILE3D sets +a new state-of-the-art. Moreover, we also verify its practicality in real-world +setups with real user studies. + +
+
+ comment: Accepted to ICLR 2024. Project page: https://ywyue.github.io/AGILE3D +
+
+
+
+
+ + ♻ ☆ On Error Propagation of Diffusion Models ICLR-2024 + + +
+ Although diffusion models (DMs) have shown promising performances in a number +of tasks (e.g., speech synthesis and image generation), they might suffer from +error propagation because of their sequential structure. However, this is not +certain because some sequential models, such as Conditional Random Field (CRF), +are free from this problem. To address this issue, we develop a theoretical +framework to mathematically formulate error propagation in the architecture of +DMs, The framework contains three elements, including modular error, cumulative +error, and propagation equation. The modular and cumulative errors are related +by the equation, which interprets that DMs are indeed affected by error +propagation. Our theoretical study also suggests that the cumulative error is +closely related to the generation quality of DMs. Based on this finding, we +apply the cumulative error as a regularization term to reduce error +propagation. Because the term is computationally intractable, we derive its +upper bound and design a bootstrap algorithm to efficiently estimate the bound +for optimization. We have conducted extensive experiments on multiple image +datasets, showing that our proposed regularization reduces error propagation, +significantly improves vanilla DMs, and outperforms previous baselines. + +
+
+ comment: Accepted by ICLR-2024 +
+
+
+
+
+ + ♻ ☆ Benchmarking Robustness of Multimodal Image-Text Models under + Distribution Shift + + +
+ Multimodal image-text models have shown remarkable performance in the past +few years. However, evaluating robustness against distribution shifts is +crucial before adopting them in real-world applications. In this work, we +investigate the robustness of 12 popular open-sourced image-text models under +common perturbations on five tasks (image-text retrieval, visual reasoning, +visual entailment, image captioning, and text-to-image generation). In +particular, we propose several new multimodal robustness benchmarks by applying +17 image perturbation and 16 text perturbation techniques on top of existing +datasets. We observe that multimodal models are not robust to image and text +perturbations, especially to image perturbations. Among the tested perturbation +methods, character-level perturbations constitute the most severe distribution +shift for text, and zoom blur is the most severe shift for image data. We also +introduce two new robustness metrics (\textbf{MMI} for MultiModal Impact score +and \textbf{MOR} for Missing Object Rate) for proper evaluations of multimodal +models. We hope our extensive study sheds light on new directions for the +development of robust multimodal models. More details can be found on the +project webpage: \url{https://MMRobustness.github.io}. + +
+
+ comment: Accepted by Journal of Data-centric Machine Learning Research (DMLR) + 2024 +
+
+
+
+
+ + ♻ ☆ Soft Mixture Denoising: Beyond the Expressive Bottleneck of Diffusion + Models ICLR-2024 + + +
+ Because diffusion models have shown impressive performances in a number of +tasks, such as image synthesis, there is a trend in recent works to prove (with +certain assumptions) that these models have strong approximation capabilities. +In this paper, we show that current diffusion models actually have an +expressive bottleneck in backward denoising and some assumption made by +existing theoretical guarantees is too strong. Based on this finding, we prove +that diffusion models have unbounded errors in both local and global denoising. +In light of our theoretical studies, we introduce soft mixture denoising (SMD), +an expressive and efficient model for backward denoising. SMD not only permits +diffusion models to well approximate any Gaussian mixture distributions in +theory, but also is simple and efficient for implementation. Our experiments on +multiple image datasets show that SMD significantly improves different types of +diffusion models (e.g., DDPM), espeically in the situation of few backward +iterations. + +
+
+ comment: Accepted by ICLR-2024 +
+
+
+
+
+ + ♻ ☆ Uncovering local aggregated air quality index with smartphone captured + images leveraging efficient deep convolutional neural network + + +
+ The prevalence and mobility of smartphones make these a widely used tool for +environmental health research. However, their potential for determining +aggregated air quality index (AQI) based on PM2.5 concentration in specific +locations remains largely unexplored in the existing literature. In this paper, +we thoroughly examine the challenges associated with predicting +location-specific PM2.5 concentration using images taken with smartphone +cameras. The focus of our study is on Dhaka, the capital of Bangladesh, due to +its significant air pollution levels and the large population exposed to it. +Our research involves the development of a Deep Convolutional Neural Network +(DCNN), which we train using over a thousand outdoor images taken and +annotated. These photos are captured at various locations in Dhaka, and their +labels are based on PM2.5 concentration data obtained from the local US +consulate, calculated using the NowCast algorithm. Through supervised learning, +our model establishes a correlation index during training, enhancing its +ability to function as a Picture-based Predictor of PM2.5 Concentration (PPPC). +This enables the algorithm to calculate an equivalent daily averaged AQI index +from a smartphone image. Unlike, popular overly parameterized models, our model +shows resource efficiency since it uses fewer parameters. Furthermore, test +results indicate that our model outperforms popular models like ViT and INN, as +well as popular CNN-based models such as VGG19, ResNet50, and MobileNetV2, in +predicting location-specific PM2.5 concentration. Our dataset is the first +publicly available collection that includes atmospheric images and +corresponding PM2.5 measurements from Dhaka. Our codes and dataset are +available at https://github.com/lepotatoguy/aqi. + +
+
+ comment: 18 pages, 7 figures, published to Nature Scientific Reports +
+
+
+
+
+ + ♻ ☆ Learn to Categorize or Categorize to Learn? Self-Coding for Generalized + Category Discovery NeurIPS 2023 + + +
+ In the quest for unveiling novel categories at test time, we confront the +inherent limitations of traditional supervised recognition models that are +restricted by a predefined category set. While strides have been made in the +realms of self-supervised and open-world learning towards test-time category +discovery, a crucial yet often overlooked question persists: what exactly +delineates a category? In this paper, we conceptualize a category through the +lens of optimization, viewing it as an optimal solution to a well-defined +problem. Harnessing this unique conceptualization, we propose a novel, +efficient and self-supervised method capable of discovering previously unknown +categories at test time. A salient feature of our approach is the assignment of +minimum length category codes to individual data instances, which encapsulates +the implicit category hierarchy prevalent in real-world datasets. This +mechanism affords us enhanced control over category granularity, thereby +equipping our model to handle fine-grained categories adeptly. Experimental +evaluations, bolstered by state-of-the-art benchmark comparisons, testify to +the efficacy of our solution in managing unknown categories at test time. +Furthermore, we fortify our proposition with a theoretical foundation, +providing proof of its optimality. Our code is available at +https://github.com/SarahRastegar/InfoSieve. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Hyperbolic Image-Text Representations ICML 2023 + + +
+ Visual and linguistic concepts naturally organize themselves in a hierarchy, +where a textual concept "dog" entails all images that contain dogs. Despite +being intuitive, current large-scale vision and language models such as CLIP do +not explicitly capture such hierarchy. We propose MERU, a contrastive model +that yields hyperbolic representations of images and text. Hyperbolic spaces +have suitable geometric properties to embed tree-like data, so MERU can better +capture the underlying hierarchy in image-text datasets. Our results show that +MERU learns a highly interpretable and structured representation space while +being competitive with CLIP's performance on standard multi-modal tasks like +image classification and image-text retrieval. Our code and models are +available at https://www.github.com/facebookresearch/meru + +
+
+ comment: ICML 2023 (v3: Add link to code in abstract) +
+
+
+
+
+ + ♻ ☆ UMG-CLIP: A Unified Multi-Granularity Vision Generalist for Open-World + Understanding + + +
+ Vision-language foundation models, represented by Contrastive language-image +pre-training (CLIP), have gained increasing attention for jointly understanding +both vision and textual tasks. However, existing approaches primarily focus on +training models to match global image representations with textual +descriptions, thereby overlooking the critical alignment between local regions +and corresponding text tokens. This paper extends CLIP with multi-granularity +alignment. Notably, we deliberately construct a new dataset comprising pseudo +annotations at various levels of granularities, encompassing image-level, +region-level, and pixel-level captions/tags. Accordingly, we develop a unified +multi-granularity learning framework, named UMG-CLIP, that simultaneously +empowers the model with versatile perception abilities across different levels +of detail. Equipped with parameter efficient tuning, UMG-CLIP surpasses current +widely used CLIP models and achieves state-of-the-art performance on diverse +image understanding benchmarks, including open-world recognition, retrieval, +semantic segmentation, and panoptic segmentation tasks. We hope UMG-CLIP can +serve as a valuable option for advancing vision-language foundation models. + +
+
+ comment: The paper is undergoing internal legal review and will be resubmitted + once it passes the review +
+
+
+
+
+ + ♻ ☆ FactCHD: Benchmarking Fact-Conflicting Hallucination Detection + + +
+ Despite their impressive generative capabilities, LLMs are hindered by +fact-conflicting hallucinations in real-world applications. The accurate +identification of hallucinations in texts generated by LLMs, especially in +complex inferential scenarios, is a relatively unexplored area. To address this +gap, we present FactCHD, a dedicated benchmark designed for the detection of +fact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset +that spans various factuality patterns, including vanilla, multi-hop, +comparison, and set operation. A distinctive element of FactCHD is its +integration of fact-based evidence chains, significantly enhancing the depth of +evaluating the detectors' explanations. Experiments on different LLMs expose +the shortcomings of current approaches in detecting factual errors accurately. +Furthermore, we introduce Truth-Triangulator that synthesizes reflective +considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming +to yield more credible detection through the amalgamation of predictive results +and evidence. The benchmark dataset is available at +https://github.com/zjunlp/FactCHD. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Hierarchical Masked 3D Diffusion Model for Video Outpainting ACM MM 2023 + + +
+ Video outpainting aims to adequately complete missing areas at the edges of +video frames. Compared to image outpainting, it presents an additional +challenge as the model should maintain the temporal consistency of the filled +area. In this paper, we introduce a masked 3D diffusion model for video +outpainting. We use the technique of mask modeling to train the 3D diffusion +model. This allows us to use multiple guide frames to connect the results of +multiple video clip inferences, thus ensuring temporal consistency and reducing +jitter between adjacent frames. Meanwhile, we extract the global frames of the +video as prompts and guide the model to obtain information other than the +current video clip using cross-attention. We also introduce a hybrid +coarse-to-fine inference pipeline to alleviate the artifact accumulation +problem. The existing coarse-to-fine pipeline only uses the infilling strategy, +which brings degradation because the time interval of the sparse frames is too +large. Our pipeline benefits from bidirectional learning of the mask modeling +and thus can employ a hybrid strategy of infilling and interpolation when +generating sparse frames. Experiments show that our method achieves +state-of-the-art results in video outpainting tasks. More results and codes are +provided at our https://fanfanda.github.io/M3DDM/. + +
+
+ comment: Accepted to ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ GIVT: Generative Infinite-Vocabulary Transformers + + +
+ We introduce generative infinite-vocabulary transformers (GIVT) which +generate vector sequences with real-valued entries, instead of discrete tokens +from a finite vocabulary. To this end, we propose two surprisingly simple +modifications to decoder-only transformers: 1) at the input, we replace the +finite-vocabulary lookup table with a linear projection of the input vectors; +and 2) at the output, we replace the logits prediction (usually mapped to a +categorical distribution) with the parameters of a multivariate Gaussian +mixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT, +where transformers are used to model the discrete latent sequences of a VQ-VAE, +we use GIVT to model the unquantized real-valued latent sequences of a VAE. +When applying GIVT to class-conditional image generation with iterative masked +modeling, we show competitive results with MaskGIT, while our approach +outperforms both VQ-GAN and MaskGIT when using it for causal modeling. Finally, +we obtain competitive results outside of image generation when applying our +approach to panoptic segmentation and depth estimation with a VAE-based variant +of the UViM framework. + +
+
+ comment: v2: add related NLP work, loss details +
+
+
+
+
+ + ♻ ☆ FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image + Segmentation against Heterogeneous Annotation Noise AAAI'24 + + +
+ Federated learning (FL) has emerged as a promising paradigm for training +segmentation models on decentralized medical data, owing to its +privacy-preserving property. However, existing research overlooks the prevalent +annotation noise encountered in real-world medical datasets, which limits the +performance ceilings of FL. In this paper, we, for the first time, identify and +tackle this problem. For problem formulation, we propose a contour evolution +for modeling non-independent and identically distributed (Non-IID) noise across +pixels within each client and then extend it to the case of multi-source data +to form a heterogeneous noise model (i.e., Non-IID annotation noise across +clients). For robust learning from annotations with such two-level Non-IID +noise, we emphasize the importance of data quality in model aggregation, +allowing high-quality clients to have a greater impact on FL. To achieve this, +we propose Federated learning with Annotation quAlity-aware AggregatIon, named +FedA3I, by introducing a quality factor based on client-wise noise estimation. +Specifically, noise estimation at each client is accomplished through the +Gaussian mixture model and then incorporated into model aggregation in a +layer-wise manner to up-weight high-quality clients. Extensive experiments on +two real-world medical image segmentation datasets demonstrate the superior +performance of FedA$^3$I against the state-of-the-art approaches in dealing +with cross-client annotation noise. The code is available at +https://github.com/wnn2000/FedAAAI. + +
+
+ comment: Accepted at AAAI'24 +
+
+
+
+
+ + ♻ ☆ RTFS-Net: Recurrent time-frequency modelling for efficient audio-visual + speech separation ICLR 2024 + + +
+ Audio-visual speech separation methods aim to integrate different modalities +to generate high-quality separated speech, thereby enhancing the performance of +downstream tasks such as speech recognition. Most existing state-of-the-art +(SOTA) models operate in the time domain. However, their overly simplistic +approach to modeling acoustic features often necessitates larger and more +computationally intensive models in order to achieve SOTA performance. In this +paper, we present a novel time-frequency domain audio-visual speech separation +method: Recurrent Time-Frequency Separation Network (RTFS-Net), which applies +its algorithms on the complex time-frequency bins yielded by the Short-Time +Fourier Transform. We model and capture the time and frequency dimensions of +the audio independently using a multi-layered RNN along each dimension. +Furthermore, we introduce a unique attention-based fusion technique for the +efficient integration of audio and visual information, and a new mask +separation approach that takes advantage of the intrinsic spectral nature of +the acoustic features for a clearer separation. RTFS-Net outperforms the +previous SOTA method using only 10% of the parameters and 18% of the MACs. This +is the first time-frequency domain audio-visual speech separation method to +outperform all contemporary time-domain counterparts. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ HaGRID - HAnd Gesture Recognition Image Dataset + + +
+ This paper introduces an enormous dataset, HaGRID (HAnd Gesture Recognition +Image Dataset), to build a hand gesture recognition (HGR) system concentrating +on interaction with devices to manage them. That is why all 18 chosen gestures +are endowed with the semiotic function and can be interpreted as a specific +action. Although the gestures are static, they were picked up, especially for +the ability to design several dynamic gestures. It allows the trained model to +recognize not only static gestures such as "like" and "stop" but also "swipes" +and "drag and drop" dynamic gestures. The HaGRID contains 554,800 images and +bounding box annotations with gesture labels to solve hand detection and +gesture classification tasks. The low variability in context and subjects of +other datasets was the reason for creating the dataset without such +limitations. Utilizing crowdsourcing platforms allowed us to collect samples +recorded by 37,583 subjects in at least as many scenes with subject-to-camera +distances from 0.5 to 4 meters in various natural light conditions. The +influence of the diversity characteristics was assessed in ablation study +experiments. Also, we demonstrate the HaGRID ability to be used for pretraining +models in HGR tasks. The HaGRID and pretrained models are publicly available. + +
+
+ comment: 12 pages, 5 figures, open-source dataset for computer vision +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Baseline for Imbalanced Semi-Supervised + Learning + + +
+ Semi-supervised learning (SSL) has shown great promise in leveraging +unlabeled data to improve model performance. While standard SSL assumes uniform +data distribution, we consider a more realistic and challenging setting called +imbalanced SSL, where imbalanced class distributions occur in both labeled and +unlabeled data. Although there are existing endeavors to tackle this challenge, +their performance degenerates when facing severe imbalance since they can not +reduce the class imbalance sufficiently and effectively. In this paper, we +study a simple yet overlooked baseline -- SimiS -- which tackles data imbalance +by simply supplementing labeled data with pseudo-labels, according to the +difference in class distribution from the most frequent class. Such a simple +baseline turns out to be highly effective in reducing class imbalance. It +outperforms existing methods by a significant margin, e.g., 12.8%, 13.6%, and +16.7% over previous SOTA on CIFAR100-LT, FOOD101-LT, and ImageNet127 +respectively. The reduced imbalance results in faster convergence and better +pseudo-label accuracy of SimiS. The simplicity of our method also makes it +possible to be combined with other re-balancing techniques to improve the +performance further. Moreover, our method shows great robustness to a wide +range of data distributions, which holds enormous potential in practice. Code +will be publicly available. + +
+
+ comment: Issues in the paper, will re-open later +
+
+
+
+
+ + ♻ ☆ Curvature-Balanced Feature Manifold Learning for Long-Tailed + Classification CVPR 2023 + + +
+ To address the challenges of long-tailed classification, researchers have +proposed several approaches to reduce model bias, most of which assume that +classes with few samples are weak classes. However, recent studies have shown +that tail classes are not always hard to learn, and model bias has been +observed on sample-balanced datasets, suggesting the existence of other factors +that affect model bias. In this work, we systematically propose a series of +geometric measurements for perceptual manifolds in deep neural networks, and +then explore the effect of the geometric characteristics of perceptual +manifolds on classification difficulty and how learning shapes the geometric +characteristics of perceptual manifolds. An unanticipated finding is that the +correlation between the class accuracy and the separation degree of perceptual +manifolds gradually decreases during training, while the negative correlation +with the curvature gradually increases, implying that curvature imbalance leads +to model bias. Therefore, we propose curvature regularization to facilitate the +model to learn curvature-balanced and flatter perceptual manifolds. Evaluations +on multiple long-tailed and non-long-tailed datasets show the excellent +performance and exciting generality of our approach, especially in achieving +significant performance improvements based on current state-of-the-art +techniques. Our work opens up a geometric analysis perspective on model bias +and reminds researchers to pay attention to model bias on non-long-tailed and +even sample-balanced datasets. The code and model will be made public. + +
+
+ comment: 20pages, Accepted by CVPR 2023 +
+
+
+
+
+ + ♻ ☆ Information Recovery-Driven Deep Incomplete Multiview Clustering Network + + +
+ Incomplete multi-view clustering is a hot and emerging topic. It is well +known that unavoidable data incompleteness greatly weakens the effective +information of multi-view data. To date, existing incomplete multi-view +clustering methods usually bypass unavailable views according to prior missing +information, which is considered as a second-best scheme based on evasion. +Other methods that attempt to recover missing information are mostly applicable +to specific two-view datasets. To handle these problems, in this paper, we +propose an information recovery-driven deep incomplete multi-view clustering +network, termed as RecFormer. Concretely, a two-stage autoencoder network with +the self-attention structure is built to synchronously extract high-level +semantic representations of multiple views and recover the missing data. +Besides, we develop a recurrent graph reconstruction mechanism that cleverly +leverages the restored views to promote the representation learning and the +further data reconstruction. Visualization of recovery results are given and +sufficient experimental results confirm that our RecFormer has obvious +advantages over other top methods. + +
+
+ comment: Accepted by TNNLS 2023. Please contact me if you have any questions: + liucl1996@163.com. The code is available at: + https://github.com/justsmart/RecFormer +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Decoupling Contrastive Learning for Skeleton-based + Human Action Recognition + + +
+ Skeleton-based action recognition is a central task in human-computer +interaction. However, most previous methods suffer from two issues: (i) +semantic ambiguity arising from spatial-temporal information mixture; and (ii) +overlooking the explicit exploitation of the latent data distributions (i.e., +the intra-class variations and inter-class relations), thereby leading to +sub-optimum solutions of the skeleton encoders. To mitigate this, we propose a +spatial-temporal decoupling contrastive learning (STD-CL) framework to obtain +discriminative and semantically distinct representations from the sequences, +which can be incorporated into various previous skeleton encoders and can be +removed when testing. Specifically, we decouple the global features into +spatial-specific and temporal-specific features to reduce the spatial-temporal +coupling of features. Furthermore, to explicitly exploit the latent data +distributions, we employ the attentive features to contrastive learning, which +models the cross-sequence semantic relations by pulling together the features +from the positive pairs and pushing away the negative pairs. Extensive +experiments show that STD-CL with four various skeleton encoders (HCN, 2S-AGCN, +CTR-GCN, and Hyperformer) achieves solid improvements on NTU60, NTU120, and +NW-UCLA benchmarks. The code will be released soon. + +
+
+
+
+
+ + ♻ ☆ AMSP-UOD: When Vortex Convolution and Stochastic Perturbation Meet + Underwater Object Detection + + +
+ In this paper, we present a novel Amplitude-Modulated Stochastic Perturbation +and Vortex Convolutional Network, AMSP-UOD, designed for underwater object +detection. AMSP-UOD specifically addresses the impact of non-ideal imaging +factors on detection accuracy in complex underwater environments. To mitigate +the influence of noise on object detection performance, we propose AMSP Vortex +Convolution (AMSP-VConv) to disrupt the noise distribution, enhance feature +extraction capabilities, effectively reduce parameters, and improve network +robustness. We design the Feature Association Decoupling Cross Stage Partial +(FAD-CSP) module, which strengthens the association of long and short range +features, improving the network performance in complex underwater environments. +Additionally, our sophisticated post-processing method, based on Non-Maximum +Suppression (NMS) with aspect-ratio similarity thresholds, optimizes detection +in dense scenes, such as waterweed and schools of fish, improving object +detection accuracy. Extensive experiments on the URPC and RUOD datasets +demonstrate that our method outperforms existing state-of-the-art methods in +terms of accuracy and noise immunity. AMSP-UOD proposes an innovative solution +with the potential for real-world applications. Our code is available at +https://github.com/zhoujingchun03/AMSP-UOD. + +
+
+
+
+
+ + ♻ ☆ Semantic-Guided Generative Image Augmentation Method with Diffusion + Models for Image Classification AAAI 2024 + + +
+ Existing image augmentation methods consist of two categories: +perturbation-based methods and generative methods. Perturbation-based methods +apply pre-defined perturbations to augment an original image, but only locally +vary the image, thus lacking image diversity. In contrast, generative methods +bring more image diversity in the augmented images but may not preserve +semantic consistency, thus incorrectly changing the essential semantics of the +original image. To balance image diversity and semantic consistency in +augmented images, we propose SGID, a Semantic-guided Generative Image +augmentation method with Diffusion models for image classification. +Specifically, SGID employs diffusion models to generate augmented images with +good image diversity. More importantly, SGID takes image labels and captions as +guidance to maintain semantic consistency between the augmented and original +images. Experimental results show that SGID outperforms the best augmentation +baseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and +0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image +augmentation baselines and further improves the overall performance. We +demonstrate the semantic consistency and image diversity of SGID through +quantitative human and automated evaluations, as well as qualitative case +studies. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Synergistic Multiscale Detail Refinement via Intrinsic Supervision for + Underwater Image Enhancement + + +
+ Visually restoring underwater scenes primarily involves mitigating +interference from underwater media. Existing methods ignore the inherent +scale-related characteristics in underwater scenes. Therefore, we present the +synergistic multi-scale detail refinement via intrinsic supervision (SMDR-IS) +for enhancing underwater scene details, which contain multi-stages. The +low-degradation stage from the original images furnishes the original stage +with multi-scale details, achieved through feature propagation using the +Adaptive Selective Intrinsic Supervised Feature (ASISF) module. By using +intrinsic supervision, the ASISF module can precisely control and guide feature +transmission across multi-degradation stages, enhancing multi-scale detail +refinement and minimizing the interference from irrelevant information in the +low-degradation stage. In multi-degradation encoder-decoder framework of +SMDR-IS, we introduce the Bifocal Intrinsic-Context Attention Module (BICA). +Based on the intrinsic supervision principles, BICA efficiently exploits +multi-scale scene information in images. BICA directs higher-resolution spaces +by tapping into the insights of lower-resolution ones, underscoring the pivotal +role of spatial contextual relationships in underwater image restoration. +Throughout training, the inclusion of a multi-degradation loss function can +enhance the network, allowing it to adeptly extract information across diverse +scales. When benchmarked against state-of-the-art methods, SMDR-IS consistently +showcases superior performance. The code is publicly available at: +https://github.com/zhoujingchun03/SMDR-IS. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Fashion Design with Multi-stage Diffusion Models + + +
+ Cross-modal fashion synthesis and editing offer intelligent support to +fashion designers by enabling the automatic generation and local modification +of design drafts.While current diffusion models demonstrate commendable +stability and controllability in image synthesis,they still face significant +challenges in generating fashion design from abstract design elements and +fine-grained editing.Abstract sensory expressions, \eg office, business, and +party, form the high-level design concepts, while measurable aspects like +sleeve length, collar type, and pant length are considered the low-level +attributes of clothing.Controlling and editing fashion images using lengthy +text descriptions poses a difficulty.In this paper, we propose HieraFashDiff,a +novel fashion design method using the shared multi-stage diffusion model +encompassing high-level design concepts and low-level clothing attributes in a +hierarchical structure.Specifically, we categorized the input text into +different levels and fed them in different time step to the diffusion model +according to the criteria of professional clothing designers.HieraFashDiff +allows designers to add low-level attributes after high-level prompts for +interactive editing incrementally.In addition, we design a differentiable loss +function in the sampling process with a mask to keep non-edit +areas.Comprehensive experiments performed on our newly conducted Hierarchical +fashion dataset,demonstrate that our proposed method outperforms other +state-of-the-art competitors. + +
+
+
+
+
+ + ♻ ☆ Decoupled Contrastive Multi-View Clustering with High-Order Random Walks AAAI 2024 + + +
+ In recent, some robust contrastive multi-view clustering (MvC) methods have +been proposed, which construct data pairs from neighborhoods to alleviate the +false negative issue, i.e., some intra-cluster samples are wrongly treated as +negative pairs. Although promising performance has been achieved by these +methods, the false negative issue is still far from addressed and the false +positive issue emerges because all in- and out-of-neighborhood samples are +simply treated as positive and negative, respectively. To address the issues, +we propose a novel robust method, dubbed decoupled contrastive multi-view +clustering with high-order random walks (DIVIDE). In brief, DIVIDE leverages +random walks to progressively identify data pairs in a global instead of local +manner. As a result, DIVIDE could identify in-neighborhood negatives and +out-of-neighborhood positives. Moreover, DIVIDE embraces a novel MvC +architecture to perform inter- and intra-view contrastive learning in different +embedding spaces, thus boosting clustering performance and embracing the +robustness against missing views. To verify the efficacy of DIVIDE, we carry +out extensive experiments on four benchmark datasets comparing with nine +state-of-the-art MvC methods in both complete and incomplete MvC settings. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ TopCoW: Benchmarking Topology-Aware Anatomical Segmentation of the + Circle of Willis (CoW) for CTA and MRA MICCAI + + +
+ The Circle of Willis (CoW) is an important network of arteries connecting +major circulations of the brain. Its vascular architecture is believed to +affect the risk, severity, and clinical outcome of serious neuro-vascular +diseases. However, characterizing the highly variable CoW anatomy is still a +manual and time-consuming expert task. The CoW is usually imaged by two +angiographic imaging modalities, magnetic resonance angiography (MRA) and +computed tomography angiography (CTA), but there exist limited public datasets +with annotations on CoW anatomy, especially for CTA. Therefore we organized the +TopCoW Challenge in 2023 with the release of an annotated CoW dataset. The +TopCoW dataset was the first public dataset with voxel-level annotations for +thirteen possible CoW vessel components, enabled by virtual-reality (VR) +technology. It was also the first large dataset with paired MRA and CTA from +the same patients. TopCoW challenge formalized the CoW characterization problem +as a multiclass anatomical segmentation task with an emphasis on topological +metrics. We invited submissions worldwide for the CoW segmentation task, which +attracted over 140 registered participants from four continents. The top +performing teams managed to segment many CoW components to Dice scores around +90%, but with lower scores for communicating arteries and rare variants. There +were also topological mistakes for predictions with high Dice scores. +Additional topological analysis revealed further areas for improvement in +detecting certain CoW components and matching CoW variant topology accurately. +TopCoW represented a first attempt at benchmarking the CoW anatomical +segmentation task for MRA and CTA, both morphologically and topologically. + +
+
+ comment: 23 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW + 2023 Challenge +
+
+
+
+
+ + ♻ ☆ Quantity-Aware Coarse-to-Fine Correspondence for Image-to-Point Cloud + Registration + + +
+ Image-to-point cloud registration aims to determine the relative camera pose +between an RGB image and a reference point cloud, serving as a general solution +for locating 3D objects from 2D observations. Matching individual points with +pixels can be inherently ambiguous due to modality gaps. To address this +challenge, we propose a framework to capture quantity-aware correspondences +between local point sets and pixel patches and refine the results at both the +point and pixel levels. This framework aligns the high-level semantics of point +sets and pixel patches to improve the matching accuracy. On a coarse scale, the +set-to-patch correspondence is expected to be influenced by the quantity of 3D +points. To achieve this, a novel supervision strategy is proposed to adaptively +quantify the degrees of correlation as continuous values. On a finer scale, +point-to-pixel correspondences are refined from a smaller search space through +a well-designed scheme, which incorporates both resampling and quantity-aware +priors. Particularly, a confidence sorting strategy is proposed to +proportionally select better correspondences at the final stage. Leveraging the +advantages of high-quality correspondences, the problem is successfully +resolved using an efficient Perspective-n-Point solver within the framework of +random sample consensus (RANSAC). Extensive experiments on the KITTI Odometry +and NuScenes datasets demonstrate the superiority of our method over the +state-of-the-art methods. + +
+
+
+
+
+ + ♻ ☆ SUCRe: Leveraging Scene Structure for Underwater Color Restoration + + +
+ Underwater images are altered by the physical characteristics of the medium +through which light rays pass before reaching the optical sensor. Scattering +and wavelength-dependent absorption significantly modify the captured colors +depending on the distance of observed elements to the image plane. In this +paper, we aim to recover an image of the scene as if the water had no effect on +light propagation. We introduce SUCRe, a novel method that exploits the scene's +3D structure for underwater color restoration. By following points in multiple +images and tracking their intensities at different distances to the sensor, we +constrain the optimization of the parameters in an underwater image formation +model and retrieve unattenuated pixel intensities. We conduct extensive +quantitative and qualitative analyses of our approach in a variety of scenarios +ranging from natural light to deep-sea environments using three underwater +datasets acquired from real-world scenarios and one synthetic dataset. We also +compare the performance of the proposed approach with that of a wide range of +existing state-of-the-art methods. The results demonstrate a consistent benefit +of exploiting multiple views across a spectrum of objective metrics. Our code +is publicly available at https://github.com/clementinboittiaux/sucre. + +
+
+
+
+
+ + ♻ ☆ Weakly Supervised Semantic Segmentation for Driving Scenes AAAI 2024 + + +
+ State-of-the-art techniques in weakly-supervised semantic segmentation (WSSS) +using image-level labels exhibit severe performance degradation on driving +scene datasets such as Cityscapes. To address this challenge, we develop a new +WSSS framework tailored to driving scene datasets. Based on extensive analysis +of dataset characteristics, we employ Contrastive Language-Image Pre-training +(CLIP) as our baseline to obtain pseudo-masks. However, CLIP introduces two key +challenges: (1) pseudo-masks from CLIP lack in representing small object +classes, and (2) these masks contain notable noise. We propose solutions for +each issue as follows. (1) We devise Global-Local View Training that seamlessly +incorporates small-scale patches during model training, thereby enhancing the +model's capability to handle small-sized yet critical objects in driving scenes +(e.g., traffic light). (2) We introduce Consistency-Aware Region Balancing +(CARB), a novel technique that discerns reliable and noisy regions through +evaluating the consistency between CLIP masks and segmentation predictions. It +prioritizes reliable pixels over noisy pixels via adaptive loss weighting. +Notably, the proposed method achieves 51.8\% mIoU on the Cityscapes test +dataset, showcasing its potential as a strong WSSS baseline on driving scene +datasets. Experimental results on CamVid and WildDash2 demonstrate the +effectiveness of our method across diverse datasets, even with small-scale +datasets or visually challenging conditions. The code is available at +https://github.com/k0u-id/CARB. + +
+
+ comment: AAAI 2024 accepted. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Unsupervised Multiple Domain Translation through Controlled + Disentanglement in Variational Autoencoder + + +
+ Unsupervised Multiple Domain Translation is the task of transforming data +from one domain to other domains without having paired data to train the +systems. Typically, methods based on Generative Adversarial Networks (GANs) are +used to address this task. However, our proposal exclusively relies on a +modified version of a Variational Autoencoder. This modification consists of +the use of two latent variables disentangled in a controlled way by design. One +of this latent variables is imposed to depend exclusively on the domain, while +the other one must depend on the rest of the variability factors of the data. +Additionally, the conditions imposed over the domain latent variable allow for +better control and understanding of the latent space. We empirically +demonstrate that our approach works on different vision datasets improving the +performance of other well known methods. Finally, we prove that, indeed, one of +the latent variables stores all the information related to the domain and the +other one hardly contains any domain information. + +
+
+
+
+
+ + ♻ ☆ MISS: A Generative Pretraining and Finetuning Approach for Med-VQA + + +
+ Medical visual question answering (VQA) is a challenging multimodal task, +where Vision-Language Pre-training (VLP) models can effectively improve the +generalization performance. However, most methods in the medical field treat +VQA as an answer classification task which is difficult to transfer to +practical application scenarios. Additionally, due to the privacy of medical +images and the expensive annotation process, large-scale medical image-text +pairs datasets for pretraining are severely lacking. In this paper, we propose +a large-scale MultI-task Self-Supervised learning based framework (MISS) for +medical VQA tasks. Unlike existing methods, we treat medical VQA as a +generative task. We unify the text encoder and multimodal encoder and align +image-text features through multi-task learning. Furthermore, we propose a +Transfer-and-Caption method that extends the feature space of single-modal +image datasets using large language models (LLMs), enabling those traditional +medical vision field task data to be applied to VLP. Experiments show that our +method achieves excellent results with fewer multimodal datasets and +demonstrates the advantages of generative VQA models. The code and model +weights will be released upon the paper's acceptance. + +
+
+
+
+
+ + ♻ ☆ Real-time Photorealistic Dynamic Scene Representation and Rendering with + 4D Gaussian Splatting ICLR 2024 + + +
+ Reconstructing dynamic 3D scenes from 2D images and generating diverse views +over time is challenging due to scene complexity and temporal dynamics. Despite +advancements in neural implicit models, limitations persist: (i) Inadequate +Scene Structure: Existing methods struggle to reveal the spatial and temporal +structure of dynamic scenes from directly learning the complex 6D plenoptic +function. (ii) Scaling Deformation Modeling: Explicitly modeling scene element +deformation becomes impractical for complex dynamics. To address these issues, +we consider the spacetime as an entirety and propose to approximate the +underlying spatio-temporal 4D volume of a dynamic scene by optimizing a +collection of 4D primitives, with explicit geometry and appearance modeling. +Learning to optimize the 4D primitives enables us to synthesize novel views at +any desired time with our tailored rendering routine. Our model is conceptually +simple, consisting of a 4D Gaussian parameterized by anisotropic ellipses that +can rotate arbitrarily in space and time, as well as view-dependent and +time-evolved appearance represented by the coefficient of 4D spherindrical +harmonics. This approach offers simplicity, flexibility for variable-length +video and end-to-end training, and efficient real-time rendering, making it +suitable for capturing complex dynamic scene motions. Experiments across +various benchmarks, including monocular and multi-view scenarios, demonstrate +our 4DGS model's superior visual quality and efficiency. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Dual-Decoder Consistency via Pseudo-Labels Guided Data Augmentation for + Semi-Supervised Medical Image Segmentation + + +
+ While supervised learning has achieved remarkable success, obtaining +large-scale labeled datasets in biomedical imaging is often impractical due to +high costs and the time-consuming annotations required from radiologists. +Semi-supervised learning emerges as an effective strategy to overcome this +limitation by leveraging useful information from unlabeled datasets. In this +paper, we present a novel semi-supervised learning method, Dual-Decoder +Consistency via Pseudo-Labels Guided Data Augmentation (DCPA), for medical +image segmentation. We devise a consistency regularization to promote +consistent representations during the training process. Specifically, we use +distinct decoders for student and teacher networks while maintain the same +encoder. Moreover, to learn from unlabeled data, we create pseudo-labels +generated by the teacher networks and augment the training data with the +pseudo-labels. Both techniques contribute to enhancing the performance of the +proposed method. The method is evaluated on three representative medical image +segmentation datasets. Comprehensive comparisons with state-of-the-art +semi-supervised medical image segmentation methods were conducted under typical +scenarios, utilizing 10% and 20% labeled data, as well as in the extreme +scenario of only 5% labeled data. The experimental results consistently +demonstrate the superior performance of our method compared to other methods +across the three semi-supervised settings. The source code is publicly +available at https://github.com/BinYCn/DCPA.git. + +
+
+
+
+
+ + ♻ ☆ Towards Lightweight Super-Resolution with Dual Regression Learning + + +
+ Deep neural networks have exhibited remarkable performance in image +super-resolution (SR) tasks by learning a mapping from low-resolution (LR) +images to high-resolution (HR) images. However, the SR problem is typically an +ill-posed problem and existing methods would come with several limitations. +First, the possible mapping space of SR can be extremely large since there may +exist many different HR images that can be super-resolved from the same LR +image. As a result, it is hard to directly learn a promising SR mapping from +such a large space. Second, it is often inevitable to develop very large models +with extremely high computational cost to yield promising SR performance. In +practice, one can use model compression techniques to obtain compact models by +reducing model redundancy. Nevertheless, it is hard for existing model +compression methods to accurately identify the redundant components due to the +extremely large SR mapping space. To alleviate the first challenge, we propose +a dual regression learning scheme to reduce the space of possible SR mappings. +Specifically, in addition to the mapping from LR to HR images, we learn an +additional dual regression mapping to estimate the downsampling kernel and +reconstruct LR images. In this way, the dual mapping acts as a constraint to +reduce the space of possible mappings. To address the second challenge, we +propose a dual regression compression (DRC) method to reduce model redundancy +in both layer-level and channel-level based on channel pruning. Specifically, +we first develop a channel number search method that minimizes the dual +regression loss to determine the redundancy of each layer. Given the searched +channel numbers, we further exploit the dual regression manner to evaluate the +importance of channels and prune the redundant ones. Extensive experiments show +the effectiveness of our method in obtaining accurate and efficient SR models. + +
+
+ comment: Journal extension of DRN for lightweight super-resolution +
+
+
+
+
+ + ♻ ☆ UAE: Universal Anatomical Embedding on Multi-modality Medical Images + + +
+ Identifying specific anatomical structures (\textit{e.g.}, lesions or +landmarks) in medical images plays a fundamental role in medical image +analysis. Exemplar-based landmark detection methods are receiving increasing +attention since they can detect arbitrary anatomical points in inference while +do not need landmark annotations in training. They use self-supervised learning +to acquire a discriminative embedding for each voxel within the image. These +approaches can identify corresponding landmarks through nearest neighbor +matching and has demonstrated promising results across various tasks. However, +current methods still face challenges in: (1) differentiating voxels with +similar appearance but different semantic meanings (\textit{e.g.}, two adjacent +structures without clear borders); (2) matching voxels with similar semantics +but markedly different appearance (\textit{e.g.}, the same vessel before and +after contrast injection); and (3) cross-modality matching (\textit{e.g.}, +CT-MRI landmark-based registration). To overcome these challenges, we propose +universal anatomical embedding (UAE), which is a unified framework designed to +learn appearance, semantic, and cross-modality anatomical embeddings. +Specifically, UAE incorporates three key innovations: (1) semantic embedding +learning with prototypical contrastive loss; (2) a fixed-point-based matching +strategy; and (3) an iterative approach for cross-modality embedding learning. +We thoroughly evaluated UAE across intra- and inter-modality tasks, including +one-shot landmark detection, lesion tracking on longitudinal CT scans, and +CT-MRI affine/rigid registration with varying field of view. Our results +suggest that UAE outperforms state-of-the-art methods, offering a robust and +versatile approach for landmark based medical image analysis tasks. Code and +trained models are available at: \href{https://shorturl.at/bgsB3} + +
+
+
+
+
+ + ♻ ☆ Panoptic Vision-Language Feature Fields + + +
+ Recently, methods have been proposed for 3D open-vocabulary semantic +segmentation. Such methods are able to segment scenes into arbitrary classes +based on text descriptions provided during runtime. In this paper, we propose +to the best of our knowledge the first algorithm for open-vocabulary panoptic +segmentation in 3D scenes. Our algorithm, Panoptic Vision-Language Feature +Fields (PVLFF), learns a semantic feature field of the scene by distilling +vision-language features from a pretrained 2D model, and jointly fits an +instance feature field through contrastive learning using 2D instance segments +on input frames. Despite not being trained on the target classes, our method +achieves panoptic segmentation performance similar to the state-of-the-art +closed-set 3D systems on the HyperSim, ScanNet and Replica dataset and +additionally outperforms current 3D open-vocabulary systems in terms of +semantic segmentation. We ablate the components of our method to demonstrate +the effectiveness of our model architecture. Our code will be available at +https://github.com/ethz-asl/pvlff. + +
+
+ comment: This work has been accepted by IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ Text-driven Talking Face Synthesis by Reprogramming Audio-driven Models ICASSP 2024 + + +
+ In this paper, we present a method for reprogramming pre-trained audio-driven +talking face synthesis models to operate in a text-driven manner. Consequently, +we can easily generate face videos that articulate the provided textual +sentences, eliminating the necessity of recording speech for each inference, as +required in the audio-driven model. To this end, we propose to embed the input +text into the learned audio latent space of the pre-trained audio-driven model, +while preserving the face synthesis capability of the original pre-trained +model. Specifically, we devise a Text-to-Audio Embedding Module (TAEM) which +maps a given text input into the audio latent space by modeling pronunciation +and duration characteristics. Furthermore, to consider the speaker +characteristics in audio while using text inputs, TAEM is designed to accept a +visual speaker embedding. The visual speaker embedding is derived from a single +target face image and enables improved mapping of input text to the learned +audio latent space by incorporating the speaker characteristics inherent in the +audio. The main advantages of the proposed framework are that 1) it can be +applied to diverse audio-driven talking face synthesis models and 2) we can +generate talking face videos with either text inputs or audio inputs with high +flexibility. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ GSB: Group Superposition Binarization for Vision Transformer with + Limited Training Samples + + +
+ Vision Transformer (ViT) has performed remarkably in various computer vision +tasks. Nonetheless, affected by the massive amount of parameters, ViT usually +suffers from serious overfitting problems with a relatively limited number of +training samples. In addition, ViT generally demands heavy computing resources, +which limit its deployment on resource-constrained devices. As a type of +model-compression method, model binarization is potentially a good choice to +solve the above problems. Compared with the full-precision one, the model with +the binarization method replaces complex tensor multiplication with simple +bit-wise binary operations and represents full-precision model parameters and +activations with only 1-bit ones, which potentially solves the problem of model +size and computational complexity, respectively. In this paper, we investigate +a binarized ViT model. Empirically, we observe that the existing binarization +technology designed for Convolutional Neural Networks (CNN) cannot migrate well +to a ViT's binarization task. We also find that the decline of the accuracy of +the binary ViT model is mainly due to the information loss of the Attention +module and the Value vector. Therefore, we propose a novel model binarization +technique, called Group Superposition Binarization (GSB), to deal with these +issues. Furthermore, in order to further improve the performance of the +binarization model, we have investigated the gradient calculation procedure in +the binarization process and derived more proper gradient calculation equations +for GSB to reduce the influence of gradient mismatch. Then, the knowledge +distillation technique is introduced to alleviate the performance degradation +caused by model binarization. Analytically, model binarization can limit the +parameters search space during parameter updates while training a model.... + +
+
+ comment: Accepted by Neural Networks +
+
+
+
+
+ + ♻ ☆ GPT-Prompt Controlled Diffusion for Weakly-Supervised Semantic + Segmentation + + +
+ Weakly supervised semantic segmentation (WSSS), aiming to train segmentation +models solely using image-level labels, has received significant attention. +Existing approaches mainly concentrate on creating high-quality pseudo labels +by utilizing existing images and their corresponding image-level labels. +However, the quality of pseudo labels degrades significantly when the size of +available dataset is limited. Thus, in this paper, we tackle this problem from +a different view by introducing a novel approach called GPT-Prompt Controlled +Diffusion (GPCD) for data augmentation. This approach enhances the current +labeled datasets by augmenting with a variety of images, achieved through +controlled diffusion guided by GPT prompts. In this process, the existing +images and image-level labels provide the necessary control information, where +GPT is employed to enrich the prompts, leading to the generation of diverse +backgrounds. Moreover, we integrate data source information as tokens into the +Vision Transformer (ViT) framework. These tokens are specifically designed to +improve the ability of downstream WSSS framework to recognize the origins of +augmented images. Our proposed GPCD approach clearly surpasses existing +state-of-the-art methods. This effect is more obvious when the amount of +available data is small, demonstrating the effectiveness of our method. + +
+
+
+
+
+ + ♻ ☆ Transcending the Limit of Local Window: Advanced Super-Resolution + Transformer with Adaptive Token Dictionary + + +
+ Single Image Super-Resolution is a classic computer vision problem that +involves estimating high-resolution (HR) images from low-resolution (LR) ones. +Although deep neural networks (DNNs), especially Transformers for +super-resolution, have seen significant advancements in recent years, +challenges still remain, particularly in limited receptive field caused by +window-based self-attention. To address these issues, we introduce a group of +auxiliary Adaptive Token Dictionary to SR Transformer and establish an ATD-SR +method. The introduced token dictionary could learn prior information from +training data and adapt the learned prior to specific testing image through an +adaptive refinement step. The refinement strategy could not only provide global +information to all input tokens but also group image tokens into categories. +Based on category partitions, we further propose a category-based +self-attention mechanism designed to leverage distant but similar tokens for +enhancing input features. The experimental results show that our method +achieves the best performance on various single image super-resolution +benchmarks. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ DKiS: Decay weight invertible image steganography with private key + + +
+ Image steganography, defined as the practice of concealing information within +another image, traditionally encounters security challenges when its methods +become publicly known or are under attack. To address this, a novel private +key-based image steganography technique has been introduced. This approach +ensures the security of the hidden information, as access requires a +corresponding private key, regardless of the public knowledge of the +steganography method. Experimental evidence has been presented, demonstrating +the effectiveness of our method and showcasing its real-world applicability. +Furthermore, a critical challenge in the invertible image steganography process +has been identified by us: the transfer of non-essential, or `garbage', +information from the secret to the host pipeline. To tackle this issue, the +decay weight has been introduced to control the information transfer, +effectively filtering out irrelevant data and enhancing the performance of +image steganography. The code for this technique is publicly accessible at +https://github.com/yanghangAI/DKiS, and a practical demonstration can be found +at http://yanghang.site/hidekey. + +
+
+
+
+
+ + ♻ ☆ Mean Teacher DETR with Masked Feature Alignment: A Robust Domain + Adaptive Detection Transformer Framework AAAI2024 + + +
+ Unsupervised domain adaptation object detection (UDAOD) research on Detection +Transformer(DETR) mainly focuses on feature alignment and existing methods can +be divided into two kinds, each of which has its unresolved issues. One-stage +feature alignment methods can easily lead to performance fluctuation and +training stagnation. Two-stage feature alignment method based on mean teacher +comprises a pretraining stage followed by a self-training stage, each facing +problems in obtaining reliable pretrained model and achieving consistent +performance gains. Methods mentioned above have not yet explore how to utilize +the third related domain such as target-like domain to assist adaptation. To +address these issues, we propose a two-stage framework named MTM, i.e. Mean +Teacher-DETR with Masked Feature Alignment. In the pretraining stage, we +utilize labeled target-like images produced by image style transfer to avoid +performance fluctuation. In the self-training stage, we leverage unlabeled +target images by pseudo labels based on mean teacher and propose a module +called Object Queries Knowledge Transfer (OQKT) to ensure consistent +performance gains of the student model. Most importantly, we propose masked +feature alignment methods including Masked Domain Query-based Feature Alignment +(MDQFA) and Masked Token-wise Feature Alignment (MTWFA) to alleviate domain +shift in a more robust way, which not only prevent training stagnation and lead +to a robust pretrained model in the pretraining stage, but also enhance the +model's target performance in the self-training stage. Experiments on three +challenging scenarios and a theoretical analysis verify the effectiveness of +MTM. + +
+
+ comment: AAAI2024 +
+
+
+
+
+ + ♻ ☆ Affective Video Content Analysis: Decade Review and New Perspectives + + +
+ Video content is rich in semantics and has the ability to evoke various +emotions in viewers. In recent years, with the rapid development of affective +computing and the explosive growth of visual data, affective video content +analysis (AVCA) as an essential branch of affective computing has become a +widely researched topic. In this study, we comprehensively review the +development of AVCA over the past decade, particularly focusing on the most +advanced methods adopted to address the three major challenges of video feature +extraction, expression subjectivity, and multimodal feature fusion. We first +introduce the widely used emotion representation models in AVCA and describe +commonly used datasets. We summarize and compare representative methods in the +following aspects: (1) unimodal AVCA models, including facial expression +recognition and posture emotion recognition; (2) multimodal AVCA models, +including feature fusion, decision fusion, and attention-based multimodal +models; (3) model performance evaluation standards. Finally, we discuss future +challenges and promising research directions, such as emotion recognition and +public opinion analysis, human-computer interaction, and emotional +intelligence. + +
+
+
+
+
+ + ♻ ☆ ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration + Measure ICLR 2023 + + +
+ Studies have shown that modern neural networks tend to be poorly calibrated +due to over-confident predictions. Traditionally, post-processing methods have +been used to calibrate the model after training. In recent years, various +trainable calibration measures have been proposed to incorporate them directly +into the training process. However, these methods all incorporate internal +hyperparameters, and the performance of these calibration objectives relies on +tuning these hyperparameters, incurring more computational costs as the size of +neural networks and datasets become larger. As such, we present Expected +Squared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable +calibration objective loss, where we view the calibration error from the +perspective of the squared difference between the two expectations. With +extensive experiments on several architectures (CNNs, Transformers) and +datasets, we demonstrate that (1) incorporating ESD into the training improves +model calibration in various batch size settings without the need for internal +hyperparameter tuning, (2) ESD yields the best-calibrated results compared with +previous approaches, and (3) ESD drastically improves the computational costs +required for calibration during training due to the absence of internal +hyperparameter. The code is publicly accessible at +https://github.com/hee-suk-yoon/ESD. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ B-Cos Aligned Transformers Learn Human-Interpretable Features MICCAI 2023 + + +
+ Vision Transformers (ViTs) and Swin Transformers (Swin) are currently +state-of-the-art in computational pathology. However, domain experts are still +reluctant to use these models due to their lack of interpretability. This is +not surprising, as critical decisions need to be transparent and +understandable. The most common approach to understanding transformers is to +visualize their attention. However, attention maps of ViTs are often +fragmented, leading to unsatisfactory explanations. Here, we introduce a novel +architecture called the B-cos Vision Transformer (BvT) that is designed to be +more interpretable. It replaces all linear transformations with the B-cos +transform to promote weight-input alignment. In a blinded study, medical +experts clearly ranked BvTs above ViTs, suggesting that our network is better +at capturing biomedically relevant structures. This is also true for the B-cos +Swin Transformer (Bwin). Compared to the Swin Transformer, it even improves the +F1-score by up to 4.7% on two public datasets. + +
+
+ comment: Accepted at MICCAI 2023 (oral). Camera-ready available at + https://doi.org/10.1007/978-3-031-43993-3_50 +
+
+
+
+
+ + ♻ ☆ Latent Degradation Representation Constraint for Single Image Deraining ICASSP 2024 + + +
+ Since rain streaks show a variety of shapes and directions, learning the +degradation representation is extremely challenging for single image deraining. +Existing methods are mainly targeted at designing complicated modules to +implicitly learn latent degradation representation from coupled rainy images. +This way, it is hard to decouple the content-independent degradation +representation due to the lack of explicit constraint, resulting in over- or +under-enhancement problems. To tackle this issue, we propose a novel Latent +Degradation Representation Constraint Network (LDRCNet) that consists of +Direction-Aware Encoder (DAEncoder), UNet Deraining Network, and Multi-Scale +Interaction Block (MSIBlock). Specifically, the DAEncoder is proposed to +adaptively extract latent degradation representation by using the deformable +convolutions to exploit the direction consistency of rain streaks. Next, a +constraint loss is introduced to explicitly constraint the degradation +representation learning during training. Last, we propose an MSIBlock to fuse +with the learned degradation representation and decoder features of the +deraining network for adaptive information interaction, which enables the +deraining network to remove various complicated rainy patterns and reconstruct +image details. Experimental results on synthetic and real datasets demonstrate +that our method achieves new state-of-the-art performance. + +
+
+ comment: This paper is accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Process + + +
+ Neural radiance fields (NeRFs) have gained popularity across various +applications. However, they face challenges in the sparse view setting, lacking +sufficient constraints from volume rendering. Reconstructing and understanding +a 3D scene from sparse and unconstrained cameras is a long-standing problem in +classical computer vision with diverse applications. While recent works have +explored NeRFs in sparse, unconstrained view scenarios, their focus has been +primarily on enhancing reconstruction and novel view synthesis. Our approach +takes a broader perspective by posing the question: "from where has each point +been seen?" -- which gates how well we can understand and reconstruct it. In +other words, we aim to determine the origin or provenance of each 3D point and +its associated information under sparse, unconstrained views. We introduce +ProvNeRF, a model that enriches a traditional NeRF representation by +incorporating per-point provenance, modeling likely source locations for each +point. We achieve this by extending implicit maximum likelihood estimation +(IMLE) for stochastic processes. Notably, our method is compatible with any +pre-trained NeRF model and the associated training camera poses. We demonstrate +that modeling per-point provenance offers several advantages, including +uncertainty estimation, criteria-based view selection, and improved novel view +synthesis, compared to state-of-the-art methods. Please visit our project page +at https://provnerf.github.io + +
+
+
+
+
+ + ♻ ☆ NODI: Out-Of-Distribution Detection with Noise from Diffusion + + +
+ Out-of-distribution (OOD) detection is a crucial part of deploying machine +learning models safely. It has been extensively studied with a plethora of +methods developed in the literature. This problem is tackled with an OOD score +computation, however, previous methods compute the OOD scores with limited +usage of the in-distribution dataset. For instance, the OOD scores are computed +with information from a small portion of the in-distribution data. Furthermore, +these methods encode images with a neural image encoder. The robustness of +these methods is rarely checked with respect to image encoders of different +training methods and architectures. In this work, we introduce the diffusion +process into the OOD task. The diffusion model integrates information on the +whole training set into the predicted noise vectors. What's more, we deduce a +closed-form solution for the noise vector (stable point). Then the noise vector +is converted into our OOD score, we test both the deep model predicted noise +vector and the closed-form noise vector on the OOD benchmarks \cite{openood}. +Our method outperforms previous OOD methods across all types of image encoders +(Table. \ref{main}). A $3.5\%$ performance gain is achieved with the MAE-based +image encoder. Moreover, we studied the robustness of OOD methods by applying +different types of image encoders. Some OOD methods failed to generalize well +when switching image encoders from ResNet to Vision Transformers, our method +performs exhibits good robustness with all the image encoders. + +
+
+
+
+
+ + ♻ ☆ Improved Implicity Neural Representation with Fourier Bases + Reparameterized Training + + +
+ Implicit Neural Representation (INR) as a mighty representation paradigm has +achieved success in various computer vision tasks recently. Due to the +low-frequency bias issue of vanilla multi-layer perceptron (MLP), existing +methods have investigated advanced techniques, such as positional encoding and +periodic activation function, to improve the accuracy of INR. In this paper, we +connect the network training bias with the reparameterization technique and +theoretically prove that weight reparameterization could provide us a chance to +alleviate the spectral bias of MLP. Based on our theoretical analysis, we +propose a Fourier reparameterization method which learns coefficient matrix of +fixed Fourier bases to compose the weights of MLP. We evaluate the proposed +Fourier reparameterization method on different INR tasks with various MLP +architectures, including vanilla MLP, MLP with positional encoding and MLP with +advanced activation function, etc. The superiority approximation results on +different MLP architectures clearly validate the advantage of our proposed +method. Armed with our Fourier reparameterization method, better INR with more +textures and less artifacts can be learned from the training data. + +
+
+
+
+
+ + ♻ ☆ Enhancing High-Resolution 3D Generation through Pixel-wise Gradient + Clipping ICLR 2024 + + +
+ High-resolution 3D object generation remains a challenging task primarily due +to the limited availability of comprehensive annotated training data. Recent +advancements have aimed to overcome this constraint by harnessing image +generative models, pretrained on extensive curated web datasets, using +knowledge transfer techniques like Score Distillation Sampling (SDS). +Efficiently addressing the requirements of high-resolution rendering often +necessitates the adoption of latent representation-based models, such as the +Latent Diffusion Model (LDM). In this framework, a significant challenge +arises: To compute gradients for individual image pixels, it is necessary to +backpropagate gradients from the designated latent space through the frozen +components of the image model, such as the VAE encoder used within LDM. +However, this gradient propagation pathway has never been optimized, remaining +uncontrolled during training. We find that the unregulated gradients adversely +affect the 3D model's capacity in acquiring texture-related information from +the image generative model, leading to poor quality appearance synthesis. To +address this overarching challenge, we propose an innovative operation termed +Pixel-wise Gradient Clipping (PGC) designed for seamless integration into +existing 3D generative models, thereby enhancing their synthesis quality. +Specifically, we control the magnitude of stochastic gradients by clipping the +pixel-wise gradients efficiently, while preserving crucial texture-related +gradient directions. Despite this simplicity and minimal extra cost, extensive +experiments demonstrate the efficacy of our PGC in enhancing the performance of +existing 3D generative models for high-resolution object rendering. + +
+
+ comment: Accepted at ICLR 2024. Project page: + https://fudan-zvg.github.io/PGC-3D +
+
+
+
+
+ + ♻ ☆ Accelerating Globally Optimal Consensus Maximization in Geometric Vision + + +
+ Branch-and-bound-based consensus maximization stands out due to its important +ability of retrieving the globally optimal solution to outlier-affected +geometric problems. However, while the discovery of such solutions caries high +scientific value, its application in practical scenarios is often prohibited by +its computational complexity growing exponentially as a function of the +dimensionality of the problem at hand. In this work, we convey a novel, general +technique that allows us to branch over an n-1 dimensional space for an +n-dimensional problem. The remaining degree of freedom can be solved globally +optimally within each bound calculation by applying the efficient interval +stabbing technique. While each individual bound derivation is harder to compute +owing to the additional need for solving a sorting problem, the reduced number +of intervals and tighter bounds in practice lead to a significant reduction in +the overall number of required iterations. Besides an abstract introduction of +the approach, we present applications to four fundamental geometric computer +vision problems: camera resectioning, relative camera pose estimation, point +set registration, and rotation and focal length estimation. Through our +exhaustive tests, we demonstrate significant speed-up factors at times +exceeding two orders of magnitude, thereby increasing the viability of globally +optimal consensus maximizers in online application scenarios. + +
+
+ comment: Accepted by IEEE Transactions on Pattern Analysis and Machine + Intelligence, 2024 +
+
+
+
+
+ + ♻ ☆ Virchow: A Million-Slide Digital Pathology Foundation Model + + +
+ The use of artificial intelligence to enable precision medicine and decision +support systems through the analysis of pathology images has the potential to +revolutionize the diagnosis and treatment of cancer. Such applications will +depend on models' abilities to capture the diverse patterns observed in +pathology images. To address this challenge, we present Virchow, a foundation +model for computational pathology. Using self-supervised learning empowered by +the DINOv2 algorithm, Virchow is a vision transformer model with 632 million +parameters trained on 1.5 million hematoxylin and eosin stained whole slide +images from diverse tissue and specimen types, which is orders of magnitude +more data than previous works. The Virchow model enables the development of a +pan-cancer detection system with 0.949 overall specimen-level AUC across 17 +different cancer types, while also achieving 0.937 AUC on 7 rare cancer types. +The Virchow model sets the state-of-the-art on the internal and external image +tile level benchmarks and slide level biomarker prediction tasks. The gains in +performance highlight the importance of training on massive pathology image +datasets, suggesting scaling up the data and network architecture can improve +the accuracy for many high-impact computational pathology applications where +limited amounts of training data are available. + +
+
+
+
+
+ + ♻ ☆ Skeletal Video Anomaly Detection using Deep Learning: Survey, Challenges + and Future Directions + + +
+ The existing methods for video anomaly detection mostly utilize videos +containing identifiable facial and appearance-based features. The use of videos +with identifiable faces raises privacy concerns, especially when used in a +hospital or community-based setting. Appearance-based features can also be +sensitive to pixel-based noise, straining the anomaly detection methods to +model the changes in the background and making it difficult to focus on the +actions of humans in the foreground. Structural information in the form of +skeletons describing the human motion in the videos is privacy-protecting and +can overcome some of the problems posed by appearance-based features. In this +paper, we present a survey of privacy-protecting deep learning anomaly +detection methods using skeletons extracted from videos. We present a novel +taxonomy of algorithms based on the various learning approaches. We conclude +that skeleton-based approaches for anomaly detection can be a plausible +privacy-protecting alternative for video anomaly detection. Lastly, we identify +major open research questions and provide guidelines to address them. + +
+
+ comment: This work has been accepted by IEEE Transactions on Emerging Topics + in Computational Intelligence +
+
+
+
+
+ + ♻ ☆ Stream Query Denoising for Vectorized HD Map Construction + + +
+ To enhance perception performance in complex and extensive scenarios within +the realm of autonomous driving, there has been a noteworthy focus on temporal +modeling, with a particular emphasis on streaming methods. The prevailing trend +in streaming models involves the utilization of stream queries for the +propagation of temporal information. Despite the prevalence of this approach, +the direct application of the streaming paradigm to the construction of +vectorized high-definition maps (HD-maps) fails to fully harness the inherent +potential of temporal information. This paper introduces the Stream Query +Denoising (SQD) strategy as a novel approach for temporal modeling in +high-definition map (HD-map) construction. SQD is designed to facilitate the +learning of temporal consistency among map elements within the streaming model. +The methodology involves denoising the queries that have been perturbed by the +addition of noise to the ground-truth information from the preceding frame. +This denoising process aims to reconstruct the ground-truth information for the +current frame, thereby simulating the prediction process inherent in stream +queries. The SQD strategy can be applied to those streaming methods (e.g., +StreamMapNet) to enhance the temporal modeling. The proposed SQD-MapNet is the +StreamMapNet equipped with SQD. Extensive experiments on nuScenes and +Argoverse2 show that our method is remarkably superior to other existing +methods across all settings of close range and long range. The code will be +available soon. + +
+
+
+
+
+ + ♻ ☆ Mixture of Cluster-conditional LoRA Experts for Vision-language + Instruction Tuning + + +
+ Instruction tuning of the Large Vision-language Models (LVLMs) has +revolutionized the development of versatile models with zero-shot +generalization across a wide range of downstream vision-language tasks. +However, diversity of training tasks of different sources and formats would +lead to inevitable task conflicts, where different tasks conflicts for the same +set of model parameters, resulting in sub-optimal instruction-following +abilities. To address that, we propose the Mixture of Cluster-conditional LoRA +Experts (MoCLE), a novel Mixture of Experts (MoE) architecture designed to +activate the task-customized model parameters based on the instruction +clusters. A separate universal expert is further incorporated to improve the +generalization capabilities of MoCLE for novel instructions. Extensive +experiments on 10 zero-shot tasks demonstrate the effectiveness of MoCLE. + +
+
+
+
+
+ + ♻ ☆ Free Lunch for Generating Effective Outlier Supervision + + +
+ When deployed in practical applications, computer vision systems will +encounter numerous unexpected images (\emph{{i.e.}}, out-of-distribution data). +Due to the potentially raised safety risks, these aforementioned unseen data +should be carefully identified and handled. Generally, existing approaches in +dealing with out-of-distribution (OOD) detection mainly focus on the +statistical difference between the features of OOD and in-distribution (ID) +data extracted by the classifiers. Although many of these schemes have brought +considerable performance improvements, reducing the false positive rate (FPR) +when processing open-set images, they necessarily lack reliable theoretical +analysis and generalization guarantees. Unlike the observed ways, in this +paper, we investigate the OOD detection problem based on the Bayes rule and +present a convincing description of the reason for failures encountered by +conventional classifiers. Concretely, our analysis reveals that refining the +probability distribution yielded by the vanilla neural networks is necessary +for OOD detection, alleviating the issues of assigning high confidence to OOD +data. To achieve this effortlessly, we propose an ultra-effective method to +generate near-realistic outlier supervision. Extensive experiments on +large-scale benchmarks reveal that our proposed \texttt{BayesAug} significantly +reduces the FPR95 over 12.50\% compared with the previous schemes, boosting the +reliability of machine learning systems. The code will be made publicly +available. + +
+
+ comment: We have rewritten this paper, and published as "Image Background + Serves as Good Proxy for Out-of-distribution Data" arXiv:2307.00519 +
+
+
+
+
+ + ♻ ☆ Enhancing Video Super-Resolution via Implicit Resampling-based Alignment + + +
+ In video super-resolution, it is common to use a frame-wise alignment to +support the propagation of information over time. The role of alignment is +well-studied for low-level enhancement in video, but existing works overlook a +critical step -- resampling. We show through extensive experiments that for +alignment to be effective, the resampling should preserve the reference +frequency spectrum while minimizing spatial distortions. However, most existing +works simply use a default choice of bilinear interpolation for resampling even +though bilinear interpolation has a smoothing effect and hinders +super-resolution. From these observations, we propose an implicit +resampling-based alignment. The sampling positions are encoded by a sinusoidal +positional encoding, while the value is estimated with a coordinate network and +a window-based cross-attention. We show that bilinear interpolation inherently +attenuates high-frequency information while an MLP-based coordinate network can +approximate more frequencies. Experiments on synthetic and real-world datasets +show that alignment with our proposed implicit resampling enhances the +performance of state-of-the-art frameworks with minimal impact on both compute +and parameters. + +
+
+
+
+
+ + ♻ ☆ Improved DDIM Sampling with Moment Matching Gaussian Mixtures + + +
+ We propose using a Gaussian Mixture Model (GMM) as reverse transition +operator (kernel) within the Denoising Diffusion Implicit Models (DDIM) +framework, which is one of the most widely used approaches for accelerated +sampling from pre-trained Denoising Diffusion Probabilistic Models (DDPM). +Specifically we match the first and second order central moments of the DDPM +forward marginals by constraining the parameters of the GMM. We see that moment +matching is sufficient to obtain samples with equal or better quality than the +original DDIM with Gaussian kernels. We provide experimental results with +unconditional models trained on CelebAHQ and FFHQ and class-conditional models +trained on ImageNet datasets respectively. Our results suggest that using the +GMM kernel leads to significant improvements in the quality of the generated +samples when the number of sampling steps is small, as measured by FID and IS +metrics. For example on ImageNet 256x256, using 10 sampling steps, we achieve a +FID of 6.94 and IS of 207.85 with a GMM kernel compared to 10.15 and 196.73 +respectively with a Gaussian kernel. + +
+
+ comment: 29 pages, 14 figures; Analysis of DDIM-GMM as a multimodal denoiser; + Additional experiments on LSUN datasets and text-to-image generation with + Stable Diffusion; Comparison with DPM-Solver; Ablations on GMM parameters; + Updated equations with bold font for vectors and matrices +
+
+
+
+
+ + ♻ ☆ Prismer: A Vision-Language Model with Multi-Task Experts + + +
+ Recent vision-language models have shown impressive multi-modal generation +capabilities. However, typically they require training huge models on massive +datasets. As a more scalable alternative, we introduce Prismer, a data- and +parameter-efficient vision-language model that leverages an ensemble of +task-specific experts. Prismer only requires training of a small number of +components, with the majority of network weights inherited from multiple +readily-available, pre-trained experts, and kept frozen during training. By +leveraging experts from a wide range of domains, we show Prismer can +efficiently pool this expert knowledge and adapt it to various vision-language +reasoning tasks. In our experiments, we show that Prismer achieves fine-tuned +and few-shot learning performance which is competitive with current +state-of-the-arts, whilst requiring up to two orders of magnitude less training +data. Code is available at https://github.com/NVlabs/prismer. + +
+
+ comment: Published at TMLR 2024. Project Page: + https://shikun.io/projects/prismer Code: https://github.com/NVlabs/prismer +
+
+
+
+
+ + ♻ ☆ Predicting breast cancer with AI for individual risk-adjusted MRI + screening and early detection + + +
+ Women with an increased life-time risk of breast cancer undergo supplemental +annual screening MRI. We propose to predict the risk of developing breast +cancer within one year based on the current MRI, with the objective of reducing +screening burden and facilitating early detection. An AI algorithm was +developed on 53,858 breasts from 12,694 patients who underwent screening or +diagnostic MRI and accrued over 12 years, with 2,331 confirmed cancers. A first +U-Net was trained to segment lesions and identify regions of concern. A second +convolutional network was trained to detect malignant cancer using features +extracted by the U-Net. This network was then fine-tuned to estimate the risk +of developing cancer within a year in cases that radiologists considered normal +or likely benign. Risk predictions from this AI were evaluated with a +retrospective analysis of 9,183 breasts from a high-risk screening cohort, +which were not used for training. Statistical analysis focused on the tradeoff +between number of omitted exams versus negative predictive value, and number of +potential early detections versus positive predictive value. The AI algorithm +identified regions of concern that coincided with future tumors in 52% of +screen-detected cancers. Upon directed review, a radiologist found that 71.3% +of cancers had a visible correlate on the MRI prior to diagnosis, 65% of these +correlates were identified by the AI model. Reevaluating these regions in 10% +of all cases with higher AI-predicted risk could have resulted in up to 33% +early detections by a radiologist. Additionally, screening burden could have +been reduced in 16% of lower-risk cases by recommending a later follow-up +without compromising current interval cancer rate. With increasing datasets and +improving image quality we expect this new AI-aided, adaptive screening to +meaningfully reduce screening burden and improve early detection. + +
+
+ comment: Major revisions and rewriting in progress +
+
+
+
+
+ + ♻ ☆ Link Prediction for Flow-Driven Spatial Networks + + +
+ Link prediction algorithms aim to infer the existence of connections (or +links) between nodes in network-structured data and are typically applied to +refine the connectivity among nodes. In this work, we focus on link prediction +for flow-driven spatial networks, which are embedded in a Euclidean space and +relate to physical exchange and transportation processes (e.g., blood flow in +vessels or traffic flow in road networks). To this end, we propose the Graph +Attentive Vectors (GAV) link prediction framework. GAV models simplified +dynamics of physical flow in spatial networks via an attentive, +neighborhood-aware message-passing paradigm, updating vector embeddings in a +constrained manner. We evaluate GAV on eight flow-driven spatial networks given +by whole-brain vessel graphs and road networks. GAV demonstrates superior +performances across all datasets and metrics and outperformed the +state-of-the-art on the ogbl-vessel benchmark at the time of submission by 12% +(98.38 vs. 87.98 AUC). All code is publicly available on GitHub. + +
+
+
+
+
+ + ♻ ☆ How Deep is Your Art: An Experimental Study on the Limits of Artistic + Understanding in a Single-Task, Single-Modality Neural Network + + +
+ Computational modeling of artwork meaning is complex and difficult. This is +because art interpretation is multidimensional and highly subjective. This +paper experimentally investigated the degree to which a state-of-the-art Deep +Convolutional Neural Network (DCNN), a popular Machine Learning approach, can +correctly distinguish modern conceptual art work into the galleries devised by +art curators. Two hypotheses were proposed to state that the DCNN model uses +Exhibited Properties for classification, like shape and color, but not +Non-Exhibited Properties, such as historical context and artist intention. The +two hypotheses were experimentally validated using a methodology designed for +this purpose. VGG-11 DCNN pre-trained on ImageNet dataset and discriminatively +fine-tuned was trained on handcrafted datasets designed from real-world +conceptual photography galleries. Experimental results supported the two +hypotheses showing that the DCNN model ignores Non-Exhibited Properties and +uses only Exhibited Properties for artwork classification. This work points to +current DCNN limitations, which should be addressed by future DNN models. + +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ ChatQA: Building GPT-4 Level Conversational QA Models + + +
+ In this work, we introduce ChatQA, a family of conversational question +answering (QA) models, that obtain GPT-4 level accuracies. Specifically, we +propose a two-stage instruction tuning method that can significantly improve +the zero-shot conversational QA results from large language models (LLMs). To +handle retrieval in conversational QA, we fine-tune a dense retriever on a +multi-turn QA dataset, which provides comparable results to using the +state-of-the-art query rewriting model while largely reducing deployment cost. +Notably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10 +conversational QA datasets (54.14 vs. 53.90), without relying on any synthetic +data from OpenAI GPT models. + +
+
+
+
+
+ + ☆ Comparing Traditional and LLM-based Search for Image Geolocation + + +
+ Web search engines have long served as indispensable tools for information +retrieval; user behavior and query formulation strategies have been well +studied. The introduction of search engines powered by large language models +(LLMs) suggested more conversational search and new types of query strategies. +In this paper, we compare traditional and LLM-based search for the task of +image geolocation, i.e., determining the location where an image was captured. +Our work examines user interactions, with a particular focus on query +formulation strategies. In our study, 60 participants were assigned either +traditional or LLM-based search engines as assistants for geolocation. +Participants using traditional search more accurately predicted the location of +the image compared to those using the LLM-based search. Distinct strategies +emerged between users depending on the type of assistant. Participants using +the LLM-based search issued longer, more natural language queries, but had +shorter search sessions. When reformulating their search queries, traditional +search participants tended to add more terms to their initial queries, whereas +participants using the LLM-based search consistently rephrased their initial +queries. + +
+
+
+
+
+ + ☆ LOCALINTEL: Generating Organizational Threat Intelligence from Global + and Local Cyber Knowledge + + +
+ Security Operations Center (SoC) analysts gather threat reports from openly +accessible global threat databases and customize them manually to suit a +particular organization's needs. These analysts also depend on internal +repositories, which act as private local knowledge database for an +organization. Credible cyber intelligence, critical operational details, and +relevant organizational information are all stored in these local knowledge +databases. Analysts undertake a labor intensive task utilizing these global and +local knowledge databases to manually create organization's unique threat +response and mitigation strategies. Recently, Large Language Models (LLMs) have +shown the capability to efficiently process large diverse knowledge sources. We +leverage this ability to process global and local knowledge databases to +automate the generation of organization-specific threat intelligence. + In this work, we present LOCALINTEL, a novel automated knowledge +contextualization system that, upon prompting, retrieves threat reports from +the global threat repositories and uses its local knowledge database to +contextualize them for a specific organization. LOCALINTEL comprises of three +key phases: global threat intelligence retrieval, local knowledge retrieval, +and contextualized completion generation. The former retrieves intelligence +from global threat repositories, while the second retrieves pertinent knowledge +from the local knowledge database. Finally, the fusion of these knowledge +sources is orchestrated through a generator to produce a contextualized +completion. + +
+
+
+
+
+ + ☆ HGAttack: Transferable Heterogeneous Graph Adversarial Attack + + +
+ Heterogeneous Graph Neural Networks (HGNNs) are increasingly recognized for +their performance in areas like the web and e-commerce, where resilience +against adversarial attacks is crucial. However, existing adversarial attack +methods, which are primarily designed for homogeneous graphs, fall short when +applied to HGNNs due to their limited ability to address the structural and +semantic complexity of HGNNs. This paper introduces HGAttack, the first +dedicated gray box evasion attack method for heterogeneous graphs. We design a +novel surrogate model to closely resemble the behaviors of the target HGNN and +utilize gradient-based methods for perturbation generation. Specifically, the +proposed surrogate model effectively leverages heterogeneous information by +extracting meta-path induced subgraphs and applying GNNs to learn node +embeddings with distinct semantics from each subgraph. This approach improves +the transferability of generated attacks on the target HGNN and significantly +reduces memory costs. For perturbation generation, we introduce a +semantics-aware mechanism that leverages subgraph gradient information to +autonomously identify vulnerable edges across a wide range of relations within +a constrained perturbation budget. We validate HGAttack's efficacy with +comprehensive experiments on three datasets, providing empirical analyses of +its generated perturbations. Outperforming baseline methods, HGAttack +demonstrated significant efficacy in diminishing the performance of target HGNN +models, affirming the effectiveness of our approach in evaluating the +robustness of HGNNs against adversarial attacks. + +
+
+
+
+
+ + ☆ Source Code Clone Detection Using Unsupervised Similarity Measures + + +
+ Assessing similarity in source code has gained significant attention in +recent years due to its importance in software engineering tasks such as clone +detection and code search and recommendation. This work presents a comparative +analysis of unsupervised similarity measures for identifying source code clone +detection. The goal is to overview the current state-of-the-art techniques, +their strengths, and weaknesses. To do that, we compile the existing +unsupervised strategies and evaluate their performance on a benchmark dataset +to guide software engineers in selecting appropriate methods for their specific +use cases. The source code of this study is available at +\url{https://github.com/jorge-martinez-gil/codesim} + +
+
+ comment: Accepted for publication as Full Paper in the Software Quality Days + 2024, Vienna, Austria +
+
+
+
+
+ + ☆ MatSciRE: Leveraging Pointer Networks to Automate Entity and Relation + Extraction for Material Science Knowledge-base Construction + + +
+ Material science literature is a rich source of factual information about +various categories of entities (like materials and compositions) and various +relations between these entities, such as conductivity, voltage, etc. +Automatically extracting this information to generate a material science +knowledge base is a challenging task. In this paper, we propose MatSciRE +(Material Science Relation Extractor), a Pointer Network-based encoder-decoder +framework, to jointly extract entities and relations from material science +articles as a triplet ($entity1, relation, entity2$). Specifically, we target +the battery materials and identify five relations to work on - conductivity, +coulombic efficiency, capacity, voltage, and energy. Our proposed approach +achieved a much better F1-score (0.771) than a previous attempt using +ChemDataExtractor (0.716). The overall graphical framework of MatSciRE is shown +in Fig 1. The material information is extracted from material science +literature in the form of entity-relation triplets using MatSciRE. + +
+
+
+
+
+ + ☆ Enhancing Image-Text Matching with Adaptive Feature Aggregation ICASSP 2024 + + +
+ Image-text matching aims to find matched cross-modal pairs accurately. While +current methods often rely on projecting cross-modal features into a common +embedding space, they frequently suffer from imbalanced feature representations +across different modalities, leading to unreliable retrieval results. To +address these limitations, we introduce a novel Feature Enhancement Module that +adaptively aggregates single-modal features for more balanced and robust +image-text retrieval. Additionally, we propose a new loss function that +overcomes the shortcomings of original triplet ranking loss, thereby +significantly improving retrieval performance. The proposed model has been +evaluated on two public datasets and achieves competitive retrieval performance +when compared with several state-of-the-art models. Implementation codes can be +found here. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ EfficientRec an unlimited user-item scale recommendation system based on + clustering and users interaction embedding profile + + +
+ Recommendation systems are highly interested in technology companies +nowadays. The businesses are constantly growing users and products, causing the +number of users and items to continuously increase over time, to very large +numbers. Traditional recommendation algorithms with complexity dependent on the +number of users and items make them difficult to adapt to the industrial +environment. In this paper, we introduce a new method applying graph neural +networks with a contrastive learning framework in extracting user preferences. +We incorporate a soft clustering architecture that significantly reduces the +computational cost of the inference process. Experiments show that the model is +able to learn user preferences with low computational cost in both training and +prediction phases. At the same time, the model gives a very good accuracy. We +call this architecture EfficientRec with the implication of model compactness +and the ability to scale to unlimited users and products. + +
+
+ comment: Published in 14th Asian Conference on Intelligent Information and + Database Systems (ACIIDS), 2022 +
+
+
+
+
+ + ☆ Improving One-class Recommendation with Multi-tasking on Various + Preference Intensities RecSys 2020 + + +
+ In the one-class recommendation problem, it's required to make +recommendations basing on users' implicit feedback, which is inferred from +their action and inaction. Existing works obtain representations of users and +items by encoding positive and negative interactions observed from training +data. However, these efforts assume that all positive signals from implicit +feedback reflect a fixed preference intensity, which is not realistic. +Consequently, representations learned with these methods usually fail to +capture informative entity features that reflect various preference +intensities. + In this paper, we propose a multi-tasking framework taking various preference +intensities of each signal from implicit feedback into consideration. +Representations of entities are required to satisfy the objective of each +subtask simultaneously, making them more robust and generalizable. Furthermore, +we incorporate attentive graph convolutional layers to explore high-order +relationships in the user-item bipartite graph and dynamically capture the +latent tendencies of users toward the items they interact with. Experimental +results show that our method performs better than state-of-the-art methods by a +large margin on three large-scale real-world benchmark datasets. + +
+
+ comment: RecSys 2020 (ACM Conference on Recommender Systems 2020) +
+
+
+
+
+ + ♻ ☆ A Survey on Modern Recommendation System based on Big Data + + +
+ This survey provides an exhaustive exploration of the evolution and current +state of recommendation systems, which have seen widespread integration in +various web applications. It focuses on the advancement of personalized +recommendation strategies for online products or services. We categorize +recommendation techniques into four primary types: content-based, collaborative +filtering-based, knowledge-based, and hybrid-based, each addressing unique +scenarios. The survey offers a detailed examination of the historical context +and the latest innovative approaches in recommendation systems, particularly +those employing big data. Additionally, it identifies and discusses key +challenges faced by modern recommendation systems, such as data sparsity, +scalability issues, and the need for diversity in recommendations. The survey +concludes by highlighting these challenges as potential areas for fruitful +future research in the field. + +
+
+ comment: 10 pages, 8 figures, 1 table +
+
+
+
+
+ + ♻ ☆ FactCHD: Benchmarking Fact-Conflicting Hallucination Detection + + +
+ Despite their impressive generative capabilities, LLMs are hindered by +fact-conflicting hallucinations in real-world applications. The accurate +identification of hallucinations in texts generated by LLMs, especially in +complex inferential scenarios, is a relatively unexplored area. To address this +gap, we present FactCHD, a dedicated benchmark designed for the detection of +fact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset +that spans various factuality patterns, including vanilla, multi-hop, +comparison, and set operation. A distinctive element of FactCHD is its +integration of fact-based evidence chains, significantly enhancing the depth of +evaluating the detectors' explanations. Experiments on different LLMs expose +the shortcomings of current approaches in detecting factual errors accurately. +Furthermore, we introduce Truth-Triangulator that synthesizes reflective +considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming +to yield more credible detection through the amalgamation of predictive results +and evidence. The benchmark dataset is available at +https://github.com/zjunlp/FactCHD. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ CodeKGC: Code Language Model for Generative Knowledge Graph Construction + + +
+ Current generative knowledge graph construction approaches usually fail to +capture structural knowledge by simply flattening natural language into +serialized texts or a specification language. However, large generative +language model trained on structured data such as code has demonstrated +impressive capability in understanding natural language for structural +prediction and reasoning tasks. Intuitively, we address the task of generative +knowledge graph construction with code language model: given a code-format +natural language input, the target is to generate triples which can be +represented as code completion tasks. Specifically, we develop schema-aware +prompts that effectively utilize the semantic structure within the knowledge +graph. As code inherently possesses structure, such as class and function +definitions, it serves as a useful model for prior semantic structural +knowledge. Furthermore, we employ a rationale-enhanced generation method to +boost the performance. Rationales provide intermediate steps, thereby improving +knowledge extraction abilities. Experimental results indicate that the proposed +approach can obtain better performance on benchmark datasets compared with +baselines. Code and datasets are available in +https://github.com/zjunlp/DeepKE/tree/main/example/llm. + +
+
+ comment: ACM Transactions on Asian and Low-Resource Language Information + Processing +
+
+
+
+
+ + ♻ ☆ Leveraging Negative Signals with Self-Attention for Sequential Music + Recommendation RecSys 2023 + + +
+ Music streaming services heavily rely on their recommendation engines to +continuously provide content to their consumers. Sequential recommendation +consequently has seen considerable attention in current literature, where state +of the art approaches focus on self-attentive models leveraging contextual +information such as long and short-term user history and item features; +however, most of these studies focus on long-form content domains (retail, +movie, etc.) rather than short-form, such as music. Additionally, many do not +explore incorporating negative session-level feedback during training. In this +study, we investigate the use of transformer-based self-attentive architectures +to learn implicit session-level information for sequential music +recommendation. We additionally propose a contrastive learning task to +incorporate negative feedback (e.g skipped tracks) to promote positive hits and +penalize negative hits. This task is formulated as a simple loss term that can +be incorporated into a variety of deep learning architectures for sequential +recommendation. Our experiments show that this results in consistent +performance gains over the baseline architectures ignoring negative user +feedback. + +
+
+ comment: Accepted to the 1st Workshop on Music Recommender Systems, co-located + with the 17th ACM Conference on Recommender Systems (MuRS @ RecSys 2023) +
+
+
+
+
+
+
+
+ + Machine Learning 163 + +
+
+
+ + ☆ A Simple Latent Diffusion Approach for Panoptic Segmentation and Mask + Inpainting + + +
+ Panoptic and instance segmentation networks are often trained with +specialized object detection modules, complex loss functions, and ad-hoc +post-processing steps to handle the permutation-invariance of the instance +masks. This work builds upon Stable Diffusion and proposes a latent diffusion +approach for panoptic segmentation, resulting in a simple architecture which +omits these complexities. Our training process consists of two steps: (1) +training a shallow autoencoder to project the segmentation masks to latent +space; (2) training a diffusion model to allow image-conditioned sampling in +latent space. The use of a generative model unlocks the exploration of mask +completion or inpainting, which has applications in interactive segmentation. +The experimental validation yields promising results for both panoptic +segmentation and mask inpainting. While not setting a new state-of-the-art, our +model's simplicity, generality, and mask completion capability are desirable +properties. + +
+
+ comment: Code: https://github.com/segments-ai/latent-diffusion-segmentation +
+
+
+
+
+ + ☆ ChatQA: Building GPT-4 Level Conversational QA Models + + +
+ In this work, we introduce ChatQA, a family of conversational question +answering (QA) models, that obtain GPT-4 level accuracies. Specifically, we +propose a two-stage instruction tuning method that can significantly improve +the zero-shot conversational QA results from large language models (LLMs). To +handle retrieval in conversational QA, we fine-tune a dense retriever on a +multi-turn QA dataset, which provides comparable results to using the +state-of-the-art query rewriting model while largely reducing deployment cost. +Notably, our ChatQA-70B can outperform GPT-4 in terms of average score on 10 +conversational QA datasets (54.14 vs. 53.90), without relying on any synthetic +data from OpenAI GPT models. + +
+
+
+
+
+ + ☆ AutoFT: Robust Fine-Tuning by Optimizing Hyperparameters on OOD Data + + +
+ Foundation models encode rich representations that can be adapted to a +desired task by fine-tuning on task-specific data. However, fine-tuning a model +on one particular data distribution often compromises the model's original +performance on other distributions. Current methods for robust fine-tuning +utilize hand-crafted regularization techniques to constrain the fine-tuning +process towards the base foundation model. Yet, it is hard to precisely specify +what characteristics of the foundation model to retain during fine-tuning, as +this depends on how the pre-training, fine-tuning, and evaluation data +distributions relate to each other. We propose AutoFT, a data-driven approach +for guiding foundation model fine-tuning. AutoFT optimizes fine-tuning +hyperparameters to maximize performance on a small out-of-distribution (OOD) +validation set. To guide fine-tuning in a granular way, AutoFT searches a +highly expressive hyperparameter space that includes weight coefficients for +many different losses, in addition to learning rate and weight decay values. We +evaluate AutoFT on nine natural distribution shifts which include domain shifts +and subpopulation shifts. Our experiments show that AutoFT significantly +improves generalization to new OOD data, outperforming existing robust +fine-tuning methods. Notably, AutoFT achieves new state-of-the-art performance +on the WILDS-iWildCam and WILDS-FMoW benchmarks, outperforming the previous +best methods by $6.0\%$ and $1.5\%$, respectively. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Enabling Efficient Equivariant Operations in the Fourier Basis via Gaunt + Tensor Products ICLR 2024 + + +
+ Developing equivariant neural networks for the E(3) group plays an important +role in modeling 3D data across real-world applications. Enforcing this +equivariance primarily involves the tensor products of irreducible +representations (irreps). However, the computational complexity of such +operations increases significantly as higher-order tensors are used. In this +work, we propose a systematic approach to substantially accelerate the +computation of the tensor products of irreps. We mathematically connect the +commonly used Clebsch-Gordan coefficients to the Gaunt coefficients, which are +integrals of products of three spherical harmonics. Through Gaunt coefficients, +the tensor product of irreps becomes equivalent to the multiplication between +spherical functions represented by spherical harmonics. This perspective +further allows us to change the basis for the equivariant operations from +spherical harmonics to a 2D Fourier basis. Consequently, the multiplication +between spherical functions represented by a 2D Fourier basis can be +efficiently computed via the convolution theorem and Fast Fourier Transforms. +This transformation reduces the complexity of full tensor products of irreps +from $\mathcal{O}(L^6)$ to $\mathcal{O}(L^3)$, where $L$ is the max degree of +irreps. Leveraging this approach, we introduce the Gaunt Tensor Product, which +serves as a new method to construct efficient equivariant operations across +different model architectures. Our experiments on the Open Catalyst Project and +3BPA datasets demonstrate both the increased efficiency and improved +performance of our approach. + +
+
+ comment: 36 pages; ICLR 2024 (Spotlight Presentation); Code: + https://github.com/lsj2408/Gaunt-Tensor-Product +
+
+
+
+
+ + ☆ Eclectic Rule Extraction for Explainability of Deep Neural Network based + Intrusion Detection Systems + + +
+ This paper addresses trust issues created from the ubiquity of black box +algorithms and surrogate explainers in Explainable Intrusion Detection Systems +(X-IDS). While Explainable Artificial Intelligence (XAI) aims to enhance +transparency, black box surrogate explainers, such as Local Interpretable +Model-Agnostic Explanation (LIME) and SHapley Additive exPlanation (SHAP), are +difficult to trust. The black box nature of these surrogate explainers makes +the process behind explanation generation opaque and difficult to understand. +To avoid this problem, one can use transparent white box algorithms such as +Rule Extraction (RE). There are three types of RE algorithms: pedagogical, +decompositional, and eclectic. Pedagogical methods offer fast but untrustworthy +white-box explanations, while decompositional RE provides trustworthy +explanations with poor scalability. This work explores eclectic rule +extraction, which strikes a balance between scalability and trustworthiness. By +combining techniques from pedagogical and decompositional approaches, eclectic +rule extraction leverages the advantages of both, while mitigating some of +their drawbacks. The proposed Hybrid X-IDS architecture features eclectic RE as +a white box surrogate explainer for black box Deep Neural Networks (DNN). The +presented eclectic RE algorithm extracts human-readable rules from hidden +layers, facilitating explainable and trustworthy rulesets. Evaluations on +UNSW-NB15 and CIC-IDS-2017 datasets demonstrate the algorithm's ability to +generate rulesets with 99.9% accuracy, mimicking DNN outputs. The contributions +of this work include the hybrid X-IDS architecture, the eclectic rule +extraction algorithm applicable to intrusion detection datasets, and a thorough +analysis of performance and explainability, demonstrating the trade-offs +involved in rule extraction speed and accuracy. + +
+
+
+
+
+ + ☆ Divide and not forget: Ensemble of selectively trained experts in + Continual Learning ICLR2024 + + +
+ Class-incremental learning is becoming more popular as it helps models widen +their applicability while not forgetting what they already know. A trend in +this area is to use a mixture-of-expert technique, where different models work +together to solve the task. However, the experts are usually trained all at +once using whole task data, which makes them all prone to forgetting and +increasing computational burden. To address this limitation, we introduce a +novel approach named SEED. SEED selects only one, the most optimal expert for a +considered task, and uses data from this task to fine-tune only this expert. +For this purpose, each expert represents each class with a Gaussian +distribution, and the optimal expert is selected based on the similarity of +those distributions. Consequently, SEED increases diversity and heterogeneity +within the experts while maintaining the high stability of this ensemble +method. The extensive experiments demonstrate that SEED achieves +state-of-the-art performance in exemplar-free settings across various +scenarios, showing the potential of expert diversification through data in +continual learning. + +
+
+ comment: Accepted to ICLR2024 (main track), code is available at: + https://github.com/grypesc/SEED +
+
+
+
+
+ + ☆ A Kaczmarz-inspired approach to accelerate the optimization of neural + network wavefunctions + + +
+ Neural network wavefunctions optimized using the variational Monte Carlo +method have been shown to produce highly accurate results for the electronic +structure of atoms and small molecules, but the high cost of optimizing such +wavefunctions prevents their application to larger systems. We propose the +Subsampled Projected-Increment Natural Gradient Descent (SPRING) optimizer to +reduce this bottleneck. SPRING combines ideas from the recently introduced +minimum-step stochastic reconfiguration optimizer (MinSR) and the classical +randomized Kaczmarz method for solving linear least-squares problems. We +demonstrate that SPRING outperforms both MinSR and the popular +Kronecker-Factored Approximate Curvature method (KFAC) across a number of small +atoms and molecules, given that the learning rates of all methods are optimally +tuned. For example, on the oxygen atom, SPRING attains chemical accuracy after +forty thousand training iterations, whereas both MinSR and KFAC fail to do so +even after one hundred thousand iterations. + +
+
+
+
+
+ + ☆ Chem-FINESE: Validating Fine-Grained Few-shot Entity Extraction through + Text Reconstruction EACL 2024 + + +
+ Fine-grained few-shot entity extraction in the chemical domain faces two +unique challenges. First, compared with entity extraction tasks in the general +domain, sentences from chemical papers usually contain more entities. Moreover, +entity extraction models usually have difficulty extracting entities of +long-tailed types. In this paper, we propose Chem-FINESE, a novel +sequence-to-sequence (seq2seq) based few-shot entity extraction approach, to +address these two challenges. Our Chem-FINESE has two components: a seq2seq +entity extractor to extract named entities from the input sentence and a +seq2seq self-validation module to reconstruct the original input sentence from +extracted entities. Inspired by the fact that a good entity extraction system +needs to extract entities faithfully, our new self-validation module leverages +entity extraction results to reconstruct the original input sentence. Besides, +we design a new contrastive loss to reduce excessive copying during the +extraction process. Finally, we release ChemNER+, a new fine-grained chemical +entity extraction dataset that is annotated by domain experts with the ChemNER +schema. Experiments in few-shot settings with both ChemNER+ and CHEMET datasets +show that our newly proposed framework has contributed up to 8.26% and 6.84% +absolute F1-score gains respectively. + +
+
+ comment: 16 pages. Accepted by Findings of the Association for Computational + Linguistics: EACL 2024. Code and resources are available at + https://github.com/EagleW/Chem-FINESE +
+
+
+
+
+ + ☆ Transfer Learning in Human Activity Recognition: A Survey + + +
+ Sensor-based human activity recognition (HAR) has been an active research +area, owing to its applications in smart environments, assisted living, +fitness, healthcare, etc. Recently, deep learning based end-to-end training has +resulted in state-of-the-art performance in domains such as computer vision and +natural language, where large amounts of annotated data are available. However, +large quantities of annotated data are not available for sensor-based HAR. +Moreover, the real-world settings on which the HAR is performed differ in terms +of sensor modalities, classification tasks, and target users. To address this +problem, transfer learning has been employed extensively. In this survey, we +focus on these transfer learning methods in the application domains of smart +home and wearables-based HAR. In particular, we provide a problem-solution +perspective by categorizing and presenting the works in terms of their +contributions and the challenges they address. We also present an updated view +of the state-of-the-art for both application domains. Based on our analysis of +205 papers, we highlight the gaps in the literature and provide a roadmap for +addressing them. This survey provides a reference to the HAR community, by +summarizing the existing works and providing a promising research agenda. + +
+
+ comment: 40 pages, 5 figures, 7 tables +
+
+
+
+
+ + ☆ Comprehensive OOD Detection Improvements + + +
+ As machine learning becomes increasingly prevalent in impactful decisions, +recognizing when inference data is outside the model's expected input +distribution is paramount for giving context to predictions. +Out-of-distribution (OOD) detection methods have been created for this task. +Such methods can be split into representation-based or logit-based methods from +whether they respectively utilize the model's embeddings or predictions for OOD +detection. In contrast to most papers which solely focus on one such group, we +address both. We employ dimensionality reduction on feature embeddings in +representation-based methods for both time speedups and improved performance. +Additionally, we propose DICE-COL, a modification of the popular logit-based +method Directed Sparsification (DICE) that resolves an unnoticed flaw. We +demonstrate the effectiveness of our methods on the OpenOODv1.5 benchmark +framework, where they significantly improve performance and set +state-of-the-art results. + +
+
+
+
+
+ + ☆ Multi-Agent Reinforcement Learning for Maritime Operational Technology + Cyber Security + + +
+ This paper demonstrates the potential for autonomous cyber defence to be +applied on industrial control systems and provides a baseline environment to +further explore Multi-Agent Reinforcement Learning's (MARL) application to this +problem domain. It introduces a simulation environment, IPMSRL, of a generic +Integrated Platform Management System (IPMS) and explores the use of MARL for +autonomous cyber defence decision-making on generic maritime based IPMS +Operational Technology (OT). OT cyber defensive actions are less mature than +they are for Enterprise IT. This is due to the relatively brittle nature of OT +infrastructure originating from the use of legacy systems, design-time +engineering assumptions, and lack of full-scale modern security controls. There +are many obstacles to be tackled across the cyber landscape due to continually +increasing cyber-attack sophistication and the limitations of traditional +IT-centric cyber defence solutions. Traditional IT controls are rarely deployed +on OT infrastructure, and where they are, some threats aren't fully addressed. +In our experiments, a shared critic implementation of Multi Agent Proximal +Policy Optimisation (MAPPO) outperformed Independent Proximal Policy +Optimisation (IPPO). MAPPO reached an optimal policy (episode outcome mean of +1) after 800K timesteps, whereas IPPO was only able to reach an episode outcome +mean of 0.966 after one million timesteps. Hyperparameter tuning greatly +improved training performance. Across one million timesteps the tuned +hyperparameters reached an optimal policy whereas the default hyperparameters +only managed to win sporadically, with most simulations resulting in a draw. We +tested a real-world constraint, attack detection alert success, and found that +when alert success probability is reduced to 0.75 or 0.9, the MARL defenders +were still able to win in over 97.5% or 99.5% of episodes, respectively. + +
+
+ comment: 13 pages, 7 figures, Proceedings of the Conference on Applied Machine + Learning in Information Security 2023 (CAMLIS) +
+
+
+
+
+ + ☆ Explicitly Disentangled Representations in Object-Centric Learning + + +
+ Extracting structured representations from raw visual data is an important +and long-standing challenge in machine learning. Recently, techniques for +unsupervised learning of object-centric representations have raised growing +interest. In this context, enhancing the robustness of the latent features can +improve the efficiency and effectiveness of the training of downstream tasks. A +promising step in this direction is to disentangle the factors that cause +variation in the data. Previously, Invariant Slot Attention disentangled +position, scale, and orientation from the remaining features. Extending this +approach, we focus on separating the shape and texture components. In +particular, we propose a novel architecture that biases object-centric models +toward disentangling shape and texture components into two non-overlapping +subsets of the latent space dimensions. These subsets are known a priori, hence +before the training process. Experiments on a range of object-centric +benchmarks reveal that our approach achieves the desired disentanglement while +also numerically improving baseline performance in most cases. In addition, we +show that our method can generate novel textures for a specific object or +transfer textures between objects with distinct shapes. + +
+
+
+
+
+ + ☆ Spatial-Temporal Large Language Model for Traffic Prediction + + +
+ Traffic prediction, a critical component for intelligent transportation +systems, endeavors to foresee future traffic at specific locations using +historical data. Although existing traffic prediction models often emphasize +developing complex neural network structures, their accuracy has not seen +improvements accordingly. Recently, Large Language Models (LLMs) have shown +outstanding capabilities in time series analysis. Differing from existing +models, LLMs progress mainly through parameter expansion and extensive +pre-training while maintaining their fundamental structures. In this paper, we +propose a Spatial-Temporal Large Language Model (ST-LLM) for traffic +prediction. Specifically, ST-LLM redefines the timesteps at each location as +tokens and incorporates a spatial-temporal embedding module to learn the +spatial location and global temporal representations of tokens. Then these +representations are fused to provide each token with unified spatial and +temporal information. Furthermore, we propose a novel partially frozen +attention strategy of the LLM, which is designed to capture spatial-temporal +dependencies for traffic prediction. Comprehensive experiments on real traffic +datasets offer evidence that ST-LLM outperforms state-of-the-art models. +Notably, the ST-LLM also exhibits robust performance in both few-shot and +zero-shot prediction scenarios. + +
+
+
+
+
+ + ☆ Towards Principled Graph Transformers + + +
+ Graph learning architectures based on the k-dimensional Weisfeiler-Leman +(k-WL) hierarchy offer a theoretically well-understood expressive power. +However, such architectures often fail to deliver solid predictive performance +on real-world tasks, limiting their practical impact. In contrast, global +attention-based models such as graph transformers demonstrate strong +performance in practice, but comparing their expressive power with the k-WL +hierarchy remains challenging, particularly since these architectures rely on +positional or structural encodings for their expressivity and predictive +performance. To address this, we show that the recently proposed Edge +Transformer, a global attention model operating on node pairs instead of nodes, +has at least 3-WL expressive power. Empirically, we demonstrate that the Edge +Transformer surpasses other theoretically aligned architectures regarding +predictive performance while not relying on positional or structural encodings. + +
+
+
+
+
+ + ☆ Comparison analysis between standard polysomnographic data and + in-ear-EEG signals: A preliminary study + + +
+ Study Objectives: Polysomnography (PSG) currently serves as the benchmark for +evaluating sleep disorders. Its discomfort, impracticality for home-use, and +introduction of bias in sleep quality assessment necessitate the exploration of +less invasive, cost-effective, and portable alternatives. One promising +contender is the in-ear-EEG sensor, which offers advantages in terms of +comfort, fixed electrode positions, resistance to electromagnetic interference, +and user-friendliness. This study aims to establish a methodology to assess the +similarity between the in-ear-EEG signal and standard PSG. + Methods: We assess the agreement between the PSG and in-ear-EEG derived +hypnograms. We extract features in the time- and frequency- domain from PSG and +in-ear-EEG 30-second epochs. We only consider the epochs where the PSG-scorers +and the in-ear-EEG-scorers were in agreement. We introduce a methodology to +quantify the similarity between PSG derivations and the single-channel +in-ear-EEG. The approach relies on a comparison of distributions of selected +features -- extracted for each sleep stage and subject on both PSG and the +in-ear-EEG signals -- via a Jensen-Shannon Divergence Feature-based Similarity +Index (JSD-FSI). + Results: We found a high intra-scorer variability, mainly due to the +uncertainty the scorers had in evaluating the in-ear-EEG signals. We show that +the similarity between PSG and in-ear-EEG signals is high (JSD-FSI: 0.61 +/- +0.06 in awake, 0.60 +/- 0.07 in NREM and 0.51 +/- 0.08 in REM), and in line +with the similarity values computed independently on standard +PSG-channel-combinations. + Conclusions: In-ear-EEG is a valuable solution for home-based sleep +monitoring, however further studies with a larger and more heterogeneous +dataset are needed. + +
+
+ comment: 29 pages, 12 figures, 1 table +
+
+
+
+
+ + ☆ Learning shallow quantum circuits + + +
+ Despite fundamental interests in learning quantum circuits, the existence of +a computationally efficient algorithm for learning shallow quantum circuits +remains an open question. Because shallow quantum circuits can generate +distributions that are classically hard to sample from, existing learning +algorithms do not apply. In this work, we present a polynomial-time classical +algorithm for learning the description of any unknown $n$-qubit shallow quantum +circuit $U$ (with arbitrary unknown architecture) within a small diamond +distance using single-qubit measurement data on the output states of $U$. We +also provide a polynomial-time classical algorithm for learning the description +of any unknown $n$-qubit state $\lvert \psi \rangle = U \lvert 0^n \rangle$ +prepared by a shallow quantum circuit $U$ (on a 2D lattice) within a small +trace distance using single-qubit measurements on copies of $\lvert \psi +\rangle$. Our approach uses a quantum circuit representation based on local +inversions and a technique to combine these inversions. This circuit +representation yields an optimization landscape that can be efficiently +navigated and enables efficient learning of quantum circuits that are +classically hard to simulate. + +
+
+ comment: 10 pages, 14 figures (7 inline; 7 floating) + 76-page appendix +
+
+
+
+
+ + ☆ Optimizing Medication Decisions for Patients with Atrial Fibrillation + through Path Development Network + + +
+ Atrial fibrillation (AF) is a common cardiac arrhythmia characterized by +rapid and irregular contractions of the atria. It significantly elevates the +risk of strokes due to slowed blood flow in the atria, especially in the left +atrial appendage, which is prone to blood clot formation. Such clots can +migrate into cerebral arteries, leading to ischemic stroke. To assess whether +AF patients should be prescribed anticoagulants, doctors often use the +CHA2DS2-VASc scoring system. However, anticoagulant use must be approached with +caution as it can impact clotting functions. This study introduces a machine +learning algorithm that predicts whether patients with AF should be recommended +anticoagulant therapy using 12-lead ECG data. In this model, we use STOME to +enhance time-series data and then process it through a Convolutional Neural +Network (CNN). By incorporating a path development layer, the model achieves a +specificity of 30.6% under the condition of an NPV of 1. In contrast, LSTM +algorithms without path development yield a specificity of only 2.7% under the +same NPV condition. + +
+
+ comment: Master's thesis +
+
+
+
+
+ + ☆ Developing an AI-based Integrated System for Bee Health Evaluation + + +
+ Honey bees pollinate about one-third of the world's food supply, but bee +colonies have alarmingly declined by nearly 40% over the past decade due to +several factors, including pesticides and pests. Traditional methods for +monitoring beehives, such as human inspection, are subjective, disruptive, and +time-consuming. To overcome these limitations, artificial intelligence has been +used to assess beehive health. However, previous studies have lacked an +end-to-end solution and primarily relied on data from a single source, either +bee images or sounds. This study introduces a comprehensive system consisting +of bee object detection and health evaluation. Additionally, it utilized a +combination of visual and audio signals to analyze bee behaviors. An +Attention-based Multimodal Neural Network (AMNN) was developed to adaptively +focus on key features from each type of signal for accurate bee health +assessment. The AMNN achieved an overall accuracy of 92.61%, surpassing eight +existing single-signal Convolutional Neural Networks and Recurrent Neural +Networks. It outperformed the best image-based model by 32.51% and the top +sound-based model by 13.98% while maintaining efficient processing times. +Furthermore, it improved prediction robustness, attaining an F1-score higher +than 90% across all four evaluated health conditions. The study also shows that +audio signals are more reliable than images for assessing bee health. By +seamlessly integrating AMNN with image and sound data in a comprehensive bee +health monitoring system, this approach provides a more efficient and +non-invasive solution for the early detection of bee diseases and the +preservation of bee colonies. + +
+
+
+
+
+ + ☆ FLex&Chill: Improving Local Federated Learning Training with Logit + Chilling + + +
+ Federated learning are inherently hampered by data heterogeneity: non-iid +distributed training data over local clients. We propose a novel model training +approach for federated learning, FLex&Chill, which exploits the Logit Chilling +method. Through extensive evaluations, we demonstrate that, in the presence of +non-iid data characteristics inherent in federated learning systems, this +approach can expedite model convergence and improve inference accuracy. +Quantitatively, from our experiments, we observe up to 6X improvement in the +global federated learning model convergence time, and up to 3.37% improvement +in inference accuracy. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Ventricular Segmentation: A Brief Comparison of U-Net Derivatives + + +
+ Medical imaging refers to the technologies and methods utilized to view the +human body and its inside, in order to diagnose, monitor, or even treat medical +disorders. This paper aims to explore the application of deep learning +techniques in the semantic segmentation of Cardiac short-axis MRI (Magnetic +Resonance Imaging) images, aiming to enhance the diagnosis, monitoring, and +treatment of medical disorders related to the heart. The focus centers on +implementing various architectures that are derivatives of U-Net, to +effectively isolate specific parts of the heart for comprehensive anatomical +and functional analysis. Through a combination of images, graphs, and +quantitative metrics, the efficacy of the models and their predictions are +showcased. Additionally, this paper addresses encountered challenges and +outline strategies for future improvements. This abstract provides a concise +overview of the efforts in utilizing deep learning for cardiac image +segmentation, emphasizing both the accomplishments and areas for further +refinement. + +
+
+
+
+
+ + ☆ False Discovery Rate Control for Gaussian Graphical Models via + Neighborhood Screening + + +
+ Gaussian graphical models emerge in a wide range of fields. They model the +statistical relationships between variables as a graph, where an edge between +two variables indicates conditional dependence. Unfortunately, well-established +estimators, such as the graphical lasso or neighborhood selection, are known to +be susceptible to a high prevalence of false edge detections. False detections +may encourage inaccurate or even incorrect scientific interpretations, with +major implications in applications, such as biomedicine or healthcare. In this +paper, we introduce a nodewise variable selection approach to graph learning +and provably control the false discovery rate of the selected edge set at a +self-estimated level. A novel fusion method of the individual neighborhoods +outputs an undirected graph estimate. The proposed method is parameter-free and +does not require tuning by the user. Benchmarks against competing false +discovery rate controlling methods in numerical experiments considering +different graph topologies show a significant gain in performance. + +
+
+
+
+
+ + ☆ Through the Dual-Prism: A Spectral Perspective on Graph Data + Augmentation for Graph Classification + + +
+ Graph Neural Networks (GNNs) have become the preferred tool to process graph +data, with their efficacy being boosted through graph data augmentation +techniques. Despite the evolution of augmentation methods, issues like graph +property distortions and restricted structural changes persist. This leads to +the question: Is it possible to develop more property-conserving and +structure-sensitive augmentation methods? Through a spectral lens, we +investigate the interplay between graph properties, their augmentation, and +their spectral behavior, and found that keeping the low-frequency eigenvalues +unchanged can preserve the critical properties at a large scale when generating +augmented graphs. These observations inform our introduction of the Dual-Prism +(DP) augmentation method, comprising DP-Noise and DP-Mask, which adeptly +retains essential graph properties while diversifying augmented graphs. +Extensive experiments validate the efficiency of our approach, providing a new +and promising direction for graph data augmentation. + +
+
+
+
+
+ + ☆ SymbolNet: Neural Symbolic Regression with Adaptive Dynamic Pruning + + +
+ Contrary to the use of genetic programming, the neural network approach to +symbolic regression can scale well with high input dimension and leverage +gradient methods for faster equation searching. Common ways of constraining +expression complexity have relied on multistage pruning methods with +fine-tuning, but these often lead to significant performance loss. In this +work, we propose SymbolNet, a neural network approach to symbolic regression in +a novel framework that enables dynamic pruning of model weights, input +features, and mathematical operators in a single training, where both training +loss and expression complexity are optimized simultaneously. We introduce a +sparsity regularization term per pruning type, which can adaptively adjust its +own strength and lead to convergence to a target sparsity level. In contrast to +most existing symbolic regression methods that cannot efficiently handle +datasets with more than $O$(10) inputs, we demonstrate the effectiveness of our +model on the LHC jet tagging task (16 inputs), MNIST (784 inputs), and SVHN +(3072 inputs). + +
+
+ comment: 11 pages. Submitted to IEEE TNNLS, under review +
+
+
+
+
+ + ☆ HGAttack: Transferable Heterogeneous Graph Adversarial Attack + + +
+ Heterogeneous Graph Neural Networks (HGNNs) are increasingly recognized for +their performance in areas like the web and e-commerce, where resilience +against adversarial attacks is crucial. However, existing adversarial attack +methods, which are primarily designed for homogeneous graphs, fall short when +applied to HGNNs due to their limited ability to address the structural and +semantic complexity of HGNNs. This paper introduces HGAttack, the first +dedicated gray box evasion attack method for heterogeneous graphs. We design a +novel surrogate model to closely resemble the behaviors of the target HGNN and +utilize gradient-based methods for perturbation generation. Specifically, the +proposed surrogate model effectively leverages heterogeneous information by +extracting meta-path induced subgraphs and applying GNNs to learn node +embeddings with distinct semantics from each subgraph. This approach improves +the transferability of generated attacks on the target HGNN and significantly +reduces memory costs. For perturbation generation, we introduce a +semantics-aware mechanism that leverages subgraph gradient information to +autonomously identify vulnerable edges across a wide range of relations within +a constrained perturbation budget. We validate HGAttack's efficacy with +comprehensive experiments on three datasets, providing empirical analyses of +its generated perturbations. Outperforming baseline methods, HGAttack +demonstrated significant efficacy in diminishing the performance of target HGNN +models, affirming the effectiveness of our approach in evaluating the +robustness of HGNNs against adversarial attacks. + +
+
+
+
+
+ + ☆ WindSeer: Real-time volumetric wind prediction over complex terrain + aboard a small UAV + + +
+ Real-time high-resolution wind predictions are beneficial for various +applications including safe manned and unmanned aviation. Current weather +models require too much compute and lack the necessary predictive capabilities +as they are valid only at the scale of multiple kilometers and hours - much +lower spatial and temporal resolutions than these applications require. Our +work, for the first time, demonstrates the ability to predict low-altitude wind +in real-time on limited-compute devices, from only sparse measurement data. We +train a neural network, WindSeer, using only synthetic data from computational +fluid dynamics simulations and show that it can successfully predict real wind +fields over terrain with known topography from just a few noisy and spatially +clustered wind measurements. WindSeer can generate accurate predictions at +different resolutions and domain sizes on previously unseen topography without +retraining. We demonstrate that the model successfully predicts historical wind +data collected by weather stations and wind measured onboard drones. + +
+
+
+
+
+ + ☆ Infinite-Horizon Graph Filters: Leveraging Power Series to Enhance + Sparse Information Aggregation + + +
+ Graph Neural Networks (GNNs) have shown considerable effectiveness in a +variety of graph learning tasks, particularly those based on the +message-passing approach in recent years. However, their performance is often +constrained by a limited receptive field, a challenge that becomes more acute +in the presence of sparse graphs. In light of the power series, which possesses +infinite expansion capabilities, we propose a novel \underline{G}raph +\underline{P}ower \underline{F}ilter \underline{N}eural Network (GPFN) that +enhances node classification by employing a power series graph filter to +augment the receptive field. Concretely, our GPFN designs a new way to build a +graph filter with an infinite receptive field based on the convergence power +series, which can be analyzed in the spectral and spatial domains. Besides, we +theoretically prove that our GPFN is a general framework that can integrate any +power series and capture long-range dependencies. Finally, experimental results +on three datasets demonstrate the superiority of our GPFN over state-of-the-art +baselines. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ Biases in Expected Goals Models Confound Finishing Ability + + +
+ Expected Goals (xG) has emerged as a popular tool for evaluating finishing +skill in soccer analytics. It involves comparing a player's cumulative xG with +their actual goal output, where consistent overperformance indicates strong +finishing ability. However, the assessment of finishing skill in soccer using +xG remains contentious due to players' difficulty in consistently outperforming +their cumulative xG. In this paper, we aim to address the limitations and +nuances surrounding the evaluation of finishing skill using xG statistics. +Specifically, we explore three hypotheses: (1) the deviation between actual and +expected goals is an inadequate metric due to the high variance of shot +outcomes and limited sample sizes, (2) the inclusion of all shots in cumulative +xG calculation may be inappropriate, and (3) xG models contain biases arising +from interdependencies in the data that affect skill measurement. We found that +sustained overperformance of cumulative xG requires both high shot volumes and +exceptional finishing, including all shot types can obscure the finishing +ability of proficient strikers, and that there is a persistent bias that makes +the actual and expected goals closer for excellent finishers than it really is. +Overall, our analysis indicates that we need more nuanced quantitative +approaches for investigating a player's finishing ability, which we achieved +using a technique from AI fairness to learn an xG model that is calibrated for +multiple subgroups of players. As a concrete use case, we show that (1) the +standard biased xG model underestimates Messi's GAX by 17% and (2) Messi's GAX +is 27% higher than the typical elite high-shot-volume attacker, indicating that +Messi is even a more exceptional finisher than people commonly believed. + +
+
+
+
+
+ + ☆ Probabilistic Truly Unordered Rule Sets + + +
+ Rule set learning has recently been frequently revisited because of its +interpretability. Existing methods have several shortcomings though. First, +most existing methods impose orders among rules, either explicitly or +implicitly, which makes the models less comprehensible. Second, due to the +difficulty of handling conflicts caused by overlaps (i.e., instances covered by +multiple rules), existing methods often do not consider probabilistic rules. +Third, learning classification rules for multi-class target is understudied, as +most existing methods focus on binary classification or multi-class +classification via the ``one-versus-rest" approach. + To address these shortcomings, we propose TURS, for Truly Unordered Rule +Sets. To resolve conflicts caused by overlapping rules, we propose a novel +model that exploits the probabilistic properties of our rule sets, with the +intuition of only allowing rules to overlap if they have similar probabilistic +outputs. We next formalize the problem of learning a TURS model based on the +MDL principle and develop a carefully designed heuristic algorithm. We +benchmark against a wide range of rule-based methods and demonstrate that our +method learns rule sets that have lower model complexity and highly competitive +predictive performance. In addition, we empirically show that rules in our +model are empirically ``independent" and hence truly unordered. + +
+
+ comment: Submitted to JMLR +
+
+
+
+
+ + ☆ Enabling On-device Continual Learning with Binary Neural Networks + + +
+ On-device learning remains a formidable challenge, especially when dealing +with resource-constrained devices that have limited computational capabilities. +This challenge is primarily rooted in two key issues: first, the memory +available on embedded devices is typically insufficient to accommodate the +memory-intensive back-propagation algorithm, which often relies on +floating-point precision. Second, the development of learning algorithms on +models with extreme quantization levels, such as Binary Neural Networks (BNNs), +is critical due to the drastic reduction in bit representation. In this study, +we propose a solution that combines recent advancements in the field of +Continual Learning (CL) and Binary Neural Networks to enable on-device training +while maintaining competitive performance. Specifically, our approach leverages +binary latent replay (LR) activations and a novel quantization scheme that +significantly reduces the number of bits required for gradient computation. The +experimental validation demonstrates a significant accuracy improvement in +combination with a noticeable reduction in memory requirement, confirming the +suitability of our approach in expanding the practical applications of deep +learning in real-world scenarios. + +
+
+
+
+
+ + ☆ Qadence: a differentiable interface for digital-analog programs + + +
+ Digital-analog quantum computing (DAQC) is an alternative paradigm for +universal quantum computation combining digital single-qubit gates with global +analog operations acting on a register of interacting qubits. Currently, no +available open-source software is tailored to express, differentiate, and +execute programs within the DAQC paradigm. In this work, we address this +shortfall by presenting Qadence, a high-level programming interface for +building complex digital-analog quantum programs developed at Pasqal. Thanks to +its flexible interface, native differentiability, and focus on real-device +execution, Qadence aims at advancing research on variational quantum algorithms +built for native DAQC platforms such as Rydberg atom arrays. + +
+
+
+
+
+ + ☆ Interplay between depth and width for interpolation in neural ODEs + + +
+ Neural ordinary differential equations (neural ODEs) have emerged as a +natural tool for supervised learning from a control perspective, yet a complete +understanding of their optimal architecture remains elusive. In this work, we +examine the interplay between their width $p$ and number of layer transitions +$L$ (effectively the depth $L+1$). Specifically, we assess the model +expressivity in terms of its capacity to interpolate either a finite dataset +$D$ comprising $N$ pairs of points or two probability measures in +$\mathbb{R}^d$ within a Wasserstein error margin $\varepsilon>0$. Our findings +reveal a balancing trade-off between $p$ and $L$, with $L$ scaling as +$O(1+N/p)$ for dataset interpolation, and +$L=O\left(1+(p\varepsilon^d)^{-1}\right)$ for measure interpolation. + In the autonomous case, where $L=0$, a separate study is required, which we +undertake focusing on dataset interpolation. We address the relaxed problem of +$\varepsilon$-approximate controllability and establish an error decay of +$\varepsilon\sim O(\log(p)p^{-1/d})$. This decay rate is a consequence of +applying a universal approximation theorem to a custom-built Lipschitz vector +field that interpolates $D$. In the high-dimensional setting, we further +demonstrate that $p=O(N)$ neurons are likely sufficient to achieve exact +control. + +
+
+ comment: 16 pages, 10 figures, double column +
+
+
+
+
+ + ☆ A Survey on Hardware Accelerators for Large Language Models + + +
+ Large Language Models (LLMs) have emerged as powerful tools for natural +language processing tasks, revolutionizing the field with their ability to +understand and generate human-like text. As the demand for more sophisticated +LLMs continues to grow, there is a pressing need to address the computational +challenges associated with their scale and complexity. This paper presents a +comprehensive survey on hardware accelerators designed to enhance the +performance and energy efficiency of Large Language Models. By examining a +diverse range of accelerators, including GPUs, FPGAs, and custom-designed +architectures, we explore the landscape of hardware solutions tailored to meet +the unique computational demands of LLMs. The survey encompasses an in-depth +analysis of architecture, performance metrics, and energy efficiency +considerations, providing valuable insights for researchers, engineers, and +decision-makers aiming to optimize the deployment of LLMs in real-world +applications. + +
+
+
+
+
+ + ☆ Cooperative Edge Caching Based on Elastic Federated and Multi-Agent Deep + Reinforcement Learning in Next-Generation Network + + +
+ Edge caching is a promising solution for next-generation networks by +empowering caching units in small-cell base stations (SBSs), which allows user +equipments (UEs) to fetch users' requested contents that have been pre-cached +in SBSs. It is crucial for SBSs to predict accurate popular contents through +learning while protecting users' personal information. Traditional federated +learning (FL) can protect users' privacy but the data discrepancies among UEs +can lead to a degradation in model quality. Therefore, it is necessary to train +personalized local models for each UE to predict popular contents accurately. +In addition, the cached contents can be shared among adjacent SBSs in +next-generation networks, thus caching predicted popular contents in different +SBSs may affect the cost to fetch contents. Hence, it is critical to determine +where the popular contents are cached cooperatively. To address these issues, +we propose a cooperative edge caching scheme based on elastic federated and +multi-agent deep reinforcement learning (CEFMR) to optimize the cost in the +network. We first propose an elastic FL algorithm to train the personalized +model for each UE, where adversarial autoencoder (AAE) model is adopted for +training to improve the prediction accuracy, then {a popular} content +prediction algorithm is proposed to predict the popular contents for each SBS +based on the trained AAE model. Finally, we propose a multi-agent deep +reinforcement learning (MADRL) based algorithm to decide where the predicted +popular contents are collaboratively cached among SBSs. Our experimental +results demonstrate the superiority of our proposed scheme to existing baseline +caching schemes. + +
+
+ comment: This paper has been submitted to IEEE TNSM. The source code has been + released at: + https://github.com/qiongwu86/Edge-Caching-Based-on-Multi-Agent-Deep-Reinforcement-Learning-and-Federated-Learning +
+
+
+
+
+ + ☆ GA-SmaAt-GNet: Generative Adversarial Small Attention GNet for Extreme + Precipitation Nowcasting + + +
+ In recent years, data-driven modeling approaches have gained considerable +traction in various meteorological applications, particularly in the realm of +weather forecasting. However, these approaches often encounter challenges when +dealing with extreme weather conditions. In light of this, we propose +GA-SmaAt-GNet, a novel generative adversarial architecture that makes use of +two methodologies aimed at enhancing the performance of deep learning models +for extreme precipitation nowcasting. Firstly, it uses a novel SmaAt-GNet built +upon the successful SmaAt-UNet architecture as generator. This network +incorporates precipitation masks (binarized precipitation maps) as an +additional data source, leveraging valuable information for improved +predictions. Additionally, GA-SmaAt-GNet utilizes an attention-augmented +discriminator inspired by the well-established Pix2Pix architecture. +Furthermore, we assess the performance of GA-SmaAt-GNet using real-life +precipitation dataset from the Netherlands. Our experimental results reveal a +notable improvement in both overall performance and for extreme precipitation +events. Furthermore, we conduct uncertainty analysis on the proposed +GA-SmaAt-GNet model as well as on the precipitation dataset, providing +additional insights into the predictive capabilities of the model. Finally, we +offer further insights into the predictions of our proposed model using +Grad-CAM. This visual explanation technique generates activation heatmaps, +illustrating areas of the input that are more activated for various parts of +the network. + +
+
+ comment: 16 pages, 11 figurs +
+
+
+
+
+ + ☆ Attention-Based Recurrent Neural Network For Automatic Behavior Laying + Hen Recognition + + +
+ One of the interests of modern poultry farming is the vocalization of laying +hens which contain very useful information on health behavior. This information +is used as health and well-being indicators that help breeders better monitor +laying hens, which involves early detection of problems for rapid and more +effective intervention. In this work, we focus on the sound analysis for the +recognition of the types of calls of the laying hens in order to propose a +robust system of characterization of their behavior for a better monitoring. To +do this, we first collected and annotated laying hen call signals, then +designed an optimal acoustic characterization based on the combination of time +and frequency domain features. We then used these features to build the +multi-label classification models based on recurrent neural network to assign a +semantic class to the vocalization that characterize the laying hen behavior. +The results show an overall performance with our model based on the combination +of time and frequency domain features that obtained the highest F1-score +(F1=92.75) with a gain of 17% on the models using the frequency domain features +and of 8% on the compared approaches from the litterature. + +
+
+
+
+
+ + ☆ Reconciling Spatial and Temporal Abstractions for Goal Representation ICLR 2024 + + +
+ Goal representation affects the performance of Hierarchical Reinforcement +Learning (HRL) algorithms by decomposing the complex learning problem into +easier subtasks. Recent studies show that representations that preserve +temporally abstract environment dynamics are successful in solving difficult +problems and provide theoretical guarantees for optimality. These methods +however cannot scale to tasks where environment dynamics increase in complexity +i.e. the temporally abstract transition relations depend on larger number of +variables. On the other hand, other efforts have tried to use spatial +abstraction to mitigate the previous issues. Their limitations include +scalability to high dimensional environments and dependency on prior knowledge. + In this paper, we propose a novel three-layer HRL algorithm that introduces, +at different levels of the hierarchy, both a spatial and a temporal goal +abstraction. We provide a theoretical study of the regret bounds of the learned +policies. We evaluate the approach on complex continuous control tasks, +demonstrating the effectiveness of spatial and temporal abstractions learned by +this approach. + +
+
+ comment: Accepted for publication in ICLR 2024 +
+
+
+
+
+ + ☆ Improving fine-grained understanding in image-text pre-training + + +
+ We introduce SPARse Fine-grained Contrastive Alignment (SPARC), a simple +method for pretraining more fine-grained multimodal representations from +image-text pairs. Given that multiple image patches often correspond to single +words, we propose to learn a grouping of image patches for every token in the +caption. To achieve this, we use a sparse similarity metric between image +patches and language tokens and compute for each token a language-grouped +vision embedding as the weighted average of patches. The token and +language-grouped vision embeddings are then contrasted through a fine-grained +sequence-wise loss that only depends on individual samples and does not require +other batch samples as negatives. This enables more detailed information to be +learned in a computationally inexpensive manner. SPARC combines this +fine-grained loss with a contrastive loss between global image and text +embeddings to learn representations that simultaneously encode global and local +information. We thoroughly evaluate our proposed method and show improved +performance over competing approaches both on image-level tasks relying on +coarse-grained information, e.g. classification, as well as region-level tasks +relying on fine-grained information, e.g. retrieval, object detection, and +segmentation. Moreover, SPARC improves model faithfulness and captioning in +foundational vision-language models. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Evolutionary Multi-Objective Optimization of Large Language Model + Prompts for Balancing Sentiments + + +
+ The advent of large language models (LLMs) such as ChatGPT has attracted +considerable attention in various domains due to their remarkable performance +and versatility. As the use of these models continues to grow, the importance +of effective prompt engineering has come to the fore. Prompt optimization +emerges as a crucial challenge, as it has a direct impact on model performance +and the extraction of relevant information. Recently, evolutionary algorithms +(EAs) have shown promise in addressing this issue, paving the way for novel +optimization strategies. In this work, we propose a evolutionary +multi-objective (EMO) approach specifically tailored for prompt optimization +called EMO-Prompts, using sentiment analysis as a case study. We use sentiment +analysis capabilities as our experimental targets. Our results demonstrate that +EMO-Prompts effectively generates prompts capable of guiding the LLM to produce +texts embodying two conflicting emotions simultaneously. + +
+
+ comment: Accepted in EvoApps at EvoStar 2024 +
+
+
+
+
+ + ☆ FREED++: Improving RL Agents for Fragment-Based Molecule Generation by + Thorough Reproduction + + +
+ A rational design of new therapeutic drugs aims to find a molecular structure +with desired biological functionality, e.g., an ability to activate or suppress +a specific protein via binding to it. Molecular docking is a common technique +for evaluating protein-molecule interactions. Recently, Reinforcement Learning +(RL) has emerged as a promising approach to generating molecules with the +docking score (DS) as a reward. In this work, we reproduce, scrutinize and +improve the recent RL model for molecule generation called FREED +(arXiv:2110.01219). Extensive evaluation of the proposed method reveals several +limitations and challenges despite the outstanding results reported for three +target proteins. Our contributions include fixing numerous implementation bugs +and simplifying the model while increasing its quality, significantly extending +experiments, and conducting an accurate comparison with current +state-of-the-art methods for protein-conditioned molecule generation. We show +that the resulting fixed model is capable of producing molecules with superior +docking scores compared to alternative approaches. + +
+
+ comment: 37 pages, 10 figures, to be published in TMLR journal + (https://www.jmlr.org/tmlr/) +
+
+
+
+
+ + ☆ PPNet: A Novel Neural Network Structure for End-to-End Near-Optimal Path + Planning + + +
+ The classical path planners, such as sampling-based path planners, have the +limitations of sensitivity to the initial solution and slow convergence to the +optimal solution. However, finding a near-optimal solution in a short period is +challenging in many applications such as the autonomous vehicle with limited +power/fuel. To achieve an end-to-end near-optimal path planner, we first divide +the path planning problem into two subproblems, which are path's space +segmentation and waypoints generation in the given path's space. We further +propose a two-level cascade neural network named Path Planning Network (PPNet) +to solve the path planning problem by solving the abovementioned subproblems. +Moreover, we propose a novel efficient data generation method for path planning +named EDaGe-PP. The results show the total computation time is less than 1/33 +and the success rate of PPNet trained by the dataset that is generated by +EDaGe-PP is about $2 \times$ compared to other methods. We validate PPNet +against state-of-the-art path planning methods. The results show PPNet can find +a near-optimal solution in 15.3ms, which is much shorter than the +state-of-the-art path planners. + +
+
+
+
+
+ + ☆ Clickbait vs. Quality: How Engagement-Based Optimization Shapes the + Content Landscape in Online Platforms + + +
+ Online content platforms commonly use engagement-based optimization when +making recommendations. This encourages content creators to invest in quality, +but also rewards gaming tricks such as clickbait. To understand the total +impact on the content landscape, we study a game between content creators +competing on the basis of engagement metrics and analyze the equilibrium +decisions about investment in quality and gaming. First, we show the content +created at equilibrium exhibits a positive correlation between quality and +gaming, and we empirically validate this finding on a Twitter dataset. Using +the equilibrium structure of the content landscape, we then examine the +downstream performance of engagement-based optimization along several axes. +Perhaps counterintuitively, the average quality of content consumed by users +can decrease at equilibrium as gaming tricks become more costly for content +creators to employ. Moreover, engagement-based optimization can perform worse +in terms of user utility than a baseline with random recommendations, and +engagement-based optimization is also suboptimal in terms of realized +engagement relative to quality-based optimization. Altogether, our results +highlight the need to consider content creator incentives when evaluating a +platform's choice of optimization metric. + +
+
+
+
+
+ + ☆ A Fast, Performant, Secure Distributed Training Framework For Large + Language Model ICASSP2024 + + +
+ The distributed (federated) LLM is an important method for co-training the +domain-specific LLM using siloed data. However, maliciously stealing model +parameters and data from the server or client side has become an urgent problem +to be solved. In this paper, we propose a secure distributed LLM based on model +slicing. In this case, we deploy the Trusted Execution Environment (TEE) on +both the client and server side, and put the fine-tuned structure (LoRA or +embedding of P-tuning v2) into the TEE. Then, secure communication is executed +in the TEE and general environments through lightweight encryption. In order to +further reduce the equipment cost as well as increase the model performance and +accuracy, we propose a split fine-tuning scheme. In particular, we split the +LLM by layers and place the latter layers in a server-side TEE (the client does +not need a TEE). We then combine the proposed Sparsification Parameter +Fine-tuning (SPF) with the LoRA part to improve the accuracy of the downstream +task. Numerous experiments have shown that our method guarantees accuracy while +maintaining security. + +
+
+ comment: Accept ICASSP2024 +
+
+
+
+
+ + ☆ PatchAD: Patch-based MLP-Mixer for Time Series Anomaly Detection IJCAI 2024 + + +
+ Anomaly detection stands as a crucial aspect of time series analysis, aiming +to identify abnormal events in time series samples. The central challenge of +this task lies in effectively learning the representations of normal and +abnormal patterns in a label-lacking scenario. Previous research mostly relied +on reconstruction-based approaches, restricting the representational abilities +of the models. In addition, most of the current deep learning-based methods are +not lightweight enough, which prompts us to design a more efficient framework +for anomaly detection. In this study, we introduce PatchAD, a novel multi-scale +patch-based MLP-Mixer architecture that leverages contrastive learning for +representational extraction and anomaly detection. Specifically, PatchAD is +composed of four distinct MLP Mixers, exclusively utilizing the MLP +architecture for high efficiency and lightweight architecture. Additionally, we +also innovatively crafted a dual project constraint module to mitigate +potential model degradation. Comprehensive experiments demonstrate that PatchAD +achieves state-of-the-art results across multiple real-world multivariate time +series datasets. Our code is publicly +available.\footnote{\url{https://github.com/EmorZz1G/PatchAD}} + +
+
+ comment: 13 pages, 16 figures, IJCAI 2024 under review, paper id 3166 +
+
+
+
+
+ + ☆ BreastRegNet: A Deep Learning Framework for Registration of Breast + Faxitron and Histopathology Images + + +
+ A standard treatment protocol for breast cancer entails administering +neoadjuvant therapy followed by surgical removal of the tumor and surrounding +tissue. Pathologists typically rely on cabinet X-ray radiographs, known as +Faxitron, to examine the excised breast tissue and diagnose the extent of +residual disease. However, accurately determining the location, size, and +focality of residual cancer can be challenging, and incorrect assessments can +lead to clinical consequences. The utilization of automated methods can improve +the histopathology process, allowing pathologists to choose regions for +sampling more effectively and precisely. Despite the recognized necessity, +there are currently no such methods available. Training such automated +detection models require accurate ground truth labels on ex-vivo radiology +images, which can be acquired through registering Faxitron and histopathology +images and mapping the extent of cancer from histopathology to x-ray images. +This study introduces a deep learning-based image registration approach trained +on mono-modal synthetic image pairs. The models were trained using data from 50 +women who received neoadjuvant chemotherapy and underwent surgery. The results +demonstrate that our method is faster and yields significantly lower average +landmark error ($2.1\pm1.96$ mm) over the state-of-the-art iterative +($4.43\pm4.1$ mm) and deep learning ($4.02\pm3.15$ mm) approaches. Improved +performance of our approach in integrating radiology and pathology information +facilitates generating large datasets, which allows training models for more +accurate breast cancer detection. + +
+
+
+
+
+ + ☆ Querying Easily Flip-flopped Samples for Deep Active Learning ICLR 2024 + + +
+ Active learning is a machine learning paradigm that aims to improve the +performance of a model by strategically selecting and querying unlabeled data. +One effective selection strategy is to base it on the model's predictive +uncertainty, which can be interpreted as a measure of how informative a sample +is. The sample's distance to the decision boundary is a natural measure of +predictive uncertainty, but it is often intractable to compute, especially for +complex decision boundaries formed in multiclass classification tasks. To +address this issue, this paper proposes the {\it least disagree metric} (LDM), +defined as the smallest probability of disagreement of the predicted label, and +an estimator for LDM proven to be asymptotically consistent under mild +assumptions. The estimator is computationally efficient and can be easily +implemented for deep learning models using parameter perturbation. The +LDM-based active learning is performed by querying unlabeled data with the +smallest LDM. Experimental results show that our LDM-based active learning +algorithm obtains state-of-the-art overall performance on all considered +datasets and deep architectures. + +
+
+ comment: 34 pages, 17 figures, 5 tables. Accepted to the 12th International + Conference on Learning Representations (ICLR 2024) +
+
+
+
+
+ + ☆ Towards Learning from Graphs with Heterophily: Progress and Future + + +
+ Graphs are structured data that models complex relations between real-world +entities. Heterophilous graphs, where linked nodes are prone to be with +different labels or dissimilar features, have recently attracted significant +attention and found many applications. Meanwhile, increasing efforts have been +made to advance learning from heterophilous graphs. Although there exist +surveys on the relevant topic, they focus on heterophilous GNNs, which are only +sub-topics of heterophilous graph learning. In this survey, we comprehensively +overview existing works on learning from graphs with heterophily.First, we +collect over 180 publications and introduce the development of this field. +Then, we systematically categorize existing methods based on a hierarchical +taxonomy including learning strategies, model architectures and practical +applications. Finally, we discuss the primary challenges of existing studies +and highlight promising avenues for future research.More publication details +and corresponding open-source codes can be accessed and will be continuously +updated at our +repositories:https://github.com/gongchenghua/Awesome-Survey-Graphs-with-Heterophily. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Explaining Drift using Shapley Values + + +
+ Machine learning models often deteriorate in their performance when they are +used to predict the outcomes over data on which they were not trained. These +scenarios can often arise in real world when the distribution of data changes +gradually or abruptly due to major events like a pandemic. There have been many +attempts in machine learning research to come up with techniques that are +resilient to such Concept drifts. However, there is no principled framework to +identify the drivers behind the drift in model performance. In this paper, we +propose a novel framework - DBShap that uses Shapley values to identify the +main contributors of the drift and quantify their respective contributions. The +proposed framework not only quantifies the importance of individual features in +driving the drift but also includes the change in the underlying relation +between the input and output as a possible driver. The explanation provided by +DBShap can be used to understand the root cause behind the drift and use it to +make the model resilient to the drift. + +
+
+
+
+
+ + ☆ Universally Robust Graph Neural Networks by Preserving Neighbor + Similarity + + +
+ Despite the tremendous success of graph neural networks in learning +relational data, it has been widely investigated that graph neural networks are +vulnerable to structural attacks on homophilic graphs. Motivated by this, a +surge of robust models is crafted to enhance the adversarial robustness of +graph neural networks on homophilic graphs. However, the vulnerability based on +heterophilic graphs remains a mystery to us. To bridge this gap, in this paper, +we start to explore the vulnerability of graph neural networks on heterophilic +graphs and theoretically prove that the update of the negative classification +loss is negatively correlated with the pairwise similarities based on the +powered aggregated neighbor features. This theoretical proof explains the +empirical observations that the graph attacker tends to connect dissimilar node +pairs based on the similarities of neighbor features instead of ego features +both on homophilic and heterophilic graphs. In this way, we novelly introduce a +novel robust model termed NSPGNN which incorporates a dual-kNN graphs pipeline +to supervise the neighbor similarity-guided propagation. This propagation +utilizes the low-pass filter to smooth the features of node pairs along the +positive kNN graphs and the high-pass filter to discriminate the features of +node pairs along the negative kNN graphs. Extensive experiments on both +homophilic and heterophilic graphs validate the universal robustness of NSPGNN +compared to the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Applications of Machine Learning to Optimizing Polyolefin Manufacturing + + +
+ This chapter is a preprint from our book by , focusing on leveraging machine +learning (ML) in chemical and polyolefin manufacturing optimization. It's +crafted for both novices and seasoned professionals keen on the latest ML +applications in chemical processes. We trace the evolution of AI and ML in +chemical industries, delineate core ML components, and provide resources for ML +beginners. A detailed discussion on various ML methods is presented, covering +regression, classification, and unsupervised learning techniques, with +performance metrics and examples. Ensemble methods, deep learning networks, +including MLP, DNNs, RNNs, CNNs, and transformers, are explored for their +growing role in chemical applications. Practical workshops guide readers +through predictive modeling using advanced ML algorithms. The chapter +culminates with insights into science-guided ML, advocating for a hybrid +approach that enhances model accuracy. The extensive bibliography offers +resources for further research and practical implementation. This chapter aims +to be a thorough primer on ML's practical application in chemical engineering, +particularly for polyolefin production, and sets the stage for continued +learning in subsequent chapters. Please cite the original work [169,170] when +referencing. + +
+
+
+
+
+ + ☆ Improving Speaker-independent Speech Emotion Recognition Using Dynamic + Joint Distribution Adaptation ICASSP 2024 + + +
+ In speaker-independent speech emotion recognition, the training and testing +samples are collected from diverse speakers, leading to a multi-domain shift +challenge across the feature distributions of data from different speakers. +Consequently, when the trained model is confronted with data from new speakers, +its performance tends to degrade. To address the issue, we propose a Dynamic +Joint Distribution Adaptation (DJDA) method under the framework of multi-source +domain adaptation. DJDA firstly utilizes joint distribution adaptation (JDA), +involving marginal distribution adaptation (MDA) and conditional distribution +adaptation (CDA), to more precisely measure the multi-domain distribution +shifts caused by different speakers. This helps eliminate speaker bias in +emotion features, allowing for learning discriminative and speaker-invariant +speech emotion features from coarse-level to fine-level. Furthermore, we +quantify the adaptation contributions of MDA and CDA within JDA by using a +dynamic balance factor based on $\mathcal{A}$-Distance, promoting to +effectively handle the unknown distributions encountered in data from new +speakers. Experimental results demonstrate the superior performance of our DJDA +as compared to other state-of-the-art (SOTA) methods. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ Exploration and Anti-Exploration with Distributional Random Network + Distillation ICML 2024 + + +
+ Exploration remains a critical issue in deep reinforcement learning for an +agent to attain high returns in unknown environments. Although the prevailing +exploration Random Network Distillation (RND) algorithm has been demonstrated +to be effective in numerous environments, it often needs more discriminative +power in bonus allocation. This paper highlights the ``bonus inconsistency'' +issue within RND, pinpointing its primary limitation. To address this issue, we +introduce the Distributional RND (DRND), a derivative of the RND. DRND enhances +the exploration process by distilling a distribution of random networks and +implicitly incorporating pseudo counts to improve the precision of bonus +allocation. This refinement encourages agents to engage in more extensive +exploration. Our method effectively mitigates the inconsistency issue without +introducing significant computational overhead. Both theoretical analysis and +experimental results demonstrate the superiority of our approach over the +original RND algorithm. Our method excels in challenging online exploration +scenarios and effectively serves as an anti-exploration mechanism in D4RL +offline tasks. + +
+
+ comment: Submitted to ICML 2024 +
+
+
+
+
+ + ☆ Bootstrapping OTS-Funcimg Pre-training Model (Botfip) -- A Comprehensive + Symbolic Regression Framework + + +
+ In the field of scientific computing, many problem-solving approaches tend to +focus only on the process and final outcome, even in AI for science, there is a +lack of deep multimodal information mining behind the data, missing a +multimodal framework akin to that in the image-text domain. In this paper, we +take Symbolic Regression(SR) as our focal point and, drawing inspiration from +the BLIP model in the image-text domain, propose a scientific computing +multimodal framework based on Function Images (Funcimg) and Operation Tree +Sequence (OTS), named Bootstrapping OTS-Funcimg Pre-training Model (Botfip). In +SR experiments, we validate the advantages of Botfip in low-complexity SR +problems, showcasing its potential. As a MED framework, Botfip holds promise +for future applications in a broader range of scientific computing problems. + +
+
+
+
+
+ + ☆ Offline Imitation Learning by Controlling the Effective Planning Horizon + + +
+ In offline imitation learning (IL), we generally assume only a handful of +expert trajectories and a supplementary offline dataset from suboptimal +behaviors to learn the expert policy. While it is now common to minimize the +divergence between state-action visitation distributions so that the agent also +considers the future consequences of an action, a sampling error in an offline +dataset may lead to erroneous estimates of state-action visitations in the +offline case. In this paper, we investigate the effect of controlling the +effective planning horizon (i.e., reducing the discount factor) as opposed to +imposing an explicit regularizer, as previously studied. Unfortunately, it +turns out that the existing algorithms suffer from magnified approximation +errors when the effective planning horizon is shortened, which results in a +significant degradation in performance. We analyze the main cause of the +problem and provide the right remedies to correct the algorithm. We show that +the corrected algorithm improves on popular imitation learning benchmarks by +controlling the effective planning horizon rather than an explicit +regularization. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ EfficientRec an unlimited user-item scale recommendation system based on + clustering and users interaction embedding profile + + +
+ Recommendation systems are highly interested in technology companies +nowadays. The businesses are constantly growing users and products, causing the +number of users and items to continuously increase over time, to very large +numbers. Traditional recommendation algorithms with complexity dependent on the +number of users and items make them difficult to adapt to the industrial +environment. In this paper, we introduce a new method applying graph neural +networks with a contrastive learning framework in extracting user preferences. +We incorporate a soft clustering architecture that significantly reduces the +computational cost of the inference process. Experiments show that the model is +able to learn user preferences with low computational cost in both training and +prediction phases. At the same time, the model gives a very good accuracy. We +call this architecture EfficientRec with the implication of model compactness +and the ability to scale to unlimited users and products. + +
+
+ comment: Published in 14th Asian Conference on Intelligent Information and + Database Systems (ACIIDS), 2022 +
+
+
+
+
+ + ☆ Imitation Learning Inputting Image Feature to Each Layer of Neural + Network + + +
+ Imitation learning enables robots to learn and replicate human behavior from +training data. Recent advances in machine learning enable end-to-end learning +approaches that directly process high-dimensional observation data, such as +images. However, these approaches face a critical challenge when processing +data from multiple modalities, inadvertently ignoring data with a lower +correlation to the desired output, especially when using short sampling +periods. This paper presents a useful method to address this challenge, which +amplifies the influence of data with a relatively low correlation to the output +by inputting the data into each neural network layer. The proposed approach +effectively incorporates diverse data sources into the learning process. +Through experiments using a simple pick-and-place operation with raw images and +joint information as input, significant improvements in success rates are +demonstrated even when dealing with data from short sampling periods. + +
+
+ comment: IEEE The 18th International Workshop on Advanced Motion Control + (AMC2024) +
+
+
+
+
+ + ☆ Comparative Study on the Performance of Categorical Variable Encoders in + Classification and Regression Tasks + + +
+ Categorical variables often appear in datasets for classification and +regression tasks, and they need to be encoded into numerical values before +training. Since many encoders have been developed and can significantly impact +performance, choosing the appropriate encoder for a task becomes a +time-consuming yet important practical issue. This study broadly classifies +machine learning models into three categories: 1) ATI models that implicitly +perform affine transformations on inputs, such as multi-layer perceptron neural +network; 2) Tree-based models that are based on decision trees, such as random +forest; and 3) the rest, such as kNN. Theoretically, we prove that the one-hot +encoder is the best choice for ATI models in the sense that it can mimic any +other encoders by learning suitable weights from the data. We also explain why +the target encoder and its variants are the most suitable encoders for +tree-based models. This study conducted comprehensive computational experiments +to evaluate 14 encoders, including one-hot and target encoders, along with +eight common machine-learning models on 28 datasets. The computational results +agree with our theoretical analysis. The findings in this study shed light on +how to select the suitable encoder for data scientists in fields such as fraud +detection, disease diagnosis, etc. + +
+
+
+
+
+ + ☆ Harnessing Density Ratios for Online Reinforcement Learning ICLR 2024 + + +
+ The theories of offline and online reinforcement learning, despite having +evolved in parallel, have begun to show signs of the possibility for a +unification, with algorithms and analysis techniques for one setting often +having natural counterparts in the other. However, the notion of density ratio +modeling, an emerging paradigm in offline RL, has been largely absent from +online RL, perhaps for good reason: the very existence and boundedness of +density ratios relies on access to an exploratory dataset with good coverage, +but the core challenge in online RL is to collect such a dataset without having +one to start. In this work we show -- perhaps surprisingly -- that density +ratio-based algorithms have online counterparts. Assuming only the existence of +an exploratory distribution with good coverage, a structural condition known as +coverability (Xie et al., 2023), we give a new algorithm (GLOW) that uses +density ratio realizability and value function realizability to perform +sample-efficient online exploration. GLOW addresses unbounded density ratios +via careful use of truncation, and combines this with optimism to guide +exploration. GLOW is computationally inefficient; we complement it with a more +efficient counterpart, HyGLOW, for the Hybrid RL setting (Song et al., 2022) +wherein online RL is augmented with additional offline data. HyGLOW is derived +as a special case of a more general meta-algorithm that provides a provable +black-box reduction from hybrid RL to offline RL, which may be of independent +interest. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Artwork Protection Against Neural Style Transfer Using Locally Adaptive + Adversarial Color Attack + + +
+ Neural style transfer (NST) is widely adopted in computer vision to generate +new images with arbitrary styles. This process leverages neural networks to +merge aesthetic elements of a style image with the structural aspects of a +content image into a harmoniously integrated visual result. However, +unauthorized NST can exploit artwork. Such misuse raises socio-technical +concerns regarding artists' rights and motivates the development of technical +approaches for the proactive protection of original creations. Adversarial +attack is a concept primarily explored in machine learning security. Our work +introduces this technique to protect artists' intellectual property. In this +paper Locally Adaptive Adversarial Color Attack (LAACA), a method for altering +images in a manner imperceptible to the human eyes but disruptive to NST. +Specifically, we design perturbations targeting image areas rich in +high-frequency content, generated by disrupting intermediate features. Our +experiments and user study confirm that by attacking NST using the proposed +method results in visually worse neural style transfer, thus making it an +effective solution for visual artwork protection. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Towards Identifiable Unsupervised Domain Translation: A Diversified + Distribution Matching Approach + + +
+ Unsupervised domain translation (UDT) aims to find functions that convert +samples from one domain (e.g., sketches) to another domain (e.g., photos) +without changing the high-level semantic meaning (also referred to as +``content''). The translation functions are often sought by probability +distribution matching of the transformed source domain and target domain. +CycleGAN stands as arguably the most representative approach among this line of +work. However, it was noticed in the literature that CycleGAN and variants +could fail to identify the desired translation functions and produce +content-misaligned translations. This limitation arises due to the presence of +multiple translation functions -- referred to as ``measure-preserving +automorphism" (MPA) -- in the solution space of the learning criteria. Despite +awareness of such identifiability issues, solutions have remained elusive. This +study delves into the core identifiability inquiry and introduces an MPA +elimination theory. Our analysis shows that MPA is unlikely to exist, if +multiple pairs of diverse cross-domain conditional distributions are matched by +the learning function. Our theory leads to a UDT learner using distribution +matching over auxiliary variable-induced subsets of the domains -- other than +over the entire data domains as in the classical approaches. The proposed +framework is the first to rigorously establish translation identifiability +under reasonable UDT settings, to our best knowledge. Experiments corroborate +with our theoretical claims. + +
+
+
+
+
+ + ☆ Accelerating Distributed Stochastic Optimization via Self-Repellent + Random Walks ICLR 2024 + + +
+ We study a family of distributed stochastic optimization algorithms where +gradients are sampled by a token traversing a network of agents in random-walk +fashion. Typically, these random-walks are chosen to be Markov chains that +asymptotically sample from a desired target distribution, and play a critical +role in the convergence of the optimization iterates. In this paper, we take a +novel approach by replacing the standard linear Markovian token by one which +follows a nonlinear Markov chain - namely the Self-Repellent Radom Walk (SRRW). +Defined for any given 'base' Markov chain, the SRRW, parameterized by a +positive scalar {\alpha}, is less likely to transition to states that were +highly visited in the past, thus the name. In the context of MCMC sampling on a +graph, a recent breakthrough in Doshi et al. (2023) shows that the SRRW +achieves O(1/{\alpha}) decrease in the asymptotic variance for sampling. We +propose the use of a 'generalized' version of the SRRW to drive token +algorithms for distributed stochastic optimization in the form of stochastic +approximation, termed SA-SRRW. We prove that the optimization iterate errors of +the resulting SA-SRRW converge to zero almost surely and prove a central limit +theorem, deriving the explicit form of the resulting asymptotic covariance +matrix corresponding to iterate errors. This asymptotic covariance is always +smaller than that of an algorithm driven by the base Markov chain and decreases +at rate O(1/{\alpha}^2) - the performance benefit of using SRRW thereby +amplified in the stochastic optimization context. Empirical results support our +theoretical findings. + +
+
+ comment: Accepted for oral presentation at the Twelfth International + Conference on Learning Representations (ICLR 2024) +
+
+
+
+
+ + ☆ Mobility Accelerates Learning: Convergence Analysis on Hierarchical + Federated Learning in Vehicular Networks + + +
+ Hierarchical federated learning (HFL) enables distributed training of models +across multiple devices with the help of several edge servers and a cloud edge +server in a privacy-preserving manner. In this paper, we consider HFL with +highly mobile devices, mainly targeting at vehicular networks. Through +convergence analysis, we show that mobility influences the convergence speed by +both fusing the edge data and shuffling the edge models. While mobility is +usually considered as a challenge from the perspective of communication, we +prove that it increases the convergence speed of HFL with edge-level +heterogeneous data, since more diverse data can be incorporated. Furthermore, +we demonstrate that a higher speed leads to faster convergence, since it +accelerates the fusion of data. Simulation results show that mobility increases +the model accuracy of HFL by up to 15.1% when training a convolutional neural +network on the CIFAR-10 dataset. + +
+
+ comment: Submitted to IEEE for possible publication +
+
+
+
+
+ + ☆ M3BUNet: Mobile Mean Max UNet for Pancreas Segmentation on CT-Scans + + +
+ Segmenting organs in CT scan images is a necessary process for multiple +downstream medical image analysis tasks. Currently, manual CT scan segmentation +by radiologists is prevalent, especially for organs like the pancreas, which +requires a high level of domain expertise for reliable segmentation due to +factors like small organ size, occlusion, and varying shapes. When resorting to +automated pancreas segmentation, these factors translate to limited reliable +labeled data to train effective segmentation models. Consequently, the +performance of contemporary pancreas segmentation models is still not within +acceptable ranges. To improve that, we propose M3BUNet, a fusion of MobileNet +and U-Net neural networks, equipped with a novel Mean-Max (MM) attention that +operates in two stages to gradually segment pancreas CT images from coarse to +fine with mask guidance for object detection. This approach empowers the +network to surpass segmentation performance achieved by similar network +architectures and achieve results that are on par with complex state-of-the-art +methods, all while maintaining a low parameter count. Additionally, we +introduce external contour segmentation as a preprocessing step for the coarse +stage to assist in the segmentation process through image standardization. For +the fine segmentation stage, we found that applying a wavelet decomposition +filter to create multi-input images enhances pancreas segmentation performance. +We extensively evaluate our approach on the widely known NIH pancreas dataset +and MSD pancreas dataset. Our approach demonstrates a considerable performance +improvement, achieving an average Dice Similarity Coefficient (DSC) value of up +to 89.53% and an Intersection Over Union (IOU) score of up to 81.16 for the NIH +pancreas dataset, and 88.60% DSC and 79.90% IOU for the MSD Pancreas dataset. + +
+
+
+
+
+ + ☆ Differentially Private and Adversarially Robust Machine Learning: An + Empirical Evaluation AAAI + + +
+ Malicious adversaries can attack machine learning models to infer sensitive +information or damage the system by launching a series of evasion attacks. +Although various work addresses privacy and security concerns, they focus on +individual defenses, but in practice, models may undergo simultaneous attacks. +This study explores the combination of adversarial training and differentially +private training to defend against simultaneous attacks. While +differentially-private adversarial training, as presented in DP-Adv, +outperforms the other state-of-the-art methods in performance, it lacks formal +privacy guarantees and empirical validation. Thus, in this work, we benchmark +the performance of this technique using a membership inference attack and +empirically show that the resulting approach is as private as non-robust +private models. This work also highlights the need to explore privacy +guarantees in dynamic training paradigms. + +
+
+ comment: Accepted at PPAI-24: The 5th AAAI Workshop on Privacy-Preserving + Artificial Intelligence +
+
+
+
+
+ + ☆ Deep Dict: Deep Learning-based Lossy Time Series Compressor for IoT Data + + +
+ We propose Deep Dict, a deep learning-based lossy time series compressor +designed to achieve a high compression ratio while maintaining decompression +error within a predefined range. Deep Dict incorporates two essential +components: the Bernoulli transformer autoencoder (BTAE) and a distortion +constraint. BTAE extracts Bernoulli representations from time series data, +reducing the size of the representations compared to conventional autoencoders. +The distortion constraint limits the prediction error of BTAE to the desired +range. Moreover, in order to address the limitations of common regression +losses such as L1/L2, we introduce a novel loss function called quantized +entropy loss (QEL). QEL takes into account the specific characteristics of the +problem, enhancing robustness to outliers and alleviating optimization +challenges. Our evaluation of Deep Dict across ten diverse time series datasets +from various domains reveals that Deep Dict outperforms state-of-the-art lossy +compressors in terms of compression ratio by a significant margin by up to +53.66%. + +
+
+ comment: 6 pages, 13 figures, IEEE International Conference on Communications + (ICC) 2024 +
+
+
+
+
+ + ☆ Distribution Consistency based Self-Training for Graph Neural Networks + with Sparse Labels WSDM 2024 + + +
+ Few-shot node classification poses a significant challenge for Graph Neural +Networks (GNNs) due to insufficient supervision and potential distribution +shifts between labeled and unlabeled nodes. Self-training has emerged as a +widely popular framework to leverage the abundance of unlabeled data, which +expands the training set by assigning pseudo-labels to selected unlabeled +nodes. Efforts have been made to develop various selection strategies based on +confidence, information gain, etc. However, none of these methods takes into +account the distribution shift between the training and testing node sets. The +pseudo-labeling step may amplify this shift and even introduce new ones, +hindering the effectiveness of self-training. Therefore, in this work, we +explore the potential of explicitly bridging the distribution shift between the +expanded training set and test set during self-training. To this end, we +propose a novel Distribution-Consistent Graph Self-Training (DC-GST) framework +to identify pseudo-labeled nodes that are both informative and capable of +redeeming the distribution discrepancy and formulate it as a differentiable +optimization task. A distribution-shift-aware edge predictor is further adopted +to augment the graph and increase the model's generalizability in assigning +pseudo labels. We evaluate our proposed method on four publicly available +benchmark datasets and extensive experiments demonstrate that our framework +consistently outperforms state-of-the-art baselines. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ☆ Catastrophic Interference is Mitigated in Naturalistic Power-Law + Learning Environments + + +
+ Neural networks often suffer from catastrophic interference (CI): performance +on previously learned tasks drops off significantly when learning a new task. +This contrasts strongly with humans, who can sequentially learn new tasks +without appreciably forgetting previous tasks. Prior work has explored various +techniques for mitigating CI such as regularization, rehearsal, generative +replay, and distillation methods. The current work takes a different approach, +one guided by cognitive science research showing that in naturalistic +environments, the probability of encountering a task decreases as a power-law +of the time since it was last performed. We argue that a realistic evaluation +of techniques for the mitigation of CI should be performed in simulated +naturalistic learning environments. Thus, we evaluate the extent of mitigation +of CI when training simple rehearsal-based methods in power-law environments +similar to the ones humans face. Our work explores this novel rehearsal-based +approach for a domain-incremental task: learning permutations in the MNIST +task. We compare our rehearsal environment with other baselines to show its +efficacy in promoting continual learning. Additionally, we investigate whether +this environment shows forward facilitation, i.e., faster learning of later +tasks. Next, we explore the robustness of our learning environment to the +number of tasks, model size, and amount of data rehearsed after each task. +Notably, our results show that the performance is comparable or superior to +that of models trained using popular regularization methods and also to +rehearsals in non-power-law environments. The benefits of this training +paradigm include simplicity and the lack of a need for extra neural circuitry. +In addition, because our method is orthogonal to other methods, future research +can combine training in power-law environments with other continual learning +mechanisms. + +
+
+
+
+
+ + ☆ Noninvasive Acute Compartment Syndrome Diagnosis Using Random Forest + Machine Learning + + +
+ Acute compartment syndrome (ACS) is an orthopedic emergency, caused by +elevated pressure within a muscle compartment, that leads to permanent tissue +damage and eventually death. Diagnosis of ACS relies heavily on +patient-reported symptoms, a method that is clinically unreliable and often +supplemented with invasive intracompartmental pressure measurements. This study +proposes a continuous, objective, noninvasive diagnostic for ACS. The device +detects ACS through a random forest machine learning model that uses pressure +readings from force-sensitive resistors (FSRs) placed on the skin. The final +diagnosis is exported real-time to a web application via Bluetooth. To validate +the diagnostic, a data set containing FSR measurements and the corresponding +simulated intracompartmental pressure was created. The diagnostic achieved an +accuracy, on par to the invasive gold standard, of 97%. The device excelled in +key performance metrics including precision, sensitivity, and F1 score. +Manufactured for 73 USD, our device may be an economic alternative to +needle-based diagnostics. These results demonstrate the potential of +noninvasive ACS diagnostics to meet clinical standards and enhance patient +care. + +
+
+
+
+
+ + ☆ Approximation of Solution Operators for High-dimensional PDEs + + +
+ We propose a finite-dimensional control-based method to approximate solution +operators for evolutional partial differential equations (PDEs), particularly +in high-dimensions. By employing a general reduced-order model, such as a deep +neural network, we connect the evolution of the model parameters with +trajectories in a corresponding function space. Using the computational +technique of neural ordinary differential equation, we learn the control over +the parameter space such that from any initial starting point, the controlled +trajectories closely approximate the solutions to the PDE. Approximation +accuracy is justified for a general class of second-order nonlinear PDEs. +Numerical results are presented for several high-dimensional PDEs, including +real-world applications to solving Hamilton-Jacobi-Bellman equations. These are +demonstrated to show the accuracy and efficiency of the proposed method. + +
+
+ comment: 14 pages, 4 page appendix, 4 figures +
+
+
+
+
+ + ☆ Cooperative Multi-Agent Graph Bandits: UCB Algorithm and Regret Analysis + + +
+ In this paper, we formulate the multi-agent graph bandit problem as a +multi-agent extension of the graph bandit problem introduced by Zhang, +Johansson, and Li [CISS 57, 1-6 (2023)]. In our formulation, $N$ cooperative +agents travel on a connected graph $G$ with $K$ nodes. Upon arrival at each +node, agents observe a random reward drawn from a node-dependent probability +distribution. The reward of the system is modeled as a weighted sum of the +rewards the agents observe, where the weights capture the decreasing marginal +reward associated with multiple agents sampling the same node at the same time. +We propose an Upper Confidence Bound (UCB)-based learning algorithm, +Multi-G-UCB, and prove that its expected regret over $T$ steps is bounded by +$O(N\log(T)[\sqrt{KT} + DK])$, where $D$ is the diameter of graph $G$. Lastly, +we numerically test our algorithm by comparing it to alternative methods. + +
+
+
+
+
+ + ☆ Vulnerabilities of Foundation Model Integrated Federated Learning Under + Adversarial Threats + + +
+ Federated Learning (FL) addresses critical issues in machine learning related +to data privacy and security, yet suffering from data insufficiency and +imbalance under certain circumstances. The emergence of foundation models (FMs) +offers potential solutions to the limitations of existing FL frameworks, e.g., +by generating synthetic data for model initialization. However, due to the +inherent safety concerns of FMs, integrating FMs into FL could introduce new +risks, which remains largely unexplored. To address this gap, we conduct the +first investigation on the vulnerability of FM integrated FL (FM-FL) under +adversarial threats. Based on a unified framework of FM-FL, we introduce a +novel attack strategy that exploits safety issues of FM to compromise FL client +models. Through extensive experiments with well-known models and benchmark +datasets in both image and text domains, we reveal the high susceptibility of +the FM-FL to this new threat under various FL configurations. Furthermore, we +find that existing FL defense strategies offer limited protection against this +novel attack approach. This research highlights the critical need for enhanced +security measures in FL in the era of FMs. + +
+
+ comment: Chen Wu and Xi Li are equal contribution. The corresponding author is + Jiaqi Wang +
+
+
+
+
+ + ☆ Harmonized Spatial and Spectral Learning for Robust and Generalized + Medical Image Segmentation + + +
+ Deep learning has demonstrated remarkable achievements in medical image +segmentation. However, prevailing deep learning models struggle with poor +generalization due to (i) intra-class variations, where the same class appears +differently in different samples, and (ii) inter-class independence, resulting +in difficulties capturing intricate relationships between distinct objects, +leading to higher false negative cases. This paper presents a novel approach +that synergies spatial and spectral representations to enhance +domain-generalized medical image segmentation. We introduce the innovative +Spectral Correlation Coefficient objective to improve the model's capacity to +capture middle-order features and contextual long-range dependencies. This +objective complements traditional spatial objectives by incorporating valuable +spectral information. Extensive experiments reveal that optimizing this +objective with existing architectures like UNet and TransUNet significantly +enhances generalization, interpretability, and noise robustness, producing more +confident predictions. For instance, in cardiac segmentation, we observe a 0.81 +pp and 1.63 pp (pp = percentage point) improvement in DSC over UNet and +TransUNet, respectively. Our interpretability study demonstrates that, in most +tasks, objectives optimized with UNet outperform even TransUNet by introducing +global contextual information alongside local details. These findings +underscore the versatility and effectiveness of our proposed method across +diverse imaging modalities and medical domains. + +
+
+
+
+
+ + ☆ Langevin Unlearning: A New Perspective of Noisy Gradient Descent for + Machine Unlearning + + +
+ Machine unlearning has raised significant interest with the adoption of laws +ensuring the ``right to be forgotten''. Researchers have provided a +probabilistic notion of approximate unlearning under a similar definition of +Differential Privacy (DP), where privacy is defined as statistical +indistinguishability to retraining from scratch. We propose Langevin +unlearning, an unlearning framework based on noisy gradient descent with +privacy guarantees for approximate unlearning problems. Langevin unlearning +unifies the DP learning process and the privacy-certified unlearning process +with many algorithmic benefits. These include approximate certified unlearning +for non-convex problems, complexity saving compared to retraining, sequential +and batch unlearning for multiple unlearning requests. We verify the +practicality of Langevin unlearning by studying its privacy-utility-complexity +trade-off via experiments on benchmark datasets, and also demonstrate its +superiority against gradient-decent-plus-output-perturbation based approximate +unlearning. + +
+
+
+
+
+ + ☆ Deep Generative Modeling for Financial Time Series with Application in + VaR: A Comparative Review + + +
+ In the financial services industry, forecasting the risk factor distribution +conditional on the history and the current market environment is the key to +market risk modeling in general and value at risk (VaR) model in particular. As +one of the most widely adopted VaR models in commercial banks, Historical +simulation (HS) uses the empirical distribution of daily returns in a +historical window as the forecast distribution of risk factor returns in the +next day. The objectives for financial time series generation are to generate +synthetic data paths with good variety, and similar distribution and dynamics +to the original historical data. In this paper, we apply multiple existing deep +generative methods (e.g., CGAN, CWGAN, Diffusion, and Signature WGAN) for +conditional time series generation, and propose and test two new methods for +conditional multi-step time series generation, namely Encoder-Decoder CGAN and +Conditional TimeVAE. Furthermore, we introduce a comprehensive framework with a +set of KPIs to measure the quality of the generated time series for financial +modeling. The KPIs cover distribution distance, autocorrelation and +backtesting. All models (HS, parametric and neural networks) are tested on both +historical USD yield curve data and additional data simulated from GARCH and +CIR processes. The study shows that top performing models are HS, GARCH and +CWGAN models. Future research directions in this area are also discussed. + +
+
+
+
+
+ + ☆ Using LLM such as ChatGPT for Designing and Implementing a RISC + Processor: Execution,Challenges and Limitations + + +
+ This paper discusses the feasibility of using Large Language Models LLM for +code generation with a particular application in designing an RISC. The paper +also reviews the associated steps such as parsing, tokenization, encoding, +attention mechanism, sampling the tokens and iterations during code generation. +The generated code for the RISC components is verified through testbenches and +hardware implementation on a FPGA board. Four metric parameters Correct output +on the first iteration, Number of errors embedded in the code, Number of trials +required to achieve the code and Failure to generate the code after three +iterations, are used to compare the efficiency of using LLM in programming. In +all the cases, the generated code had significant errors and human intervention +was always required to fix the bugs. LLM can therefore be used to complement a +programmer code design. + +
+
+
+
+
+ + ☆ Hierarchical Federated Learning in Multi-hop Cluster-Based VANETs + + +
+ The usage of federated learning (FL) in Vehicular Ad hoc Networks (VANET) has +garnered significant interest in research due to the advantages of reducing +transmission overhead and protecting user privacy by communicating local +dataset gradients instead of raw data. However, implementing FL in VANETs faces +challenges, including limited communication resources, high vehicle mobility, +and the statistical diversity of data distributions. In order to tackle these +issues, this paper introduces a novel framework for hierarchical federated +learning (HFL) over multi-hop clustering-based VANET. The proposed method +utilizes a weighted combination of the average relative speed and cosine +similarity of FL model parameters as a clustering metric to consider both data +diversity and high vehicle mobility. This metric ensures convergence with +minimum changes in cluster heads while tackling the complexities associated +with non-independent and identically distributed (non-IID) data scenarios. +Additionally, the framework includes a novel mechanism to manage seamless +transitions of cluster heads (CHs), followed by transferring the most recent FL +model parameter to the designated CH. Furthermore, the proposed approach +considers the option of merging CHs, aiming to reduce their count and, +consequently, mitigate associated overhead. Through extensive simulations, the +proposed hierarchical federated learning over clustered VANET has been +demonstrated to improve accuracy and convergence time significantly while +maintaining an acceptable level of packet overhead compared to previously +proposed clustering algorithms and non-clustered VANET. + +
+
+
+
+
+ + ☆ Excuse me, sir? Your language model is leaking (information) + + +
+ We introduce a cryptographic method to hide an arbitrary secret payload in +the response of a Large Language Model (LLM). A secret key is required to +extract the payload from the model's response, and without the key it is +provably impossible to distinguish between the responses of the original LLM +and the LLM that hides a payload. In particular, the quality of generated text +is not affected by the payload. Our approach extends a recent result of Christ, +Gunn and Zamir (2023) who introduced an undetectable watermarking scheme for +LLMs. + +
+
+
+
+
+ + ☆ Intelligent Optimization and Machine Learning Algorithms for Structural + Anomaly Detection using Seismic Signals + + +
+ The lack of anomaly detection methods during mechanized tunnelling can cause +financial loss and deficits in drilling time. On-site excavation requires hard +obstacles to be recognized prior to drilling in order to avoid damaging the +tunnel boring machine and to adjust the propagation velocity. The efficiency of +the structural anomaly detection can be increased with intelligent optimization +techniques and machine learning. In this research, the anomaly in a simple +structure is detected by comparing the experimental measurements of the +structural vibrations with numerical simulations using parameter estimation +methods. + +
+
+
+
+
+ + ☆ Towards providing reliable job completion time predictions using PCS + + +
+ In this paper we build a case for providing job completion time predictions +to cloud users, similar to the delivery date of a package or arrival time of a +booked ride. Our analysis reveals that providing predictability can come at the +expense of performance and fairness. Existing cloud scheduling systems optimize +for extreme points in the trade-off space, making them either extremely +unpredictable or impractical. + To address this challenge, we present PCS, a new scheduling framework that +aims to provide predictability while balancing other traditional objectives. +The key idea behind PCS is to use Weighted-Fair-Queueing (WFQ) and find a +suitable configuration of different WFQ parameters (e.g., class weights) that +meets specific goals for predictability. It uses a simulation-aided search +strategy, to efficiently discover WFQ configurations that lie on the Pareto +front of the trade-off space between these objectives. We implement and +evaluate PCS in the context of DNN job scheduling on GPUs. Our evaluation, on a +small scale GPU testbed and larger-scale simulations, shows that PCS can +provide accurate completion time estimates while marginally compromising on +performance and fairness. + +
+
+
+
+
+ + ☆ MELODY: Robust Semi-Supervised Hybrid Model for Entity-Level Online + Anomaly Detection with Multivariate Time Series + + +
+ In large IT systems, software deployment is a crucial process in online +services as their code is regularly updated. However, a faulty code change may +degrade the target service's performance and cause cascading outages in +downstream services. Thus, software deployments should be comprehensively +monitored, and their anomalies should be detected timely. In this paper, we +study the problem of anomaly detection for deployments. We begin by identifying +the challenges unique to this anomaly detection problem, which is at +entity-level (e.g., deployments), relative to the more typical problem of +anomaly detection in multivariate time series (MTS). The unique challenges +include the heterogeneity of deployments, the low latency tolerance, the +ambiguous anomaly definition, and the limited supervision. To address them, we +propose a novel framework, semi-supervised hybrid Model for Entity-Level Online +Detection of anomalY (MELODY). MELODY first transforms the MTS of different +entities to the same feature space by an online feature extractor, then uses a +newly proposed semi-supervised deep one-class model for detecting anomalous +entities. We evaluated MELODY on real data of cloud services with 1.2M+ time +series. The relative F1 score improvement of MELODY over the state-of-the-art +methods ranges from 7.6% to 56.5%. The user evaluation suggests MELODY is +suitable for monitoring deployments in large online systems. + +
+
+
+
+
+ + ☆ Noise Contrastive Estimation-based Matching Framework for Low-resource + Security Attack Pattern Recognition EACL 2024 + + +
+ Tactics, Techniques and Procedures (TTPs) represent sophisticated attack +patterns in the cybersecurity domain, described encyclopedically in textual +knowledge bases. Identifying TTPs in cybersecurity writing, often called TTP +mapping, is an important and challenging task. Conventional learning approaches +often target the problem in the classical multi-class or multilabel +classification setting. This setting hinders the learning ability of the model +due to a large number of classes (i.e., TTPs), the inevitable skewness of the +label distribution and the complex hierarchical structure of the label space. +We formulate the problem in a different learning paradigm, where the assignment +of a text to a TTP label is decided by the direct semantic similarity between +the two, thus reducing the complexity of competing solely over the large +labeling space. To that end, we propose a neural matching architecture with an +effective sampling-based learn-to-compare mechanism, facilitating the learning +process of the matching model despite constrained resources. + +
+
+ comment: accepted at EACL 2024, in ARR October 2023 +
+
+
+
+
+ + ♻ ☆ Unboxing Tree Ensembles for interpretability: a hierarchical + visualization tool and a multivariate optimal re-built tree + + +
+ The interpretability of models has become a crucial issue in Machine Learning +because of algorithmic decisions' growing impact on real-world applications. +Tree ensemble methods, such as Random Forests or XgBoost, are powerful learning +tools for classification tasks. However, while combining multiple trees may +provide higher prediction quality than a single one, it sacrifices the +interpretability property resulting in "black-box" models. In light of this, we +aim to develop an interpretable representation of a tree-ensemble model that +can provide valuable insights into its behavior. First, given a target +tree-ensemble model, we develop a hierarchical visualization tool based on a +heatmap representation of the forest's feature use, considering the frequency +of a feature and the level at which it is selected as an indicator of +importance. Next, we propose a mixed-integer linear programming (MILP) +formulation for constructing a single optimal multivariate tree that accurately +mimics the target model predictions. The goal is to provide an interpretable +surrogate model based on oblique hyperplane splits, which uses only the most +relevant features according to the defined forest's importance indicators. The +MILP model includes a penalty on feature selection based on their frequency in +the forest to further induce sparsity of the splits. The natural formulation +has been strengthened to improve the computational performance of +{mixed-integer} software. Computational experience is carried out on benchmark +datasets from the UCI repository using a state-of-the-art off-the-shelf solver. +Results show that the proposed model is effective in yielding a shallow +interpretable tree approximating the tree-ensemble decision function. + +
+
+ comment: 44 pages, 9 figures, 20 tables +
+
+
+
+
+ + ♻ ☆ Compositional Program Generation for Few-Shot Systematic Generalization + + +
+ Compositional generalization is a key ability of humans that enables us to +learn new concepts from only a handful examples. Neural machine learning +models, including the now ubiquitous Transformers, struggle to generalize in +this way, and typically require thousands of examples of a concept during +training in order to generalize meaningfully. This difference in ability +between humans and artificial neural architectures, motivates this study on a +neuro-symbolic architecture called the Compositional Program Generator (CPG). +CPG has three key features: \textit{modularity}, \textit{composition}, and +\textit{abstraction}, in the form of grammar rules, that enable it to +generalize both systematically to new concepts in a few-shot manner, as well as +productively by length on various sequence-to-sequence language tasks. For each +input, CPG uses a grammar of the input language and a parser to generate a +parse in which each grammar rule is assigned its own unique semantic module, a +probabilistic copy or substitution program. Instances with the same parse are +always processed with the same composed modules, while those with different +parses may be processed with different modules. CPG learns parameters for the +modules and is able to learn the semantics for new rules and types +incrementally, without forgetting or retraining on rules it's already seen. It +achieves perfect generalization on both the SCAN and COGS benchmarks using just +14 examples for SCAN and 22 examples for COGS -- state-of-the-art accuracy with +a 1000x improvement in sample efficiency. + +
+
+ comment: 7 pages of text with 1 page of references +
+
+
+
+
+ + ♻ ☆ On Error Propagation of Diffusion Models ICLR-2024 + + +
+ Although diffusion models (DMs) have shown promising performances in a number +of tasks (e.g., speech synthesis and image generation), they might suffer from +error propagation because of their sequential structure. However, this is not +certain because some sequential models, such as Conditional Random Field (CRF), +are free from this problem. To address this issue, we develop a theoretical +framework to mathematically formulate error propagation in the architecture of +DMs, The framework contains three elements, including modular error, cumulative +error, and propagation equation. The modular and cumulative errors are related +by the equation, which interprets that DMs are indeed affected by error +propagation. Our theoretical study also suggests that the cumulative error is +closely related to the generation quality of DMs. Based on this finding, we +apply the cumulative error as a regularization term to reduce error +propagation. Because the term is computationally intractable, we derive its +upper bound and design a bootstrap algorithm to efficiently estimate the bound +for optimization. We have conducted extensive experiments on multiple image +datasets, showing that our proposed regularization reduces error propagation, +significantly improves vanilla DMs, and outperforms previous baselines. + +
+
+ comment: Accepted by ICLR-2024 +
+
+
+
+
+ + ♻ ☆ Soft Mixture Denoising: Beyond the Expressive Bottleneck of Diffusion + Models ICLR-2024 + + +
+ Because diffusion models have shown impressive performances in a number of +tasks, such as image synthesis, there is a trend in recent works to prove (with +certain assumptions) that these models have strong approximation capabilities. +In this paper, we show that current diffusion models actually have an +expressive bottleneck in backward denoising and some assumption made by +existing theoretical guarantees is too strong. Based on this finding, we prove +that diffusion models have unbounded errors in both local and global denoising. +In light of our theoretical studies, we introduce soft mixture denoising (SMD), +an expressive and efficient model for backward denoising. SMD not only permits +diffusion models to well approximate any Gaussian mixture distributions in +theory, but also is simple and efficient for implementation. Our experiments on +multiple image datasets show that SMD significantly improves different types of +diffusion models (e.g., DDPM), espeically in the situation of few backward +iterations. + +
+
+ comment: Accepted by ICLR-2024 +
+
+
+
+
+ + ♻ ☆ Learn to Categorize or Categorize to Learn? Self-Coding for Generalized + Category Discovery NeurIPS 2023 + + +
+ In the quest for unveiling novel categories at test time, we confront the +inherent limitations of traditional supervised recognition models that are +restricted by a predefined category set. While strides have been made in the +realms of self-supervised and open-world learning towards test-time category +discovery, a crucial yet often overlooked question persists: what exactly +delineates a category? In this paper, we conceptualize a category through the +lens of optimization, viewing it as an optimal solution to a well-defined +problem. Harnessing this unique conceptualization, we propose a novel, +efficient and self-supervised method capable of discovering previously unknown +categories at test time. A salient feature of our approach is the assignment of +minimum length category codes to individual data instances, which encapsulates +the implicit category hierarchy prevalent in real-world datasets. This +mechanism affords us enhanced control over category granularity, thereby +equipping our model to handle fine-grained categories adeptly. Experimental +evaluations, bolstered by state-of-the-art benchmark comparisons, testify to +the efficacy of our solution in managing unknown categories at test time. +Furthermore, we fortify our proposition with a theoretical foundation, +providing proof of its optimality. Our code is available at +https://github.com/SarahRastegar/InfoSieve. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ ICML 2023 Topological Deep Learning Challenge : Design and Results + + +
+ This paper presents the computational challenge on topological deep learning +that was hosted within the ICML 2023 Workshop on Topology and Geometry in +Machine Learning. The competition asked participants to provide open-source +implementations of topological neural networks from the literature by +contributing to the python packages TopoNetX (data processing) and TopoModelX +(deep learning). The challenge attracted twenty-eight qualifying submissions in +its two-month duration. This paper describes the design of the challenge and +summarizes its main findings. + +
+
+
+
+
+ + ♻ ☆ Hyperbolic Image-Text Representations ICML 2023 + + +
+ Visual and linguistic concepts naturally organize themselves in a hierarchy, +where a textual concept "dog" entails all images that contain dogs. Despite +being intuitive, current large-scale vision and language models such as CLIP do +not explicitly capture such hierarchy. We propose MERU, a contrastive model +that yields hyperbolic representations of images and text. Hyperbolic spaces +have suitable geometric properties to embed tree-like data, so MERU can better +capture the underlying hierarchy in image-text datasets. Our results show that +MERU learns a highly interpretable and structured representation space while +being competitive with CLIP's performance on standard multi-modal tasks like +image classification and image-text retrieval. Our code and models are +available at https://www.github.com/facebookresearch/meru + +
+
+ comment: ICML 2023 (v3: Add link to code in abstract) +
+
+
+
+
+ + ♻ ☆ BasisFormer: Attention-based Time Series Forecasting with Learnable and + Interpretable Basis NeurIPS 2023 + + +
+ Bases have become an integral part of modern deep learning-based models for +time series forecasting due to their ability to act as feature extractors or +future references. To be effective, a basis must be tailored to the specific +set of time series data and exhibit distinct correlation with each time series +within the set. However, current state-of-the-art methods are limited in their +ability to satisfy both of these requirements simultaneously. To address this +challenge, we propose BasisFormer, an end-to-end time series forecasting +architecture that leverages learnable and interpretable bases. This +architecture comprises three components: First, we acquire bases through +adaptive self-supervised learning, which treats the historical and future +sections of the time series as two distinct views and employs contrastive +learning. Next, we design a Coef module that calculates the similarity +coefficients between the time series and bases in the historical view via +bidirectional cross-attention. Finally, we present a Forecast module that +selects and consolidates the bases in the future view based on the similarity +coefficients, resulting in accurate future predictions. Through extensive +experiments on six datasets, we demonstrate that BasisFormer outperforms +previous state-of-the-art methods by 11.04\% and 15.78\% respectively for +univariate and multivariate forecasting tasks. Code is available at: +\url{https://github.com/nzl5116190/Basisformer} + +
+
+ comment: NeurIPS 2023(poster) +
+
+
+
+
+ + ♻ ☆ FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel + Identification + + +
+ Highly parallelized workloads like machine learning training, inferences and +general HPC tasks are greatly accelerated using GPU devices. In a cloud +computing cluster, serving a GPU's computation power through multi-tasks +sharing is highly demanded since there are always more task requests than the +number of GPU available. Existing GPU sharing solutions focus on reducing +task-level waiting time or task-level switching costs when multiple jobs +competing for a single GPU. Non-stopped computation requests come with +different priorities, having non-symmetric impact on QoS for sharing a GPU +device. Existing work missed the kernel-level optimization opportunity brought +by this setting. To address this problem, we present a novel kernel-level +scheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT +incorporates task-level priority information, fine-grained kernel +identification, and kernel measurement, allowing low priorities task's +execution during high priority task's inter-kernel idle time. Thereby, filling +the GPU's device runtime fully, and reduce overall GPU sharing impact to cloud +services. Across a set of ML models, the FIKIT based inference system +accelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in +GPU sharing mode, and more than half of the cases are accelerated by more than +3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have +a comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We +further limit the kernel measurement and runtime fine-grained kernel scheduling +overhead to less than 5%. + +
+
+ comment: 20 pages, 20 figures. Update the Abstract on the arXiv page +
+
+
+
+
+ + ♻ ☆ Recovering Simultaneously Structured Data via Non-Convex Iteratively + Reweighted Least Squares + + +
+ We propose a new algorithm for the problem of recovering data that adheres to +multiple, heterogeneous low-dimensional structures from linear observations. +Focusing on data matrices that are simultaneously row-sparse and low-rank, we +propose and analyze an iteratively reweighted least squares (IRLS) algorithm +that is able to leverage both structures. In particular, it optimizes a +combination of non-convex surrogates for row-sparsity and rank, a balancing of +which is built into the algorithm. We prove locally quadratic convergence of +the iterates to a simultaneously structured data matrix in a regime of minimal +sample complexity (up to constants and a logarithmic factor), which is known to +be impossible for a combination of convex surrogates. In experiments, we show +that the IRLS method exhibits favorable empirical convergence, identifying +simultaneously row-sparse and low-rank matrices from fewer measurements than +state-of-the-art methods. Code is available at +https://github.com/ckuemmerle/simirls. + +
+
+ comment: 35 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Model-Based Solution to the Offline Multi-Agent Reinforcement Learning + Coordination Problem + + +
+ Training multiple agents to coordinate is an essential problem with +applications in robotics, game theory, economics, and social sciences. However, +most existing Multi-Agent Reinforcement Learning (MARL) methods are online and +thus impractical for real-world applications in which collecting new +interactions is costly or dangerous. While these algorithms should leverage +offline data when available, doing so gives rise to what we call the offline +coordination problem. Specifically, we identify and formalize the strategy +agreement (SA) and the strategy fine-tuning (SFT) coordination challenges, two +issues at which current offline MARL algorithms fail. Concretely, we reveal +that the prevalent model-free methods are severely deficient and cannot handle +coordination-intensive offline multi-agent tasks in either toy or MuJoCo +domains. To address this setback, we emphasize the importance of inter-agent +interactions and propose the very first model-based offline MARL method. Our +resulting algorithm, Model-based Offline Multi-Agent Proximal Policy +Optimization (MOMA-PPO) generates synthetic interaction data and enables agents +to converge on a strategy while fine-tuning their policies accordingly. This +simple model-based solution solves the coordination-intensive offline tasks, +significantly outperforming the prevalent model-free methods even under severe +partial observability and with learned world models. + +
+
+
+
+
+ + ♻ ☆ FactCHD: Benchmarking Fact-Conflicting Hallucination Detection + + +
+ Despite their impressive generative capabilities, LLMs are hindered by +fact-conflicting hallucinations in real-world applications. The accurate +identification of hallucinations in texts generated by LLMs, especially in +complex inferential scenarios, is a relatively unexplored area. To address this +gap, we present FactCHD, a dedicated benchmark designed for the detection of +fact-conflicting hallucinations from LLMs. FactCHD features a diverse dataset +that spans various factuality patterns, including vanilla, multi-hop, +comparison, and set operation. A distinctive element of FactCHD is its +integration of fact-based evidence chains, significantly enhancing the depth of +evaluating the detectors' explanations. Experiments on different LLMs expose +the shortcomings of current approaches in detecting factual errors accurately. +Furthermore, we introduce Truth-Triangulator that synthesizes reflective +considerations by tool-enhanced ChatGPT and LoRA-tuning based on Llama2, aiming +to yield more credible detection through the amalgamation of predictive results +and evidence. The benchmark dataset is available at +https://github.com/zjunlp/FactCHD. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ CodeKGC: Code Language Model for Generative Knowledge Graph Construction + + +
+ Current generative knowledge graph construction approaches usually fail to +capture structural knowledge by simply flattening natural language into +serialized texts or a specification language. However, large generative +language model trained on structured data such as code has demonstrated +impressive capability in understanding natural language for structural +prediction and reasoning tasks. Intuitively, we address the task of generative +knowledge graph construction with code language model: given a code-format +natural language input, the target is to generate triples which can be +represented as code completion tasks. Specifically, we develop schema-aware +prompts that effectively utilize the semantic structure within the knowledge +graph. As code inherently possesses structure, such as class and function +definitions, it serves as a useful model for prior semantic structural +knowledge. Furthermore, we employ a rationale-enhanced generation method to +boost the performance. Rationales provide intermediate steps, thereby improving +knowledge extraction abilities. Experimental results indicate that the proposed +approach can obtain better performance on benchmark datasets compared with +baselines. Code and datasets are available in +https://github.com/zjunlp/DeepKE/tree/main/example/llm. + +
+
+ comment: ACM Transactions on Asian and Low-Resource Language Information + Processing +
+
+
+
+
+ + ♻ ☆ Understanding Augmentation-based Self-Supervised Representation Learning + via RKHS Approximation and Regression ICLR 2024 + + +
+ Data augmentation is critical to the empirical success of modern +self-supervised representation learning, such as contrastive learning and +masked language modeling. However, a theoretical understanding of the exact +role of augmentation remains limited. Recent work has built the connection +between self-supervised learning and the approximation of the top eigenspace of +a graph Laplacian operator, suggesting that learning a linear probe atop such +representation can be connected to RKHS regression. Building on this insight, +this work delves into a statistical analysis of augmentation-based pretraining. +Starting from the isometry property, a geometric characterization of the target +function given by the augmentation, we disentangle the effects of the model and +the augmentation, and prove two generalization bounds that are free of model +complexity. Our first bound works for an arbitrary encoder, where the +prediction error is decomposed as the sum of an estimation error incurred by +fitting a linear probe with RKHS regression, and an approximation error +entailed by RKHS approximation. Our second bound specifically addresses the +case where the encoder is near-optimal, that is it approximates the top-d +eigenspace of the RKHS induced by the augmentation. A key ingredient in our +analysis is the augmentation complexity, which we use to quantitatively compare +different augmentations and analyze their impact on downstream performance. + +
+
+ comment: ICLR 2024 spotlight. 34 pages +
+
+
+
+
+ + ♻ ☆ Determinantal Point Process Attention Over Grid Cell Code Supports Out + of Distribution Generalization + + +
+ Deep neural networks have made tremendous gains in emulating human-like +intelligence, and have been used increasingly as ways of understanding how the +brain may solve the complex computational problems on which this relies. +However, these still fall short of, and therefore fail to provide insight into +how the brain supports strong forms of generalization of which humans are +capable. One such case is out-of-distribution (OOD) generalization-successful +performance on test examples that lie outside the distribution of the training +set. Here, we identify properties of processing in the brain that may +contribute to this ability. We describe a two-part algorithm that draws on +specific features of neural computation to achieve OOD generalization, and +provide a proof of concept by evaluating performance on two challenging +cognitive tasks. First we draw on the fact that the mammalian brain represents +metric spaces using grid cell code (e.g., in entorhinal cortex): abstract +representations of relational structure, organized in recurring motifs that +cover the representational space. Second, we propose an attentional mechanism +that operates over the grid cell code using Determinantal Point Process (DPP), +that we call DPP attention (DPP-A) -- a transformation that ensures maximum +sparseness in the coverage of that space. We show that a loss function that +combines standard task-optimized error with DPP-A can exploit the recurring +motifs in the grid cell code, and can be integrated with common architectures +to achieve strong OOD generalization performance on analogy and arithmetic +tasks. This provides both an interpretation of how the grid cell code in the +mammalian brain may contribute to generalization performance, and at the same +time a potential means for improving such capabilities in artificial neural +networks. + +
+
+ comment: 29 pages (including Appendix), 21 figures +
+
+
+
+
+ + ♻ ☆ Labeling Neural Representations with Inverse Recognition + + +
+ Deep Neural Networks (DNNs) demonstrate remarkable capabilities in learning +complex hierarchical data representations, but the nature of these +representations remains largely unknown. Existing global explainability +methods, such as Network Dissection, face limitations such as reliance on +segmentation masks, lack of statistical significance testing, and high +computational demands. We propose Inverse Recognition (INVERT), a scalable +approach for connecting learned representations with human-understandable +concepts by leveraging their capacity to discriminate between these concepts. +In contrast to prior work, INVERT is capable of handling diverse types of +neurons, exhibits less computational complexity, and does not rely on the +availability of segmentation masks. Moreover, INVERT provides an interpretable +metric assessing the alignment between the representation and its corresponding +explanation and delivering a measure of statistical significance. We +demonstrate the applicability of INVERT in various scenarios, including the +identification of representations affected by spurious correlations, and the +interpretation of the hierarchical structure of decision-making within the +models. + +
+
+ comment: 25 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Exploring Parameter-Efficient Fine-Tuning Techniques for Code Generation + with Large Language Models + + +
+ Large Language Models (LLMs) demonstrate impressive capabilities to generate +accurate code snippets given natural language intents in zero-shot, i.e., +without the need for specific fine-tuning. While prior studies have highlighted +the advantages of fine-tuning LLMs, this process incurs high computational +costs, making it impractical in resource-scarce environments, particularly for +models with billions of parameters. To address these challenges, previous +research explored In-Context Learning (ICL) as a strategy to guide the LLM +generative process with task-specific prompt examples. However, ICL introduces +inconveniences, such as the need for designing contextually relevant prompts +and the absence of learning task-specific parameters, thereby limiting +downstream task performance. In this context, we foresee Parameter-Efficient +Fine-Tuning (PEFT) techniques as a promising approach to efficiently specialize +LLMs to task-specific data while maintaining reasonable resource consumption. +In this paper, we deliver a comprehensive study of PEFT techniques for LLMs +under the automated code generation scenario. Our comprehensive investigation +of PEFT techniques for LLMs reveals their superiority and potential over ICL +across a diverse set of LLMs. Additionally, we demonstrate the extended +capabilities of PEFT, showcasing its ability to learn from two distinct +datasets jointly without compromising performance. Furthermore, our study +highlights the potential for tuning larger LLMs and significant reductions in +memory usage by combining PEFT with quantization. Therefore, this study opens +opportunities for broader applications of PEFT in software engineering +scenarios. Our code is available at +https://github.com/martin-wey/peft-llm-code/. + +
+
+
+
+
+ + ♻ ☆ Chat Failures and Troubles: Reasons and Solutions + + +
+ This paper examines some common problems in Human-Robot Interaction (HRI) +causing failures and troubles in Chat. A given use case's design decisions +start with the suitable robot, the suitable chatting model, identifying common +problems that cause failures, identifying potential solutions, and planning +continuous improvement. In conclusion, it is recommended to use a closed-loop +control algorithm that guides the use of trained Artificial Intelligence (AI) +pre-trained models and provides vocabulary filtering, re-train batched models +on new datasets, learn online from data streams, and/or use reinforcement +learning models to self-update the trained models and reduce errors. + +
+
+ comment: In WTF Workshop Proceedings (arXiv:2401.04108) held in conjunction + with the ACM conference on Conversational User Interfaces (CUI), 19 - 21/07 + 2023, in Eindhoven, The Netherlands +
+
+
+
+
+ + ♻ ☆ FedA3I: Annotation Quality-Aware Aggregation for Federated Medical Image + Segmentation against Heterogeneous Annotation Noise AAAI'24 + + +
+ Federated learning (FL) has emerged as a promising paradigm for training +segmentation models on decentralized medical data, owing to its +privacy-preserving property. However, existing research overlooks the prevalent +annotation noise encountered in real-world medical datasets, which limits the +performance ceilings of FL. In this paper, we, for the first time, identify and +tackle this problem. For problem formulation, we propose a contour evolution +for modeling non-independent and identically distributed (Non-IID) noise across +pixels within each client and then extend it to the case of multi-source data +to form a heterogeneous noise model (i.e., Non-IID annotation noise across +clients). For robust learning from annotations with such two-level Non-IID +noise, we emphasize the importance of data quality in model aggregation, +allowing high-quality clients to have a greater impact on FL. To achieve this, +we propose Federated learning with Annotation quAlity-aware AggregatIon, named +FedA3I, by introducing a quality factor based on client-wise noise estimation. +Specifically, noise estimation at each client is accomplished through the +Gaussian mixture model and then incorporated into model aggregation in a +layer-wise manner to up-weight high-quality clients. Extensive experiments on +two real-world medical image segmentation datasets demonstrate the superior +performance of FedA$^3$I against the state-of-the-art approaches in dealing +with cross-client annotation noise. The code is available at +https://github.com/wnn2000/FedAAAI. + +
+
+ comment: Accepted at AAAI'24 +
+
+
+
+
+ + ♻ ☆ CTAGE: Curvature-Based Topology-Aware Graph Embedding for Learning + Molecular Representations + + +
+ AI-driven drug design relies significantly on predicting molecular +properties, which is a complex task. In current approaches, the most commonly +used feature representations for training deep neural network models are based +on SMILES and molecular graphs. While these methods are concise and efficient, +they have limitations in capturing complex spatial information. Recently, +researchers have recognized the importance of incorporating three-dimensional +information of molecular structures into models. However, capturing spatial +information requires the introduction of additional units in the generator, +bringing additional design and computational costs. Therefore, it is necessary +to develop a method for predicting molecular properties that effectively +combines spatial structural information while maintaining the simplicity and +efficiency of graph neural networks. In this work, we propose an embedding +approach CTAGE, utilizing $k$-hop discrete Ricci curvature to extract +structural insights from molecular graph data. This effectively integrates +spatial structural information while preserving the training complexity of the +network. Experimental results indicate that introducing node curvature +significantly improves the performance of current graph neural network +frameworks, validating that the information from k-hop node curvature +effectively reflects the relationship between molecular structure and function. + +
+
+
+
+
+ + ♻ ☆ Nearly $d$-Linear Convergence Bounds for Diffusion Models via Stochastic + Localization + + +
+ Denoising diffusions are a powerful method to generate approximate samples +from high-dimensional data distributions. Recent results provide polynomial +bounds on their convergence rate, assuming $L^2$-accurate scores. Until now, +the tightest bounds were either superlinear in the data dimension or required +strong smoothness assumptions. We provide the first convergence bounds which +are linear in the data dimension (up to logarithmic factors) assuming only +finite second moments of the data distribution. We show that diffusion models +require at most $\tilde O(\frac{d \log^2(1/\delta)}{\varepsilon^2})$ steps to +approximate an arbitrary distribution on $\mathbb{R}^d$ corrupted with Gaussian +noise of variance $\delta$ to within $\varepsilon^2$ in KL divergence. Our +proof extends the Girsanov-based methods of previous works. We introduce a +refined treatment of the error from discretizing the reverse SDE inspired by +stochastic localization. + +
+
+
+
+
+ + ♻ ☆ An Embarrassingly Simple Baseline for Imbalanced Semi-Supervised + Learning + + +
+ Semi-supervised learning (SSL) has shown great promise in leveraging +unlabeled data to improve model performance. While standard SSL assumes uniform +data distribution, we consider a more realistic and challenging setting called +imbalanced SSL, where imbalanced class distributions occur in both labeled and +unlabeled data. Although there are existing endeavors to tackle this challenge, +their performance degenerates when facing severe imbalance since they can not +reduce the class imbalance sufficiently and effectively. In this paper, we +study a simple yet overlooked baseline -- SimiS -- which tackles data imbalance +by simply supplementing labeled data with pseudo-labels, according to the +difference in class distribution from the most frequent class. Such a simple +baseline turns out to be highly effective in reducing class imbalance. It +outperforms existing methods by a significant margin, e.g., 12.8%, 13.6%, and +16.7% over previous SOTA on CIFAR100-LT, FOOD101-LT, and ImageNet127 +respectively. The reduced imbalance results in faster convergence and better +pseudo-label accuracy of SimiS. The simplicity of our method also makes it +possible to be combined with other re-balancing techniques to improve the +performance further. Moreover, our method shows great robustness to a wide +range of data distributions, which holds enormous potential in practice. Code +will be publicly available. + +
+
+ comment: Issues in the paper, will re-open later +
+
+
+
+
+ + ♻ ☆ Upper and lower bounds for the Lipschitz constant of random neural + networks + + +
+ Empirical studies have widely demonstrated that neural networks are highly +sensitive to small, adversarial perturbations of the input. The worst-case +robustness against these so-called adversarial examples can be quantified by +the Lipschitz constant of the neural network. In this paper, we study upper and +lower bounds for the Lipschitz constant of random ReLU neural networks. +Specifically, we assume that the weights and biases follow a generalization of +the He initialization, where general symmetric distributions for the biases are +permitted. For shallow neural networks, we characterize the Lipschitz constant +up to an absolute numerical constant. For deep networks with fixed depth and +sufficiently large width, our established upper bound is larger than the lower +bound by a factor that is logarithmic in the width. + +
+
+
+
+
+ + ♻ ☆ On Mitigating the Utility-Loss in Differentially Private Learning: A new + Perspective by a Geometrically Inspired Kernel Approach + + +
+ Privacy-utility tradeoff remains as one of the fundamental issues of +differentially private machine learning. This paper introduces a geometrically +inspired kernel-based approach to mitigate the accuracy-loss issue in +classification. In this approach, a representation of the affine hull of given +data points is learned in Reproducing Kernel Hilbert Spaces (RKHS). This leads +to a novel distance measure that hides privacy-sensitive information about +individual data points and improves the privacy-utility tradeoff via +significantly reducing the risk of membership inference attacks. The +effectiveness of the approach is demonstrated through experiments on MNIST +dataset, Freiburg groceries dataset, and a real biomedical dataset. It is +verified that the approach remains computationally practical. The application +of the approach to federated learning is considered and it is observed that the +accuracy-loss due to data being distributed is either marginal or not +significantly high. + +
+
+
+
+
+ + ♻ ☆ Large Language Model-Enhanced Algorithm Selection: Towards Comprehensive + Algorithm Representation + + +
+ Algorithm selection aims to identify the most suitable algorithm for solving +a specific problem before execution, which has become a critical process of the +AutoML. Current mainstream algorithm selection techniques rely heavily on +feature representations of various problems and employ the performance of each +algorithm as supervised information. However, there is a significant research +gap concerning the consideration of algorithm features. This gap is primarily +attributed to the inherent complexity of algorithms, making it particularly +challenging to find a universally effective feature extraction method that is +applicable across a diverse range of algorithms. Unfortunately, neglecting this +aspect undoubtedly impacts the accuracy of algorithm selection and indirectly +necessitates an increased volume of problem data for training purposes. This +paper takes a significant stride towards addressing this gap by proposing an +approach that integrates algorithm representation into the algorithm selection +process. Specifically, our proposed model employs distinct modules to extract +representations of both problems and algorithms, where the algorithm +representation leverages the capabilities of pre-trained LLMs in the realm of +code comprehension. Following the extraction of embedding vectors for both +algorithms and problems, the most suitable algorithm is determined through +calculations of matching degrees. Our experiments not only validate the +effectiveness of the proposed model but also showcase the performance of +different embedded pre-trained LLMs, which suggests that the proposed algorithm +selection framework holds the potential to serve as a baseline task for +evaluating the code representation capabilities of LLMs. + +
+
+
+
+
+ + ♻ ☆ Semantic-Guided Generative Image Augmentation Method with Diffusion + Models for Image Classification AAAI 2024 + + +
+ Existing image augmentation methods consist of two categories: +perturbation-based methods and generative methods. Perturbation-based methods +apply pre-defined perturbations to augment an original image, but only locally +vary the image, thus lacking image diversity. In contrast, generative methods +bring more image diversity in the augmented images but may not preserve +semantic consistency, thus incorrectly changing the essential semantics of the +original image. To balance image diversity and semantic consistency in +augmented images, we propose SGID, a Semantic-guided Generative Image +augmentation method with Diffusion models for image classification. +Specifically, SGID employs diffusion models to generate augmented images with +good image diversity. More importantly, SGID takes image labels and captions as +guidance to maintain semantic consistency between the augmented and original +images. Experimental results show that SGID outperforms the best augmentation +baseline by 1.72% on ResNet-50 (from scratch), 0.33% on ViT (ImageNet-21k), and +0.14% on CLIP-ViT (LAION-2B). Moreover, SGID can be combined with other image +augmentation baselines and further improves the overall performance. We +demonstrate the semantic consistency and image diversity of SGID through +quantitative human and automated evaluations, as well as qualitative case +studies. + +
+
+ comment: AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Grammar-Constrained Decoding for Structured NLP Tasks without Finetuning EMNLP 2023 + + +
+ Despite their impressive performance, large language models (LMs) still +struggle with reliably generating complex output structures when not finetuned +to follow the required output format exactly. To address this issue, +grammar-constrained decoding (GCD) can be used to control the generation of +LMs, guaranteeing that the output follows a given structure. Most existing GCD +methods are, however, limited to specific tasks, such as parsing or code +generation. In this work, we demonstrate that formal grammars can describe the +output space for a much wider range of tasks and argue that GCD can serve as a +unified framework for structured NLP tasks in general. For increased +flexibility, we introduce input-dependent grammars, which allow the grammar to +depend on the input and thus enable the generation of different output +structures for different inputs. We then empirically demonstrate the power and +flexibility of GCD-enhanced LMs on (1) information extraction, (2) entity +disambiguation, and (3) constituency parsing. Our results indicate that +grammar-constrained LMs substantially outperform unconstrained LMs or even beat +task-specific finetuned models. Grammar constraints thus hold great promise for +harnessing off-the-shelf LMs for a wide range of structured NLP tasks, +especially where training data is scarce or finetuning is expensive. Code and +data: https://github.com/epfl-dlab/GCD. + +
+
+ comment: Accepted at EMNLP 2023 Main Conference +
+
+
+
+
+ + ♻ ☆ Relaxing the Additivity Constraints in Decentralized No-Regret + High-Dimensional Bayesian Optimization + + +
+ Bayesian Optimization (BO) is typically used to optimize an unknown function +$f$ that is noisy and costly to evaluate, by exploiting an acquisition function +that must be maximized at each optimization step. Even if provably +asymptotically optimal BO algorithms are efficient at optimizing +low-dimensional functions, scaling them to high-dimensional spaces remains an +open problem, often tackled by assuming an additive structure for $f$. By doing +so, BO algorithms typically introduce additional restrictive assumptions on the +additive structure that reduce their applicability domain. This paper contains +two main contributions: (i) we relax the restrictive assumptions on the +additive structure of $f$ without weakening the maximization guarantees of the +acquisition function, and (ii) we address the over-exploration problem for +decentralized BO algorithms. To these ends, we propose DuMBO, an asymptotically +optimal decentralized BO algorithm that achieves very competitive performance +against state-of-the-art BO algorithms, especially when the additive structure +of $f$ comprises high-dimensional factors. + +
+
+
+
+
+ + ♻ ☆ Detecting Change Intervals with Isolation Distributional Kernel + + +
+ Detecting abrupt changes in data distribution is one of the most significant +tasks in streaming data analysis. Although many unsupervised Change-Point +Detection (CPD) methods have been proposed recently to identify those changes, +they still suffer from missing subtle changes, poor scalability, or/and +sensitivity to outliers. To meet these challenges, we are the first to +generalise the CPD problem as a special case of the Change-Interval Detection +(CID) problem. Then we propose a CID method, named iCID, based on a recent +Isolation Distributional Kernel (IDK). iCID identifies the change interval if +there is a high dissimilarity score between two non-homogeneous temporal +adjacent intervals. The data-dependent property and finite feature map of IDK +enabled iCID to efficiently identify various types of change-points in data +streams with the tolerance of outliers. Moreover, the proposed online and +offline versions of iCID have the ability to optimise key parameter settings. +The effectiveness and efficiency of iCID have been systematically verified on +both synthetic and real-world datasets. + +
+
+
+
+
+ + ♻ ☆ TopCoW: Benchmarking Topology-Aware Anatomical Segmentation of the + Circle of Willis (CoW) for CTA and MRA MICCAI + + +
+ The Circle of Willis (CoW) is an important network of arteries connecting +major circulations of the brain. Its vascular architecture is believed to +affect the risk, severity, and clinical outcome of serious neuro-vascular +diseases. However, characterizing the highly variable CoW anatomy is still a +manual and time-consuming expert task. The CoW is usually imaged by two +angiographic imaging modalities, magnetic resonance angiography (MRA) and +computed tomography angiography (CTA), but there exist limited public datasets +with annotations on CoW anatomy, especially for CTA. Therefore we organized the +TopCoW Challenge in 2023 with the release of an annotated CoW dataset. The +TopCoW dataset was the first public dataset with voxel-level annotations for +thirteen possible CoW vessel components, enabled by virtual-reality (VR) +technology. It was also the first large dataset with paired MRA and CTA from +the same patients. TopCoW challenge formalized the CoW characterization problem +as a multiclass anatomical segmentation task with an emphasis on topological +metrics. We invited submissions worldwide for the CoW segmentation task, which +attracted over 140 registered participants from four continents. The top +performing teams managed to segment many CoW components to Dice scores around +90%, but with lower scores for communicating arteries and rare variants. There +were also topological mistakes for predictions with high Dice scores. +Additional topological analysis revealed further areas for improvement in +detecting certain CoW components and matching CoW variant topology accurately. +TopCoW represented a first attempt at benchmarking the CoW anatomical +segmentation task for MRA and CTA, both morphologically and topologically. + +
+
+ comment: 23 pages, 11 figures, 9 tables. Summary Paper for the MICCAI TopCoW + 2023 Challenge +
+
+
+
+
+ + ♻ ☆ Training Neural Networks is NP-Hard in Fixed Dimension NeurIPS 2023 + + +
+ We study the parameterized complexity of training two-layer neural networks +with respect to the dimension of the input data and the number of hidden +neurons, considering ReLU and linear threshold activation functions. Albeit the +computational complexity of these problems has been studied numerous times in +recent years, several questions are still open. We answer questions by Arora et +al. [ICLR '18] and Khalife and Basu [IPCO '22] showing that both problems are +NP-hard for two dimensions, which excludes any polynomial-time algorithm for +constant dimension. We also answer a question by Froese et al. [JAIR '22] +proving W[1]-hardness for four ReLUs (or two linear threshold neurons) with +zero training error. Finally, in the ReLU case, we show fixed-parameter +tractability for the combined parameter number of dimensions and number of +ReLUs if the network is assumed to compute a convex map. Our results settle the +complexity status regarding these parameters almost completely. + +
+
+ comment: Paper accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Yet Another ICU Benchmark: A Flexible Multi-Center Framework for + Clinical ML + + +
+ Medical applications of machine learning (ML) have experienced a surge in +popularity in recent years. The intensive care unit (ICU) is a natural habitat +for ML given the abundance of available data from electronic health records. +Models have been proposed to address numerous ICU prediction tasks like the +early detection of complications. While authors frequently report +state-of-the-art performance, it is challenging to verify claims of +superiority. Datasets and code are not always published, and cohort +definitions, preprocessing pipelines, and training setups are difficult to +reproduce. This work introduces Yet Another ICU Benchmark (YAIB), a modular +framework that allows researchers to define reproducible and comparable +clinical ML experiments; we offer an end-to-end solution from cohort definition +to model evaluation. The framework natively supports most open-access ICU +datasets (MIMIC III/IV, eICU, HiRID, AUMCdb) and is easily adaptable to future +ICU datasets. Combined with a transparent preprocessing pipeline and extensible +training code for multiple ML and deep learning models, YAIB enables unified +model development. Our benchmark comes with five predefined established +prediction tasks (mortality, acute kidney injury, sepsis, kidney function, and +length of stay) developed in collaboration with clinicians. Adding further +tasks is straightforward by design. Using YAIB, we demonstrate that the choice +of dataset, cohort definition, and preprocessing have a major impact on the +prediction performance - often more so than model class - indicating an urgent +need for YAIB as a holistic benchmarking tool. We provide our work to the +clinical ML community to accelerate method development and enable real-world +clinical implementations. Software Repository: +https://github.com/rvandewater/YAIB. + +
+
+ comment: Main benchmark: https://github.com/rvandewater/YAIB, Cohort + generation: https://github.com/rvandewater/YAIB-cohorts, Models: + https://github.com/rvandewater/YAIB-models +
+
+
+
+
+ + ♻ ☆ A Meta-Level Learning Algorithm for Sequential Hyper-Parameter Space + Reduction in AutoML + + +
+ AutoML platforms have numerous options for the algorithms to try for each +step of the analysis, i.e., different possible algorithms for imputation, +transformations, feature selection, and modelling. Finding the optimal +combination of algorithms and hyper-parameter values is computationally +expensive, as the number of combinations to explore leads to an exponential +explosion of the space. In this paper, we present the Sequential +Hyper-parameter Space Reduction (SHSR) algorithm that reduces the space for an +AutoML tool with negligible drop in its predictive performance. SHSR is a +meta-level learning algorithm that analyzes past runs of an AutoML tool on +several datasets and learns which hyper-parameter values to filter out from +consideration on a new dataset to analyze. SHSR is evaluated on 284 +classification and 375 regression problems, showing an approximate 30% +reduction in execution time with a performance drop of less than 0.1%. + +
+
+
+
+
+ + ♻ ☆ Astroconformer: The Prospects of Analyzing Stellar Light Curves with + Transformer-Based Deep Learning Models + + +
+ Stellar light curves contain valuable information about oscillations and +granulation, offering insights into stars' internal structures and evolutionary +states. Traditional asteroseismic techniques, primarily focused on power +spectral analysis, often overlook the crucial phase information in these light +curves. Addressing this gap, recent machine learning applications, particularly +those using Convolutional Neural Networks (CNNs), have made strides in +inferring stellar properties from light curves. However, CNNs are limited by +their localized feature extraction capabilities. In response, we introduce +$\textit{Astroconformer}$, a Transformer-based deep learning framework, +specifically designed to capture long-range dependencies in stellar light +curves. Our empirical analysis centers on estimating surface gravity ($\log +g$), using a dataset derived from single-quarter Kepler light curves with $\log +g$ values ranging from 0.2 to 4.4. $\textit{Astroconformer}$ demonstrates +superior performance, achieving a root-mean-square-error (RMSE) of 0.017 dex at +$\log g\approx3$ in data-rich regimes and up to 0.1 dex in sparser areas. This +performance surpasses both K-nearest neighbor models and advanced CNNs. +Ablation studies highlight the influence of receptive field size on model +effectiveness, with larger fields correlating to improved results. +$\textit{Astroconformer}$ also excels in extracting $\nu_{\max}$ with high +precision. It achieves less than 2% relative median absolute error for 90-day +red giant light curves. Notably, the error remains under 3% for 30-day light +curves, whose oscillations are undetectable by a conventional pipeline in 30% +cases. Furthermore, the attention mechanisms in $\textit{Astroconformer}$ align +closely with the characteristics of stellar oscillations and granulation +observed in light curves. + +
+
+ comment: 15 pages, 10 figures, Accepted by MNRAS +
+
+
+
+
+ + ♻ ☆ DAISM: Digital Approximate In-SRAM Multiplier-based Accelerator for DNN + Training and Inference + + +
+ DNNs are widely used but face significant computational costs due to matrix +multiplications, especially from data movement between the memory and +processing units. One promising approach is therefore Processing-in-Memory as +it greatly reduces this overhead. However, most PIM solutions rely either on +novel memory technologies that have yet to mature or bit-serial computations +that have significant performance overhead and scalability issues. Our work +proposes an in-SRAM digital multiplier, that uses a conventional memory to +perform bit-parallel computations, leveraging multiple wordlines activation. We +then introduce DAISM, an architecture leveraging this multiplier, which +achieves up to two orders of magnitude higher area efficiency compared to the +SOTA counterparts, with competitive energy efficiency. + +
+
+ comment: 12 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Explainable Reinforcement Learning via a Causal World Model IJCAI 2023 + + +
+ Generating explanations for reinforcement learning (RL) is challenging as +actions may produce long-term effects on the future. In this paper, we develop +a novel framework for explainable RL by learning a causal world model without +prior knowledge of the causal structure of the environment. The model captures +the influence of actions, allowing us to interpret the long-term effects of +actions through causal chains, which present how actions influence +environmental variables and finally lead to rewards. Different from most +explanatory models which suffer from low accuracy, our model remains accurate +while improving explainability, making it applicable in model-based learning. +As a result, we demonstrate that our causal model can serve as the bridge +between explainability and learning. + +
+
+ comment: Accepted by IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Unsupervised Multiple Domain Translation through Controlled + Disentanglement in Variational Autoencoder + + +
+ Unsupervised Multiple Domain Translation is the task of transforming data +from one domain to other domains without having paired data to train the +systems. Typically, methods based on Generative Adversarial Networks (GANs) are +used to address this task. However, our proposal exclusively relies on a +modified version of a Variational Autoencoder. This modification consists of +the use of two latent variables disentangled in a controlled way by design. One +of this latent variables is imposed to depend exclusively on the domain, while +the other one must depend on the rest of the variability factors of the data. +Additionally, the conditions imposed over the domain latent variable allow for +better control and understanding of the latent space. We empirically +demonstrate that our approach works on different vision datasets improving the +performance of other well known methods. Finally, we prove that, indeed, one of +the latent variables stores all the information related to the domain and the +other one hardly contains any domain information. + +
+
+
+
+
+ + ♻ ☆ Unexpected Improvements to Expected Improvement for Bayesian + Optimization NeurIPS 2023 + + +
+ Expected Improvement (EI) is arguably the most popular acquisition function +in Bayesian optimization and has found countless successful applications, but +its performance is often exceeded by that of more recent methods. Notably, EI +and its variants, including for the parallel and multi-objective settings, are +challenging to optimize because their acquisition values vanish numerically in +many regions. This difficulty generally increases as the number of +observations, dimensionality of the search space, or the number of constraints +grow, resulting in performance that is inconsistent across the literature and +most often sub-optimal. Herein, we propose LogEI, a new family of acquisition +functions whose members either have identical or approximately equal optima as +their canonical counterparts, but are substantially easier to optimize +numerically. We demonstrate that numerical pathologies manifest themselves in +"classic" analytic EI, Expected Hypervolume Improvement (EHVI), as well as +their constrained, noisy, and parallel variants, and propose corresponding +reformulations that remedy these pathologies. Our empirical results show that +members of the LogEI family of acquisition functions substantially improve on +the optimization performance of their canonical counterparts and surprisingly, +are on par with or exceed the performance of recent state-of-the-art +acquisition functions, highlighting the understated role of numerical +optimization in the literature. + +
+
+ comment: NeurIPS 2023 Spotlight +
+
+
+
+
+ + ♻ ☆ A Constrained BA Algorithm for Rate-Distortion and Distortion-Rate + Functions + + +
+ The Blahut-Arimoto (BA) algorithm has played a fundamental role in the +numerical computation of rate-distortion (RD) functions. This algorithm +possesses a desirable monotonic convergence property by alternatively +minimizing its Lagrangian with a fixed multiplier. In this paper, we propose a +novel modification of the BA algorithm, wherein the multiplier is updated +through a one-dimensional root-finding step using a monotonic univariate +function, efficiently implemented by Newton's method in each iteration. +Consequently, the modified algorithm directly computes the RD function for a +given target distortion, without exploring the entire RD curve as in the +original BA algorithm. Moreover, this modification presents a versatile +framework, applicable to a wide range of problems, including the computation of +distortion-rate (DR) functions. Theoretical analysis shows that the outputs of +the modified algorithms still converge to the solutions of the RD and DR +functions with rate $O(1/n)$, where $n$ is the number of iterations. +Additionally, these algorithms provide $\varepsilon$-approximation solutions +with $O\left(\frac{MN\log N}{\varepsilon}(1+\log |\log \varepsilon|)\right)$ +arithmetic operations, where $M,N$ are the sizes of source and reproduced +alphabets respectively. Numerical experiments demonstrate that the modified +algorithms exhibit significant acceleration compared with the original BA +algorithms and showcase commendable performance across classical source +distributions such as discretized Gaussian, Laplacian and uniform sources. + +
+
+ comment: Version_2 +
+
+
+
+
+ + ♻ ☆ LPAC: Learnable Perception-Action-Communication Loops with Applications + to Coverage Control + + +
+ Coverage control is the problem of navigating a robot swarm to +collaboratively monitor features or a phenomenon of interest not known a +priori. The problem is challenging in decentralized settings with robots that +have limited communication and sensing capabilities. We propose a learnable +Perception-Action-Communication (LPAC) architecture for the problem, wherein a +convolution neural network (CNN) processes localized perception; a graph neural +network (GNN) facilitates robot communications; finally, a shallow multi-layer +perceptron (MLP) computes robot actions. The GNN enables collaboration in the +robot swarm by computing what information to communicate with nearby robots and +how to incorporate received information. Evaluations show that the LPAC models +-- trained using imitation learning -- outperform standard decentralized and +centralized coverage control algorithms. The learned policy generalizes to +environments different from the training dataset, transfers to larger +environments with more robots, and is robust to noisy position estimates. The +results indicate the suitability of LPAC architectures for decentralized +navigation in robot swarms to achieve collaborative behavior. + +
+
+
+
+
+ + ♻ ☆ Towards Open Federated Learning Platforms: Survey and Vision from + Technical and Legal Perspectives + + +
+ Traditional Federated Learning (FL) follows a server-domincated cooperation +paradigm which narrows the application scenarios of FL and decreases the +enthusiasm of data holders to participate. To fully unleash the potential of +FL, we advocate rethinking the design of current FL frameworks and extending it +to a more generalized concept: Open Federated Learning Platforms. We propose +two reciprocal cooperation frameworks for FL to achieve this: query-based FL +and contract-based FL. In this survey, we conduct a comprehensive review of the +feasibility of constructing an open FL platform from both technical and legal +perspectives. We begin by reviewing the definition of FL and summarizing its +inherent limitations, including server-client coupling, low model reusability, +and non-public. In the query-based FL platform, which is an open model sharing +and reusing platform empowered by the community for model mining, we explore a +wide range of valuable topics, including the availability of up-to-date model +repositories for model querying, legal compliance analysis between different +model licenses, and copyright issues and intellectual property protection in +model reusing. In particular, we introduce a novel taxonomy to streamline the +analysis of model license compatibility in FL studies that involve batch model +reusing methods, including combination, amalgamation, distillation, and +generation. This taxonomy provides a systematic framework for identifying the +corresponding clauses of licenses and facilitates the identification of +potential legal implications and restrictions when reusing models. Through this +survey, we uncover the the current dilemmas faced by FL and advocate for the +development of sustainable open FL platforms. We aim to provide guidance for +establishing such platforms in the future, while identifying potential problems +and challenges that need to be addressed. + +
+
+ comment: This is an ongoing work. See the latest version on + https://github.com/morningD/Model-Centric-FML +
+
+
+
+
+ + ♻ ☆ DKiS: Decay weight invertible image steganography with private key + + +
+ Image steganography, defined as the practice of concealing information within +another image, traditionally encounters security challenges when its methods +become publicly known or are under attack. To address this, a novel private +key-based image steganography technique has been introduced. This approach +ensures the security of the hidden information, as access requires a +corresponding private key, regardless of the public knowledge of the +steganography method. Experimental evidence has been presented, demonstrating +the effectiveness of our method and showcasing its real-world applicability. +Furthermore, a critical challenge in the invertible image steganography process +has been identified by us: the transfer of non-essential, or `garbage', +information from the secret to the host pipeline. To tackle this issue, the +decay weight has been introduced to control the information transfer, +effectively filtering out irrelevant data and enhancing the performance of +image steganography. The code for this technique is publicly accessible at +https://github.com/yanghangAI/DKiS, and a practical demonstration can be found +at http://yanghang.site/hidekey. + +
+
+
+
+
+ + ♻ ☆ Versatile Energy-Based Probabilistic Models for High Energy Physics NeurIPS 2023 + + +
+ As a classical generative modeling approach, energy-based models have the +natural advantage of flexibility in the form of the energy function. Recently, +energy-based models have achieved great success in modeling high-dimensional +data in computer vision and natural language processing. In line with these +advancements, we build a multi-purpose energy-based probabilistic model for +High Energy Physics events at the Large Hadron Collider. This framework builds +on a powerful generative model and describes higher-order inter-particle +interactions. It suits different encoding architectures and builds on implicit +generation. As for applicative aspects, it can serve as a powerful +parameterized event generator for physics simulation, a generic anomalous +signal detector free from spurious correlations, and an augmented event +classifier for particle identification. + +
+
+ comment: 17 pages, 9 figures. NeurIPS 2023 camera ready +
+
+
+
+
+ + ♻ ☆ ESD: Expected Squared Difference as a Tuning-Free Trainable Calibration + Measure ICLR 2023 + + +
+ Studies have shown that modern neural networks tend to be poorly calibrated +due to over-confident predictions. Traditionally, post-processing methods have +been used to calibrate the model after training. In recent years, various +trainable calibration measures have been proposed to incorporate them directly +into the training process. However, these methods all incorporate internal +hyperparameters, and the performance of these calibration objectives relies on +tuning these hyperparameters, incurring more computational costs as the size of +neural networks and datasets become larger. As such, we present Expected +Squared Difference (ESD), a tuning-free (i.e., hyperparameter-free) trainable +calibration objective loss, where we view the calibration error from the +perspective of the squared difference between the two expectations. With +extensive experiments on several architectures (CNNs, Transformers) and +datasets, we demonstrate that (1) incorporating ESD into the training improves +model calibration in various batch size settings without the need for internal +hyperparameter tuning, (2) ESD yields the best-calibrated results compared with +previous approaches, and (3) ESD drastically improves the computational costs +required for calibration during training due to the absence of internal +hyperparameter. The code is publicly accessible at +https://github.com/hee-suk-yoon/ESD. + +
+
+ comment: ICLR 2023 +
+
+
+
+
+ + ♻ ☆ Higher-order Graph Convolutional Network with Flower-Petals Laplacians + on Simplicial Complexes + + +
+ Despite the recent successes of vanilla Graph Neural Networks (GNNs) on +various tasks, their foundation on pairwise networks inherently limits their +capacity to discern latent higher-order interactions in complex systems. To +bridge this capability gap, we propose a novel approach exploiting the rich +mathematical theory of simplicial complexes (SCs) - a robust tool for modeling +higher-order interactions. Current SC-based GNNs are burdened by high +complexity and rigidity, and quantifying higher-order interaction strengths +remains challenging. Innovatively, we present a higher-order Flower-Petals (FP) +model, incorporating FP Laplacians into SCs. Further, we introduce a +Higher-order Graph Convolutional Network (HiGCN) grounded in FP Laplacians, +capable of discerning intrinsic features across varying topological scales. By +employing learnable graph filters, a parameter group within each FP Laplacian +domain, we can identify diverse patterns where the filters' weights serve as a +quantifiable measure of higher-order interaction strengths. The theoretical +underpinnings of HiGCN's advanced expressiveness are rigorously demonstrated. +Additionally, our empirical investigations reveal that the proposed model +accomplishes state-of-the-art performance on a range of graph tasks and +provides a scalable and flexible solution to explore higher-order interactions +in graphs. Codes and datasets are available at +https://github.com/Yiminghh/HiGCN. + +
+
+
+
+
+ + ♻ ☆ NODI: Out-Of-Distribution Detection with Noise from Diffusion + + +
+ Out-of-distribution (OOD) detection is a crucial part of deploying machine +learning models safely. It has been extensively studied with a plethora of +methods developed in the literature. This problem is tackled with an OOD score +computation, however, previous methods compute the OOD scores with limited +usage of the in-distribution dataset. For instance, the OOD scores are computed +with information from a small portion of the in-distribution data. Furthermore, +these methods encode images with a neural image encoder. The robustness of +these methods is rarely checked with respect to image encoders of different +training methods and architectures. In this work, we introduce the diffusion +process into the OOD task. The diffusion model integrates information on the +whole training set into the predicted noise vectors. What's more, we deduce a +closed-form solution for the noise vector (stable point). Then the noise vector +is converted into our OOD score, we test both the deep model predicted noise +vector and the closed-form noise vector on the OOD benchmarks \cite{openood}. +Our method outperforms previous OOD methods across all types of image encoders +(Table. \ref{main}). A $3.5\%$ performance gain is achieved with the MAE-based +image encoder. Moreover, we studied the robustness of OOD methods by applying +different types of image encoders. Some OOD methods failed to generalize well +when switching image encoders from ResNet to Vision Transformers, our method +performs exhibits good robustness with all the image encoders. + +
+
+
+
+
+ + ♻ ☆ Increasing biases can be more efficient than increasing weights WACV 2024 + + +
+ We introduce a novel computational unit for neural networks that features +multiple biases, challenging the traditional perceptron structure. This unit +emphasizes the importance of preserving uncorrupted information as it is passed +from one unit to the next, applying activation functions later in the process +with specialized biases for each unit. Through both empirical and theoretical +analyses, we show that by focusing on increasing biases rather than weights, +there is potential for significant enhancement in a neural network model's +performance. This approach offers an alternative perspective on optimizing +information flow within neural networks. See source code at +https://github.com/CuriosAI/dac-dev. + +
+
+ comment: Major rewriting. Supersedes v1 and v2. Focusing on the fact that not + all parameters are born equal: biases can be more important than weights. + Accordingly, new title and new abstract, and many more experiments on fully + connected architectures. This is the extended version of the paper published + at WACV 2024 +
+
+
+
+
+ + ♻ ☆ LLM4TS: Aligning Pre-Trained LLMs as Data-Efficient Time-Series + Forecasters + + +
+ Multivariate time-series forecasting is vital in various domains, e.g., +economic planning and weather prediction. Deep train-from-scratch models have +exhibited effective performance yet require large amounts of data, which limits +real-world applicability. Recently, researchers have leveraged the +representation learning transferability of pre-trained Large Language Models +(LLMs) to handle limited non-linguistic datasets effectively. However, +incorporating LLMs with time-series data presents challenges of limited +adaptation due to different compositions between time-series and linguistic +data, and the inability to process multi-scale temporal information. To tackle +these challenges, we propose LLM4TS, a framework for time-series forecasting +with pre-trained LLMs. LLM4TS consists of a two-stage fine-tuning strategy: the +\textit{time-series alignment} stage to align LLMs with the nuances of +time-series data, and the \textit{forecasting fine-tuning} stage for downstream +time-series forecasting tasks. Furthermore, our framework features a novel +two-level aggregation method that integrates multi-scale temporal data within +pre-trained LLMs, enhancing their ability to interpret time-specific +information. In experiments across 7 time-series forecasting datasets, LLM4TS +is superior to existing state-of-the-art methods compared with +trained-from-scratch models in full-shot scenarios, and also achieves an +average improvement of 6.84% in MSE in few-shot scenarios. In addition, +evaluations compared with different self-supervised learning approaches +highlight LLM4TS's effectiveness with representation learning in forecasting +tasks. + +
+
+ comment: This paper is currently under review. The code will be made available + upon acceptance +
+
+
+
+
+ + ♻ ☆ Discovering mesoscopic descriptions of collective movement with neural + stochastic modelling + + +
+ Collective motion is an ubiquitous phenomenon in nature, inspiring engineers, +physicists and mathematicians to develop mathematical models and bio-inspired +designs. Collective motion at small to medium group sizes ($\sim$10-1000 +individuals, also called the `mesoscale'), can show nontrivial features due to +stochasticity. Therefore, characterizing both the deterministic and stochastic +aspects of the dynamics is crucial in the study of mesoscale collective +phenomena. Here, we use a physics-inspired, neural-network based approach to +characterize the stochastic group dynamics of interacting individuals, through +a stochastic differential equation (SDE) that governs the collective dynamics +of the group. We apply this technique on both synthetic and real-world +datasets, and identify the deterministic and stochastic aspects of the dynamics +using drift and diffusion fields, enabling us to make novel inferences about +the nature of order in these systems. + +
+
+ comment: (v2) Minor corrections and clarifications. Added funding sources +
+
+
+
+
+ + ♻ ☆ MA2GCN: Multi Adjacency relationship Attention Graph Convolutional + Networks for Traffic Prediction using Trajectory data + + +
+ The problem of traffic congestion not only causes a large amount of economic +losses, but also seriously endangers the urban environment. Predicting traffic +congestion has important practical significance. So far, most studies have been +based on historical data from sensors placed on different roads to predict +future traffic flow and speed, to analyze the traffic congestion conditions of +a certain road segment. However, due to the fixed position of sensors, it is +difficult to mine new information. On the other hand, vehicle trajectory data +is more flexible and can extract traffic information as needed. Therefore, we +proposed a new traffic congestion prediction model - Multi Adjacency +relationship Attention Graph Convolutional Networks(MA2GCN). This model +transformed vehicle trajectory data into graph structured data in grid form, +and proposed a vehicle entry and exit matrix based on the mobility between +different grids. At the same time, in order to improve the performance of the +model, this paper also built a new adaptive adjacency matrix generation method +and adjacency matrix attention module. This model mainly used gated temporal +convolution and graph convolution to extract temporal and spatial information, +respectively. Compared with multiple baselines, our model achieved the best +performance on Shanghai taxi GPS trajectory dataset. The code is available at +https://github.com/zachysun/Taxi_Traffic_Benchmark. + +
+
+
+
+
+ + ♻ ☆ Partial Label Learning with a Partner AAAI + + +
+ In partial label learning (PLL), each instance is associated with a set of +candidate labels among which only one is ground-truth. The majority of the +existing works focuses on constructing robust classifiers to estimate the +labeling confidence of candidate labels in order to identify the correct one. +However, these methods usually struggle to rectify mislabeled samples. To help +existing PLL methods identify and rectify mislabeled samples, in this paper, we +introduce a novel partner classifier and propose a novel ``mutual supervision'' +paradigm. Specifically, we instantiate the partner classifier predicated on the +implicit fact that non-candidate labels of a sample should not be assigned to +it, which is inherently accurate and has not been fully investigated in PLL. +Furthermore, a novel collaborative term is formulated to link the base +classifier and the partner one. During each stage of mutual supervision, both +classifiers will blur each other's predictions through a blurring mechanism to +prevent overconfidence in a specific label. Extensive experiments demonstrate +that the performance and disambiguation ability of several well-established +stand-alone and deep-learning based PLL approaches can be significantly +improved by coupling with this learning paradigm. + +
+
+ comment: 2024, AAAI oral +
+
+
+
+
+ + ♻ ☆ SpecTr: Fast Speculative Decoding via Optimal Transport NeurIPS 2023 + + +
+ Autoregressive sampling from large language models has led to +state-of-the-art results in several natural language tasks. However, +autoregressive sampling generates tokens one at a time making it slow, and even +prohibitive in certain tasks. One way to speed up sampling is +$\textit{speculative decoding}$: use a small model to sample a $\textit{draft}$ +(block or sequence of tokens), and then score all tokens in the draft by the +large language model in parallel. A subset of the tokens in the draft are +accepted (and the rest rejected) based on a statistical method to guarantee +that the final output follows the distribution of the large model. In this +work, we provide a principled understanding of speculative decoding through the +lens of optimal transport (OT) with $\textit{membership cost}$. This framework +can be viewed as an extension of the well-known $\textit{maximal-coupling}$ +problem. This new formulation enables us to generalize the speculative decoding +method to allow for a set of $k$ candidates at the token-level, which leads to +an improved optimal membership cost. We show that the optimal draft selection +algorithm (transport plan) can be computed via linear programming, whose +best-known runtime is exponential in $k$. We then propose a valid draft +selection algorithm whose acceptance probability is $(1-1/e)$-optimal +multiplicatively. Moreover, it can be computed in time almost linear with size +of domain of a single token. Using this $new draft selection$ algorithm, we +develop a new autoregressive sampling algorithm called $\textit{SpecTr}$, which +provides speedup in decoding while ensuring that there is no quality +degradation in the decoded output. We experimentally demonstrate that for +state-of-the-art large language models, the proposed approach achieves a wall +clock speedup of 2.13X, a further 1.37X speedup over speculative decoding on +standard benchmarks. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Input Convex LSTM: A Convex Approach for Fast Lyapunov-Based Model + Predictive Control + + +
+ Leveraging Input Convex Neural Networks (ICNNs), ICNN-based Model Predictive +Control (MPC) successfully attains globally optimal solutions by upholding +convexity within the MPC framework. However, current ICNN architectures +encounter the issue of vanishing/exploding gradients, which limits their +ability to serve as deep neural networks for complex tasks. Additionally, the +current neural network-based MPC, including conventional neural network-based +MPC and ICNN-based MPC, faces slower convergence speed when compared to MPC +based on first-principles models. In this study, we leverage the principles of +ICNNs to propose a novel Input Convex LSTM for Lyapunov-based MPC, with the +specific goal of reducing convergence time and mitigating the +vanishing/exploding gradient problem while ensuring closed-loop stability. From +a simulation study of a nonlinear chemical reactor, we observed a mitigation of +vanishing/exploding gradient problem and a reduction in convergence time, with +a percentage decrease of 46.7%, 31.3%, and 20.2% compared to baseline plain +RNN, plain LSTM, and Input Convex Recurrent Neural Network, respectively. + +
+
+ comment: Submitted to 6th Annual Learning for Dynamics & Control Conference + (L4DC 2024) +
+
+
+
+
+ + ♻ ☆ On Finding Bi-objective Pareto-optimal Fraud Prevention Rule Sets for + Fintech Applications + + +
+ Rules are widely used in Fintech institutions to make fraud prevention +decisions, since rules are highly interpretable thanks to their intuitive +if-then structure. In practice, a two-stage framework of fraud prevention +decision rule set mining is usually employed in large Fintech institutions. +This paper is concerned with finding high-quality rule subsets in a +bi-objective space (such as precision and recall) from an initial pool of +rules. To this end, we adopt the concept of Pareto optimality and aim to find a +set of non-dominated rule subsets, which constitutes a Pareto front. We propose +a heuristic-based framework called PORS and we identify that the core of PORS +is the problem of solution selection on the front (SSF). We provide a +systematic categorization of the SSF problem and a thorough empirical +evaluation of various SSF methods on both public and proprietary datasets. We +also introduce a novel variant of sequential covering algorithm called +SpectralRules to encourage the diversity of the initial rule set and we +empirically find that SpectralRules further improves the quality of the found +Pareto front. On two real application scenarios within Alipay, we demonstrate +the advantages of our proposed methodology compared to existing work. + +
+
+
+
+
+ + ♻ ☆ Framework for Variable-lag Motif Following Relation Inference In Time + Series using Matrix Profile analysis + + +
+ Knowing who follows whom and what patterns they are following are crucial +steps to understand collective behaviors (e.g. a group of human, a school of +fish, or a stock market). Time series is one of resources that can be used to +get insight regarding following relations. However, the concept of following +patterns or motifs and the solution to find them in time series are not +obvious. In this work, we formalize a concept of following motifs between two +time series and present a framework to infer following patterns between two +time series. The framework utilizes one of efficient and scalable methods to +retrieve motifs from time series called the Matrix Profile Method. We compare +our proposed framework with several baselines. The framework performs better +than baselines in the simulation datasets. In the dataset of sound recording, +the framework is able to retrieve the following motifs within a pair of time +series that two singers sing following each other. In the cryptocurrency +dataset, the framework is capable of capturing the following motifs within a +pair of time series from two digital currencies, which implies that the values +of one currency follow the values of another currency patterns. Our framework +can be utilized in any field of time series to get insight regarding following +patterns between time series. + +
+
+ comment: Revising based on an expert's comments in the research community +
+
+
+
+
+ + ♻ ☆ Virchow: A Million-Slide Digital Pathology Foundation Model + + +
+ The use of artificial intelligence to enable precision medicine and decision +support systems through the analysis of pathology images has the potential to +revolutionize the diagnosis and treatment of cancer. Such applications will +depend on models' abilities to capture the diverse patterns observed in +pathology images. To address this challenge, we present Virchow, a foundation +model for computational pathology. Using self-supervised learning empowered by +the DINOv2 algorithm, Virchow is a vision transformer model with 632 million +parameters trained on 1.5 million hematoxylin and eosin stained whole slide +images from diverse tissue and specimen types, which is orders of magnitude +more data than previous works. The Virchow model enables the development of a +pan-cancer detection system with 0.949 overall specimen-level AUC across 17 +different cancer types, while also achieving 0.937 AUC on 7 rare cancer types. +The Virchow model sets the state-of-the-art on the internal and external image +tile level benchmarks and slide level biomarker prediction tasks. The gains in +performance highlight the importance of training on massive pathology image +datasets, suggesting scaling up the data and network architecture can improve +the accuracy for many high-impact computational pathology applications where +limited amounts of training data are available. + +
+
+
+
+
+ + ♻ ☆ Distilling Autoregressive Models to Obtain High-Performance + Non-Autoregressive Solvers for Vehicle Routing Problems with Faster Inference + Speed AAAI24 + + +
+ Neural construction models have shown promising performance for Vehicle +Routing Problems (VRPs) by adopting either the Autoregressive (AR) or +Non-Autoregressive (NAR) learning approach. While AR models produce +high-quality solutions, they generally have a high inference latency due to +their sequential generation nature. Conversely, NAR models generate solutions +in parallel with a low inference latency but generally exhibit inferior +performance. In this paper, we propose a generic Guided Non-Autoregressive +Knowledge Distillation (GNARKD) method to obtain high-performance NAR models +having a low inference latency. GNARKD removes the constraint of sequential +generation in AR models while preserving the learned pivotal components in the +network architecture to obtain the corresponding NAR models through knowledge +distillation. We evaluate GNARKD by applying it to three widely adopted AR +models to obtain NAR VRP solvers for both synthesized and real-world instances. +The experimental results demonstrate that GNARKD significantly reduces the +inference time (4-5 times faster) with acceptable performance drop (2-3\%). To +the best of our knowledge, this study is first-of-its-kind to obtain NAR VRP +solvers from AR ones through knowledge distillation. + +
+
+ comment: 11 pages, 5 figures, accepted by AAAI24 +
+
+
+
+
+ + ♻ ☆ Masked Hard-Attention Transformers and Boolean RASP Recognize Exactly + the Star-Free Languages + + +
+ We consider transformer encoders with hard attention (in which all attention +is focused on exactly one position) and strict future masking (in which each +position only attends to positions strictly to its left), and prove that the +class of languages recognized by these networks is exactly the star-free +languages. Adding position embeddings increases the class of recognized +languages to other well-studied classes. A key technique in these proofs is +Boolean RASP, a variant of RASP that is restricted to Boolean values. Via the +star-free languages, we relate transformers to first-order logic, temporal +logic, and algebraic automata theory. + +
+
+
+
+
+ + ♻ ☆ 3D-Mol: A Novel Contrastive Learning Framework for Molecular Property + Prediction with 3D Information + + +
+ Molecular property prediction, crucial for early drug candidate screening and +optimization, has seen advancements with deep learning-based methods. While +deep learning-based methods have advanced considerably, they often fall short +in fully leveraging 3D spatial information. Specifically, current molecular +encoding techniques tend to inadequately extract spatial information, leading +to ambiguous representations where a single one might represent multiple +distinct molecules. Moreover, existing molecular modeling methods focus +predominantly on the most stable 3D conformations, neglecting other viable +conformations present in reality. To address these issues, we propose 3D-Mol, a +novel approach designed for more accurate spatial structure representation. It +deconstructs molecules into three hierarchical graphs to better extract +geometric information. Additionally, 3D-Mol leverages contrastive learning for +pretraining on 20 million unlabeled data, treating their conformations with +identical topological structures as weighted positive pairs and contrasting +ones as negatives, based on the similarity of their 3D conformation descriptors +and fingerprints. We compare 3D-Mol with various state-of-the-art baselines on +7 benchmarks and demonstrate our outstanding performance. + +
+
+
+
+
+ + ♻ ☆ Use of Prior Knowledge to Discover Causal Additive Models with + Unobserved Variables and its Application to Time Series Data + + +
+ This paper proposes two methods for causal additive models with unobserved +variables (CAM-UV). CAM-UV assumes that the causal functions take the form of +generalized additive models and that latent confounders are present. First, we +propose a method that leverages prior knowledge for efficient causal discovery. +Then, we propose an extension of this method for inferring causality in time +series data. The original CAM-UV algorithm differs from other existing causal +function models in that it does not seek the causal order between observed +variables, but rather aims to identify the causes for each observed variable. +Therefore, the first proposed method in this paper utilizes prior knowledge, +such as understanding that certain variables cannot be causes of specific +others. Moreover, by incorporating the prior knowledge that causes precedes +their effects in time, we extend the first algorithm to the second method for +causal discovery in time series data. We validate the first proposed method by +using simulated data to demonstrate that the accuracy of causal discovery +increases as more prior knowledge is accumulated. Additionally, we test the +second proposed method by comparing it with existing time series causal +discovery methods, using both simulated data and real-world data. + +
+
+
+
+
+ + ♻ ☆ Invariant Random Forest: Tree-Based Model Solution for OOD + Generalization AAAI + + +
+ Out-Of-Distribution (OOD) generalization is an essential topic in machine +learning. However, recent research is only focusing on the corresponding +methods for neural networks. This paper introduces a novel and effective +solution for OOD generalization of decision tree models, named Invariant +Decision Tree (IDT). IDT enforces a penalty term with regard to the +unstable/varying behavior of a split across different environments during the +growth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is +constructed. Our proposed method is motivated by a theoretical result under +mild conditions, and validated by numerical tests with both synthetic and real +datasets. The superior performance compared to non-OOD tree models implies that +considering OOD generalization for tree models is absolutely necessary and +should be given more attention. + +
+
+ comment: AAAI Conference on Artificial Intelligence, 2024 (Oral Presentation) +
+
+
+
+
+ + ♻ ☆ Preparing Lessons for Progressive Training on Language Models + + +
+ The rapid progress of Transformers in artificial intelligence has come at the +cost of increased resource consumption and greenhouse gas emissions due to +growing model sizes. Prior work suggests using pretrained small models to +improve training efficiency, but this approach may not be suitable for new +model structures. On the other hand, training from scratch can be slow, and +progressively stacking layers often fails to achieve significant acceleration. +To address these challenges, we propose a novel method called Apollo, which +prep\textbf{a}res lessons for ex\textbf{p}anding \textbf{o}perations by +\textbf{l}earning high-\textbf{l}ayer functi\textbf{o}nality during training of +low layers. Our approach involves low-value-prioritized sampling (LVPS) to +train different depths and weight sharing to facilitate efficient expansion. We +also introduce an interpolation method for stable model depth extension. +Experiments demonstrate that Apollo achieves state-of-the-art acceleration +ratios, even rivaling methods using pretrained models, making it a universal +and efficient solution for training deep models while reducing time, financial, +and environmental costs. + +
+
+
+
+
+ + ♻ ☆ Approximate Cross-validated Mean Estimates for Bayesian Hierarchical + Regression Models + + +
+ We introduce a novel procedure for obtaining cross-validated predictive +estimates for Bayesian hierarchical regression models (BHRMs). Bayesian +hierarchical models are popular for their ability to model complex dependence +structures and provide probabilistic uncertainty estimates, but can be +computationally expensive to run. Cross-validation (CV) is therefore not a +common practice to evaluate the predictive performance of BHRMs. Our method +circumvents the need to re-run computationally costly estimation methods for +each cross-validation fold and makes CV more feasible for large BHRMs. By +conditioning on the variance-covariance parameters, we shift the CV problem +from probability-based sampling to a simple and familiar optimization problem. +In many cases, this produces estimates which are equivalent to full CV. We +provide theoretical results and demonstrate its efficacy on publicly available +data and in simulations. + +
+
+ comment: 26 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Improved DDIM Sampling with Moment Matching Gaussian Mixtures + + +
+ We propose using a Gaussian Mixture Model (GMM) as reverse transition +operator (kernel) within the Denoising Diffusion Implicit Models (DDIM) +framework, which is one of the most widely used approaches for accelerated +sampling from pre-trained Denoising Diffusion Probabilistic Models (DDPM). +Specifically we match the first and second order central moments of the DDPM +forward marginals by constraining the parameters of the GMM. We see that moment +matching is sufficient to obtain samples with equal or better quality than the +original DDIM with Gaussian kernels. We provide experimental results with +unconditional models trained on CelebAHQ and FFHQ and class-conditional models +trained on ImageNet datasets respectively. Our results suggest that using the +GMM kernel leads to significant improvements in the quality of the generated +samples when the number of sampling steps is small, as measured by FID and IS +metrics. For example on ImageNet 256x256, using 10 sampling steps, we achieve a +FID of 6.94 and IS of 207.85 with a GMM kernel compared to 10.15 and 196.73 +respectively with a Gaussian kernel. + +
+
+ comment: 29 pages, 14 figures; Analysis of DDIM-GMM as a multimodal denoiser; + Additional experiments on LSUN datasets and text-to-image generation with + Stable Diffusion; Comparison with DPM-Solver; Ablations on GMM parameters; + Updated equations with bold font for vectors and matrices +
+
+
+
+
+ + ♻ ☆ Language Control Diffusion: Efficiently Scaling through Space, Time, and + Tasks ICLR 2024 + + +
+ Training generalist agents is difficult across several axes, requiring us to +deal with high-dimensional inputs (space), long horizons (time), and +generalization to novel tasks. Recent advances with architectures have allowed +for improved scaling along one or two of these axes, but are still +computationally prohibitive to use. In this paper, we propose to address all +three axes by leveraging \textbf{L}anguage to \textbf{C}ontrol +\textbf{D}iffusion models as a hierarchical planner conditioned on language +(LCD). We effectively and efficiently scale diffusion models for planning in +extended temporal, state, and task dimensions to tackle long horizon control +problems conditioned on natural language instructions, as a step towards +generalist agents. Comparing LCD with other state-of-the-art models on the +CALVIN language robotics benchmark finds that LCD outperforms other SOTA +methods in multi-task success rates, whilst improving inference speed over +other comparable diffusion models by 3.3x~15x. We show that LCD can +successfully leverage the unique strength of diffusion models to produce +coherent long range plans while addressing their weakness in generating +low-level details and control. + +
+
+ comment: ICLR 2024, Project and code available at + https://github.com/ezhang7423/language-control-diffusion +
+
+
+
+
+ + ♻ ☆ Divergences induced by dual subtractive and divisive normalizations of + exponential families and their convex deformations + + +
+ Exponential families are statistical models which are the workhorses in +statistics, information theory, and machine learning among others. An +exponential family can either be normalized subtractively by its cumulant or +free energy function or equivalently normalized divisively by its partition +function. Both subtractive and divisive normalizers are strictly convex and +smooth functions inducing pairs of Bregman and Jensen divergences. It is +well-known that skewed Bhattacharryya distances between probability densities +of an exponential family amounts to skewed Jensen divergences induced by the +cumulant function between their corresponding natural parameters, and in limit +cases that the sided Kullback-Leibler divergences amount to reverse-sided +Bregman divergences. In this paper, we first show that the $\alpha$-divergences +between unnormalized densities of an exponential family amounts to scaled +$\alpha$-skewed Jensen divergences induced by the partition function. We then +show how comparative convexity with respect to a pair of quasi-arithmetic means +allows to deform both convex functions and their arguments, and thereby define +dually flat spaces with corresponding divergences when ordinary convexity is +preserved. + +
+
+ comment: 19 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Thought Cloning: Learning to Think while Acting by Imitating Human + Thinking NeurIPS 2023 + + +
+ Language is often considered a key aspect of human thinking, providing us +with exceptional abilities to generalize, explore, plan, replan, and adapt to +new situations. However, Reinforcement Learning (RL) agents are far from +human-level performance in any of these abilities. We hypothesize one reason +for such cognitive deficiencies is that they lack the benefits of thinking in +language and that we can improve AI agents by training them to think like +humans do. We introduce a novel Imitation Learning framework, Thought Cloning, +where the idea is to not just clone the behaviors of human demonstrators, but +also the thoughts humans have as they perform these behaviors. While we expect +Thought Cloning to truly shine at scale on internet-sized datasets of humans +thinking out loud while acting (e.g. online videos with transcripts), here we +conduct experiments in a domain where the thinking and action data are +synthetically generated. Results reveal that Thought Cloning learns much faster +than Behavioral Cloning and its performance advantage grows the further out of +distribution test tasks are, highlighting its ability to better handle novel +situations. Thought Cloning also provides important benefits for AI Safety and +Interpretability, and makes it easier to debug and improve AI. Because we can +observe the agent's thoughts, we can (1) more easily diagnose why things are +going wrong, making it easier to fix the problem, (2) steer the agent by +correcting its thinking, or (3) prevent it from doing unsafe things it plans to +do. Overall, by training agents how to think as well as behave, Thought Cloning +creates safer, more powerful agents. + +
+
+ comment: Accepted to NeurIPS 2023 as a spotlight +
+
+
+
+
+ + ♻ ☆ Unified Uncertainty Calibration + + +
+ To build robust, fair, and safe AI systems, we would like our classifiers to +say ``I don't know'' when facing test examples that are difficult or fall +outside of the training classes.The ubiquitous strategy to predict under +uncertainty is the simplistic \emph{reject-or-classify} rule: abstain from +prediction if epistemic uncertainty is high, classify otherwise.Unfortunately, +this recipe does not allow different sources of uncertainty to communicate with +each other, produces miscalibrated predictions, and it does not allow to +correct for misspecifications in our uncertainty estimates. To address these +three issues, we introduce \emph{unified uncertainty calibration (U2C)}, a +holistic framework to combine aleatoric and epistemic uncertainties. U2C +enables a clean learning-theoretical analysis of uncertainty estimation, and +outperforms reject-or-classify across a variety of ImageNet benchmarks. Our +code is available at: +https://github.com/facebookresearch/UnifiedUncertaintyCalibration + +
+
+
+
+
+ + ♻ ☆ Active Restoration of Lost Audio Signals Using Machine Learning and + Latent Information + + +
+ Digital audio signal reconstruction of a lost or corrupt segment using deep +learning algorithms has been explored intensively in recent years. +Nevertheless, prior traditional methods with linear interpolation, phase coding +and tone insertion techniques are still in vogue. However, we found no research +work on reconstructing audio signals with the fusion of dithering, +steganography, and machine learning regressors. Therefore, this paper proposes +the combination of steganography, halftoning (dithering), and state-of-the-art +shallow and deep learning methods. The results (including comparing the SPAIN, +Autoregressive, deep learning-based, graph-based, and other methods) are +evaluated with three different metrics. The observations from the results show +that the proposed solution is effective and can enhance the reconstruction of +audio signals performed by the side information (e.g., Latent representation) +steganography provides. Moreover, this paper proposes a novel framework for +reconstruction from heavily compressed embedded audio data using halftoning +(i.e., dithering) and machine learning, which we termed the HCR (halftone-based +compression and reconstruction). This work may trigger interest in optimising +this approach and/or transferring it to different domains (i.e., image +reconstruction). Compared to existing methods, we show improvement in the +inpainting performance in terms of signal-to-noise ratio (SNR), the objective +difference grade (ODG) and Hansen's audio quality metric. In particular, our +proposed framework outperformed the learning-based methods (D2WGAN and SG) and +the traditional statistical algorithms (e.g., SPAIN, TDC, WCP). + +
+
+ comment: 18 Pages, 2 Tables, 8 Figures +
+
+
+
+
+ + ♻ ☆ Leveraging Negative Signals with Self-Attention for Sequential Music + Recommendation RecSys 2023 + + +
+ Music streaming services heavily rely on their recommendation engines to +continuously provide content to their consumers. Sequential recommendation +consequently has seen considerable attention in current literature, where state +of the art approaches focus on self-attentive models leveraging contextual +information such as long and short-term user history and item features; +however, most of these studies focus on long-form content domains (retail, +movie, etc.) rather than short-form, such as music. Additionally, many do not +explore incorporating negative session-level feedback during training. In this +study, we investigate the use of transformer-based self-attentive architectures +to learn implicit session-level information for sequential music +recommendation. We additionally propose a contrastive learning task to +incorporate negative feedback (e.g skipped tracks) to promote positive hits and +penalize negative hits. This task is formulated as a simple loss term that can +be incorporated into a variety of deep learning architectures for sequential +recommendation. Our experiments show that this results in consistent +performance gains over the baseline architectures ignoring negative user +feedback. + +
+
+ comment: Accepted to the 1st Workshop on Music Recommender Systems, co-located + with the 17th ACM Conference on Recommender Systems (MuRS @ RecSys 2023) +
+
+
+
+
+ + ♻ ☆ Pre-training of Molecular GNNs via Conditional Boltzmann Generator + + +
+ Learning representations of molecular structures using deep learning is a +fundamental problem in molecular property prediction tasks. Molecules +inherently exist in the real world as three-dimensional structures; +furthermore, they are not static but in continuous motion in the 3D Euclidean +space, forming a potential energy surface. Therefore, it is desirable to +generate multiple conformations in advance and extract molecular +representations using a 4D-QSAR model that incorporates multiple conformations. +However, this approach is impractical for drug and material discovery tasks +because of the computational cost of obtaining multiple conformations. To +address this issue, we propose a pre-training method for molecular GNNs using +an existing dataset of molecular conformations to generate a latent vector +universal to multiple conformations from a 2D molecular graph. Our method, +called Boltzmann GNN, is formulated by maximizing the conditional marginal +likelihood of a conditional generative model for conformations generation. We +show that our model has a better prediction performance for molecular +properties than existing pre-training methods using molecular graphs and +three-dimensional molecular structures. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ♻ ☆ Prismer: A Vision-Language Model with Multi-Task Experts + + +
+ Recent vision-language models have shown impressive multi-modal generation +capabilities. However, typically they require training huge models on massive +datasets. As a more scalable alternative, we introduce Prismer, a data- and +parameter-efficient vision-language model that leverages an ensemble of +task-specific experts. Prismer only requires training of a small number of +components, with the majority of network weights inherited from multiple +readily-available, pre-trained experts, and kept frozen during training. By +leveraging experts from a wide range of domains, we show Prismer can +efficiently pool this expert knowledge and adapt it to various vision-language +reasoning tasks. In our experiments, we show that Prismer achieves fine-tuned +and few-shot learning performance which is competitive with current +state-of-the-arts, whilst requiring up to two orders of magnitude less training +data. Code is available at https://github.com/NVlabs/prismer. + +
+
+ comment: Published at TMLR 2024. Project Page: + https://shikun.io/projects/prismer Code: https://github.com/NVlabs/prismer +
+
+
+
+
+ + ♻ ☆ IPA: Inference Pipeline Adaptation to Achieve High Accuracy and + Cost-Efficiency + + +
+ Efficiently optimizing multi-model inference pipelines for fast, accurate, +and cost-effective inference is a crucial challenge in machine learning +production systems, given their tight end-to-end latency requirements. To +simplify the exploration of the vast and intricate trade-off space of latency, +accuracy, and cost in inference pipelines, providers frequently opt to consider +one of them. However, the challenge lies in reconciling latency, accuracy, and +cost trade-offs. To address this challenge and propose a solution to +efficiently manage model variants in inference pipelines, we present IPA, an +online deep learning Inference Pipeline Adaptation system that efficiently +leverages model variants for each deep learning task. Model variants are +different versions of pre-trained models for the same deep learning task with +variations in resource requirements, latency, and accuracy. IPA dynamically +configures batch size, replication, and model variants to optimize accuracy, +minimize costs, and meet user-defined latency Service Level Agreements (SLAs) +using Integer Programming. It supports multi-objective settings for achieving +different trade-offs between accuracy and cost objectives while remaining +adaptable to varying workloads and dynamic traffic patterns. Navigating a wider +variety of configurations allows \namex{} to achieve better trade-offs between +cost and accuracy objectives compared to existing methods. Extensive +experiments in a Kubernetes implementation with five real-world inference +pipelines demonstrate that IPA improves end-to-end accuracy by up to 21% with a +minimal cost increase. The code and data for replications are available at +https://github.com/reconfigurable-ml-pipeline/ipa. + +
+
+
+
+
+ + ♻ ☆ MULTISCRIPT: Multimodal Script Learning for Supporting Open Domain + Everyday Tasks AAAI 2024 + + +
+ Automatically generating scripts (i.e. sequences of key steps described in +text) from video demonstrations and reasoning about the subsequent steps are +crucial to the modern AI virtual assistants to guide humans to complete +everyday tasks, especially unfamiliar ones. However, current methods for +generative script learning rely heavily on well-structured preceding steps +described in text and/or images or are limited to a certain domain, resulting +in a disparity with real-world user scenarios. To address these limitations, we +present a new benchmark challenge -- MultiScript, with two new tasks on +task-oriented multimodal script learning: (1) multimodal script generation, and +(2) subsequent step prediction. For both tasks, the input consists of a target +task name and a video illustrating what has been done to complete the target +task, and the expected output is (1) a sequence of structured step descriptions +in text based on the demonstration video, and (2) a single text description for +the subsequent step, respectively. Built from WikiHow, MultiScript covers +multimodal scripts in videos and text descriptions for over 6,655 human +everyday tasks across 19 diverse domains. To establish baseline performance on +MultiScript, we propose two knowledge-guided multimodal generative frameworks +that incorporate the task-related knowledge prompted from large language models +such as Vicuna. Experimental results show that our proposed approaches +significantly improve over the competitive baselines. + +
+
+ comment: Accepted by AAAI 2024. 11 pages, 9 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ Are you using test log-likelihood correctly? NeurIPS 2022 + + +
+ Test log-likelihood is commonly used to compare different models of the same +data or different approximate inference algorithms for fitting the same +probabilistic model. We present simple examples demonstrating how comparisons +based on test log-likelihood can contradict comparisons according to other +objectives. Specifically, our examples show that (i) approximate Bayesian +inference algorithms that attain higher test log-likelihoods need not also +yield more accurate posterior approximations and (ii) conclusions about +forecast accuracy based on test log-likelihood comparisons may not agree with +conclusions based on root mean squared error. + +
+
+ comment: Presented at the ICBINB Workshop at NeurIPS 2022. This version + accepted at TMLR, available at https://openreview.net/forum?id=n2YifD4Dxo +
+
+
+
+
+ + ♻ ☆ Predicting breast cancer with AI for individual risk-adjusted MRI + screening and early detection + + +
+ Women with an increased life-time risk of breast cancer undergo supplemental +annual screening MRI. We propose to predict the risk of developing breast +cancer within one year based on the current MRI, with the objective of reducing +screening burden and facilitating early detection. An AI algorithm was +developed on 53,858 breasts from 12,694 patients who underwent screening or +diagnostic MRI and accrued over 12 years, with 2,331 confirmed cancers. A first +U-Net was trained to segment lesions and identify regions of concern. A second +convolutional network was trained to detect malignant cancer using features +extracted by the U-Net. This network was then fine-tuned to estimate the risk +of developing cancer within a year in cases that radiologists considered normal +or likely benign. Risk predictions from this AI were evaluated with a +retrospective analysis of 9,183 breasts from a high-risk screening cohort, +which were not used for training. Statistical analysis focused on the tradeoff +between number of omitted exams versus negative predictive value, and number of +potential early detections versus positive predictive value. The AI algorithm +identified regions of concern that coincided with future tumors in 52% of +screen-detected cancers. Upon directed review, a radiologist found that 71.3% +of cancers had a visible correlate on the MRI prior to diagnosis, 65% of these +correlates were identified by the AI model. Reevaluating these regions in 10% +of all cases with higher AI-predicted risk could have resulted in up to 33% +early detections by a radiologist. Additionally, screening burden could have +been reduced in 16% of lower-risk cases by recommending a later follow-up +without compromising current interval cancer rate. With increasing datasets and +improving image quality we expect this new AI-aided, adaptive screening to +meaningfully reduce screening burden and improve early detection. + +
+
+ comment: Major revisions and rewriting in progress +
+
+
+
+
+ + ♻ ☆ Meta-Learning with Versatile Loss Geometries for Fast Adaptation Using + Mirror Descent ICASSP-24 + + +
+ Utilizing task-invariant prior knowledge extracted from related tasks, +meta-learning is a principled framework that empowers learning a new task +especially when data records are limited. A fundamental challenge in +meta-learning is how to quickly "adapt" the extracted prior in order to train a +task-specific model within a few optimization steps. Existing approaches deal +with this challenge using a preconditioner that enhances convergence of the +per-task training process. Though effective in representing locally a quadratic +training loss, these simple linear preconditioners can hardly capture complex +loss geometries. The present contribution addresses this limitation by learning +a nonlinear mirror map, which induces a versatile distance metric to enable +capturing and optimizing a wide range of loss geometries, hence facilitating +the per-task training. Numerical tests on few-shot learning datasets +demonstrate the superior expressiveness and convergence of the advocated +approach. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Acoustics, Speech + and Signal Processing (ICASSP-24) +
+
+
+
+
+ + ♻ ☆ Interpreting Deep Neural Networks with the Package innsight + + +
+ The R package innsight offers a general toolbox for revealing variable-wise +interpretations of deep neural networks' predictions with so-called feature +attribution methods. Aside from the unified and user-friendly framework, the +package stands out in three ways: It is generally the first R package +implementing feature attribution methods for neural networks. Secondly, it +operates independently of the deep learning library allowing the interpretation +of models from any R package, including keras, torch, neuralnet, and even +custom models. Despite its flexibility, innsight benefits internally from the +torch package's fast and efficient array calculations, which builds on LibTorch +$-$ PyTorch's C++ backend $-$ without a Python dependency. Finally, it offers a +variety of visualization tools for tabular, signal, image data or a combination +of these. Additionally, the plots can be rendered interactively using the +plotly package. + +
+
+
+
+
+ + ♻ ☆ How Deep is Your Art: An Experimental Study on the Limits of Artistic + Understanding in a Single-Task, Single-Modality Neural Network + + +
+ Computational modeling of artwork meaning is complex and difficult. This is +because art interpretation is multidimensional and highly subjective. This +paper experimentally investigated the degree to which a state-of-the-art Deep +Convolutional Neural Network (DCNN), a popular Machine Learning approach, can +correctly distinguish modern conceptual art work into the galleries devised by +art curators. Two hypotheses were proposed to state that the DCNN model uses +Exhibited Properties for classification, like shape and color, but not +Non-Exhibited Properties, such as historical context and artist intention. The +two hypotheses were experimentally validated using a methodology designed for +this purpose. VGG-11 DCNN pre-trained on ImageNet dataset and discriminatively +fine-tuned was trained on handcrafted datasets designed from real-world +conceptual photography galleries. Experimental results supported the two +hypotheses showing that the DCNN model ignores Non-Exhibited Properties and +uses only Exhibited Properties for artwork classification. This work points to +current DCNN limitations, which should be addressed by future DNN models. + +
+
+
+
+
+ + ♻ ☆ A Latent Variable Approach for Non-Hierarchical Multi-Fidelity Adaptive + Sampling + + +
+ Multi-fidelity (MF) methods are gaining popularity for enhancing surrogate +modeling and design optimization by incorporating data from various +low-fidelity (LF) models. While most existing MF methods assume a fixed +dataset, adaptive sampling methods that dynamically allocate resources among +fidelity models can achieve higher efficiency in the exploring and exploiting +the design space. However, most existing MF methods rely on the hierarchical +assumption of fidelity levels or fail to capture the intercorrelation between +multiple fidelity levels and utilize it to quantify the value of the future +samples and navigate the adaptive sampling. To address this hurdle, we propose +a framework hinged on a latent embedding for different fidelity models and the +associated pre-posterior analysis to explicitly utilize their correlation for +adaptive sampling. In this framework, each infill sampling iteration includes +two steps: We first identify the location of interest with the greatest +potential improvement using the high-fidelity (HF) model, then we search for +the next sample across all fidelity levels that maximize the improvement per +unit cost at the location identified in the first step. This is made possible +by a single Latent Variable Gaussian Process (LVGP) model that maps different +fidelity models into an interpretable latent space to capture their +correlations without assuming hierarchical fidelity levels. The LVGP enables us +to assess how LF sampling candidates will affect HF response with pre-posterior +analysis and determine the next sample with the best benefit-to-cost ratio. +Through test cases, we demonstrate that the proposed method outperforms the +benchmark methods in both MF global fitting (GF) and Bayesian Optimization (BO) +problems in convergence rate and robustness. Moreover, the method offers the +flexibility to switch between GF and BO by simply changing the acquisition +function. + +
+
+
+
+
+ + ♻ ☆ Improving Faithfulness of Abstractive Summarization by Controlling + Confounding Effect of Irrelevant Sentences + + +
+ Lack of factual correctness is an issue that still plagues state-of-the-art +summarization systems despite their impressive progress on generating seemingly +fluent summaries. In this paper, we show that factual inconsistency can be +caused by irrelevant parts of the input text, which act as confounders. To that +end, we leverage information-theoretic measures of causal effects to quantify +the amount of confounding and precisely quantify how they affect the +summarization performance. Based on insights derived from our theoretical +results, we design a simple multi-task model to control such confounding by +leveraging human-annotated relevant sentences when available. Crucially, we +give a principled characterization of data distributions where such confounding +can be large thereby necessitating the use of human annotated relevant +sentences to generate factual summaries. Our approach improves faithfulness +scores by 20\% over strong baselines on AnswerSumm +\citep{fabbri2021answersumm}, a conversation summarization dataset where lack +of faithfulness is a significant issue due to the subjective nature of the +task. Our best method achieves the highest faithfulness score while also +achieving state-of-the-art results on standard metrics like ROUGE and METEOR. +We corroborate these improvements through human evaluation. + +
+
+
+
+
+ + ♻ ☆ Few-shot Quality-Diversity Optimization + + +
+ In the past few years, a considerable amount of research has been dedicated +to the exploitation of previous learning experiences and the design of Few-shot +and Meta Learning approaches, in problem domains ranging from Computer Vision +to Reinforcement Learning based control. A notable exception, where to the best +of our knowledge, little to no effort has been made in this direction is +Quality-Diversity (QD) optimization. QD methods have been shown to be effective +tools in dealing with deceptive minima and sparse rewards in Reinforcement +Learning. However, they remain costly due to their reliance on inherently +sample inefficient evolutionary processes. We show that, given examples from a +task distribution, information about the paths taken by optimization in +parameter space can be leveraged to build a prior population, which when used +to initialize QD methods in unseen environments, allows for few-shot +adaptation. Our proposed method does not require backpropagation. It is simple +to implement and scale, and furthermore, it is agnostic to the underlying +models that are being trained. Experiments carried in both sparse and dense +reward settings using robotic manipulation and navigation benchmarks show that +it considerably reduces the number of generations that are required for QD +optimization in these environments. + +
+
+ comment: Accepted for publication in the IEEE Robotics and Automation Letters + (RA-L) journal +
+
+
+
+
+ + ♻ ☆ Postprocessing of Ensemble Weather Forecasts Using Permutation-invariant + Neural Networks + + +
+ Statistical postprocessing is used to translate ensembles of raw numerical +weather forecasts into reliable probabilistic forecast distributions. In this +study, we examine the use of permutation-invariant neural networks for this +task. In contrast to previous approaches, which often operate on ensemble +summary statistics and dismiss details of the ensemble distribution, we propose +networks that treat forecast ensembles as a set of unordered member forecasts +and learn link functions that are by design invariant to permutations of the +member ordering. We evaluate the quality of the obtained forecast distributions +in terms of calibration and sharpness and compare the models against classical +and neural network-based benchmark methods. In case studies addressing the +postprocessing of surface temperature and wind gust forecasts, we demonstrate +state-of-the-art prediction quality. To deepen the understanding of the learned +inference process, we further propose a permutation-based importance analysis +for ensemble-valued predictors, which highlights specific aspects of the +ensemble forecast that are considered important by the trained postprocessing +models. Our results suggest that most of the relevant information is contained +in a few ensemble-internal degrees of freedom, which may impact the design of +future ensemble forecasting and postprocessing systems. + +
+
+ comment: in press +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ A Survey on Energy Consumption and Environmental Impact of Video + Streaming + + +
+ Climate change challenges require a notable decrease in worldwide greenhouse +gas (GHG) emissions across technology sectors. Digital technologies, especially +video streaming, accounting for most Internet traffic, make no exception. Video +streaming demand increases with remote working, multimedia communication +services (e.g., WhatsApp, Skype), video streaming content (e.g., YouTube, +Netflix), video resolution (4K/8K, 50 fps/60 fps), and multi-view video, making +energy consumption and environmental footprint critical. This survey +contributes to a better understanding of sustainable and efficient video +streaming technologies by providing insights into the state-of-the-art and +potential future directions for researchers, developers, and engineers, service +providers, hosting platforms, and consumers. We widen this survey's focus on +content provisioning and content consumption based on the observation that +continuously active network equipment underneath video streaming consumes +substantial energy independent of the transmitted data type. We propose a +taxonomy of factors that affect the energy consumption in video streaming, such +as encoding schemes, resource requirements, storage, content retrieval, +decoding, and display. We identify notable weaknesses in video streaming that +require further research for improved energy efficiency: (1) fixed bitrate +ladders in HTTP live streaming; (2) inefficient hardware utilization of +existing video players; (3) lack of comprehensive open energy measurement +dataset covering various device types and coding parameters for reproducible +research. + +
+
+
+
+
+ + ☆ On the Audio Hallucinations in Large Audio-Video Language Models + + +
+ Large audio-video language models can generate descriptions for both video +and audio. However, they sometimes ignore audio content, producing audio +descriptions solely reliant on visual information. This paper refers to this as +audio hallucinations and analyzes them in large audio-video language models. We +gather 1,000 sentences by inquiring about audio information and annotate them +whether they contain hallucinations. If a sentence is hallucinated, we also +categorize the type of hallucination. The results reveal that 332 sentences are +hallucinated with distinct trends observed in nouns and verbs for each +hallucination type. Based on this, we tackle a task of audio hallucination +classification using pre-trained audio-text models in the zero-shot and +fine-tuning settings. Our experimental results reveal that the zero-shot models +achieve higher performance (52.2% in F1) than the random (40.3%) and the +fine-tuning models achieve 87.9%, outperforming the zero-shot models. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ SlideAVSR: A Dataset of Paper Explanation Videos for Audio-Visual Speech + Recognition + + +
+ Audio-visual speech recognition (AVSR) is a multimodal extension of automatic +speech recognition (ASR), using video as a complement to audio. In AVSR, +considerable efforts have been directed at datasets for facial features such as +lip-readings, while they often fall short in evaluating the image comprehension +capabilities in broader contexts. In this paper, we construct SlideAVSR, an +AVSR dataset using scientific paper explanation videos. SlideAVSR provides a +new benchmark where models transcribe speech utterances with texts on the +slides on the presentation recordings. As technical terminologies that are +frequent in paper explanations are notoriously challenging to transcribe +without reference texts, our SlideAVSR dataset spotlights a new aspect of AVSR +problems. As a simple yet effective baseline, we propose DocWhisper, an AVSR +model that can refer to textual information from slides, and confirm its +effectiveness on SlideAVSR. + +
+
+
+
+
+ + ☆ Enhancing Image-Text Matching with Adaptive Feature Aggregation ICASSP 2024 + + +
+ Image-text matching aims to find matched cross-modal pairs accurately. While +current methods often rely on projecting cross-modal features into a common +embedding space, they frequently suffer from imbalanced feature representations +across different modalities, leading to unreliable retrieval results. To +address these limitations, we introduce a novel Feature Enhancement Module that +adaptively aggregates single-modal features for more balanced and robust +image-text retrieval. Additionally, we propose a new loss function that +overcomes the shortcomings of original triplet ranking loss, thereby +significantly improving retrieval performance. The proposed model has been +evaluated on two public datasets and achieves competitive retrieval performance +when compared with several state-of-the-art models. Implementation codes can be +found here. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and + Uni-Modal Adapter EMNLP + + +
+ Language Models (LMs) have demonstrated impressive molecule understanding +ability on various 1D text-related tasks. However, they inherently lack 2D +graph perception - a critical ability of human professionals in comprehending +molecules' topological structures. To bridge this gap, we propose MolCA: +Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal +Adapter. MolCA enables an LM (e.g., Galactica) to understand both text- and +graph-based molecular contents via the cross-modal projector. Specifically, the +cross-modal projector is implemented as a Q-Former to connect a graph encoder's +representation space and an LM's text space. Further, MolCA employs a uni-modal +adapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks. +Unlike previous studies that couple an LM with a graph encoder via cross-modal +contrastive learning, MolCA retains the LM's ability of open-ended text +generation and augments it with 2D graph information. To showcase its +effectiveness, we extensively benchmark MolCA on tasks of molecule captioning, +IUPAC name prediction, and molecule-text retrieval, on which MolCA +significantly outperforms the baselines. Our codes and checkpoints can be found +at https://github.com/acharkq/MolCA. + +
+
+ comment: EMNLP main conference. 9 pages +
+
+
+
+
+ + ♻ ☆ DKiS: Decay weight invertible image steganography with private key + + +
+ Image steganography, defined as the practice of concealing information within +another image, traditionally encounters security challenges when its methods +become publicly known or are under attack. To address this, a novel private +key-based image steganography technique has been introduced. This approach +ensures the security of the hidden information, as access requires a +corresponding private key, regardless of the public knowledge of the +steganography method. Experimental evidence has been presented, demonstrating +the effectiveness of our method and showcasing its real-world applicability. +Furthermore, a critical challenge in the invertible image steganography process +has been identified by us: the transfer of non-essential, or `garbage', +information from the secret to the host pipeline. To tackle this issue, the +decay weight has been introduced to control the information transfer, +effectively filtering out irrelevant data and enhancing the performance of +image steganography. The code for this technique is publicly accessible at +https://github.com/yanghangAI/DKiS, and a practical demonstration can be found +at http://yanghang.site/hidekey. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 64 + +
+
+
+ + ☆ Deciphering Textual Authenticity: A Generalized Strategy through the + Lens of Large Language Semantics for Detecting Human vs. Machine-Generated + Text + + +
+ With the recent proliferation of Large Language Models (LLMs), there has been +an increasing demand for tools to detect machine-generated text. The effective +detection of machine-generated text face two pertinent problems: First, they +are severely limited in generalizing against real-world scenarios, where +machine-generated text is produced by a variety of generators, including but +not limited to GPT-4 and Dolly, and spans diverse domains, ranging from +academic manuscripts to social media posts. Second, existing detection +methodologies treat texts produced by LLMs through a restrictive binary +classification lens, neglecting the nuanced diversity of artifacts generated by +different LLMs. In this work, we undertake a systematic study on the detection +of machine-generated text in real-world scenarios. We first study the +effectiveness of state-of-the-art approaches and find that they are severely +limited against text produced by diverse generators and domains in the real +world. Furthermore, t-SNE visualizations of the embeddings from a pretrained +LLM's encoder show that they cannot reliably distinguish between human and +machine-generated text. Based on our findings, we introduce a novel system, +T5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder +combined with LLM embedding sub-clustering to address the text produced by +diverse generators and domains in the real world. We evaluate our approach +across 9 machine-generated text systems and 9 domains and find that our +approach provides state-of-the-art generalization ability, with an average +increase in F1 score on machine-generated text of 19.6\% on unseen generators +and domains compared to the top performing existing approaches and correctly +attributes the generator of text with an accuracy of 93.6\%. + +
+
+
+
+
+ + ☆ Stuck in the Quicksand of Numeracy, Far from AGI Summit: Evaluating + LLMs' Mathematical Competency through Ontology-guided Perturbations + + +
+ Recent advancements in Large Language Models (LLMs) have showcased striking +results on existing logical reasoning benchmarks, with some models even +surpassing human performance. However, the true depth of their competencies and +robustness, in mathematical reasoning tasks, remains an open question. In +response, we develop (i) an ontology of perturbations of maths questions, (ii) +a semi-automatic method of perturbation, and (iii) a dataset of perturbed maths +questions to probe the limits of LLM capabilities in mathematical reasoning +tasks. These controlled perturbations span across multiple fine dimensions of +the structural and representational aspects of maths questions. Using GPT-4, we +generated the MORE dataset by perturbing randomly selected five seed questions +from GSM8K. This process was guided by our ontology and involved a thorough +automatic and manual filtering process, yielding a set of 216 maths problems. +We conducted comprehensive evaluation of both closed-source and open-source +LLMs on MORE. The results show a significant performance drop across all the +models against the perturbed questions. This strongly suggests that current +LLMs lack robust mathematical skills and deep reasoning abilities. This +research not only identifies multiple gaps in the capabilities of current +models, but also highlights multiple potential directions for future +development. Our dataset will be made publicly available at +https://huggingface.co/datasets/declare-lab/GSM8k_MORE. + +
+
+
+
+
+ + ☆ Efficient slot labelling + + +
+ Slot labelling is an essential component of any dialogue system, aiming to +find important arguments in every user turn. Common approaches involve large +pre-trained language models (PLMs) like BERT or RoBERTa, but they face +challenges such as high computational requirements and dependence on +pre-training data. In this work, we propose a lightweight method which performs +on par or better than the state-of-the-art PLM-based methods, while having +almost 10x less trainable parameters. This makes it especially applicable for +real-life industry scenarios. + +
+
+
+
+
+ + ☆ SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene + Understanding + + +
+ 3D vision-language grounding, which focuses on aligning language with the 3D +physical environment, stands as a cornerstone in the development of embodied +agents. In comparison to recent advancements in the 2D domain, grounding +language in 3D scenes faces several significant challenges: (i) the inherent +complexity of 3D scenes due to the diverse object configurations, their rich +attributes, and intricate relationships; (ii) the scarcity of paired 3D +vision-language data to support grounded learning; and (iii) the absence of a +unified learning framework to distill knowledge from grounded 3D data. In this +work, we aim to address these three major challenges in 3D vision-language by +examining the potential of systematically upscaling 3D vision-language learning +in indoor environments. We introduce the first million-scale 3D vision-language +dataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising +2.5M vision-language pairs derived from both human annotations and our scalable +scene-graph-based generation approach. We demonstrate that this scaling allows +for a unified pre-training framework, Grounded Pre-training for Scenes (GPS), +for 3D vision-language learning. Through extensive experiments, we showcase the +effectiveness of GPS by achieving state-of-the-art performance on all existing +3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is +unveiled through zero-shot transfer experiments in the challenging 3D +vision-language tasks. Project website: https://scene-verse.github.io . + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Large Language Models Are Neurosymbolic Reasoners AAAI 2024 + + +
+ A wide range of real-world applications is characterized by their symbolic +nature, necessitating a strong capability for symbolic reasoning. This paper +investigates the potential application of Large Language Models (LLMs) as +symbolic reasoners. We focus on text-based games, significant benchmarks for +agents with natural language capabilities, particularly in symbolic tasks like +math, map reading, sorting, and applying common sense in text-based worlds. To +facilitate these agents, we propose an LLM agent designed to tackle symbolic +challenges and achieve in-game objectives. We begin by initializing the LLM +agent and informing it of its role. The agent then receives observations and a +set of valid actions from the text-based games, along with a specific symbolic +module. With these inputs, the LLM agent chooses an action and interacts with +the game environments. Our experimental results demonstrate that our method +significantly enhances the capability of LLMs as automated agents for symbolic +reasoning, and our LLM agent is effective in text-based games involving +symbolic tasks, achieving an average performance of 88% across all tasks. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Machines Do See Color: A Guideline to Classify Different Forms of Racist + Discourse in Large Corpora + + +
+ Current methods to identify and classify racist language in text rely on +small-n qualitative approaches or large-n approaches focusing exclusively on +overt forms of racist discourse. This article provides a step-by-step +generalizable guideline to identify and classify different forms of racist +discourse in large corpora. In our approach, we start by conceptualizing racism +and its different manifestations. We then contextualize these racist +manifestations to the time and place of interest, which allows researchers to +identify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a +cross-lingual model for supervised text classification with a cutting-edge +contextual understanding of text. We show that XLM-R and XLM-R-Racismo, our +pretrained model, outperform other state-of-the-art approaches in classifying +racism in large corpora. We illustrate our approach using a corpus of tweets +relating to the Ecuadorian ind\'igena community between 2018 and 2021. + +
+
+ comment: 37 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Learning from Emotions, Demographic Information and Implicit User + Feedback in Task-Oriented Document-Grounded Dialogues + + +
+ The success of task-oriented and document-grounded dialogue systems depends +on users accepting and enjoying using them. To achieve this, recently published +work in the field of Human-Computer Interaction suggests that the combination +of considering demographic information, user emotions and learning from the +implicit feedback in their utterances, is particularly important. However, +these findings have not yet been transferred to the field of Natural Language +Processing, where these data are primarily studied separately. Accordingly, no +sufficiently annotated dataset is available. To address this gap, we introduce +FEDI, the first English dialogue dataset for task-oriented document-grounded +dialogues annotated with demographic information, user emotions and implicit +feedback. Our experiments with FLAN-T5, GPT-2 and LLaMA-2 show that these data +have the potential to improve task completion and the factual consistency of +the generated responses and user acceptance. + +
+
+
+
+
+ + ☆ Cross-lingual Offensive Language Detection: A Systematic Review of + Datasets, Transfer Approaches and Challenges + + +
+ The growing prevalence and rapid evolution of offensive language in social +media amplify the complexities of detection, particularly highlighting the +challenges in identifying such content across diverse languages. This survey +presents a systematic and comprehensive exploration of Cross-Lingual Transfer +Learning (CLTL) techniques in offensive language detection in social media. Our +study stands as the first holistic overview to focus exclusively on the +cross-lingual scenario in this domain. We analyse 67 relevant papers and +categorise these studies across various dimensions, including the +characteristics of multilingual datasets used, the cross-lingual resources +employed, and the specific CLTL strategies implemented. According to "what to +transfer", we also summarise three main CLTL transfer approaches: instance, +feature, and parameter transfer. Additionally, we shed light on the current +challenges and future research opportunities in this field. Furthermore, we +have made our survey resources available online, including two comprehensive +tables that provide accessible references to the multilingual datasets and CLTL +methods used in the reviewed literature. + +
+
+ comment: 35 pages, 7 figures +
+
+
+
+
+ + ☆ UniVIE: A Unified Label Space Approach to Visual Information Extraction + from Form-like Documents + + +
+ Existing methods for Visual Information Extraction (VIE) from form-like +documents typically fragment the process into separate subtasks, such as key +information extraction, key-value pair extraction, and choice group extraction. +However, these approaches often overlook the hierarchical structure of form +documents, including hierarchical key-value pairs and hierarchical choice +groups. To address these limitations, we present a new perspective, reframing +VIE as a relation prediction problem and unifying labels of different tasks +into a single label space. This unified approach allows for the definition of +various relation types and effectively tackles hierarchical relationships in +form-like documents. In line with this perspective, we present UniVIE, a +unified model that addresses the VIE problem comprehensively. UniVIE functions +using a coarse-to-fine strategy. It initially generates tree proposals through +a tree proposal network, which are subsequently refined into hierarchical trees +by a relation decoder module. To enhance the relation prediction capabilities +of UniVIE, we incorporate two novel tree constraints into the relation decoder: +a tree attention mask and a tree level embedding. Extensive experimental +evaluations on both our in-house dataset HierForms and a publicly available +dataset SIBR, substantiate that our method achieves state-of-the-art results, +underscoring the effectiveness and potential of our unified approach in +advancing the field of VIE. + +
+
+
+
+
+ + ☆ QAnswer: Towards Question Answering Search over Websites + + +
+ Question Answering (QA) is increasingly used by search engines to provide +results to their end-users, yet very few websites currently use QA technologies +for their search functionality. To illustrate the potential of QA technologies +for the website search practitioner, we demonstrate web searches that combine +QA over knowledge graphs and QA over free text -- each being usually tackled +separately. We also discuss the different benefits and drawbacks of both +approaches for web site searches. We use the case studies made of websites +hosted by the Wikimedia Foundation (namely Wikipedia and Wikidata). Differently +from a search engine (e.g. Google, Bing, etc), the data are indexed integrally, +i.e. we do not index only a subset, and they are indexed exclusively, i.e. we +index only data available on the corresponding website. + +
+
+
+
+
+ + ☆ Fine-tuning Strategies for Domain Specific Question Answering under Low + Annotation Budget Constraints + + +
+ The progress introduced by pre-trained language models and their fine-tuning +has resulted in significant improvements in most downstream NLP tasks. The +unsupervised training of a language model combined with further target task +fine-tuning has become the standard QA fine-tuning procedure. In this work, we +demonstrate that this strategy is sub-optimal for fine-tuning QA models, +especially under a low QA annotation budget, which is a usual setting in +practice due to the extractive QA labeling cost. We draw our conclusions by +conducting an exhaustive analysis of the performance of the alternatives of the +sequential fine-tuning strategy on different QA datasets. Based on the +experiments performed, we observed that the best strategy to fine-tune the QA +model in low-budget settings is taking a pre-trained language model (PLM) and +then fine-tuning PLM with a dataset composed of the target dataset and SQuAD +dataset. With zero extra annotation effort, the best strategy outperforms the +standard strategy by 2.28% to 6.48%. Our experiments provide one of the first +investigations on how to best fine-tune a QA system under a low budget and are +therefore of the utmost practical interest to the QA practitioners. + +
+
+
+
+
+ + ☆ Bridging Research and Readers: A Multi-Modal Automated Academic Papers + Interpretation System + + +
+ In the contemporary information era, significantly accelerated by the advent +of Large-scale Language Models, the proliferation of scientific literature is +reaching unprecedented levels. Researchers urgently require efficient tools for +reading and summarizing academic papers, uncovering significant scientific +literature, and employing diverse interpretative methodologies. To address this +burgeoning demand, the role of automated scientific literature interpretation +systems has become paramount. However, prevailing models, both commercial and +open-source, confront notable challenges: they often overlook multimodal data, +grapple with summarizing over-length texts, and lack diverse user interfaces. +In response, we introduce an open-source multi-modal automated academic paper +interpretation system (MMAPIS) with three-step process stages, incorporating +LLMs to augment its functionality. Our system first employs the hybrid modality +preprocessing and alignment module to extract plain text, and tables or figures +from documents separately. It then aligns this information based on the section +names they belong to, ensuring that data with identical section names are +categorized under the same section. Following this, we introduce a hierarchical +discourse-aware summarization method. It utilizes the extracted section names +to divide the article into shorter text segments, facilitating specific +summarizations both within and between sections via LLMs with specific prompts. +Finally, we have designed four types of diversified user interfaces, including +paper recommendation, multimodal Q\&A, audio broadcasting, and interpretation +blog, which can be widely applied across various scenarios. Our qualitative and +quantitative evaluations underscore the system's superiority, especially in +scientific summarization, where it outperforms solutions relying solely on +GPT-4. + +
+
+
+
+
+ + ☆ Asynchronous Local-SGD Training for Language Modeling + + +
+ Local stochastic gradient descent (Local-SGD), also referred to as federated +averaging, is an approach to distributed optimization where each device +performs more than one SGD update per communication. This work presents an +empirical study of {\it asynchronous} Local-SGD for training language models; +that is, each worker updates the global parameters as soon as it has finished +its SGD steps. We conduct a comprehensive investigation by examining how worker +hardware heterogeneity, model size, number of workers, and optimizer could +impact the learning performance. We find that with naive implementations, +asynchronous Local-SGD takes more iterations to converge than its synchronous +counterpart despite updating the (global) model parameters more frequently. We +identify momentum acceleration on the global parameters when worker gradients +are stale as a key challenge. We propose a novel method that utilizes a delayed +Nesterov momentum update and adjusts the workers' local training steps based on +their computation speed. This approach, evaluated with models up to 150M +parameters on the C4 dataset, matches the performance of synchronous Local-SGD +in terms of perplexity per update step, and significantly surpasses it in terms +of wall clock time. + +
+
+
+
+
+ + ☆ What makes for a 'good' social actor? Using respect as a lens to + evaluate interactions with language agents + + +
+ With the growing popularity of dialogue agents based on large language models +(LLMs), urgent attention has been drawn to finding ways to ensure their +behaviour is ethical and appropriate. These are largely interpreted in terms of +the 'HHH' criteria: making outputs more helpful and honest, and avoiding +harmful (biased, toxic, or inaccurate) statements. Whilst this semantic focus +is useful from the perspective of viewing LLM agents as mere mediums for +information, it fails to account for pragmatic factors that can make the same +utterance seem more or less offensive or tactless in different social +situations. We propose an approach to ethics that is more centred on relational +and situational factors, exploring what it means for a system, as a social +actor, to treat an individual respectfully in a (series of) interaction(s). Our +work anticipates a set of largely unexplored risks at the level of situated +interaction, and offers practical suggestions to help LLM technologies behave +as 'good' social actors and treat people respectfully. + +
+
+
+
+
+ + ☆ Code Simulation Challenges for Large Language Models + + +
+ We investigate the extent to which Large Language Models (LLMs) can simulate +the execution of computer code and algorithms. We begin by looking straight +line programs, and show that current LLMs demonstrate poor performance even +with such simple programs -- performance rapidly degrades with the length of +code. We then investigate the ability of LLMs to simulate programs that contain +critical paths and redundant instructions. We also go beyond straight line +program simulation with sorting algorithms and nested loops, and we show the +computational complexity of a routine directly affects the ability of an LLM to +simulate its execution. We observe that LLMs execute instructions sequentially +and with a low error margin only for short programs or standard procedures. +LLMs' code simulation is in tension with their pattern recognition and +memorisation capabilities: on tasks where memorisation is detrimental, we +propose a novel prompting method to simulate code execution line by line. +Empirically, our new Chain of Simulation (CoSm) method improves on the standard +Chain of Thought prompting approach by avoiding the pitfalls of memorisation. + +
+
+ comment: main paper (10 pages) + Appendix (11 pages) +
+
+
+
+
+ + ☆ LLMs for Relational Reasoning: How Far are We? ICSE 2024 + + +
+ Large language models (LLMs) have revolutionized many areas (e.g. natural +language processing, software engineering, etc.) by achieving state-of-the-art +performance on extensive downstream tasks. Aiming to achieve robust and general +artificial intelligence, there has been a surge of interest in investigating +the reasoning ability of the LLMs. Whereas the textual and numerical reasoning +benchmarks adopted by previous works are rather shallow and simple, it is hard +to conclude that the LLMs possess strong reasoning ability by merely achieving +positive results on these benchmarks. Recent efforts have demonstrated that the +LLMs are poor at solving sequential decision-making problems that require +common-sense planning by evaluating their performance on the reinforcement +learning benchmarks. In this work, we conduct an in-depth assessment of several +state-of-the-art LLMs' reasoning ability based on the inductive logic +programming (ILP) benchmark, which is broadly recognized as a representative +and challenging measurement for evaluating logic program induction/synthesis +systems as it requires inducing strict cause-effect logic to achieve robust +deduction on independent and identically distributed (IID) and +out-of-distribution (OOD) test samples. Our evaluations illustrate that +compared with the neural program induction systems which are much smaller in +model size, the state-of-the-art LLMs are much poorer in terms of reasoning +ability by achieving much lower performance and generalization using either +natural language prompting or truth-value matrix prompting. + +
+
+ comment: Accepted by The First International Workshop on Large Language Models + for Code (ICSE 2024) +
+
+
+
+
+ + ☆ Textual Summarisation of Large Sets: Towards a General Approach + + +
+ We are developing techniques to generate summary descriptions of sets of +objects. In this paper, we present and evaluate a rule-based NLG technique for +summarising sets of bibliographical references in academic papers. This extends +our previous work on summarising sets of consumer products and shows how our +model generalises across these two very different domains. + +
+
+
+
+
+ + ☆ Explain Thyself Bully: Sentiment Aided Cyberbullying Detection with + Explanation ICDAR 2023 + + +
+ Cyberbullying has become a big issue with the popularity of different social +media networks and online communication apps. While plenty of research is going +on to develop better models for cyberbullying detection in monolingual +language, there is very little research on the code-mixed languages and +explainability aspect of cyberbullying. Recent laws like "right to +explanations" of General Data Protection Regulation, have spurred research in +developing interpretable models rather than focusing on performance. Motivated +by this we develop the first interpretable multi-task model called {\em mExCB} +for automatic cyberbullying detection from code-mixed languages which can +simultaneously solve several tasks, cyberbullying detection, +explanation/rationale identification, target group detection and sentiment +analysis. We have introduced {\em BullyExplain}, the first benchmark dataset +for explainable cyberbullying detection in code-mixed language. Each post in +{\em BullyExplain} dataset is annotated with four labels, i.e., {\em bully +label, sentiment label, target and rationales (explainability)}, i.e., which +phrases are being responsible for annotating the post as a bully. The proposed +multitask framework (mExCB) based on CNN and GRU with word and sub-sentence +(SS) level attention is able to outperform several baselines and state of the +art models when applied on {\em BullyExplain} dataset. + +
+
+ comment: ICDAR 2023 +
+
+
+
+
+ + ☆ Augmenting Math Word Problems via Iterative Question Composing + + +
+ Despite recent progress in improving the mathematical reasoning ability of +large language models(LLMs), solving competition-level math problems without +the use of external tools remains challenging for open-source LLMs. In this +work, we introduce the MMIQC dataset, a mixture of processed web data and +synthetic question-response pairs, to equip base models with better +mathematical reasoning skills. Mistral-7B-MMIQC, the model obtained by +fine-tuning Mistral-7B(arXiv:2310.06825) on MMIQC, achieves 36.0\% accuracy on +MATH(arXiv:2103.03874), 5.8\% higher than the previous (model size $\sim$7B) +SOTA. Our experiments also show that a large part of the improvement attributes +to our novel augmentation method IQC(Iterative Question Composing), where we +iteratively ask an LLM to compose new questions from the given seed problems +and do rejection sampling from another LLM. MMIQC has now been released on +https://huggingface.co/datasets/Vivacem/MMIQC. + +
+
+
+
+
+ + ☆ AttackEval: How to Evaluate the Effectiveness of Jailbreak Attacking on + Large Language Models + + +
+ In our research, we pioneer a novel approach to evaluate the effectiveness of +jailbreak attacks on Large Language Models (LLMs), such as GPT-4 and LLaMa2, +diverging from traditional robustness-focused binary evaluations. Our study +introduces two distinct evaluation frameworks: a coarse-grained evaluation and +a fine-grained evaluation. Each framework, using a scoring range from 0 to 1, +offers a unique perspective, enabling a more comprehensive and nuanced +evaluation of attack effectiveness and empowering attackers to refine their +attack prompts with greater understanding. Furthermore, we have developed a +comprehensive ground truth dataset specifically tailored for jailbreak tasks. +This dataset not only serves as a crucial benchmark for our current study but +also establishes a foundational resource for future research, enabling +consistent and comparative analyses in this evolving field. Upon meticulous +comparison with traditional evaluation methods, we discovered that our +evaluation aligns with the baseline's trend while offering a more profound and +detailed assessment. We believe that by accurately evaluating the effectiveness +of attack prompts in the Jailbreak task, our work lays a solid foundation for +assessing a wider array of similar or even more complex tasks in the realm of +prompt injection, potentially revolutionizing this field. + +
+
+
+
+
+ + ☆ Efficient Adapter Finetuning for Tail Languages in Streaming + Multilingual ASR ICASSP 2024 + + +
+ The end-to-end ASR model is often desired in the streaming multilingual +scenario since it is easier to deploy and can benefit from pre-trained speech +models such as powerful foundation models. Meanwhile, the heterogeneous nature +and imbalanced data abundance of different languages may cause performance +degradation, leading to asynchronous peak performance for different languages +during training, especially on tail ones. Sometimes even the data itself may +become unavailable as a result of the enhanced privacy protection. Existing +work tend to significantly increase the model size or learn language-specific +decoders to accommodate each language separately. In this study, we explore +simple yet effective Language-Dependent Adapter (LDA) finetuning under a +cascaded Conformer transducer framework enhanced by teacher pseudo-labeling for +tail languages in the streaming multilingual ASR. The adapter only accounts for +0.4% of the full model per language. It is plugged into the frozen foundation +model and is the only trainable module during the finetuning process with noisy +student training. The final model merges the adapter parameters from different +checkpoints for different languages. The model performance is validated on a +challenging multilingual dictation dataset, which includes 39 tail languages +across Latin, Greek, Arabic, etc. Our proposed method brings 12.2% word error +rate reduction on average and up to 37.5% on a single locale. Furthermore, we +show that our parameter-efficient LDA can match the quality of the full model +finetuning, thus greatly alleviating the asynchronous peak performance issue. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ OCTO+: A Suite for Automatic Open-Vocabulary Object Placement in Mixed + Reality + + +
+ One key challenge in Augmented Reality is the placement of virtual content in +natural locations. Most existing automated techniques can only work with a +closed-vocabulary, fixed set of objects. In this paper, we introduce and +evaluate several methods for automatic object placement using recent advances +in open-vocabulary vision-language models. Through a multifaceted evaluation, +we identify a new state-of-the-art method, OCTO+. We also introduce a benchmark +for automatically evaluating the placement of virtual objects in augmented +reality, alleviating the need for costly user studies. Through this, in +addition to human evaluations, we find that OCTO+ places objects in a valid +region over 70% of the time, outperforming other methods on a range of metrics. + +
+
+ comment: 2024 IEEE International Conference on Artificial Intelligence and + eXtended and Virtual Reality (AIXVR) +
+
+
+
+
+ + ☆ ReFT: Reasoning with Reinforced Fine-Tuning + + +
+ One way to enhance the reasoning capability of Large Language Models (LLMs) +is to conduct Supervised Fine-Tuning (SFT) using Chain-of-Thought (CoT) +annotations. This approach does not show sufficiently strong generalization +ability, however, because the training only relies on the given CoT data. In +math problem-solving, for example, there is usually only one annotated +reasoning path for each question in the training data. Intuitively, it would be +better for the algorithm to learn from multiple annotated reasoning paths given +a question. To address this issue, we propose a simple yet effective approach +called Reinforced Fine-Tuning (ReFT) to enhance the generalizability of +learning LLMs for reasoning, with math problem-solving as an example. ReFT +first warmups the model with SFT, and then employs on-line reinforcement +learning, specifically the PPO algorithm in this paper, to further fine-tune +the model, where an abundance of reasoning paths are automatically sampled +given the question and the rewards are naturally derived from the ground-truth +answers. Extensive experiments on GSM8K, MathQA, and SVAMP datasets show that +ReFT significantly outperforms SFT, and the performance can be potentially +further boosted by combining inference-time strategies such as majority voting +and re-ranking. Note that ReFT obtains the improvement by learning from the +same training questions as SFT, without relying on extra or augmented training +questions. This indicates a superior generalization ability for ReFT. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Partial Diacritization: A Context-Contrastive Inference Approach + + +
+ Diacritization plays a pivotal role in improving readability and +disambiguating the meaning of Arabic texts. Efforts have so far focused on +marking every eligible character (Full Diacritization). Comparatively +overlooked, Partial Diacritzation (PD) is the selection of a subset of +characters to be marked to aid comprehension where needed. Research has +indicated that excessive diacritic marks can hinder skilled readers--reducing +reading speed and accuracy. We conduct a behavioral experiment and show that +partially marked text is often easier to read than fully marked text, and +sometimes easier than plain text. In this light, we introduce +Context-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which +integrates seamlessly with existing Arabic diacritization systems. CCPD +processes each word twice, once with context and once without, and diacritizes +only the characters with disparities between the two inferences. Further, we +introduce novel indicators for measuring partial diacritization quality (SR, +PDER, HDER, ERE), essential for establishing this as a machine learning task. +Lastly, we introduce TD2, a Transformer-variant of an established model which +offers a markedly different per formance profile on our proposed indicators +compared to all other known systems. + +
+
+ comment: 13 equations, 5 tables, 5 figures +
+
+
+
+
+ + ☆ Characterizing Online Eating Disorder Communities with Large Language + Models + + +
+ The rise in eating disorders, a dangerous mental health condition with high +mortality and morbidity, has been linked to the proliferation of idealized body +images on social media. However, the link between social media and eating +disorders is far more complex. We argue that social media platforms create a +feedback loop that amplifies the growth of content and communities that promote +eating disorders like anorexia and bulimia. Specifically, social media +platforms make it easy for vulnerable individuals to find and connect to +like-minded others, while group dynamic processes encourage them to stay +engaged within communities that promote and glorify harmful behaviors linked to +eating disorders. We characterize this dynamic empirically through a +combination of network and language analysis. We describe a novel framework +that leverages large language models to analyze the discourse within online +communities and probe their attitudes on topics related to eating disorders to +identify potentially harmful content. Our work emphasizes the need for better +social media moderation to disrupt harmful feedback loops and protect +vulnerable individuals. + +
+
+
+
+
+ + ☆ ClimateGPT: Towards AI Synthesizing Interdisciplinary Research on + Climate Change + + +
+ This paper introduces ClimateGPT, a model family of domain-specific large +language models that synthesize interdisciplinary research on climate change. +We trained two 7B models from scratch on a science-oriented dataset of 300B +tokens. For the first model, the 4.2B domain-specific tokens were included +during pre-training and the second was adapted to the climate domain after +pre-training. Additionally, ClimateGPT-7B, 13B and 70B are continuously +pre-trained from Llama~2 on a domain-specific dataset of 4.2B tokens. Each +model is instruction fine-tuned on a high-quality and human-generated +domain-specific dataset that has been created in close cooperation with climate +scientists. To reduce the number of hallucinations, we optimize the model for +retrieval augmentation and propose a hierarchical retrieval strategy. To +increase the accessibility of our model to non-English speakers, we propose to +make use of cascaded machine translation and show that this approach can +perform comparably to natively multilingual models while being easier to scale +to a large number of languages. Further, to address the intrinsic +interdisciplinary aspect of climate change we consider different research +perspectives. Therefore, the model can produce in-depth answers focusing on +different perspectives in addition to an overall answer. We propose a suite of +automatic climate-specific benchmarks to evaluate LLMs. On these benchmarks, +ClimateGPT-7B performs on par with the ten times larger Llama-2-70B Chat model +while not degrading results on general domain benchmarks. Our human evaluation +confirms the trends we saw in our benchmarks. All models were trained and +evaluated using renewable energy and are released publicly. + +
+
+
+
+
+ + ☆ Impact of Large Language Model Assistance on Patients Reading Clinical + Notes: A Mixed-Methods Study + + +
+ Patients derive numerous benefits from reading their clinical notes, +including an increased sense of control over their health and improved +understanding of their care plan. However, complex medical concepts and jargon +within clinical notes hinder patient comprehension and may lead to anxiety. We +developed a patient-facing tool to make clinical notes more readable, +leveraging large language models (LLMs) to simplify, extract information from, +and add context to notes. We prompt engineered GPT-4 to perform these +augmentation tasks on real clinical notes donated by breast cancer survivors +and synthetic notes generated by a clinician, a total of 12 notes with 3868 +words. In June 2023, 200 female-identifying US-based participants were randomly +assigned three clinical notes with varying levels of augmentations using our +tool. Participants answered questions about each note, evaluating their +understanding of follow-up actions and self-reported confidence. We found that +augmentations were associated with a significant increase in action +understanding score (0.63 $\pm$ 0.04 for select augmentations, compared to 0.54 +$\pm$ 0.02 for the control) with p=0.002. In-depth interviews of +self-identifying breast cancer patients (N=7) were also conducted via video +conferencing. Augmentations, especially definitions, elicited positive +responses among the seven participants, with some concerns about relying on +LLMs. Augmentations were evaluated for errors by clinicians, and we found +misleading errors occur, with errors more common in real donated notes than +synthetic notes, illustrating the importance of carefully written clinical +notes. Augmentations improve some but not all readability metrics. This work +demonstrates the potential of LLMs to improve patients' experience with +clinical notes at a lower burden to clinicians. However, having a human in the +loop is important to correct potential model errors. + +
+
+
+
+
+ + ☆ Learning Shortcuts: On the Misleading Promise of NLU in Language Models + + +
+ The advent of large language models (LLMs) has enabled significant +performance gains in the field of natural language processing. However, recent +studies have found that LLMs often resort to shortcuts when performing tasks, +creating an illusion of enhanced performance while lacking generalizability in +their decision rules. This phenomenon introduces challenges in accurately +assessing natural language understanding in LLMs. Our paper provides a concise +survey of relevant research in this area and puts forth a perspective on the +implications of shortcut learning in the evaluation of language models, +specifically for NLU tasks. This paper urges more research efforts to be put +towards deepening our comprehension of shortcut learning, contributing to the +development of more robust language models, and raising the standards of NLU +evaluation in real-world scenarios. + +
+
+ comment: Accepted at HICSS-SDPS 2024 +
+
+
+
+
+ + ☆ Aligning Large Language Models with Counterfactual DPO + + +
+ Advancements in large language models (LLMs) have demonstrated remarkable +capabilities across a diverse range of applications. These models excel in +generating text completions that are contextually coherent and cover an +extensive array of subjects. However, the vast datasets required for their +training make aligning response styles during the pretraining and instruction +tuning phases challenging. Consequently, an additional alignment phase is +typically employed, wherein the model is further trained with human preference +data to better align its outputs with human expectations. While this process +doesn't introduce new capabilities per se, it does accentuate generation styles +innate to the model. This paper explores the utilization of counterfactual +prompting within the framework of Direct Preference Optimization (DPO) to align +the model's style without relying on human intervention. We demonstrate that +this method effectively instils desirable behaviour, mitigates undesirable +ones, and encourages the model to disregard inappropriate instructions. Our +findings suggest that counterfactual prompting with DPO presents a low-resource +way to fine-tune LLMs to meet the demands for responsible and ethically aligned +AI systems. + +
+
+
+
+
+ + ☆ Improving Classification Performance With Human Feedback: Label a few, + we label the rest + + +
+ In the realm of artificial intelligence, where a vast majority of data is +unstructured, obtaining substantial amounts of labeled data to train supervised +machine learning models poses a significant challenge. To address this, we +delve into few-shot and active learning, where are goal is to improve AI models +with human feedback on a few labeled examples. This paper focuses on +understanding how a continuous feedback loop can refine models, thereby +enhancing their accuracy, recall, and precision through incremental human +input. By employing Large Language Models (LLMs) such as GPT-3.5, BERT, and +SetFit, we aim to analyze the efficacy of using a limited number of labeled +examples to substantially improve model accuracy. We benchmark this approach on +the Financial Phrasebank, Banking, Craigslist, Trec, Amazon Reviews datasets to +prove that with just a few labeled examples, we are able to surpass the +accuracy of zero shot large language models to provide enhanced text +classification performance. We demonstrate that rather than needing to manually +label millions of rows of data, we just need to label a few and the model can +effectively predict the rest. + +
+
+
+
+
+ + ☆ BERTologyNavigator: Advanced Question Answering with BERT-based + Semantics ISWC 2023 + + +
+ The development and integration of knowledge graphs and language models has +significance in artificial intelligence and natural language processing. In +this study, we introduce the BERTologyNavigator -- a two-phased system that +combines relation extraction techniques and BERT embeddings to navigate the +relationships within the DBLP Knowledge Graph (KG). Our approach focuses on +extracting one-hop relations and labelled candidate pairs in the first phases. +This is followed by employing BERT's CLS embeddings and additional heuristics +for relation selection in the second phase. Our system reaches an F1 score of +0.2175 on the DBLP QuAD Final test dataset for Scholarly QALD and 0.98 F1 score +on the subset of the DBLP QuAD test dataset during the QA phase. + +
+
+ comment: Accepted in Scholarly QALD Challenge @ ISWC 2023 +
+
+
+
+
+ + ♻ ☆ AUTOACT: Automatic Agent Learning from Scratch via Self-Planning + + +
+ Language agents have achieved considerable performance on various complex +tasks. Despite the incessant exploration in this field, existing language agent +systems still struggle with costly, non-reproducible data reliance and face the +challenge of compelling a single model for multiple functions. To this end, we +introduce AutoAct, an automatic agent learning framework that does not rely on +large-scale annotated data and synthetic trajectories from closed-source models +(e.g., GPT-4). Given limited data with a tool library, AutoAct first +automatically synthesizes planning trajectories without any assistance from +humans or strong closed-source models. Then, AutoAct leverages a +division-of-labor strategy to automatically differentiate based on the target +task information and synthesized trajectories, producing a sub-agent group to +complete the task. We conduct comprehensive experiments with different LLMs, +which demonstrates that AutoAct yields better or parallel performance compared +to various strong baselines. We even notice that AutoAct, when using the +Llama-2-13b model, can achieve performance comparable to that of the zero-shot +GPT-3.5-Turbo agent. Code will be available at +https://github.com/zjunlp/AutoAct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Watch Your Language: Investigating Content Moderation with Large + Language Models + + +
+ Large language models (LLMs) have exploded in popularity due to their ability +to perform a wide array of natural language tasks. Text-based content +moderation is one LLM use case that has received recent enthusiasm, however, +there is little research investigating how LLMs perform in content moderation +settings. In this work, we evaluate a suite of commodity LLMs on two common +content moderation tasks: rule-based community moderation and toxic content +detection. For rule-based community moderation, we instantiate 95 subcommunity +specific LLMs by prompting GPT-3.5 with rules from 95 Reddit subcommunities. We +find that GPT-3.5 is effective at rule-based moderation for many communities, +achieving a median accuracy of 64% and a median precision of 83%. For toxicity +detection, we evaluate a suite of commodity LLMs (GPT-3, GPT-3.5, GPT-4, Gemini +Pro, LLAMA 2) and show that LLMs significantly outperform currently widespread +toxicity classifiers. However, recent increases in model size add only marginal +benefit to toxicity detection, suggesting a potential performance plateau for +LLMs on toxicity detection tasks. We conclude by outlining avenues for future +work in studying LLMs and content moderation. + +
+
+
+
+
+ + ♻ ☆ Fine-grained Hallucination Detection and Editing for Language Models + + +
+ Large language models (LMs) are prone to generate diverse factually incorrect +statements, which are widely called hallucinations. Current approaches +predominantly focus on coarse-grained automatic hallucination detection or +editing, overlooking nuanced error levels. In this paper, we propose a novel +task -- automatic fine-grained hallucination detection -- and present a +comprehensive taxonomy encompassing six hierarchically defined types of +hallucination. To facilitate evaluation, we introduce a new benchmark that +includes fine-grained human judgments on two LM outputs across various domains. +Our analysis reveals that ChatGPT and Llama 2-Chat exhibit hallucinations in +60% and 75% of their outputs, respectively, and a majority of these +hallucinations fall into categories that have been underexplored. As an initial +step to address this, we train FAVA, a retrieval-augmented LM by carefully +designing synthetic data generations to detect and correct fine-grained +hallucinations. On our benchmark, our automatic and human evaluations show that +FAVA significantly outperforms ChatGPT on fine-grained hallucination detection +by a large margin though a large room for future improvement still exists. +FAVA's suggested edits also improve the factuality of LM-generated text, +resulting in 5-10% FActScore improvements. + +
+
+
+
+
+ + ♻ ☆ Paralinguistics-Enhanced Large Language Modeling of Spoken Dialogue ICASSP 2024 + + +
+ Large Language Models (LLMs) have demonstrated superior abilities in tasks +such as chatting, reasoning, and question-answering. However, standard LLMs may +ignore crucial paralinguistic information, such as sentiment, emotion, and +speaking style, which are essential for achieving natural, human-like spoken +conversation, especially when such information is conveyed by acoustic cues. We +therefore propose Paralinguistics-enhanced Generative Pretrained Transformer +(ParalinGPT), an LLM that utilizes text and speech modalities to better model +the linguistic content and paralinguistic attributes of spoken dialogue. The +model takes the conversational context of text, speech embeddings, and +paralinguistic attributes as input prompts within a serialized multitasking +multimodal framework. Specifically, our framework serializes tasks in the order +of current paralinguistic attribute prediction, response paralinguistic +attribute prediction, and response text generation with autoregressive +conditioning. We utilize the Switchboard-1 corpus, including its sentiment +labels as the paralinguistic attribute, as our spoken dialogue dataset. +Experimental results indicate the proposed serialized multitasking method +outperforms typical sequence classification techniques on current and response +sentiment classification. Furthermore, leveraging conversational context and +speech embeddings significantly improves both response text generation and +sentiment prediction. Our proposed framework achieves relative improvements of +6.7%, 12.0%, and 3.5% in current sentiment accuracy, response sentiment +accuracy, and response text BLEU score, respectively. + +
+
+ comment: Accepted by ICASSP 2024. Camera-ready version +
+
+
+
+
+ + ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and +70B) demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables +aggressive quantization to sub-3 bits with only minor performance degradations. +When finetuned on a language modeling calibration dataset, LQ-LoRA can also be +used for model compression; in this setting our 2.75-bit LLaMA-2-70B model +(which has 2.85 bits on average when including the low-rank components and +requires 27GB of GPU memory) performs respectably compared to the 16-bit +baseline. + +
+
+
+
+
+ + ♻ ☆ A Chat About Boring Problems: Studying GPT-based text normalization ICASSP 2024 + + +
+ Text normalization - the conversion of text from written to spoken form - is +traditionally assumed to be an ill-formed task for language models. In this +work, we argue otherwise. We empirically show the capacity of Large-Language +Models (LLM) for text normalization in few-shot scenarios. Combining +self-consistency reasoning with linguistic-informed prompt engineering, we find +LLM based text normalization to achieve error rates around 40\% lower than top +normalization systems. Further, upon error analysis, we note key limitations in +the conventional design of text normalization tasks. We create a new taxonomy +of text normalization errors and apply it to results from GPT-3.5-Turbo and +GPT-4.0. Through this new framework, we can identify strengths and weaknesses +of GPT-based TN, opening opportunities for future work. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ SummaryMixing: A Linear-Complexity Alternative to Self-Attention for + Speech Recognition and Understanding + + +
+ Modern speech processing systems rely on self-attention. Unfortunately, token +mixing with self-attention takes quadratic time in the length of the speech +utterance, slowing down inference as well as training and increasing memory +consumption. Cheaper alternatives to self-attention for ASR have been +developed, but they fail to consistently reach the same level of accuracy. This +paper, therefore, proposes a novel linear-time alternative to self-attention. +It summarises an utterance with the mean over vectors for all time steps. This +single summary is then combined with time-specific information. We call this +method "SummaryMixing". Introducing SummaryMixing in state-of-the-art ASR +models makes it feasible to preserve or exceed previous speech recognition +performance while lowering the training and inference times by up to 28$\%$ and +reducing the memory budget by a factor of two. The benefits of SummaryMixing +can also be generalized to other speech-processing tasks, such as speech +understanding. + +
+
+
+
+
+ + ♻ ☆ CLadder: Assessing Causal Reasoning in Language Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insights into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023; updated with CLadder dataset v1.5 +
+
+
+
+
+ + ♻ ☆ Rational Decision-Making Agent with Internalized Utility Judgment ICLR 2024 + + +
+ Large language models (LLMs) have demonstrated remarkable advancements and +have attracted significant efforts to develop LLMs into agents capable of +executing intricate multi-step decision-making tasks beyond traditional NLP +applications. Existing approaches to LLM-based decision-making predominantly +build upon the manually-designed external performance metrics to guide the +decision-making process. However, reliance on the external performance metrics +as prior is problematic in real-world scenarios, where such prior may be +unavailable, flawed, or even erroneous. For genuine autonomous decision making, +it is imperative for the agent to develop its rationality from its posterior +experiences to judge decisions independently. Central to the development of +rationality is the construction of an internalized utility judgment, capable of +assigning numerical utilities to each decision. This paper proposes RadAgent +(Rational Decision-Making Agent), which fosters the development of its +rationality through an iterative framework involving Experience Exploration and +Utility Learning. Within this framework, Elo-based Utility Construction is +devised to assign Elo scores to individual decision steps to judge their +utilities via pairwise comparisons. Consequently, these Elo scores guide the +decision-making process to derive optimal outcomes. Experimental results on the +ToolBench dataset demonstrate RadAgent's superiority over baselines, achieving +over 10% improvement in Pass Rate on diverse tasks. It offers higher-quality +solutions and reduces costs (ChatGPT API calls), highlighting its effectiveness +and efficiency. + +
+
+ comment: Received 8,6,6,6 scores on ICLR 2024 +
+
+
+
+
+ + ♻ ☆ MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance + + +
+ The deployment of multimodal large language models (MLLMs) has brought forth +a unique vulnerability: susceptibility to malicious attacks through visual +inputs. We delve into the novel challenge of defending MLLMs against such +attacks. We discovered that images act as a "foreign language" that is not +considered during alignment, which can make MLLMs prone to producing harmful +responses. Unfortunately, unlike the discrete tokens considered in text-based +LLMs, the continuous nature of image signals presents significant alignment +challenges, which poses difficulty to thoroughly cover the possible scenarios. +This vulnerability is exacerbated by the fact that open-source MLLMs are +predominantly fine-tuned on limited image-text pairs that is much less than the +extensive text-based pretraining corpus, which makes the MLLMs more prone to +catastrophic forgetting of their original abilities during explicit alignment +tuning. To tackle these challenges, we introduce MLLM-Protector, a +plug-and-play strategy combining a lightweight harm detector and a response +detoxifier. The harm detector's role is to identify potentially harmful outputs +from the MLLM, while the detoxifier corrects these outputs to ensure the +response stipulates to the safety standards. This approach effectively +mitigates the risks posed by malicious visual inputs without compromising the +model's overall performance. Our results demonstrate that MLLM-Protector offers +a robust solution to a previously unaddressed aspect of MLLM security. + +
+
+
+
+
+ + ♻ ☆ On the Hidden Mystery of OCR in Large Multimodal Models + + +
+ Large models have recently played a dominant role in natural language +processing and multimodal vision-language learning. However, their +effectiveness in text-related visual tasks remains relatively unexplored. In +this paper, we conducted a comprehensive evaluation of Large Multimodal Models, +such as GPT4V and Gemini, in various text-related visual tasks including Text +Recognition, Scene Text-Centric Visual Question Answering (VQA), +Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten +Mathematical Expression Recognition (HMER). To facilitate the assessment of +Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we +propose OCRBench, a comprehensive evaluation benchmark.Our study encompasses 29 +datasets, making it the most comprehensive OCR evaluation benchmark available. +Furthermore, our study reveals both the strengths and weaknesses of these +models, particularly in handling multilingual text, handwritten text, +non-semantic text, and mathematical expression recognition. Most importantly, +the baseline results showcased in this study could provide a foundational +framework for the conception and assessment of innovative strategies targeted +at enhancing zero-shot multimodal techniques. The evaluation pipeline and +benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR. + +
+
+
+
+
+ + ♻ ☆ Multilingual DistilWhisper: Efficient Distillation of Multi-task Speech + Models via Language-Specific Experts ICASSP 2024 + + +
+ Whisper is a multitask and multilingual speech model covering 99 languages. +It yields commendable automatic speech recognition (ASR) results in a subset of +its covered languages, but the model still underperforms on a non-negligible +number of under-represented languages, a problem exacerbated in smaller model +versions. In this work, we propose DistilWhisper, an approach able to bridge +the performance gap in ASR for these languages while retaining the advantages +of multitask and multilingual capabilities. Our approach involves two key +strategies: lightweight modular ASR fine-tuning of whisper-small using +language-specific experts, and knowledge distillation from whisper-large-v2. +This dual approach allows us to effectively boost ASR performance while keeping +the robustness inherited from the multitask and multilingual pre-training. +Results demonstrate that our approach is more effective than standard +fine-tuning or LoRA adapters, boosting performance in the targeted languages +for both in- and out-of-domain test sets, while introducing only a negligible +parameter overhead at inference. + +
+
+ comment: Accepted to IEEE ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Language Modeling on a SpiNNaker 2 Neuromorphic Chip + + +
+ As large language models continue to scale in size rapidly, so too does the +computational power required to run them. Event-based networks on neuromorphic +devices offer a potential way to reduce energy consumption for inference +significantly. However, to date, most event-based networks that can run on +neuromorphic hardware, including spiking neural networks (SNNs), have not +achieved task performance even on par with LSTM models for language modeling. +As a result, language modeling on neuromorphic devices has seemed a distant +prospect. In this work, we demonstrate the first-ever implementation of a +language model on a neuromorphic device - specifically the SpiNNaker 2 chip - +based on a recently published event-based architecture called the EGRU. +SpiNNaker 2 is a many-core neuromorphic chip designed for large-scale +asynchronous processing, while the EGRU is architected to leverage such +hardware efficiently while maintaining competitive task performance. This +implementation marks the first time a neuromorphic language model matches +LSTMs, setting the stage for taking task performance to the level of large +language models. We also demonstrate results on a gesture recognition task +based on inputs from a DVS camera. Overall, our results showcase the +feasibility of this neuro-inspired neural network in hardware, highlighting +significant gains versus conventional hardware in energy efficiency for the +common use case of single batch inference. + +
+
+
+
+
+ + ♻ ☆ Modelling prospective memory and resilient situated communications via + Wizard of Oz + + +
+ This abstract presents a scenario for human-robot action in a home setting +involving an older adult and a robot. The scenario is designed to explore the +envisioned modelling of memory for communication with a socially assistive +robots (SAR). The scenario will enable the gathering of data on failures of +speech technology and human-robot communication involving shared memory that +may occur during daily activities such as a music-listening activity. + +
+
+ comment: In WTF Workshop Proceedings (arXiv:2401.04108) held in conjunction + with the ACM conference on Conversational User Interfaces (CUI), 19 - 21/07 + 2023, in Eindhoven, The Netherlands +
+
+
+
+
+ + ♻ ☆ Semantic similarity prediction is better than other semantic similarity + measures + + +
+ Semantic similarity between natural language texts is typically measured +either by looking at the overlap between subsequences (e.g., BLEU) or by using +embeddings (e.g., BERTScore, S-BERT). Within this paper, we argue that when we +are only interested in measuring the semantic similarity, it is better to +directly predict the similarity using a fine-tuned model for such a task. Using +a fine-tuned model for the Semantic Textual Similarity Benchmark tasks (STS-B) +from the GLUE benchmark, we define the STSScore approach and show that the +resulting similarity is better aligned with our expectations on a robust +semantic similarity measure than other approaches. + +
+
+ comment: Accepted at TMLR: https://openreview.net/forum?id=bfsNmgN5je +
+
+
+
+
+ + ♻ ☆ TiMix: Text-aware Image Mixing for Effective Vision-Language + Pre-training AAAI2024 + + +
+ Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances +modern Vision-Language Pre-training (VLP) models by aligning visual and +linguistic modalities. Due to noises in web-harvested text-image pairs, +however, scaling up training data volume in SMCL presents considerable +obstacles in terms of computational cost and data inefficiency. To improve data +efficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates +mix-based data augmentation techniques into SMCL, yielding significant +performance improvements without significantly increasing computational +overhead. We provide a theoretical analysis of TiMixfrom a mutual information +(MI) perspective, showing that mixed data samples for cross-modal contrastive +learning implicitly serve as a regularizer for the contrastive loss. The +experimental results demonstrate that TiMix exhibits a comparable performance +on downstream tasks, even with a reduced amount of training data and shorter +training time, when benchmarked against existing methods. This work empirically +and theoretically demonstrates the potential of data mixing for data-efficient +and computationally viable VLP, benefiting broader VLP model adoption in +practical scenarios. + +
+
+ comment: Accepted on AAAI2024 +
+
+
+
+
+ + ♻ ☆ Salute the Classic: Revisiting Challenges of Machine Translation in the + Age of Large Language Models + + +
+ The evolution of Neural Machine Translation (NMT) has been significantly +influenced by six core challenges (Koehn and Knowles, 2017), which have acted +as benchmarks for progress in this field. This study revisits these challenges, +offering insights into their ongoing relevance in the context of advanced Large +Language Models (LLMs): domain mismatch, amount of parallel data, rare word +prediction, translation of long sentences, attention model as word alignment, +and sub-optimal beam search. Our empirical findings indicate that LLMs +effectively lessen the reliance on parallel data for major languages in the +pretraining phase. Additionally, the LLM-based translation system significantly +enhances the translation of long sentences that contain approximately 80 words +and shows the capability to translate documents of up to 512 words. However, +despite these significant improvements, the challenges of domain mismatch and +prediction of rare words persist. While the challenges of word alignment and +beam search, specifically associated with NMT, may not apply to LLMs, we +identify three new challenges for LLMs in translation tasks: inference +efficiency, translation of low-resource languages in the pretraining phase, and +human-aligned evaluation. The datasets and models are released at +https://github.com/pangjh3/LLM4MT. + +
+
+ comment: 17 pages. Longyue Wang is the Corresponding Author +
+
+
+
+
+ + ♻ ☆ See the Unseen: Better Context-Consistent Knowledge-Editing by Noises + + +
+ Knowledge-editing updates knowledge of large language models (LLMs) and +contributes to the interpretability and application of LLMs. However, knowledge +applying is context-consistent: LLMs can recall the same knowledge in different +contexts. Existing works ignore this property and the editing lacks +generalization. In this paper, we empirically find that the effects of +different contexts upon LLMs in recalling the same knowledge follow a +Gaussian-like distribution. We then sample Gaussian noises to simulate the +effects of different contexts when updating LLMs. By such, we can make LLMs see +the unseen contexts where the edited knowledge will be applied, therefore +improving the editing generalization. Experimental results on three LLMs +demonstrate the effectiveness of our methods and also distinguish our methods +from the others of fine-tuning LLMs by noises. + +
+
+
+
+
+ + ♻ ☆ Dynamic Fault Characteristics Evaluation in Power Grid + + +
+ To enhance the intelligence degree in operation and maintenance, a novel +method for fault detection in power grids is proposed. The proposed GNN-based +approach first identifies fault nodes through a specialized feature extraction +method coupled with a knowledge graph. By incorporating temporal data, the +method leverages the status of nodes from preceding and subsequent time periods +to help current fault detection. To validate the effectiveness of the node +features, a correlation analysis of the output features from each node was +conducted. The results from experiments show that this method can accurately +locate fault nodes in simulation scenarios with a remarkable accuracy. +Additionally, the graph neural network based feature modeling allows for a +qualitative examination of how faults spread across nodes, which provides +valuable insights for analyzing fault nodes. + +
+
+
+
+
+ + ♻ ☆ Dynamic Fault Analysis in Substations Based on Knowledge Graphs + + +
+ To address the challenge of identifying hidden danger in substations from +unstructured text, a novel dynamic analysis method is proposed. We first +extract relevant information from the unstructured text, and then leverages a +flexible distributed search engine built on Elastic-Search to handle the data. +Following this, the hidden Markov model is employed to train the data within +the engine. The Viterbi algorithm is integrated to decipher the hidden state +sequences, facilitating the segmentation and labeling of entities related to +hidden dangers. The final step involves using the Neo4j graph database to +dynamically create a knowledge graph that visualizes hidden dangers in the +substation. The effectiveness of the proposed method is demonstrated through a +case analysis from a specific substation with hidden dangers revealed in the +text records. + +
+
+
+
+
+ + ♻ ☆ LightHouse: A Survey of AGI Hallucination + + +
+ With the development of artificial intelligence, large-scale models have +become increasingly intelligent. However, numerous studies indicate that +hallucinations within these large models are a bottleneck hindering the +development of AI research. In the pursuit of achieving strong artificial +intelligence, a significant volume of research effort is being invested in the +AGI (Artificial General Intelligence) hallucination research. Previous +explorations have been conducted in researching hallucinations within LLMs +(Large Language Models). As for multimodal AGI, research on hallucinations is +still in an early stage. To further the progress of research in the domain of +hallucinatory phenomena, we present a bird's eye view of hallucinations in AGI, +summarizing the current work on AGI hallucinations and proposing some +directions for future research. + +
+
+
+
+
+ + ♻ ☆ Towards Best Practices of Activation Patching in Language Models: + Metrics and Methods ICLR 2024 + + +
+ Mechanistic interpretability seeks to understand the internal mechanisms of +machine learning models, where localization -- identifying the important model +components -- is a key step. Activation patching, also known as causal tracing +or interchange intervention, is a standard technique for this task (Vig et al., +2020), but the literature contains many variants with little consensus on the +choice of hyperparameters or methodology. In this work, we systematically +examine the impact of methodological details in activation patching, including +evaluation metrics and corruption methods. In several settings of localization +and circuit discovery in language models, we find that varying these +hyperparameters could lead to disparate interpretability results. Backed by +empirical observations, we give conceptual arguments for why certain metrics or +methods may be preferred. Finally, we provide recommendations for the best +practices of activation patching going forwards. + +
+
+ comment: 27 pages. ICLR 2024 +
+
+
+
+
+ + ♻ ☆ On-Policy Distillation of Language Models: Learning from Self-Generated + Mistakes ICLR 2024 + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +reduce its inference cost and memory footprint, by training a smaller student +model. However, current KD methods for auto-regressive sequence models suffer +from distribution mismatch between output sequences seen during training and +those generated by the student during inference. To address this issue, we +introduce Generalized Knowledge Distillation (GKD). Instead of solely relying +on a fixed set of output sequences, GKD trains the student on its +self-generated output sequences by leveraging feedback from the teacher on such +sequences. Unlike supervised KD approaches, GKD also offers the flexibility to +employ alternative loss functions between the student and teacher, which can be +useful when the student lacks the expressivity to mimic the teacher's +distribution. Furthermore, GKD facilitates the seamless integration of +distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for +distilling auto-regressive language models on summarization, translation, and +arithmetic reasoning tasks, and task-agnostic distillation for +instruction-tuning. + +
+
+ comment: Accepted at ICLR 2024. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ DialogueLLM: Context and Emotion Knowledge-Tuned Large Language Models + for Emotion Recognition in Conversations + + +
+ Large language models (LLMs) and their variants have shown extraordinary +efficacy across numerous downstream natural language processing (NLP) tasks, +which has presented a new vision for the development of NLP. Despite their +remarkable performance in natural language generating (NLG), LLMs lack a +distinct focus on the emotion understanding domain. As a result, using LLMs for +emotion recognition may lead to suboptimal and inadequate precision. Another +limitation of LLMs is that they are typical trained without leveraging +multi-modal information. To overcome these limitations, we propose DialogueLLM, +a context and emotion knowledge tuned LLM that is obtained by fine-tuning LLaMA +models with 13,638 multi-modal (i.e., texts and videos) emotional dialogues. +The visual information is considered as the supplementary knowledge to +construct high-quality instructions. We offer a comprehensive evaluation of our +proposed model on three benchmarking emotion recognition in conversations (ERC) +datasets and compare the results against the SOTA baselines and other SOTA +LLMs. Additionally, DialogueLLM-7B can be easily trained using LoRA on a 40GB +A100 GPU in 5 hours, facilitating reproducibility for other researchers. + +
+
+
+
+
+ + ♻ ☆ TAROT: A Hierarchical Framework with Multitask Co-Pretraining on + Semi-Structured Data towards Effective Person-Job Fit ICASSP 2024 + + +
+ Person-job fit is an essential part of online recruitment platforms in +serving various downstream applications like Job Search and Candidate +Recommendation. Recently, pretrained large language models have further +enhanced the effectiveness by leveraging richer textual information in user +profiles and job descriptions apart from user behavior features and job +metadata. However, the general domain-oriented design struggles to capture the +unique structural information within user profiles and job descriptions, +leading to a loss of latent semantic correlations. We propose TAROT, a +hierarchical multitask co-pretraining framework, to better utilize structural +and semantic information for informative text embeddings. TAROT targets +semi-structured text in profiles and jobs, and it is co-pretained with +multi-grained pretraining tasks to constrain the acquired semantic information +at each level. Experiments on a real-world LinkedIn dataset show significant +performance improvements, proving its effectiveness in person-job fit tasks. + +
+
+ comment: ICASSP 2024 camera ready. 5 pages, 1 figure, 3 tables +
+
+
+
+
+ + ♻ ☆ Circuit Component Reuse Across Tasks in Transformer Language Models ICLR 2024 + + +
+ Recent work in mechanistic interpretability has shown that behaviors in +language models can be successfully reverse-engineered through circuit +analysis. A common criticism, however, is that each circuit is task-specific, +and thus such analysis cannot contribute to understanding the models at a +higher level. In this work, we present evidence that insights (both low-level +findings about specific heads and higher-level findings about general +algorithms) can indeed generalize across tasks. Specifically, we study the +circuit discovered in Wang et al. (2022) for the Indirect Object Identification +(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that +it is mostly reused to solve a seemingly different task: Colored Objects +(Ippolito & Callison-Burch, 2023). We provide evidence that the process +underlying both tasks is functionally very similar, and contains about a 78% +overlap in in-circuit attention heads. We further present a proof-of-concept +intervention experiment, in which we adjust four attention heads in middle +layers in order to 'repair' the Colored Objects circuit and make it behave like +the IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the +Colored Objects task and explain most sources of error. The intervention +affects downstream attention heads in specific ways predicted by their +interactions in the IOI circuit, indicating that this subcircuit behavior is +invariant to the different task inputs. Overall, our results provide evidence +that it may yet be possible to explain large language models' behavior in terms +of a relatively small number of interpretable task-general algorithmic building +blocks and computational components. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Logic-Scaffolding: Personalized Aspect-Instructed Recommendation + Explanation Generation using LLMs WSDM 2024 + + +
+ The unique capabilities of Large Language Models (LLMs), such as the natural +language text generation ability, position them as strong candidates for +providing explanation for recommendations. However, despite the size of the +LLM, most existing models struggle to produce zero-shot explanations reliably. +To address this issue, we propose a framework called Logic-Scaffolding, that +combines the ideas of aspect-based explanation and chain-of-thought prompting +to generate explanations through intermediate reasoning steps. In this paper, +we share our experience in building the framework and present an interactive +demonstration for exploring our results. + +
+
+ comment: The 17th ACM International Conference on Web Search and Data Mining + (WSDM 2024) +
+
+
+
+
+ + ♻ ☆ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety + Training + + +
+ Humans are capable of strategically deceptive behavior: behaving helpfully in +most situations, but then behaving very differently in order to pursue +alternative objectives when given the opportunity. If an AI system learned such +a deceptive strategy, could we detect it and remove it using current +state-of-the-art safety training techniques? To study this question, we +construct proof-of-concept examples of deceptive behavior in large language +models (LLMs). For example, we train models that write secure code when the +prompt states that the year is 2023, but insert exploitable code when the +stated year is 2024. We find that such backdoor behavior can be made +persistent, so that it is not removed by standard safety training techniques, +including supervised fine-tuning, reinforcement learning, and adversarial +training (eliciting unsafe behavior and then training to remove it). The +backdoor behavior is most persistent in the largest models and in models +trained to produce chain-of-thought reasoning about deceiving the training +process, with the persistence remaining even when the chain-of-thought is +distilled away. Furthermore, rather than removing backdoors, we find that +adversarial training can teach models to better recognize their backdoor +triggers, effectively hiding the unsafe behavior. Our results suggest that, +once a model exhibits deceptive behavior, standard techniques could fail to +remove such deception and create a false impression of safety. + +
+
+ comment: updated to add missing acknowledgements +
+
+
+
+
+ + ♻ ☆ Are self-explanations from Large Language Models faithful? + + +
+ Instruction-tuned large language models (LLMs) excel at many tasks, and will +even provide explanations for their behavior. Since these models are directly +accessible to the public, there is a risk that convincing and wrong +explanations can lead to unsupported confidence in LLMs. Therefore, +interpretability-faithfulness of self-explanations is an important +consideration for AI Safety. Assessing the interpretability-faithfulness of +these explanations, termed self-explanations, is challenging as the models are +too complex for humans to annotate what is a correct explanation. To address +this, we propose employing self-consistency checks as a measure of +faithfulness. For example, if an LLM says a set of words is important for +making a prediction, then it should not be able to make the same prediction +without these words. While self-consistency checks are a common approach to +faithfulness, they have not previously been applied to LLM's self-explanations. +We apply self-consistency checks to three types of self-explanations: +counterfactuals, importance measures, and redactions. Our work demonstrate that +faithfulness is both task and model dependent, e.g., for sentiment +classification, counterfactual explanations are more faithful for Llama2, +importance measures for Mistral, and redaction for Falcon 40B. Finally, our +findings are robust to prompt-variations. + +
+
+
+
+
+ + ♻ ☆ LLM-SQL-Solver: Can LLMs Determine SQL Equivalence? + + +
+ Judging the equivalence between two SQL queries is a fundamental problem with +many practical applications in data management and SQL generation (i.e., +evaluating the quality of generated SQL queries in text-to-SQL task). While the +research community has reasoned about SQL equivalence for decades, it poses +considerable difficulties and no complete solutions exist. Recently, Large +Language Models (LLMs) have shown strong reasoning capability in conversation, +question answering and solving mathematics challenges. In this paper, we study +if LLMs can be used to determine the equivalence between SQL queries under two +notions of SQL equivalence (semantic equivalence and relaxed equivalence). To +assist LLMs in generating high quality responses, we present two prompting +techniques: Miniature & Mull and Explain & Compare. The former technique is +used to evaluate the semantic equivalence in which it asks LLMs to execute a +query on a simple database instance and then explore if a counterexample exists +by modifying the database. The latter technique is used to evaluate the relaxed +equivalence in which it asks LLMs to explain the queries and then compare if +they contain significant logical differences. Our experiments demonstrate using +our techniques, LLMs is a promising tool to help data engineers in writing +semantically equivalent SQL queries, however challenges still persist, and is a +better metric for evaluating SQL generation than the popular execution +accuracy. + +
+
+
+
+
+ + ♻ ☆ RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on + Agriculture + + +
+ There are two common ways in which developers are incorporating proprietary +and domain-specific data when building applications of Large Language Models +(LLMs): Retrieval-Augmented Generation (RAG) and Fine-Tuning. RAG augments the +prompt with the external data, while fine-Tuning incorporates the additional +knowledge into the model itself. However, the pros and cons of both approaches +are not well understood. In this paper, we propose a pipeline for fine-tuning +and RAG, and present the tradeoffs of both for multiple popular LLMs, including +Llama2-13B, GPT-3.5, and GPT-4. Our pipeline consists of multiple stages, +including extracting information from PDFs, generating questions and answers, +using them for fine-tuning, and leveraging GPT-4 for evaluating the results. We +propose metrics to assess the performance of different stages of the RAG and +fine-Tuning pipeline. We conduct an in-depth study on an agricultural dataset. +Agriculture as an industry has not seen much penetration of AI, and we study a +potentially disruptive application - what if we could provide location-specific +insights to a farmer? Our results show the effectiveness of our dataset +generation pipeline in capturing geographic-specific knowledge, and the +quantitative and qualitative benefits of RAG and fine-tuning. We see an +accuracy increase of over 6 p.p. when fine-tuning the model and this is +cumulative with RAG, which increases accuracy by 5 p.p. further. In one +particular experiment, we also demonstrate that the fine-tuned model leverages +information from across geographies to answer specific questions, increasing +answer similarity from 47% to 72%. Overall, the results point to how systems +built using LLMs can be adapted to respond and incorporate knowledge across a +dimension that is critical for a specific industry, paving the way for further +applications of LLMs in other industrial domains. + +
+
+
+
+
+ + ♻ ☆ Detecting Check-Worthy Claims in Political Debates, Speeches, and + Interviews Using Audio Data + + +
+ Developing tools to automatically detect check-worthy claims in political +debates and speeches can greatly help moderators of debates, journalists, and +fact-checkers. While previous work on this problem has focused exclusively on +the text modality, here we explore the utility of the audio modality as an +additional input. We create a new multimodal dataset (text and audio in +English) containing 48 hours of speech from past political debates in the USA. +We then experimentally demonstrate that, in the case of multiple speakers, +adding the audio modality yields sizable improvements over using the text +modality alone; moreover, an audio-only model could outperform a text-only one +for a single speaker. With the aim to enable future research, we make all our +data and code publicly available at +https://github.com/petar-iv/audio-checkworthiness-detection. + +
+
+ comment: Check-Worthiness, Fact-Checking, Fake News, Misinformation, + Disinformation, Political Debates, Multimodality +
+
+
+
+
+ + ♻ ☆ CMMLU: Measuring massive multitask language understanding in Chinese + + +
+ As the capabilities of large language models (LLMs) continue to advance, +evaluating their performance becomes increasingly crucial and challenging. This +paper aims to bridge this gap by introducing CMMLU, a comprehensive Chinese +benchmark that covers various subjects, including natural science, social +sciences, engineering, and humanities. We conduct a thorough evaluation of 18 +advanced multilingual- and Chinese-oriented LLMs, assessing their performance +across different subjects and settings. The results reveal that most existing +LLMs struggle to achieve an average accuracy of 50%, even when provided with +in-context examples and chain-of-thought prompts, whereas the random baseline +stands at 25%. This highlights significant room for improvement in LLMs. +Additionally, we conduct extensive experiments to identify factors impacting +the models' performance and propose directions for enhancing LLMs. CMMLU fills +the gap in evaluating the knowledge and reasoning capabilities of large +language models within the Chinese context. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 116 + +
+
+
+ + ☆ GARField: Group Anything with Radiance Fields + + +
+ Grouping is inherently ambiguous due to the multiple levels of granularity in +which one can decompose a scene -- should the wheels of an excavator be +considered separate or part of the whole? We present Group Anything with +Radiance Fields (GARField), an approach for decomposing 3D scenes into a +hierarchy of semantically meaningful groups from posed image inputs. To do this +we embrace group ambiguity through physical scale: by optimizing a +scale-conditioned 3D affinity feature field, a point in the world can belong to +different groups of different sizes. We optimize this field from a set of 2D +masks provided by Segment Anything (SAM) in a way that respects coarse-to-fine +hierarchy, using scale to consistently fuse conflicting masks from different +viewpoints. From this field we can derive a hierarchy of possible groupings via +automatic tree construction or user interaction. We evaluate GARField on a +variety of in-the-wild scenes and find it effectively extracts groups at many +levels: clusters of objects, objects, and various subparts. GARField inherently +represents multi-view consistent groupings and produces higher fidelity groups +than the input SAM masks. GARField's hierarchical grouping could have exciting +downstream applications such as 3D asset extraction or dynamic scene +understanding. See the project website at https://www.garfield.studio/ + +
+
+ comment: Project site: https://www.garfield.studio/ First three authors + contributed equally +
+
+
+
+
+ + ☆ Vision Mamba: Efficient Visual Representation Learning with + Bidirectional State Space Model + + +
+ Recently the state space models (SSMs) with efficient hardware-aware designs, +i.e., Mamba, have shown great potential for long sequence modeling. Building +efficient and generic vision backbones purely upon SSMs is an appealing +direction. However, representing visual data is challenging for SSMs due to the +position-sensitivity of visual data and the requirement of global context for +visual understanding. In this paper, we show that the reliance of visual +representation learning on self-attention is not necessary and propose a new +generic vision backbone with bidirectional Mamba blocks (Vim), which marks the +image sequences with position embeddings and compresses the visual +representation with bidirectional state space models. On ImageNet +classification, COCO object detection, and ADE20k semantic segmentation tasks, +Vim achieves higher performance compared to well-established vision +transformers like DeiT, while also demonstrating significantly improved +computation & memory efficiency. For example, Vim is 2.8$\times$ faster than +DeiT and saves 86.8% GPU memory when performing batch inference to extract +features on images with a resolution of 1248$\times$1248. The results +demonstrate that Vim is capable of overcoming the computation & memory +constraints on performing Transformer-style understanding for high-resolution +images and it has great potential to become the next-generation backbone for +vision foundation models. Code is available at https://github.com/hustvl/Vim. + +
+
+ comment: Work in progress. Code is available at https://github.com/hustvl/Vim +
+
+
+
+
+ + ☆ TextureDreamer: Image-guided Texture Synthesis through Geometry-aware + Diffusion + + +
+ We present TextureDreamer, a novel image-guided texture synthesis method to +transfer relightable textures from a small number of input images (3 to 5) to +target 3D shapes across arbitrary categories. Texture creation is a pivotal +challenge in vision and graphics. Industrial companies hire experienced artists +to manually craft textures for 3D assets. Classical methods require densely +sampled views and accurately aligned geometry, while learning-based methods are +confined to category-specific shapes within the dataset. In contrast, +TextureDreamer can transfer highly detailed, intricate textures from real-world +environments to arbitrary objects with only a few casually captured images, +potentially significantly democratizing texture creation. Our core idea, +personalized geometry-aware score distillation (PGSD), draws inspiration from +recent advancements in diffuse models, including personalized modeling for +texture information extraction, variational score distillation for detailed +appearance synthesis, and explicit geometry guidance with ControlNet. Our +integration and several essential modifications substantially improve the +texture quality. Experiments on real images spanning different categories show +that TextureDreamer can successfully transfer highly realistic, semantic +meaningful texture to arbitrary objects, surpassing the visual quality of +previous state-of-the-art. + +
+
+ comment: Project page: https://texturedreamer.github.io +
+
+
+
+
+ + ☆ Vlogger: Make Your Dream A Vlog + + +
+ In this work, we present Vlogger, a generic AI system for generating a +minute-level video blog (i.e., vlog) of user descriptions. Different from short +videos with a few seconds, vlog often contains a complex storyline with +diversified scenes, which is challenging for most existing video generation +approaches. To break through this bottleneck, our Vlogger smartly leverages +Large Language Model (LLM) as Director and decomposes a long video generation +task of vlog into four key stages, where we invoke various foundation models to +play the critical roles of vlog professionals, including (1) Script, (2) Actor, +(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings, +our Vlogger can generate vlogs through explainable cooperation of top-down +planning and bottom-up shooting. Moreover, we introduce a novel video diffusion +model, ShowMaker, which serves as a videographer in our Vlogger for generating +the video snippet of each shooting scene. By incorporating Script and Actor +attentively as textual and visual prompts, it can effectively enhance +spatial-temporal coherence in the snippet. Besides, we design a concise mixed +training paradigm for ShowMaker, boosting its capacity for both T2V generation +and prediction. Finally, the extensive experiments show that our method +achieves state-of-the-art performance on zero-shot T2V generation and +prediction tasks. More importantly, Vlogger can generate over 5-minute vlogs +from open-world descriptions, without loss of video coherence on script and +actor. The code and model is all available at +https://github.com/zhuangshaobin/Vlogger. + +
+
+ comment: 16 pages, 8 figures, 11 tables +
+
+
+
+
+ + ☆ POP-3D: Open-Vocabulary 3D Occupancy Prediction from Images NeurIPS 2023 + + +
+ We describe an approach to predict open-vocabulary 3D semantic voxel +occupancy map from input 2D images with the objective of enabling 3D grounding, +segmentation and retrieval of free-form language queries. This is a challenging +problem because of the 2D-3D ambiguity and the open-vocabulary nature of the +target tasks, where obtaining annotated training data in 3D is difficult. The +contributions of this work are three-fold. First, we design a new model +architecture for open-vocabulary 3D semantic occupancy prediction. The +architecture consists of a 2D-3D encoder together with occupancy prediction and +3D-language heads. The output is a dense voxel map of 3D grounded language +embeddings enabling a range of open-vocabulary tasks. Second, we develop a +tri-modal self-supervised learning algorithm that leverages three modalities: +(i) images, (ii) language and (iii) LiDAR point clouds, and enables training +the proposed architecture using a strong pre-trained vision-language model +without the need for any 3D manual language annotations. Finally, we +demonstrate quantitatively the strengths of the proposed model on several +open-vocabulary tasks: Zero-shot 3D semantic segmentation using existing +datasets; 3D grounding and retrieval of free-form language queries, using a +small dataset that we propose as an extension of nuScenes. You can find the +project page here https://vobecant.github.io/POP3D. + +
+
+ comment: accepted to NeurIPS 2023 +
+
+
+
+
+ + ☆ Tri$^{2}$-plane: Volumetric Avatar Reconstruction with Feature Pyramid + + +
+ Recent years have witnessed considerable achievements in facial avatar +reconstruction with neural volume rendering. Despite notable advancements, the +reconstruction of complex and dynamic head movements from monocular videos +still suffers from capturing and restoring fine-grained details. In this work, +we propose a novel approach, named Tri$^2$-plane, for monocular photo-realistic +volumetric head avatar reconstructions. Distinct from the existing works that +rely on a single tri-plane deformation field for dynamic facial modeling, the +proposed Tri$^2$-plane leverages the principle of feature pyramids and three +top-to-down lateral connections tri-planes for details improvement. It samples +and renders facial details at multiple scales, transitioning from the entire +face to specific local regions and then to even more refined sub-regions. +Moreover, we incorporate a camera-based geometry-aware sliding window method as +an augmentation in training, which improves the robustness beyond the canonical +space, with a particular improvement in cross-identity generation capabilities. +Experimental outcomes indicate that the Tri$^2$-plane not only surpasses +existing methodologies but also achieves superior performance across both +quantitative metrics and qualitative assessments through experiments. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ☆ Diverse Part Synthesis for 3D Shape Creation + + +
+ Methods that use neural networks for synthesizing 3D shapes in the form of a +part-based representation have been introduced over the last few years. These +methods represent shapes as a graph or hierarchy of parts and enable a variety +of applications such as shape sampling and reconstruction. However, current +methods do not allow easily regenerating individual shape parts according to +user preferences. In this paper, we investigate techniques that allow the user +to generate multiple, diverse suggestions for individual parts. Specifically, +we experiment with multimodal deep generative models that allow sampling +diverse suggestions for shape parts and focus on models which have not been +considered in previous work on shape synthesis. To provide a comparative study +of these techniques, we introduce a method for synthesizing 3D shapes in a +part-based representation and evaluate all the part suggestion techniques +within this synthesis method. In our method, which is inspired by previous +work, shapes are represented as a set of parts in the form of implicit +functions which are then positioned in space to form the final shape. Synthesis +in this representation is enabled by a neural network architecture based on an +implicit decoder and a spatial transformer. We compare the various multimodal +generative models by evaluating their performance in generating part +suggestions. Our contribution is to show with qualitative and quantitative +evaluations which of the new techniques for multimodal part generation perform +the best and that a synthesis method based on the top-performing techniques +allows the user to more finely control the parts that are generated in the 3D +shapes while maintaining high shape fidelity when reconstructing shapes. + +
+
+
+
+
+ + ☆ SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene + Understanding + + +
+ 3D vision-language grounding, which focuses on aligning language with the 3D +physical environment, stands as a cornerstone in the development of embodied +agents. In comparison to recent advancements in the 2D domain, grounding +language in 3D scenes faces several significant challenges: (i) the inherent +complexity of 3D scenes due to the diverse object configurations, their rich +attributes, and intricate relationships; (ii) the scarcity of paired 3D +vision-language data to support grounded learning; and (iii) the absence of a +unified learning framework to distill knowledge from grounded 3D data. In this +work, we aim to address these three major challenges in 3D vision-language by +examining the potential of systematically upscaling 3D vision-language learning +in indoor environments. We introduce the first million-scale 3D vision-language +dataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising +2.5M vision-language pairs derived from both human annotations and our scalable +scene-graph-based generation approach. We demonstrate that this scaling allows +for a unified pre-training framework, Grounded Pre-training for Scenes (GPS), +for 3D vision-language learning. Through extensive experiments, we showcase the +effectiveness of GPS by achieving state-of-the-art performance on all existing +3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is +unveiled through zero-shot transfer experiments in the challenging 3D +vision-language tasks. Project website: https://scene-verse.github.io . + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ To deform or not: treatment-aware longitudinal registration for breast + DCE-MRI during neoadjuvant chemotherapy via unsupervised keypoints detection + + +
+ Clinicians compare breast DCE-MRI after neoadjuvant chemotherapy (NAC) with +pre-treatment scans to evaluate the response to NAC. Clinical evidence supports +that accurate longitudinal deformable registration without deforming treated +tumor regions is key to quantifying tumor changes. We propose a conditional +pyramid registration network based on unsupervised keypoint detection and +selective volume-preserving to quantify changes over time. In this approach, we +extract the structural and the abnormal keypoints from DCE-MRI, apply the +structural keypoints for the registration algorithm to restrict large +deformation, and employ volume-preserving loss based on abnormal keypoints to +keep the volume of the tumor unchanged after registration. We use a clinical +dataset with 1630 MRI scans from 314 patients treated with NAC. The results +demonstrate that our method registers with better performance and better volume +preservation of the tumors. Furthermore, a local-global-combining biomarker +based on the proposed method achieves high accuracy in pathological complete +response (pCR) prediction, indicating that predictive information exists +outside tumor regions. The biomarkers could potentially be used to avoid +unnecessary surgeries for certain patients. It may be valuable for clinicians +and/or computer systems to conduct follow-up tumor segmentation and response +prediction on images registered by our method. Our code is available on +\url{https://github.com/fiy2W/Treatment-aware-Longitudinal-Registration}. + +
+
+
+
+
+ + ☆ Event-Based Visual Odometry on Non-Holonomic Ground Vehicles 3DV 2024 + + +
+ Despite the promise of superior performance under challenging conditions, +event-based motion estimation remains a hard problem owing to the difficulty of +extracting and tracking stable features from event streams. In order to +robustify the estimation, it is generally believed that fusion with other +sensors is a requirement. In this work, we demonstrate reliable, purely +event-based visual odometry on planar ground vehicles by employing the +constrained non-holonomic motion model of Ackermann steering platforms. We +extend single feature n-linearities for regular frame-based cameras to the case +of quasi time-continuous event-tracks, and achieve a polynomial form via +variable degree Taylor expansions. Robust averaging over multiple event tracks +is simply achieved via histogram voting. As demonstrated on both simulated and +real data, our algorithm achieves accurate and robust estimates of the +vehicle's instantaneous rotational velocity, and thus results that are +comparable to the delta rotations obtained by frame-based sensors under normal +conditions. We furthermore significantly outperform the more traditional +alternatives in challenging illumination scenarios. The code is available at +\url{https://github.com/gowanting/NHEVO}. + +
+
+ comment: Accepted by 3DV 2024 +
+
+
+
+
+ + ☆ Online Stability Improvement of Groebner Basis Solvers using Deep + Learning 3DV 2019 + + +
+ Over the past decade, the Gr\"obner basis theory and automatic solver +generation have lead to a large number of solutions to geometric vision +problems. In practically all cases, the derived solvers apply a fixed +elimination template to calculate the Gr\"obner basis and thereby identify the +zero-dimensional variety of the original polynomial constraints. However, it is +clear that different variable or monomial orderings lead to different +elimination templates, and we show that they may present a large variability in +accuracy for a certain instance of a problem. The present paper has two +contributions. We first show that for a common class of problems in geometric +vision, variable reordering simply translates into a permutation of the columns +of the initial coefficient matrix, and that -- as a result -- one and the same +elimination template can be reused in different ways, each one leading to +potentially different accuracy. We then prove that the original set of +coefficients may contain sufficient information to train a classifier for +online selection of a good solver, most notably at the cost of only a small +computational overhead. We demonstrate wide applicability at the hand of +generic dense polynomial problem solvers, as well as a concrete solver from +geometric vision. + +
+
+ comment: Accepted by 3DV 2019 +
+
+
+
+
+ + ☆ Siamese Meets Diffusion Network: SMDNet for Enhanced Change Detection in + High-Resolution RS Imagery + + +
+ Recently, the application of deep learning to change detection (CD) has +significantly progressed in remote sensing images. In recent years, CD tasks +have mostly used architectures such as CNN and Transformer to identify these +changes. However, these architectures have shortcomings in representing +boundary details and are prone to false alarms and missed detections under +complex lighting and weather conditions. For that, we propose a new network, +Siamese Meets Diffusion Network (SMDNet). This network combines the Siam-U2Net +Feature Differential Encoder (SU-FDE) and the denoising diffusion implicit +model to improve the accuracy of image edge change detection and enhance the +model's robustness under environmental changes. First, we propose an innovative +SU-FDE module that utilizes shared weight features to capture differences +between time series images and identify similarities between features to +enhance edge detail detection. Furthermore, we add an attention mechanism to +identify key coarse features to improve the model's sensitivity and accuracy. +Finally, the diffusion model of progressive sampling is used to fuse key coarse +features, and the noise reduction ability of the diffusion model and the +advantages of capturing the probability distribution of image data are used to +enhance the adaptability of the model in different environments. Our method's +combination of feature extraction and diffusion models demonstrates +effectiveness in change detection in remote sensing images. The performance +evaluation of SMDNet on LEVIR-CD, DSIFN-CD, and CDD datasets yields validated +F1 scores of 90.99%, 88.40%, and 88.47%, respectively. This substantiates the +advanced capabilities of our model in accurately identifying variations and +intricate details. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Tight Fusion of Events and Inertial Measurements for Direct Velocity + Estimation + + +
+ Traditional visual-inertial state estimation targets absolute camera poses +and spatial landmark locations while first-order kinematics are typically +resolved as an implicitly estimated sub-state. However, this poses a risk in +velocity-based control scenarios, as the quality of the estimation of +kinematics depends on the stability of absolute camera and landmark coordinates +estimation. To address this issue, we propose a novel solution to tight +visual-inertial fusion directly at the level of first-order kinematics by +employing a dynamic vision sensor instead of a normal camera. More +specifically, we leverage trifocal tensor geometry to establish an incidence +relation that directly depends on events and camera velocity, and demonstrate +how velocity estimates in highly dynamic situations can be obtained over short +time intervals. Noise and outliers are dealt with using a nested two-layer +RANSAC scheme. Additionally, smooth velocity signals are obtained from a tight +fusion with pre-integrated inertial signals using a sliding window optimizer. +Experiments on both simulated and real data demonstrate that the proposed tight +event-inertial fusion leads to continuous and reliable velocity estimation in +highly dynamic scenarios independently of absolute coordinates. Furthermore, in +extreme cases, it achieves more stable and more accurate estimation of +kinematics than traditional, point-position-based visual-inertial odometry. + +
+
+ comment: Accepted by IEEE Transactions on Robotics (T-RO) +
+
+
+
+
+ + ☆ A gradient-based approach to fast and accurate head motion compensation + in cone-beam CT + + +
+ Cone-beam computed tomography (CBCT) systems, with their portability, present +a promising avenue for direct point-of-care medical imaging, particularly in +critical scenarios such as acute stroke assessment. However, the integration of +CBCT into clinical workflows faces challenges, primarily linked to long scan +duration resulting in patient motion during scanning and leading to image +quality degradation in the reconstructed volumes. This paper introduces a novel +approach to CBCT motion estimation using a gradient-based optimization +algorithm, which leverages generalized derivatives of the backprojection +operator for cone-beam CT geometries. Building on that, a fully differentiable +target function is formulated which grades the quality of the current motion +estimate in reconstruction space. We drastically accelerate motion estimation +yielding a 19-fold speed-up compared to existing methods. Additionally, we +investigate the architecture of networks used for quality metric regression and +propose predicting voxel-wise quality maps, favoring autoencoder-like +architectures over contracting ones. This modification improves gradient flow, +leading to more accurate motion estimation. The presented method is evaluated +through realistic experiments on head anatomy. It achieves a reduction in +reprojection error from an initial average of 3mm to 0.61mm after motion +compensation and consistently demonstrates superior performance compared to +existing approaches. The analytic Jacobian for the backprojection operation, +which is at the core of the proposed method, is made publicly available. In +summary, this paper contributes to the advancement of CBCT integration into +clinical workflows by proposing a robust motion estimation approach that +enhances efficiency and accuracy, addressing critical challenges in +time-sensitive scenarios. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ PixelDINO: Semi-Supervised Semantic Segmentation for Detecting + Permafrost Disturbances + + +
+ Arctic Permafrost is facing significant changes due to global climate change. +As these regions are largely inaccessible, remote sensing plays a crucial rule +in better understanding the underlying processes not just on a local scale, but +across the Arctic. In this study, we focus on the remote detection of +retrogressive thaw slumps (RTS), a permafrost disturbance comparable to +landslides induced by thawing. For such analyses from space, deep learning has +become an indispensable tool, but limited labelled training data remains a +challenge for training accurate models. To improve model generalization across +the Arctic without the need for additional labelled data, we present a +semi-supervised learning approach to train semantic segmentation models to +detect RTS. Our framework called PixelDINO is trained in parallel on labelled +data as well as unlabelled data. For the unlabelled data, the model segments +the imagery into self-taught pseudo-classes and the training procedure ensures +consistency of these pseudo-classes across strong augmentations of the input +data. Our experimental results demonstrate that PixelDINO can improve model +performance both over supervised baseline methods as well as existing +semi-supervised semantic segmentation approaches, highlighting its potential +for training robust models that generalize well to regions that were not +included in the training data. The project page containing code and other +materials for this study can be found at +\url{https://khdlr.github.io/PixelDINO/}. + +
+
+
+
+
+ + ☆ P$^2$OT: Progressive Partial Optimal Transport for Deep Imbalanced + Clustering ICLR2024 + + +
+ Deep clustering, which learns representation and semantic clustering without +labels information, poses a great challenge for deep learning-based approaches. +Despite significant progress in recent years, most existing methods focus on +uniformly distributed datasets, significantly limiting the practical +applicability of their methods. In this paper, we first introduce a more +practical problem setting named deep imbalanced clustering, where the +underlying classes exhibit an imbalance distribution. To tackle this problem, +we propose a novel pseudo-labeling-based learning framework. Our framework +formulates pseudo-label generation as a progressive partial optimal transport +problem, which progressively transports each sample to imbalanced clusters +under prior distribution constraints, thus generating imbalance-aware +pseudo-labels and learning from high-confident samples. In addition, we +transform the initial formulation into an unbalanced optimal transport problem +with augmented constraints, which can be solved efficiently by a fast matrix +scaling algorithm. Experiments on various datasets, including a human-curated +long-tailed CIFAR100, challenging ImageNet-R, and large-scale subsets of +fine-grained iNaturalist2018 datasets, demonstrate the superiority of our +method. + +
+
+ comment: Accepted by ICLR2024 +
+
+
+
+
+ + ☆ An Efficient Generalizable Framework for Visuomotor Policies via + Control-aware Augmentation and Privilege-guided Distillation + + +
+ Visuomotor policies, which learn control mechanisms directly from +high-dimensional visual observations, confront challenges in adapting to new +environments with intricate visual variations. Data augmentation emerges as a +promising method for bridging these generalization gaps by enriching data +variety. However, straightforwardly augmenting the entire observation shall +impose excessive burdens on policy learning and may even result in performance +degradation. In this paper, we propose to improve the generalization ability of +visuomotor policies as well as preserve training stability from two aspects: 1) +We learn a control-aware mask through a self-supervised reconstruction task +with three auxiliary losses and then apply strong augmentation only to those +control-irrelevant regions based on the mask to reduce the generalization gaps. +2) To address training instability issues prevalent in visual reinforcement +learning (RL), we distill the knowledge from a pretrained RL expert processing +low-level environment states, to the student visuomotor policy. The policy is +subsequently deployed to unseen environments without any further finetuning. We +conducted comparison and ablation studies across various benchmarks: the +DMControl Generalization Benchmark (DMC-GB), the enhanced Robot Manipulation +Distraction Benchmark (RMDB), and a specialized long-horizontal drawer-opening +robotic task. The extensive experimental results well demonstrate the +effectiveness of our method, e.g., showing a 17\% improvement over previous +methods in the video-hard setting of DMC-GB. + +
+
+
+
+
+ + ☆ 3D Scene Geometry Estimation from 360$^\circ$ Imagery: A Survey + + +
+ This paper provides a comprehensive survey on pioneer and state-of-the-art 3D +scene geometry estimation methodologies based on single, two, or multiple +images captured under the omnidirectional optics. We first revisit the basic +concepts of the spherical camera model, and review the most common acquisition +technologies and representation formats suitable for omnidirectional (also +called 360$^\circ$, spherical or panoramic) images and videos. We then survey +monocular layout and depth inference approaches, highlighting the recent +advances in learning-based solutions suited for spherical data. The classical +stereo matching is then revised on the spherical domain, where methodologies +for detecting and describing sparse and dense features become crucial. The +stereo matching concepts are then extrapolated for multiple view camera setups, +categorizing them among light fields, multi-view stereo, and structure from +motion (or visual simultaneous localization and mapping). We also compile and +discuss commonly adopted datasets and figures of merit indicated for each +purpose and list recent results for completeness. We conclude this paper by +pointing out current and future trends. + +
+
+ comment: Published in ACM Computing Surveys +
+
+
+
+
+ + ☆ Uncertainty estimates for semantic segmentation: providing enhanced + reliability for automated motor claims handling + + +
+ Deep neural network models for image segmentation can be a powerful tool for +the automation of motor claims handling processes in the insurance industry. A +crucial aspect is the reliability of the model outputs when facing adverse +conditions, such as low quality photos taken by claimants to document damages. +We explore the use of a meta-classification model to assess the precision of +segments predicted by a model trained for the semantic segmentation of car body +parts. Different sets of features correlated with the quality of a segment are +compared, and an AUROC score of 0.915 is achieved for distinguishing between +high- and low-quality segments. By removing low-quality segments, the average +mIoU of the segmentation output is improved by 16 percentage points and the +number of wrongly predicted segments is reduced by 77%. + +
+
+ comment: 9 pages, 7 figures, 2 tables, submitted to MVAA +
+
+
+
+
+ + ☆ DaFoEs: Mixing Datasets towards the generalization of vision-state + deep-learning Force Estimation in Minimally Invasive Robotic Surgery + + +
+ Precisely determining the contact force during safe interaction in Minimally +Invasive Robotic Surgery (MIRS) is still an open research challenge. Inspired +by post-operative qualitative analysis from surgical videos, the use of +cross-modality data driven deep neural network models has been one of the +newest approaches to predict sensorless force trends. However, these methods +required for large and variable datasets which are not currently available. In +this paper, we present a new vision-haptic dataset (DaFoEs) with variable soft +environments for the training of deep neural models. In order to reduce the +bias from a single dataset, we present a pipeline to generalize different +vision and state data inputs for mixed dataset training, using a previously +validated dataset with different setup. Finally, we present a variable +encoder-decoder architecture to predict the forces done by the laparoscopic +tool using single input or sequence of inputs. For input sequence, we use a +recurrent decoder, named with the prefix R, and a new temporal sampling to +represent the acceleration of the tool. During our training, we demonstrate +that single dataset training tends to overfit to the training data domain, but +has difficulties on translating the results across new domains. However, +dataset mixing presents a good translation with a mean relative estimated force +error of 5% and 12% for the recurrent and non-recurrent models respectively. +Our method, also marginally increase the effectiveness of transformers for +force estimation up to a maximum of ~15%, as the volume of available data is +increase by 150%. In conclusion, we demonstrate that mixing experimental set +ups for vision-state force estimation in MIRS is a possible approach towards +the general solution of the problem. + +
+
+
+
+
+ + ☆ Dynamic Relation Transformer for Contextual Text Block Detection + + +
+ Contextual Text Block Detection (CTBD) is the task of identifying coherent +text blocks within the complexity of natural scenes. Previous methodologies +have treated CTBD as either a visual relation extraction challenge within +computer vision or as a sequence modeling problem from the perspective of +natural language processing. We introduce a new framework that frames CTBD as a +graph generation problem. This methodology consists of two essential +procedures: identifying individual text units as graph nodes and discerning the +sequential reading order relationships among these units as graph edges. +Leveraging the cutting-edge capabilities of DQ-DETR for node detection, our +framework innovates further by integrating a novel mechanism, a Dynamic +Relation Transformer (DRFormer), dedicated to edge generation. DRFormer +incorporates a dual interactive transformer decoder that deftly manages a +dynamic graph structure refinement process. Through this iterative process, the +model systematically enhances the graph's fidelity, ultimately resulting in +improved precision in detecting contextual text blocks. Comprehensive +experimental evaluations conducted on both SCUT-CTW-Context and ReCTS-Context +datasets substantiate that our method achieves state-of-the-art results, +underscoring the effectiveness and potential of our graph generation framework +in advancing the field of CTBD. + +
+
+
+
+
+ + ☆ Training-Free Semantic Video Composition via Pre-trained Diffusion Model + + +
+ The video composition task aims to integrate specified foregrounds and +backgrounds from different videos into a harmonious composite. Current +approaches, predominantly trained on videos with adjusted foreground color and +lighting, struggle to address deep semantic disparities beyond superficial +adjustments, such as domain gaps. Therefore, we propose a training-free +pipeline employing a pre-trained diffusion model imbued with semantic prior +knowledge, which can process composite videos with broader semantic +disparities. Specifically, we process the video frames in a cascading manner +and handle each frame in two processes with the diffusion model. In the +inversion process, we propose Balanced Partial Inversion to obtain generation +initial points that balance reversibility and modifiability. Then, in the +generation process, we further propose Inter-Frame Augmented attention to +augment foreground continuity across frames. Experimental results reveal that +our pipeline successfully ensures the visual harmony and inter-frame coherence +of the outputs, demonstrating efficacy in managing broader semantic +disparities. + +
+
+
+
+
+ + ☆ Exploring the Role of Convolutional Neural Networks (CNN) in Dental + Radiography Segmentation: A Comprehensive Systematic Literature Review + + +
+ In the field of dentistry, there is a growing demand for increased precision +in diagnostic tools, with a specific focus on advanced imaging techniques such +as computed tomography, cone beam computed tomography, magnetic resonance +imaging, ultrasound, and traditional intra-oral periapical X-rays. Deep +learning has emerged as a pivotal tool in this context, enabling the +implementation of automated segmentation techniques crucial for extracting +essential diagnostic data. This integration of cutting-edge technology +addresses the urgent need for effective management of dental conditions, which, +if left undetected, can have a significant impact on human health. The +impressive track record of deep learning across various domains, including +dentistry, underscores its potential to revolutionize early detection and +treatment of oral health issues. Objective: Having demonstrated significant +results in diagnosis and prediction, deep convolutional neural networks (CNNs) +represent an emerging field of multidisciplinary research. The goals of this +study were to provide a concise overview of the state of the art, standardize +the current debate, and establish baselines for future research. Method: In +this study, a systematic literature review is employed as a methodology to +identify and select relevant studies that specifically investigate the deep +learning technique for dental imaging analysis. This study elucidates the +methodological approach, including the systematic collection of data, +statistical analysis, and subsequent dissemination of outcomes. Conclusion: +This work demonstrates how Convolutional Neural Networks (CNNs) can be employed +to analyze images, serving as effective tools for detecting dental pathologies. +Although this research acknowledged some limitations, CNNs utilized for +segmenting and categorizing teeth exhibited their highest level of performance +overall. + +
+
+
+
+
+ + ☆ Unsupervised Multiple Domain Translation through Controlled + Disentanglement in Variational Autoencoder + + +
+ Unsupervised Multiple Domain Translation is the task of transforming data +from one domain to other domains without having paired data to train the +systems. Typically, methods based on Generative Adversarial Networks (GANs) are +used to address this task. However, our proposal exclusively relies on a +modified version of a Variational Autoencoder. This modification consists of +the use of two latent variables disentangled in a controlled way by design. One +of this latent variables is imposed to depend exclusively on the domain, while +the other one must depend on the rest of the variability factors of the data. +Additionally, the conditions imposed over the domain latent variable allow for +better control and understanding of the latent space. We empirically +demonstrate that our approach works on different vision datasets improving the +performance of other well known methods. Finally, we prove that, indeed, one of +the latent variables stores all the information related to the domain and the +other one hardly contains any domain information. + +
+
+
+
+
+ + ☆ DK-SLAM: Monocular Visual SLAM with Deep Keypoints Adaptive Learning, + Tracking and Loop-Closing + + +
+ Unreliable feature extraction and matching in handcrafted features undermine +the performance of visual SLAM in complex real-world scenarios. While learned +local features, leveraging CNNs, demonstrate proficiency in capturing +high-level information and excel in matching benchmarks, they encounter +challenges in continuous motion scenes, resulting in poor generalization and +impacting loop detection accuracy. To address these issues, we present DK-SLAM, +a monocular visual SLAM system with adaptive deep local features. MAML +optimizes the training of these features, and we introduce a coarse-to-fine +feature tracking approach. Initially, a direct method approximates the relative +pose between consecutive frames, followed by a feature matching method for +refined pose estimation. To counter cumulative positioning errors, a novel +online learning binary feature-based online loop closure module identifies loop +nodes within a sequence. Experimental results underscore DK-SLAM's efficacy, +outperforms representative SLAM solutions, such as ORB-SLAM3 on publicly +available datasets. + +
+
+ comment: In submission +
+
+
+
+
+ + ☆ Continuous Piecewise-Affine Based Motion Model for Image Animation + + +
+ Image animation aims to bring static images to life according to driving +videos and create engaging visual content that can be used for various purposes +such as animation, entertainment, and education. Recent unsupervised methods +utilize affine and thin-plate spline transformations based on keypoints to +transfer the motion in driving frames to the source image. However, limited by +the expressive power of the transformations used, these methods always produce +poor results when the gap between the motion in the driving frame and the +source image is large. To address this issue, we propose to model motion from +the source image to the driving frame in highly-expressive diffeomorphism +spaces. Firstly, we introduce Continuous Piecewise-Affine based (CPAB) +transformation to model the motion and present a well-designed inference +algorithm to generate CPAB transformation from control keypoints. Secondly, we +propose a SAM-guided keypoint semantic loss to further constrain the keypoint +extraction process and improve the semantic consistency between the +corresponding keypoints on the source and driving images. Finally, we design a +structure alignment loss to align the structure-related features extracted from +driving and generated images, thus helping the generator generate results that +are more consistent with the driving action. Extensive experiments on four +datasets demonstrate the effectiveness of our method against state-of-the-art +competitors quantitatively and qualitatively. Code will be publicly available +at: https://github.com/DevilPG/AAAI2024-CPABMM. + +
+
+
+
+
+ + ☆ Relative Pose for Nonrigid Multi-Perspective Cameras: The Static Case + + +
+ Multi-perspective cameras with potentially non-overlapping fields of view +have become an important exteroceptive sensing modality in a number of +applications such as intelligent vehicles, drones, and mixed reality headsets. +In this work, we challenge one of the basic assumptions made in these +scenarios, which is that the multi-camera rig is rigid. More specifically, we +are considering the problem of estimating the relative pose between a static +non-rigid rig in different spatial orientations while taking into account the +effect of gravity onto the system. The deformable physical connections between +each camera and the body center are approximated by a simple cantilever model, +and inserted into the generalized epipolar constraint. Our results lead us to +the important insight that the latent parameters of the deformation model, +meaning the gravity vector in both views, become observable. We present a +concise analysis of the observability of all variables based on noise, +outliers, and rig rigidity for two different algorithms. The first one is a +vision-only alternative, while the second one makes use of additional gravity +measurements. To conclude, we demonstrate the ability to sense gravity in a +real-world example, and discuss practical implications. + +
+
+
+
+
+ + ☆ SM$^3$: Self-Supervised Multi-task Modeling with Multi-view 2D Images + for Articulated Objects + + +
+ Reconstructing real-world objects and estimating their movable joint +structures are pivotal technologies within the field of robotics. Previous +research has predominantly focused on supervised approaches, relying on +extensively annotated datasets to model articulated objects within limited +categories. However, this approach falls short of effectively addressing the +diversity present in the real world. To tackle this issue, we propose a +self-supervised interaction perception method, referred to as SM$^3$, which +leverages multi-view RGB images captured before and after interaction to model +articulated objects, identify the movable parts, and infer the parameters of +their rotating joints. By constructing 3D geometries and textures from the +captured 2D images, SM$^3$ achieves integrated optimization of movable part and +joint parameters during the reconstruction process, obviating the need for +annotations. Furthermore, we introduce the MMArt dataset, an extension of +PartNet-Mobility, encompassing multi-view and multi-modal data of articulated +objects spanning diverse categories. Evaluations demonstrate that SM$^3$ +surpasses existing benchmarks across various categories and objects, while its +adaptability in real-world scenarios has been thoroughly validated. + +
+
+
+
+
+ + ☆ Objects With Lighting: A Real-World Dataset for Evaluating + Reconstruction and Rendering for Object Relighting 3DV 2024 + + +
+ Reconstructing an object from photos and placing it virtually in a new +environment goes beyond the standard novel view synthesis task as the +appearance of the object has to not only adapt to the novel viewpoint but also +to the new lighting conditions and yet evaluations of inverse rendering methods +rely on novel view synthesis data or simplistic synthetic datasets for +quantitative analysis. This work presents a real-world dataset for measuring +the reconstruction and rendering of objects for relighting. To this end, we +capture the environment lighting and ground truth images of the same objects in +multiple environments allowing to reconstruct the objects from images taken in +one environment and quantify the quality of the rendered views for the unseen +lighting environments. Further, we introduce a simple baseline composed of +off-the-shelf methods and test several state-of-the-art methods on the +relighting task and show that novel view synthesis is not a reliable proxy to +measure performance. Code and dataset are available at +https://github.com/isl-org/objects-with-lighting . + +
+
+ comment: Accepted at 3DV 2024, Oral presentation. For the project page see + https://github.com/isl-org/objects-with-lighting +
+
+
+
+
+ + ☆ Stream Query Denoising for Vectorized HD Map Construction + + +
+ To enhance perception performance in complex and extensive scenarios within +the realm of autonomous driving, there has been a noteworthy focus on temporal +modeling, with a particular emphasis on streaming methods. The prevailing trend +in streaming models involves the utilization of stream queries for the +propagation of temporal information. Despite the prevalence of this approach, +the direct application of the streaming paradigm to the construction of +vectorized high-definition maps (HD-maps) fails to fully harness the inherent +potential of temporal information. This paper introduces the Stream Query +Denoising (SQD) strategy as a novel approach for temporal modeling in +high-definition map (HD-map) construction. SQD is designed to facilitate the +learning of temporal consistency among map elements within the streaming model. +The methodology involves denoising the queries that have been perturbed by the +addition of noise to the ground-truth information from the preceding frame. +This denoising process aims to reconstruct the ground-truth information for the +current frame, thereby simulating the prediction process inherent in stream +queries. The SQD strategy can be applied to those streaming methods (e.g., +StreamMapNet) to enhance the temporal modeling. The proposed SQD-MapNet is the +StreamMapNet equipped with SQD. Extensive experiments on nuScenes and +Argoverse2 show that our method is remarkably superior to other existing +methods across all settings of close range and long range. The code will be +available soon. + +
+
+
+
+
+ + ☆ Trapped in texture bias? A large scale comparison of deep instance + segmentation ECCV 2022 + + +
+ Do deep learning models for instance segmentation generalize to novel objects +in a systematic way? For classification, such behavior has been questioned. In +this study, we aim to understand if certain design decisions such as framework, +architecture or pre-training contribute to the semantic understanding of +instance segmentation. To answer this question, we consider a special case of +robustness and compare pre-trained models on a challenging benchmark for +object-centric, out-of-distribution texture. We do not introduce another method +in this work. Instead, we take a step back and evaluate a broad range of +existing literature. This includes Cascade and Mask R-CNN, Swin Transformer, +BMask, YOLACT(++), DETR, BCNet, SOTR and SOLOv2. We find that YOLACT++, SOTR +and SOLOv2 are significantly more robust to out-of-distribution texture than +other frameworks. In addition, we show that deeper and dynamic architectures +improve robustness whereas training schedules, data augmentation and +pre-training have only a minor impact. In summary we evaluate 68 models on 61 +versions of MS COCO for a total of 4148 evaluations. + +
+
+ comment: Accepted at ECCV 2022. Code: + https://github.com/JohannesTheo/trapped-in-texture-bias +
+
+
+
+
+ + ☆ PIN-SLAM: LiDAR SLAM Using a Point-Based Implicit Neural Representation + for Achieving Global Map Consistency + + +
+ Accurate and robust localization and mapping are essential components for +most autonomous robots. In this paper, we propose a SLAM system for building +globally consistent maps, called PIN-SLAM, that is based on an elastic and +compact point-based implicit neural map representation. Taking range +measurements as input, our approach alternates between incremental learning of +the local implicit signed distance field and the pose estimation given the +current local map using a correspondence-free, point-to-implicit model +registration. Our implicit map is based on sparse optimizable neural points, +which are inherently elastic and deformable with the global pose adjustment +when closing a loop. Loops are also detected using the neural point features. +Extensive experiments validate that PIN-SLAM is robust to various environments +and versatile to different range sensors such as LiDAR and RGB-D cameras. +PIN-SLAM achieves pose estimation accuracy better or on par with the +state-of-the-art LiDAR odometry or SLAM systems and outperforms the recent +neural implicit SLAM approaches while maintaining a more consistent, and highly +compact implicit map that can be reconstructed as accurate and complete meshes. +Finally, thanks to the voxel hashing for efficient neural points indexing and +the fast implicit map-based registration without closest point association, +PIN-SLAM can run at the sensor frame rate on a moderate GPU. Codes will be +available at: https://github.com/PRBonn/PIN_SLAM. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ UniVG: Towards UNIfied-modal Video Generation + + +
+ Diffusion based video generation has received extensive attention and +achieved considerable success within both the academic and industrial +communities. However, current efforts are mainly concentrated on +single-objective or single-task video generation, such as generation driven by +text, by image, or by a combination of text and image. This cannot fully meet +the needs of real-world application scenarios, as users are likely to input +images and text conditions in a flexible manner, either individually or in +combination. To address this, we propose a Unified-modal Video Genearation +system that is capable of handling multiple video generation tasks across text +and image modalities. To this end, we revisit the various video generation +tasks within our system from the perspective of generative freedom, and +classify them into high-freedom and low-freedom video generation categories. +For high-freedom video generation, we employ Multi-condition Cross Attention to +generate videos that align with the semantics of the input images or text. For +low-freedom video generation, we introduce Biased Gaussian Noise to replace the +pure random Gaussian Noise, which helps to better preserve the content of the +input conditions. Our method achieves the lowest Fr\'echet Video Distance (FVD) +on the public academic benchmark MSR-VTT, surpasses the current open-source +methods in human evaluations, and is on par with the current close-source +method Gen2. For more samples, visit https://univg-baidu.github.io. + +
+
+
+
+
+ + ☆ Remote Sensing ChatGPT: Solving Remote Sensing Tasks with ChatGPT and + Visual Models + + +
+ Recently, the flourishing large language models(LLM), especially ChatGPT, +have shown exceptional performance in language understanding, reasoning, and +interaction, attracting users and researchers from multiple fields and domains. +Although LLMs have shown great capacity to perform human-like task +accomplishment in natural language and natural image, their potential in +handling remote sensing interpretation tasks has not yet been fully explored. +Moreover, the lack of automation in remote sensing task planning hinders the +accessibility of remote sensing interpretation techniques, especially to +non-remote sensing experts from multiple research fields. To this end, we +present Remote Sensing ChatGPT, an LLM-powered agent that utilizes ChatGPT to +connect various AI-based remote sensing models to solve complicated +interpretation tasks. More specifically, given a user request and a remote +sensing image, we utilized ChatGPT to understand user requests, perform task +planning according to the tasks' functions, execute each subtask iteratively, +and generate the final response according to the output of each subtask. +Considering that LLM is trained with natural language and is not capable of +directly perceiving visual concepts as contained in remote sensing images, we +designed visual cues that inject visual information into ChatGPT. With Remote +Sensing ChatGPT, users can simply send a remote sensing image with the +corresponding request, and get the interpretation results as well as language +feedback from Remote Sensing ChatGPT. Experiments and examples show that Remote +Sensing ChatGPT can tackle a wide range of remote sensing tasks and can be +extended to more tasks with more sophisticated models such as the remote +sensing foundation model. The code and demo of Remote Sensing ChatGPT is +publicly available at https://github.com/HaonanGuo/Remote-Sensing-ChatGPT . + +
+
+ comment: The manuscript is submitted to IEEE International Geoscience and + Remote Sensing Symposium(IGARSS2024). Looking forward to seeing you in July! +
+
+
+
+
+ + ☆ Towards Continual Learning Desiderata via HSIC-Bottleneck + Orthogonalization and Equiangular Embedding AAAI 2024 + + +
+ Deep neural networks are susceptible to catastrophic forgetting when trained +on sequential tasks. Various continual learning (CL) methods often rely on +exemplar buffers or/and network expansion for balancing model stability and +plasticity, which, however, compromises their practical value due to privacy +and memory concerns. Instead, this paper considers a strict yet realistic +setting, where the training data from previous tasks is unavailable and the +model size remains relatively constant during sequential training. To achieve +such desiderata, we propose a conceptually simple yet effective method that +attributes forgetting to layer-wise parameter overwriting and the resulting +decision boundary distortion. This is achieved by the synergy between two key +components: HSIC-Bottleneck Orthogonalization (HBO) implements non-overwritten +parameter updates mediated by Hilbert-Schmidt independence criterion in an +orthogonal space and EquiAngular Embedding (EAE) enhances decision boundary +adaptation between old and new tasks with predefined basis vectors. Extensive +experiments demonstrate that our method achieves competitive accuracy +performance, even with absolute superiority of zero exemplar buffer and 1.02x +the base model. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Autonomous Catheterization with Open-source Simulator and Expert + Trajectory + + +
+ Endovascular robots have been actively developed in both academia and +industry. However, progress toward autonomous catheterization is often hampered +by the widespread use of closed-source simulators and physical phantoms. +Additionally, the acquisition of large-scale datasets for training machine +learning algorithms with endovascular robots is usually infeasible due to +expensive medical procedures. In this chapter, we introduce CathSim, the first +open-source simulator for endovascular intervention to address these +limitations. CathSim emphasizes real-time performance to enable rapid +development and testing of learning algorithms. We validate CathSim against the +real robot and show that our simulator can successfully mimic the behavior of +the real robot. Based on CathSim, we develop a multimodal expert navigation +network and demonstrate its effectiveness in downstream endovascular navigation +tasks. The intensive experimental results suggest that CathSim has the +potential to significantly accelerate research in the autonomous +catheterization field. Our project is publicly available at +https://github.com/airvlab/cathsim. + +
+
+ comment: Code: https://github.com/airvlab/cathsim +
+
+
+
+
+ + ☆ CrossVideo: Self-supervised Cross-modal Contrastive Learning for Point + Cloud Video Understanding + + +
+ This paper introduces a novel approach named CrossVideo, which aims to +enhance self-supervised cross-modal contrastive learning in the field of point +cloud video understanding. Traditional supervised learning methods encounter +limitations due to data scarcity and challenges in label acquisition. To +address these issues, we propose a self-supervised learning method that +leverages the cross-modal relationship between point cloud videos and image +videos to acquire meaningful feature representations. Intra-modal and +cross-modal contrastive learning techniques are employed to facilitate +effective comprehension of point cloud video. We also propose a multi-level +contrastive approach for both modalities. Through extensive experiments, we +demonstrate that our method significantly surpasses previous state-of-the-art +approaches, and we conduct comprehensive ablation studies to validate the +effectiveness of our proposed designs. + +
+
+
+
+
+ + ☆ Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation + with Deterministic Sampling Prior + + +
+ Score distillation sampling (SDS) and its variants have greatly boosted the +development of text-to-3D generation, but are vulnerable to geometry collapse +and poor textures yet. To solve this issue, we first deeply analyze the SDS and +find that its distillation sampling process indeed corresponds to the +trajectory sampling of a stochastic differential equation (SDE): SDS samples +along an SDE trajectory to yield a less noisy sample which then serves as a +guidance to optimize a 3D model. However, the randomness in SDE sampling often +leads to a diverse and unpredictable sample which is not always less noisy, and +thus is not a consistently correct guidance, explaining the vulnerability of +SDS. Since for any SDE, there always exists an ordinary differential equation +(ODE) whose trajectory sampling can deterministically and consistently converge +to the desired target point as the SDE, we propose a novel and effective +"Consistent3D" method that explores the ODE deterministic sampling prior for +text-to-3D generation. Specifically, at each training iteration, given a +rendered image by a 3D model, we first estimate its desired 3D score function +by a pre-trained 2D diffusion model, and build an ODE for trajectory sampling. +Next, we design a consistency distillation sampling loss which samples along +the ODE trajectory to generate two adjacent samples and uses the less noisy +sample to guide another more noisy one for distilling the deterministic prior +into the 3D model. Experimental results show the efficacy of our Consistent3D +in generating high-fidelity and diverse 3D objects and large-scale scenes, as +shown in Fig. 1. The codes are available at +https://github.com/sail-sg/Consistent3D. + +
+
+
+
+
+ + ☆ Enhancing Lidar-based Object Detection in Adverse Weather using Offset + Sequences in Time CEC + + +
+ Automated vehicles require an accurate perception of their surroundings for +safe and efficient driving. Lidar-based object detection is a widely used +method for environment perception, but its performance is significantly +affected by adverse weather conditions such as rain and fog. In this work, we +investigate various strategies for enhancing the robustness of lidar-based +object detection by processing sequential data samples generated by lidar +sensors. Our approaches leverage temporal information to improve a lidar object +detection model, without the need for additional filtering or pre-processing +steps. We compare $10$ different neural network architectures that process +point cloud sequences including a novel augmentation strategy introducing a +temporal offset between frames of a sequence during training and evaluate the +effectiveness of all strategies on lidar point clouds under adverse weather +conditions through experiments. Our research provides a comprehensive study of +effective methods for mitigating the effects of adverse weather on the +reliability of lidar-based object detection using sequential data that are +evaluated using public datasets such as nuScenes, Dense, and the Canadian +Adverse Driving Conditions Dataset. Our findings demonstrate that our novel +method, involving temporal offset augmentation through randomized frame +skipping in sequences, enhances object detection accuracy compared to both the +baseline model (Pillar-based Object Detection) and no augmentation. + +
+
+ comment: Published as part of the III. International Conference on Electrical, + Computer and Energy Technologies (ICECET 2023), Cape Town, South Africa, + November 16-17, 2023 +
+
+
+
+
+ + ☆ Compose and Conquer: Diffusion-Based 3D Depth Aware Composable Image + Synthesis ICLR 2024 + + +
+ Addressing the limitations of text as a source of accurate layout +representation in text-conditional diffusion models, many works incorporate +additional signals to condition certain attributes within a generated image. +Although successful, previous works do not account for the specific +localization of said attributes extended into the three dimensional plane. In +this context, we present a conditional diffusion model that integrates control +over three-dimensional object placement with disentangled representations of +global stylistic semantics from multiple exemplar images. Specifically, we +first introduce \textit{depth disentanglement training} to leverage the +relative depth of objects as an estimator, allowing the model to identify the +absolute positions of unseen objects through the use of synthetic image +triplets. We also introduce \textit{soft guidance}, a method for imposing +global semantics onto targeted regions without the use of any additional +localization cues. Our integrated framework, \textsc{Compose and Conquer +(CnC)}, unifies these techniques to localize multiple conditions in a +disentangled manner. We demonstrate that our approach allows perception of +objects at varying depths while offering a versatile framework for composing +localized objects with different global semantics. Code: +https://github.com/tomtom1103/compose-and-conquer/ + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ VideoCrafter2: Overcoming Data Limitations for High-Quality Video + Diffusion Models + + +
+ Text-to-video generation aims to produce a video based on a given prompt. +Recently, several commercial video models have been able to generate plausible +videos with minimal noise, excellent details, and high aesthetic scores. +However, these models rely on large-scale, well-filtered, high-quality videos +that are not accessible to the community. Many existing research works, which +train models using the low-quality WebVid-10M dataset, struggle to generate +high-quality videos because the models are optimized to fit WebVid-10M. In this +work, we explore the training scheme of video models extended from Stable +Diffusion and investigate the feasibility of leveraging low-quality videos and +synthesized high-quality images to obtain a high-quality video model. We first +analyze the connection between the spatial and temporal modules of video models +and the distribution shift to low-quality videos. We observe that full training +of all modules results in a stronger coupling between spatial and temporal +modules than only training temporal modules. Based on this stronger coupling, +we shift the distribution to higher quality without motion degradation by +finetuning spatial modules with high-quality images, resulting in a generic +high-quality video model. Evaluations are conducted to demonstrate the +superiority of the proposed method, particularly in picture quality, motion, +and concept composition. + +
+
+ comment: Homepage: https://ailab-cvc.github.io/videocrafter; Github: + https://github.com/AILab-CVC/VideoCrafter +
+
+
+
+
+ + ☆ Cross-modality Guidance-aided Multi-modal Learning with Dual Attention + for MRI Brain Tumor Grading + + +
+ Brain tumor represents one of the most fatal cancers around the world, and is +very common in children and the elderly. Accurate identification of the type +and grade of tumor in the early stages plays an important role in choosing a +precise treatment plan. The Magnetic Resonance Imaging (MRI) protocols of +different sequences provide clinicians with important contradictory information +to identify tumor regions. However, manual assessment is time-consuming and +error-prone due to big amount of data and the diversity of brain tumor types. +Hence, there is an unmet need for MRI automated brain tumor diagnosis. We +observe that the predictive capability of uni-modality models is limited and +their performance varies widely across modalities, and the commonly used +modality fusion methods would introduce potential noise, which results in +significant performance degradation. To overcome these challenges, we propose a +novel cross-modality guidance-aided multi-modal learning with dual attention +for addressing the task of MRI brain tumor grading. To balance the tradeoff +between model efficiency and efficacy, we employ ResNet Mix Convolution as the +backbone network for feature extraction. Besides, dual attention is applied to +capture the semantic interdependencies in spatial and slice dimensions +respectively. To facilitate information interaction among modalities, we design +a cross-modality guidance-aided module where the primary modality guides the +other secondary modalities during the process of training, which can +effectively leverage the complementary information of different MRI modalities +and meanwhile alleviate the impact of the possible noise. + +
+
+
+
+
+ + ☆ Change Detection Between Optical Remote Sensing Imagery and Map Data via + Segment Anything Model (SAM) + + +
+ Unsupervised multimodal change detection is pivotal for time-sensitive tasks +and comprehensive multi-temporal Earth monitoring. In this study, we explore +unsupervised multimodal change detection between two key remote sensing data +sources: optical high-resolution imagery and OpenStreetMap (OSM) data. +Specifically, we propose to utilize the vision foundation model Segmentation +Anything Model (SAM), for addressing our task. Leveraging SAM's exceptional +zero-shot transfer capability, high-quality segmentation maps of optical images +can be obtained. Thus, we can directly compare these two heterogeneous data +forms in the so-called segmentation domain. We then introduce two strategies +for guiding SAM's segmentation process: the 'no-prompt' and 'box/mask prompt' +methods. The two strategies are designed to detect land-cover changes in +general scenarios and to identify new land-cover objects within existing +backgrounds, respectively. Experimental results on three datasets indicate that +the proposed approach can achieve more competitive results compared to +representative unsupervised multimodal change detection methods. + +
+
+
+
+
+ + ☆ Hybrid of DiffStride and Spectral Pooling in Convolutional Neural + Networks + + +
+ Stride determines the distance between adjacent filter positions as the +filter moves across the input. A fixed stride causes important information +contained in the image can not be captured, so that important information is +not classified. Therefore, in previous research, the DiffStride Method was +applied, namely the Strided Convolution Method with which it can learn its own +stride value. Severe Quantization and a constraining lower bound on preserved +information are arises with Max Pooling Downsampling Method. Spectral Pooling +reduce the constraint lower bound on preserved information by cutting off the +representation in the frequency domain. In this research a CNN Model is +proposed with the Downsampling Learnable Stride Technique performed by +Backpropagation combined with the Spectral Pooling Technique. Diffstride and +Spectral Pooling techniques are expected to maintain most of the information +contained in the image. In this study, we compare the Hybrid Method, which is a +combined implementation of Spectral Pooling and DiffStride against the Baseline +Method, which is the DiffStride implementation on ResNet 18. The accuracy +result of the DiffStride combination with Spectral Pooling improves over +DiffStride which is baseline method by 0.0094. This shows that the Hybrid +Method can maintain most of the information by cutting of the representation in +the frequency domain and determine the stride of the learning result through +Backpropagation. + +
+
+
+
+
+ + ☆ Generalized Face Liveness Detection via De-spoofing Face Generator + + +
+ Previous Face Anti-spoofing (FAS) works face the challenge of generalizing in +unseen domains. One of the major problems is that most existing FAS datasets +are relatively small and lack data diversity. However, we find that there are +numerous real faces that can be easily achieved under various conditions, which +are neglected by previous FAS works. In this paper, we conduct an Anomalous cue +Guided FAS (AG-FAS) method, which leverages real faces for improving model +generalization via a De-spoofing Face Generator (DFG). Specifically, the DFG +trained only on the real faces gains the knowledge of what a real face should +be like and can generate a "real" version of the face corresponding to any +given input face. The difference between the generated "real" face and the +input face can provide an anomalous cue for the downstream FAS task. We then +propose an Anomalous cue Guided FAS feature extraction Network (AG-Net) to +further improve the FAS feature generalization via a cross-attention +transformer. Extensive experiments on a total of nine public datasets show our +method achieves state-of-the-art results under cross-domain evaluations with +unseen scenarios and unknown presentation attacks. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ Attack and Reset for Unlearning: Exploiting Adversarial Noise toward + Machine Unlearning through Parameter Re-initialization + + +
+ With growing concerns surrounding privacy and regulatory compliance, the +concept of machine unlearning has gained prominence, aiming to selectively +forget or erase specific learned information from a trained model. In response +to this critical need, we introduce a novel approach called Attack-and-Reset +for Unlearning (ARU). This algorithm leverages meticulously crafted adversarial +noise to generate a parameter mask, effectively resetting certain parameters +and rendering them unlearnable. ARU outperforms current state-of-the-art +results on two facial machine-unlearning benchmark datasets, MUFAC and MUCAC. +In particular, we present the steps involved in attacking and masking that +strategically filter and re-initialize network parameters biased towards the +forget set. Our work represents a significant advancement in rendering data +unexploitable to deep learning models through parameter re-initialization, +achieved by harnessing adversarial noise to craft a mask. + +
+
+
+
+
+ + ☆ OCTO+: A Suite for Automatic Open-Vocabulary Object Placement in Mixed + Reality + + +
+ One key challenge in Augmented Reality is the placement of virtual content in +natural locations. Most existing automated techniques can only work with a +closed-vocabulary, fixed set of objects. In this paper, we introduce and +evaluate several methods for automatic object placement using recent advances +in open-vocabulary vision-language models. Through a multifaceted evaluation, +we identify a new state-of-the-art method, OCTO+. We also introduce a benchmark +for automatically evaluating the placement of virtual objects in augmented +reality, alleviating the need for costly user studies. Through this, in +addition to human evaluations, we find that OCTO+ places objects in a valid +region over 70% of the time, outperforming other methods on a range of metrics. + +
+
+ comment: 2024 IEEE International Conference on Artificial Intelligence and + eXtended and Virtual Reality (AIXVR) +
+
+
+
+
+ + ☆ Hearing Loss Detection from Facial Expressions in One-on-one + Conversations ICASSP 2024 + + +
+ Individuals with impaired hearing experience difficulty in conversations, +especially in noisy environments. This difficulty often manifests as a change +in behavior and may be captured via facial expressions, such as the expression +of discomfort or fatigue. In this work, we build on this idea and introduce the +problem of detecting hearing loss from an individual's facial expressions +during a conversation. Building machine learning models that can represent +hearing-related facial expression changes is a challenge. In addition, models +need to disentangle spurious age-related correlations from hearing-driven +expressions. To this end, we propose a self-supervised pre-training strategy +tailored for the modeling of expression variations. We also use adversarial +representation learning to mitigate the age bias. We evaluate our approach on a +large-scale egocentric dataset with real-world conversational scenarios +involving subjects with hearing loss and show that our method for hearing loss +detection achieves superior performance over baselines. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ☆ COCO is "ALL'' You Need for Visual Instruction Fine-tuning + + +
+ Multi-modal Large Language Models (MLLMs) are increasingly prominent in the +field of artificial intelligence. Visual instruction fine-tuning (IFT) is a +vital process for aligning MLLMs' output with user's intentions. High-quality +and diversified instruction following data is the key to this fine-tuning +process. Recent studies propose to construct visual IFT datasets through a +multifaceted approach: transforming existing datasets with rule-based +templates, employing GPT-4 for rewriting annotations, and utilizing GPT-4V for +visual dataset pseudo-labeling. LLaVA-1.5 adopted similar approach and +construct LLaVA-mix-665k, which is one of the simplest, most widely used, yet +most effective IFT datasets today. Notably, when properly fine-tuned with this +dataset, MLLMs can achieve state-of-the-art performance on several benchmarks. +However, we noticed that models trained with this dataset often struggle to +follow user instructions properly in multi-round dialog. In addition, tradition +caption and VQA evaluation benchmarks, with their closed-form evaluation +structure, are not fully equipped to assess the capabilities of modern +open-ended generative MLLMs. This problem is not unique to the LLaVA-mix-665k +dataset, but may be a potential issue in all IFT datasets constructed from +image captioning or VQA sources, though the extent of this issue may vary. We +argue that datasets with diverse and high-quality detailed instruction +following annotations are essential and adequate for MLLMs IFT. In this work, +we establish a new IFT dataset, with images sourced from the COCO dataset along +with more diverse instructions. Our experiments show that when fine-tuned with +out proposed dataset, MLLMs achieve better performance on open-ended evaluation +benchmarks in both single-round and multi-round dialog setting. + +
+
+
+
+
+ + ☆ Dynamic DNNs and Runtime Management for Efficient Inference on + Mobile/Embedded Devices DATE + + +
+ Deep neural network (DNN) inference is increasingly being executed on mobile +and embedded platforms due to several key advantages in latency, privacy and +always-on availability. However, due to limited computing resources, efficient +DNN deployment on mobile and embedded platforms is challenging. Although many +hardware accelerators and static model compression methods were proposed by +previous works, at system runtime, multiple applications are typically executed +concurrently and compete for hardware resources. This raises two main +challenges: Runtime Hardware Availability and Runtime Application Variability. +Previous works have addressed these challenges through either dynamic neural +networks that contain sub-networks with different performance trade-offs or +runtime hardware resource management. In this thesis, we proposed a combined +method, a system was developed for DNN performance trade-off management, +combining the runtime trade-off opportunities in both algorithms and hardware +to meet dynamically changing application performance targets and hardware +constraints in real time. We co-designed novel Dynamic Super-Networks to +maximise runtime system-level performance and energy efficiency on +heterogeneous hardware platforms. Compared with SOTA, our experimental results +using ImageNet on the GPU of Jetson Xavier NX show our model is 2.4x faster for +similar ImageNet Top-1 accuracy, or 5.1% higher accuracy at similar latency. We +also designed a hierarchical runtime resource manager that tunes both dynamic +neural networks and DVFS at runtime. Compared with the Linux DVFS governor +schedutil, our runtime approach achieves up to a 19% energy reduction and a 9% +latency reduction in single model deployment scenario, and an 89% energy +reduction and a 23% latency reduction in a two concurrent model deployment +scenario. + +
+
+ comment: Accepted at Design, Automation & Test in Europe Conference (DATE) + 2024, PhD Forum +
+
+
+
+
+ + ☆ Fluid Dynamic DNNs for Reliable and Adaptive Distributed Inference on + Edge Devices DATE + + +
+ Distributed inference is a popular approach for efficient DNN inference at +the edge. However, traditional Static and Dynamic DNNs are not +distribution-friendly, causing system reliability and adaptability issues. In +this paper, we introduce Fluid Dynamic DNNs (Fluid DyDNNs), tailored for +distributed inference. Distinct from Static and Dynamic DNNs, Fluid DyDNNs +utilize a novel nested incremental training algorithm to enable independent and +combined operation of its sub-networks, enhancing system reliability and +adaptability. Evaluation on embedded Arm CPUs with a DNN model and the MNIST +dataset, shows that in scenarios of single device failure, Fluid DyDNNs ensure +continued inference, whereas Static and Dynamic DNNs fail. When devices are +fully operational, Fluid DyDNNs can operate in either a High-Accuracy mode and +achieve comparable accuracy with Static DNNs, or in a High-Throughput mode and +achieve 2.5x and 2x throughput compared with Static and Dynamic DNNs, +respectively. + +
+
+ comment: Accepted at Design, Automation & Test in Europe Conference (DATE) + 2024 +
+
+
+
+
+ + ☆ ICON: Incremental CONfidence for Joint Pose and Radiance Field + Optimization + + +
+ Neural Radiance Fields (NeRF) exhibit remarkable performance for Novel View +Synthesis (NVS) given a set of 2D images. However, NeRF training requires +accurate camera pose for each input view, typically obtained by +Structure-from-Motion (SfM) pipelines. Recent works have attempted to relax +this constraint, but they still often rely on decent initial poses which they +can refine. Here we aim at removing the requirement for pose initialization. We +present Incremental CONfidence (ICON), an optimization procedure for training +NeRFs from 2D video frames. ICON only assumes smooth camera motion to estimate +initial guess for poses. Further, ICON introduces ``confidence": an adaptive +measure of model quality used to dynamically reweight gradients. ICON relies on +high-confidence poses to learn NeRF, and high-confidence 3D structure (as +encoded by NeRF) to learn poses. We show that ICON, without prior pose +initialization, achieves superior performance in both CO3D and HO3D versus +methods which use SfM pose. + +
+
+
+
+
+ + ☆ Learning to detect cloud and snow in remote sensing images from noisy + labels + + +
+ Detecting clouds and snow in remote sensing images is an essential +preprocessing task for remote sensing imagery. Previous works draw inspiration +from semantic segmentation models in computer vision, with most research +focusing on improving model architectures to enhance detection performance. +However, unlike natural images, the complexity of scenes and the diversity of +cloud types in remote sensing images result in many inaccurate labels in cloud +and snow detection datasets, introducing unnecessary noises into the training +and testing processes. By constructing a new dataset and proposing a novel +training strategy with the curriculum learning paradigm, we guide the model in +reducing overfitting to noisy labels. Additionally, we design a more +appropriate model performance evaluation method, that alleviates the +performance assessment bias caused by noisy labels. By conducting experiments +on models with UNet and Segformer, we have validated the effectiveness of our +proposed method. This paper is the first to consider the impact of label noise +on the detection of clouds and snow in remote sensing images. + +
+
+
+
+
+ + ☆ 3D Human Pose Analysis via Diffusion Synthesis + + +
+ Diffusion models have demonstrated remarkable success in generative modeling. +In this paper, we propose PADS (Pose Analysis by Diffusion Synthesis), a novel +framework designed to address various challenges in 3D human pose analysis +through a unified pipeline. Central to PADS are two distinctive strategies: i) +learning a task-agnostic pose prior using a diffusion synthesis process to +effectively capture the kinematic constraints in human pose data, and ii) +unifying multiple pose analysis tasks like estimation, completion, denoising, +etc, as instances of inverse problems. The learned pose prior will be treated +as a regularization imposing on task-specific constraints, guiding the +optimization process through a series of conditional denoising steps. PADS +represents the first diffusion-based framework for tackling general 3D human +pose analysis within the inverse problem framework. Its performance has been +validated on different benchmarks, signaling the adaptability and robustness of +this pipeline. + +
+
+
+
+
+ + ☆ Uncertainty-aware No-Reference Point Cloud Quality Assessment + + +
+ The evolution of compression and enhancement algorithms necessitates an +accurate quality assessment for point clouds. Previous works consistently +regard point cloud quality assessment (PCQA) as a MOS regression problem and +devise a deterministic mapping, ignoring the stochasticity in generating MOS +from subjective tests. Besides, the viewpoint switching of 3D point clouds in +subjective tests reinforces the judging stochasticity of different subjects +compared with traditional images. This work presents the first probabilistic +architecture for no-reference PCQA, motivated by the labeling process of +existing datasets. The proposed method can model the quality judging +stochasticity of subjects through a tailored conditional variational +autoencoder (CVAE) and produces multiple intermediate quality ratings. These +intermediate ratings simulate the judgments from different subjects and are +then integrated into an accurate quality prediction, mimicking the generation +process of a ground truth MOS. Specifically, our method incorporates a Prior +Module, a Posterior Module, and a Quality Rating Generator, where the former +two modules are introduced to model the judging stochasticity in subjective +tests, while the latter is developed to generate diverse quality ratings. +Extensive experiments indicate that our approach outperforms previous +cutting-edge methods by a large margin and exhibits gratifying cross-dataset +robustness. + +
+
+
+
+
+ + ☆ Subwavelength Imaging using a Solid-Immersion Diffractive Optical + Processor + + +
+ Phase imaging is widely used in biomedical imaging, sensing, and material +characterization, among other fields. However, direct imaging of phase objects +with subwavelength resolution remains a challenge. Here, we demonstrate +subwavelength imaging of phase and amplitude objects based on all-optical +diffractive encoding and decoding. To resolve subwavelength features of an +object, the diffractive imager uses a thin, high-index solid-immersion layer to +transmit high-frequency information of the object to a spatially-optimized +diffractive encoder, which converts/encodes high-frequency information of the +input into low-frequency spatial modes for transmission through air. The +subsequent diffractive decoder layers (in air) are jointly designed with the +encoder using deep-learning-based optimization, and communicate with the +encoder layer to create magnified images of input objects at its output, +revealing subwavelength features that would otherwise be washed away due to +diffraction limit. We demonstrate that this all-optical collaboration between a +diffractive solid-immersion encoder and the following decoder layers in air can +resolve subwavelength phase and amplitude features of input objects in a highly +compact design. To experimentally demonstrate its proof-of-concept, we used +terahertz radiation and developed a fabrication method for creating monolithic +multi-layer diffractive processors. Through these monolithically fabricated +diffractive encoder-decoder pairs, we demonstrated phase-to-intensity +transformations and all-optically reconstructed subwavelength phase features of +input objects by directly transforming them into magnified intensity features +at the output. This solid-immersion-based diffractive imager, with its compact +and cost-effective design, can find wide-ranging applications in bioimaging, +endoscopy, sensing and materials characterization. + +
+
+ comment: 32 Pages, 9 Figures +
+
+
+
+
+ + ☆ Idempotence and Perceptual Image Compression ICLR 2024 + + +
+ Idempotence is the stability of image codec to re-compression. At the first +glance, it is unrelated to perceptual image compression. However, we find that +theoretically: 1) Conditional generative model-based perceptual codec satisfies +idempotence; 2) Unconditional generative model with idempotence constraint is +equivalent to conditional generative codec. Based on this newfound equivalence, +we propose a new paradigm of perceptual image codec by inverting unconditional +generative model with idempotence constraints. Our codec is theoretically +equivalent to conditional generative codec, and it does not require training +new models. Instead, it only requires a pre-trained mean-square-error codec and +unconditional generative model. Empirically, we show that our proposed approach +outperforms state-of-the-art methods such as HiFiC and ILLM, in terms of +Fr\'echet Inception Distance (FID). The source code is provided in +https://github.com/tongdaxu/Idempotence-and-Perceptual-Image-Compression. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ Efficient Image Super-Resolution via Symmetric Visual Attention Network + + +
+ An important development direction in the Single-Image Super-Resolution +(SISR) algorithms is to improve the efficiency of the algorithms. Recently, +efficient Super-Resolution (SR) research focuses on reducing model complexity +and improving efficiency through improved deep small kernel convolution, +leading to a small receptive field. The large receptive field obtained by large +kernel convolution can significantly improve image quality, but the +computational cost is too high. To improve the reconstruction details of +efficient super-resolution reconstruction, we propose a Symmetric Visual +Attention Network (SVAN) by applying large receptive fields. The SVAN +decomposes a large kernel convolution into three different combinations of +convolution operations and combines them with an attention mechanism to form a +Symmetric Large Kernel Attention Block (SLKAB), which forms a symmetric +attention block with a bottleneck structure by the size of the receptive field +in the convolution combination to extract depth features effectively as the +basic component of the SVAN. Our network gets a large receptive field while +minimizing the number of parameters and improving the perceptual ability of the +model. The experimental results show that the proposed SVAN can obtain +high-quality super-resolution reconstruction results using only about 30% of +the parameters of existing SOTA methods. + +
+
+ comment: 13 pages,4 figures +
+
+
+
+
+ + ☆ PPR: Enhancing Dodging Attacks while Maintaining Impersonation Attacks + on Face Recognition Systems + + +
+ Adversarial Attacks on Face Recognition (FR) encompass two types: +impersonation attacks and evasion attacks. We observe that achieving a +successful impersonation attack on FR does not necessarily ensure a successful +dodging attack on FR in the black-box setting. Introducing a novel attack +method named Pre-training Pruning Restoration Attack (PPR), we aim to enhance +the performance of dodging attacks whilst avoiding the degradation of +impersonation attacks. Our method employs adversarial example pruning, enabling +a portion of adversarial perturbations to be set to zero, while tending to +maintain the attack performance. By utilizing adversarial example pruning, we +can prune the pre-trained adversarial examples and selectively free up certain +adversarial perturbations. Thereafter, we embed adversarial perturbations in +the pruned area, which enhances the dodging performance of the adversarial face +examples. The effectiveness of our proposed attack method is demonstrated +through our experimental results, showcasing its superior performance. + +
+
+
+
+
+ + ☆ Uncertainty Modeling in Ultrasound Image Segmentation for Precise Fetal + Biometric Measurements + + +
+ Medical image segmentation, particularly in the context of ultrasound data, +is a crucial aspect of computer vision and medical imaging. This paper delves +into the complexities of uncertainty in the segmentation process, focusing on +fetal head and femur ultrasound images. The proposed methodology involves +extracting target contours and exploring techniques for precise parameter +measurement. Uncertainty modeling methods are employed to enhance the training +and testing processes of the segmentation network. The study reveals that the +average absolute error in fetal head circumference measurement is 8.0833mm, +with a relative error of 4.7347%. Similarly, the average absolute error in +fetal femur measurement is 2.6163mm, with a relative error of 6.3336%. +Uncertainty modeling experiments employing Test-Time Augmentation (TTA) +demonstrate effective interpretability of data uncertainty on both datasets. +This suggests that incorporating data uncertainty based on the TTA method can +support clinical practitioners in making informed decisions and obtaining more +reliable measurement results in practical clinical applications. The paper +contributes to the advancement of ultrasound image segmentation, addressing +critical challenges and improving the reliability of biometric measurements. + +
+
+
+
+
+ + ☆ Automatic 3D Multi-modal Ultrasound Segmentation of Human Placenta using + Fusion Strategies and Deep Learning + + +
+ Purpose: Ultrasound is the most commonly used medical imaging modality for +diagnosis and screening in clinical practice. Due to its safety profile, +noninvasive nature and portability, ultrasound is the primary imaging modality +for fetal assessment in pregnancy. Current ultrasound processing methods are +either manual or semi-automatic and are therefore laborious, time-consuming and +prone to errors, and automation would go a long way in addressing these +challenges. Automated identification of placental changes at earlier gestation +could facilitate potential therapies for conditions such as fetal growth +restriction and pre-eclampsia that are currently detected only at late +gestational age, potentially preventing perinatal morbidity and mortality. + Methods: We propose an automatic three-dimensional multi-modal (B-mode and +power Doppler) ultrasound segmentation of the human placenta using deep +learning combined with different fusion strategies.We collected data containing +Bmode and power Doppler ultrasound scans for 400 studies. + Results: We evaluated different fusion strategies and state-of-the-art image +segmentation networks for placenta segmentation based on standard overlap- and +boundary-based metrics. We found that multimodal information in the form of +B-mode and power Doppler scans outperform any single modality. Furthermore, we +found that B-mode and power Doppler input scans fused at the data level provide +the best results with a mean Dice Similarity Coefficient (DSC) of 0.849. + Conclusion: We conclude that the multi-modal approach of combining B-mode and +power Doppler scans is effective in segmenting the placenta from 3D ultrasound +scans in a fully automated manner and is robust to quality variation of the +datasets. + +
+
+
+
+
+ + ☆ CT Liver Segmentation via PVT-based Encoding and Refined Decoding + + +
+ Accurate liver segmentation from CT scans is essential for computer-aided +diagnosis and treatment planning. Recently, Vision Transformers achieved a +competitive performance in computer vision tasks compared to convolutional +neural networks due to their exceptional ability to learn global +representations. However, they often struggle with scalability, memory +constraints, and computational inefficiency, particularly in handling +high-resolution medical images. To overcome scalability and efficiency issues, +we propose a novel deep learning approach, \textit{\textbf{PVTFormer}}, that is +built upon a pretrained pyramid vision transformer (PVT v2) combined with +advanced residual upsampling and decoder block. By integrating a refined +feature channel approach with hierarchical decoding strategy, PVTFormer +generates high quality segmentation masks by enhancing semantic features. +Rigorous evaluation of the proposed method on Liver Tumor Segmentation +Benchmark (LiTS) 2017 demonstrates that our proposed architecture not only +achieves a high dice coefficient of 86.78\%, mIoU of 78.46\%, but also obtains +a low HD of 3.50. The results underscore PVTFormer's efficacy in setting a new +benchmark for state-of-the-art liver segmentation methods. The source code of +the proposed PVTFormer is available at +\url{https://github.com/DebeshJha/PVTFormer}. + +
+
+
+
+
+ + ☆ SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of + Lumbar Spine MRI + + +
+ Intervertebral disc disease, a prevalent ailment, frequently leads to +intermittent or persistent low back pain, and diagnosing and assessing of this +disease rely on accurate measurement of vertebral bone and intervertebral disc +geometries from lumbar MR images. Deep neural network (DNN) models may assist +clinicians with more efficient image segmentation of individual instances +(disks and vertebrae) of the lumbar spine in an automated way, which is termed +as instance image segmentation. In this work, we proposed SymTC, an innovative +lumbar spine MR image segmentation model that combines the strengths of +Transformer and Convolutional Neural Network (CNN). Specifically, we designed a +parallel dual-path architecture to merge CNN layers and Transformer layers, and +we integrated a novel position embedding into the self-attention module of +Transformer, enhancing the utilization of positional information for more +accurate segmentation. To further improves model performance, we introduced a +new data augmentation technique to create synthetic yet realistic MR image +dataset, named SSMSpine, which is made publicly available. We evaluated our +SymTC and the other 15 existing image segmentation models on our private +in-house dataset and the public SSMSpine dataset, using two metrics, Dice +Similarity Coefficient and 95% Hausdorff Distance. The results show that our +SymTC has the best performance for segmenting vertebral bones and +intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine +dataset are available at https://github.com/jiasongchen/SymTC. + +
+
+
+
+
+ + ☆ MITS-GAN: Safeguarding Medical Imaging from Tampering with Generative + Adversarial Networks + + +
+ The progress in generative models, particularly Generative Adversarial +Networks (GANs), opened new possibilities for image generation but raised +concerns about potential malicious uses, especially in sensitive areas like +medical imaging. This study introduces MITS-GAN, a novel approach to prevent +tampering in medical images, with a specific focus on CT scans. The approach +disrupts the output of the attacker's CT-GAN architecture by introducing +imperceptible but yet precise perturbations. Specifically, the proposed +approach involves the introduction of appropriate Gaussian noise to the input +as a protective measure against various attacks. Our method aims to enhance +tamper resistance, comparing favorably to existing techniques. Experimental +results on a CT scan dataset demonstrate MITS-GAN's superior performance, +emphasizing its ability to generate tamper-resistant images with negligible +artifacts. As image tampering in medical domains poses life-threatening risks, +our proactive approach contributes to the responsible and ethical use of +generative models. This work provides a foundation for future research in +countering cyber threats in medical imaging. Models and codes are publicly +available at the following link +\url{https://iplab.dmi.unict.it/MITS-GAN-2024/}. + +
+
+
+
+
+ + ☆ Land Cover Image Classification + + +
+ Land Cover (LC) image classification has become increasingly significant in +understanding environmental changes, urban planning, and disaster management. +However, traditional LC methods are often labor-intensive and prone to human +error. This paper explores state-of-the-art deep learning models for enhanced +accuracy and efficiency in LC analysis. We compare convolutional neural +networks (CNN) against transformer-based methods, showcasing their applications +and advantages in LC studies. We used EuroSAT, a patch-based LC classification +data set based on Sentinel-2 satellite images and achieved state-of-the-art +results using current transformer models. + +
+
+ comment: 7 pages, 4 figures, 1 table, published in conference +
+
+
+
+
+ + ☆ Robustness Evaluation of Machine Learning Models for Robot Arm Action + Recognition in Noisy Environments ICASSP + + +
+ In the realm of robot action recognition, identifying distinct but spatially +proximate arm movements using vision systems in noisy environments poses a +significant challenge. This paper studies robot arm action recognition in noisy +environments using machine learning techniques. Specifically, a vision system +is used to track the robot's movements followed by a deep learning model to +extract the arm's key points. Through a comparative analysis of machine +learning methods, the effectiveness and robustness of this model are assessed +in noisy environments. A case study was conducted using the Tic-Tac-Toe game in +a 3-by-3 grid environment, where the focus is to accurately identify the +actions of the arms in selecting specific locations within this constrained +environment. Experimental results show that our approach can achieve precise +key point detection and action classification despite the addition of noise and +uncertainties to the dataset. + +
+
+ comment: Accepted at ICASSP +
+
+
+
+
+ + ☆ MedBlindTuner: Towards Privacy-preserving Fine-tuning on Biomedical + Images with Transformers and Fully Homomorphic Encryption AAAI + + +
+ Advancements in machine learning (ML) have significantly revolutionized +medical image analysis, prompting hospitals to rely on external ML services. +However, the exchange of sensitive patient data, such as chest X-rays, poses +inherent privacy risks when shared with third parties. Addressing this concern, +we propose MedBlindTuner, a privacy-preserving framework leveraging fully +homomorphic encryption (FHE) and a data-efficient image transformer (DEiT). +MedBlindTuner enables the training of ML models exclusively on FHE-encrypted +medical images. Our experimental evaluation demonstrates that MedBlindTuner +achieves comparable accuracy to models trained on non-encrypted images, +offering a secure solution for outsourcing ML computations while preserving +patient data privacy. To the best of our knowledge, this is the first work that +uses data-efficient image transformers and fully homomorphic encryption in this +domain. + +
+
+ comment: Accepted for the presentation at W3PHIAI, The 38th Annual AAAI + Conference on Artificial Intelligence 2024 +
+
+
+
+
+ + ☆ Efficient generative adversarial networks using linear + additive-attention Transformers + + +
+ Although the capacity of deep generative models for image generation, such as +Diffusion Models (DMs) and Generative Adversarial Networks (GANs), has +dramatically improved in recent years, much of their success can be attributed +to computationally expensive architectures. This has limited their adoption and +use to research laboratories and companies with large resources, while +significantly raising the carbon footprint for training, fine-tuning, and +inference. In this work, we present LadaGAN, an efficient generative +adversarial network that is built upon a novel Transformer block named +Ladaformer. The main component of this block is a linear additive-attention +mechanism that computes a single attention vector per head instead of the +quadratic dot-product attention. We employ Ladaformer in both the generator and +discriminator, which reduces the computational complexity and overcomes the +training instabilities often associated with Transformer GANs. LadaGAN +consistently outperforms existing convolutional and Transformer GANs on +benchmark datasets at different resolutions while being significantly more +efficient. Moreover, LadaGAN shows competitive performance compared to +state-of-the-art multi-step generative models (e.g. DMs) using orders of +magnitude less computational resources. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Enhancing image quality prediction with self-supervised visual masking + + +
+ Full-reference image quality metrics (FR-IQMs) aim to measure the visual +differences between a pair of reference and distorted images, with the goal of +accurately predicting human judgments. However, existing FR-IQMs, including +traditional ones like PSNR and SSIM and even perceptual ones such as HDR-VDP, +LPIPS, and DISTS, still fall short in capturing the complexities and nuances of +human perception. In this work, rather than devising a novel IQM model, we seek +to improve upon the perceptual quality of existing FR-IQM methods. We achieve +this by considering visual masking, an important characteristic of the human +visual system that changes its sensitivity to distortions as a function of +local image content. Specifically, for a given FR-IQM metric, we propose to +predict a visual masking model that modulates reference and distorted images in +a way that penalizes the visual errors based on their visibility. Since the +ground truth visual masks are difficult to obtain, we demonstrate how they can +be derived in a self-supervised manner solely based on mean opinion scores +(MOS) collected from an FR-IQM dataset. Our approach results in enhanced FR-IQM +metrics that are more in line with human prediction both visually and +quantitatively. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the loss saturation problem under +massive false negatives; second, mixed sample data augmentation for +probabilistic matching. Experimental results on MS-COCO Caption and two +extended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of +PCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is +also evaluated under noisy image-text correspondences. In addition, the +potential applicability of PCME++ in automatic prompt tuning for zero-shot +classification is shown. The code is available at +https://github.com/naver-ai/pcmepp. + +
+
+ comment: ICLR 2024; Code: https://github.com/naver-ai/pcmepp. Project page: + https://naver-ai.github.io/pcmepp/. 26 pages, 2.4 MB +
+
+
+
+
+ + ♻ ☆ Diffusion-Based Adversarial Sample Generation for Improved Stealthiness + and Controllability NeurIPS'2023 + + +
+ Neural networks are known to be susceptible to adversarial samples: small +variations of natural examples crafted to deliberately mislead the models. +While they can be easily generated using gradient-based techniques in digital +and physical scenarios, they often differ greatly from the actual data +distribution of natural images, resulting in a trade-off between strength and +stealthiness. In this paper, we propose a novel framework dubbed +Diffusion-Based Projected Gradient Descent (Diff-PGD) for generating realistic +adversarial samples. By exploiting a gradient guided by a diffusion model, +Diff-PGD ensures that adversarial samples remain close to the original data +distribution while maintaining their effectiveness. Moreover, our framework can +be easily customized for specific tasks such as digital attacks, physical-world +attacks, and style-based attacks. Compared with existing methods for generating +natural-style adversarial samples, our framework enables the separation of +optimizing adversarial loss from other surrogate losses (e.g., +content/smoothness/style loss), making it more stable and controllable. +Finally, we demonstrate that the samples generated using Diff-PGD have better +transferability and anti-purification power than traditional gradient-based +methods. Code will be released in https://github.com/xavihart/Diff-PGD + +
+
+ comment: Accepted as a conference paper in NeurIPS'2023. Code repo: + https://github.com/xavihart/Diff-PGD +
+
+
+
+
+ + ♻ ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Diffusion Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ♻ ☆ Phenotyping calcification in vascular tissues using artificial + intelligence + + +
+ Vascular calcification is implicated as an important factor in major adverse +cardiovascular events (MACE), including heart attack and stroke. A controversy +remains over how to integrate the diverse forms of vascular calcification into +clinical risk assessment tools. Even the commonly used calcium score for +coronary arteries, which assumes risk scales positively with total +calcification, has important inconsistencies. Fundamental studies are needed to +determine how risk is influenced by the diverse calcification phenotypes. +However, studies of these kinds are hindered by the lack of high-throughput, +objective, and non-destructive tools for classifying calcification in imaging +data sets. Here, we introduce a new classification system for phenotyping +calcification along with a semi-automated, non-destructive pipeline that can +distinguish these phenotypes in even atherosclerotic tissues. The pipeline +includes a deep-learning-based framework for segmenting lipid pools in noisy +micro-CT images and an unsupervised clustering framework for categorizing +calcification based on size, clustering, and topology. This approach is +illustrated for five vascular specimens, providing phenotyping for thousands of +calcification particles across as many as 3200 images in less than seven hours. +Average Dice Similarity Coefficients of 0.96 and 0.87 could be achieved for +tissue and lipid pool, respectively, with training and validation needed on +only 13 images despite the high heterogeneity in these tissues. By introducing +an efficient and comprehensive approach to phenotyping calcification, this work +enables large-scale studies to identify a more reliable indicator of the risk +of cardiovascular events, a leading cause of global mortality and morbidity. + +
+
+
+
+
+ + ♻ ☆ Generalizing Medical Image Representations via Quaternion Wavelet + Networks + + +
+ Neural network generalizability is becoming a broad research field due to the +increasing availability of datasets from different sources and for various +tasks. This issue is even wider when processing medical data, where a lack of +methodological standards causes large variations being provided by different +imaging centers or acquired with various devices and cofactors. To overcome +these limitations, we introduce a novel, generalizable, data- and task-agnostic +framework able to extract salient features from medical images. The proposed +quaternion wavelet network (QUAVE) can be easily integrated with any +pre-existing medical image analysis or synthesis task, and it can be involved +with real, quaternion, or hypercomplex-valued models, generalizing their +adoption to single-channel data. QUAVE first extracts different sub-bands +through the quaternion wavelet transform, resulting in both +low-frequency/approximation bands and high-frequency/fine-grained features. +Then, it weighs the most representative set of sub-bands to be involved as +input to any other neural model for image processing, replacing standard data +samples. We conduct an extensive experimental evaluation comprising different +datasets, diverse image analysis, and synthesis tasks including reconstruction, +segmentation, and modality translation. We also evaluate QUAVE in combination +with both real and quaternion-valued models. Results demonstrate the +effectiveness and the generalizability of the proposed framework that improves +network performance while being flexible to be adopted in manifold scenarios +and robust to domain shifts. The full code is available at: +https://github.com/ispamm/QWT. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ Balancing stability and plasticity in continual learning: the + readout-decomposition of activation change (RDAC) framework + + +
+ Continual learning (CL) algorithms strive to acquire new knowledge while +preserving prior information. However, this stability-plasticity trade-off +remains a central challenge. This paper introduces a framework that dissects +this trade-off, offering valuable insights into CL algorithms. The +Readout-Decomposition of Activation Change (RDAC) framework first addresses the +stability-plasticity dilemma and its relation to catastrophic forgetting. It +relates learning-induced activation changes in the range of prior readouts to +the degree of stability and changes in the null space to the degree of +plasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the +framework clarifies the stability-plasticity trade-offs of the popular +regularization algorithms Synaptic intelligence (SI), Elastic-weight +consolidation (EWC), and learning without Forgetting (LwF), and replay-based +algorithms Gradient episodic memory (GEM), and data replay. GEM and data replay +preserved stability and plasticity, while SI, EWC, and LwF traded off +plasticity for stability. The inability of the regularization algorithms to +maintain plasticity was linked to them restricting the change of activations in +the null space of the prior readout. Additionally, for one-hidden-layer linear +neural networks, we derived a gradient decomposition algorithm to restrict +activation change only in the range of the prior readouts, to maintain high +stability while not further sacrificing plasticity. Results demonstrate that +the algorithm maintained stability without significant plasticity loss. The +RDAC framework informs the behavior of existing CL algorithms and paves the way +for novel CL approaches. Finally, it sheds light on the connection between +learning-induced activation/representation changes and the stability-plasticity +dilemma, also offering insights into representational drift in biological +systems. + +
+
+ comment: 15 pages, 5 figures, Revision +
+
+
+
+
+ + ♻ ☆ FairTune: Optimizing Parameter Efficient Fine Tuning for Fairness in + Medical Image Analysis ICLR 2024 + + +
+ Training models with robust group fairness properties is crucial in ethically +sensitive application areas such as medical diagnosis. Despite the growing body +of work aiming to minimise demographic bias in AI, this problem remains +challenging. A key reason for this challenge is the fairness generalisation +gap: High-capacity deep learning models can fit all training data nearly +perfectly, and thus also exhibit perfect fairness during training. In this +case, bias emerges only during testing when generalisation performance differs +across subgroups. This motivates us to take a bi-level optimisation perspective +on fair learning: Optimising the learning strategy based on validation +fairness. Specifically, we consider the highly effective workflow of adapting +pre-trained models to downstream medical imaging tasks using +parameter-efficient fine-tuning (PEFT) techniques. There is a trade-off between +updating more parameters, enabling a better fit to the task of interest vs. +fewer parameters, potentially reducing the generalisation gap. To manage this +tradeoff, we propose FairTune, a framework to optimise the choice of PEFT +parameters with respect to fairness. We demonstrate empirically that FairTune +leads to improved fairness on a range of medical imaging datasets. The code is +available at https://github.com/Raman1121/FairTune + +
+
+ comment: Accepted in ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Medical Images? + + +
+ The Segment Anything Model (SAM) is the first foundation model for general +image segmentation. It has achieved impressive results on various natural image +segmentation tasks. However, medical image segmentation (MIS) is more +challenging because of the complex modalities, fine anatomical structures, +uncertain and complex object boundaries, and wide-range object scales. To fully +validate SAM's performance on medical data, we collected and sorted 53 +open-source datasets and built a large medical segmentation dataset with 18 +modalities, 84 objects, 125 object-modality paired targets, 1050K 2D images, +and 6033K masks. We comprehensively analyzed different models and strategies on +the so-called COSMOS 1050K dataset. Our findings mainly include the following: +1) SAM showed remarkable performance in some specific objects but was unstable, +imperfect, or even totally failed in other situations. 2) SAM with the large +ViT-H showed better overall performance than that with the small ViT-B. 3) SAM +performed better with manual hints, especially box, than the Everything mode. +4) SAM could help human annotation with high labeling quality and less time. 5) +SAM was sensitive to the randomness in the center point and tight box prompts, +and may suffer from a serious performance drop. 6) SAM performed better than +interactive methods with one or a few points, but will be outpaced as the +number of points increases. 7) SAM's performance correlated to different +factors, including boundary complexity, intensity differences, etc. 8) +Finetuning the SAM on specific medical tasks could improve its average DICE +performance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that +this comprehensive report can help researchers explore the potential of SAM +applications in MIS, and guide how to appropriately use and develop SAM. + +
+
+ comment: Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Adversarial Examples are Misaligned in Diffusion Model Manifolds + + +
+ In recent years, diffusion models (DMs) have drawn significant attention for +their success in approximating data distributions, yielding state-of-the-art +generative results. Nevertheless, the versatility of these models extends +beyond their generative capabilities to encompass various vision applications, +such as image inpainting, segmentation, adversarial robustness, among others. +This study is dedicated to the investigation of adversarial attacks through the +lens of diffusion models. However, our objective does not involve enhancing the +adversarial robustness of image classifiers. Instead, our focus lies in +utilizing the diffusion model to detect and analyze the anomalies introduced by +these attacks on images. To that end, we systematically examine the alignment +of the distributions of adversarial examples when subjected to the process of +transformation using diffusion models. The efficacy of this approach is +assessed across CIFAR-10 and ImageNet datasets, including varying image sizes +in the latter. The results demonstrate a notable capacity to discriminate +effectively between benign and attacked images, providing compelling evidence +that adversarial instances do not align with the learned manifold of the DMs. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ PPT: Token Pruning and Pooling for Efficient Vision Transformers + + +
+ Vision Transformers (ViTs) have emerged as powerful models in the field of +computer vision, delivering superior performance across various vision tasks. +However, the high computational complexity poses a significant barrier to their +practical applications in real-world scenarios. Motivated by the fact that not +all tokens contribute equally to the final predictions and fewer tokens bring +less computational cost, reducing redundant tokens has become a prevailing +paradigm for accelerating vision transformers. However, we argue that it is not +optimal to either only reduce inattentive redundancy by token pruning, or only +reduce duplicative redundancy by token merging. To this end, in this paper we +propose a novel acceleration framework, namely token Pruning & Pooling +Transformers (PPT), to adaptively tackle these two types of redundancy in +different layers. By heuristically integrating both token pruning and token +pooling techniques in ViTs without additional trainable parameters, PPT +effectively reduces the model complexity while maintaining its predictive +accuracy. For example, PPT reduces over 37% FLOPs and improves the throughput +by over 45% for DeiT-S without any accuracy drop on the ImageNet dataset. The +code is available at https://github.com/xjwu1024/PPT and +https://github.com/mindspore-lab/models/ + +
+
+
+
+
+ + ♻ ☆ Robust and Large-Payload DNN Watermarking via Fixed, + Distribution-Optimized, Weights + + +
+ The design of an effective multi-bit watermarking algorithm hinges upon +finding a good trade-off between the three fundamental requirements forming the +watermarking trade-off triangle, namely, robustness against network +modifications, payload, and unobtrusiveness, ensuring minimal impact on the +performance of the watermarked network. In this paper, we first revisit the +nature of the watermarking trade-off triangle for the DNN case, then we exploit +our findings to propose a white-box, multi-bit watermarking method achieving +very large payload and strong robustness against network modification. In the +proposed system, the weights hosting the watermark are set prior to training, +making sure that their amplitude is large enough to bear the target payload and +survive network modifications, notably retraining, and are left unchanged +throughout the training process. The distribution of the weights carrying the +watermark is theoretically optimised to ensure the secrecy of the watermark and +make sure that the watermarked weights are indistinguishable from the +non-watermarked ones. The proposed method can achieve outstanding performance, +with no significant impact on network accuracy, including robustness against +network modifications, retraining and transfer learning, while ensuring a +payload which is out of reach of state of the art methods achieving a lower - +or at most comparable - robustness. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ MIMIR: Masked Image Modeling for Mutual Information-based Adversarial + Robustness + + +
+ Vision Transformers (ViTs) achieve superior performance on various tasks +compared to convolutional neural networks (CNNs), but ViTs are also vulnerable +to adversarial attacks. Adversarial training is one of the most successful +methods to build robust CNN models. Thus, recent works explored new +methodologies for adversarial training of ViTs based on the differences between +ViTs and CNNs, such as better training strategies, preventing attention from +focusing on a single block, or discarding low-attention embeddings. However, +these methods still follow the design of traditional supervised adversarial +training, limiting the potential of adversarial training on ViTs. This paper +proposes a novel defense method, MIMIR, which aims to build a different +adversarial training methodology by utilizing Masked Image Modeling at +pre-training. We create an autoencoder that accepts adversarial examples as +input but takes the clean examples as the modeling target. Then, we create a +mutual information (MI) penalty following the idea of the Information +Bottleneck. Among the two information source inputs and corresponding +adversarial perturbation, the perturbation information is eliminated due to the +constraint of the modeling target. Next, we provide a theoretical analysis of +MIMIR using the bounds of the MI penalty. We also design two adaptive attacks +when the adversary is aware of the MIMIR defense and show that MIMIR still +performs well. The experimental results show that MIMIR improves (natural and +adversarial) accuracy on average by 4.19% on CIFAR-10 and 5.52% on ImageNet-1K, +compared to baselines. On Tiny-ImageNet, we obtained improved natural accuracy +of 2.99\% on average and comparable adversarial accuracy. Our code and trained +models are publicly available https://github.com/xiaoyunxxy/MIMIR. + +
+
+
+
+
+ + ♻ ☆ MLLM-Protector: Ensuring MLLM's Safety without Hurting Performance + + +
+ The deployment of multimodal large language models (MLLMs) has brought forth +a unique vulnerability: susceptibility to malicious attacks through visual +inputs. We delve into the novel challenge of defending MLLMs against such +attacks. We discovered that images act as a "foreign language" that is not +considered during alignment, which can make MLLMs prone to producing harmful +responses. Unfortunately, unlike the discrete tokens considered in text-based +LLMs, the continuous nature of image signals presents significant alignment +challenges, which poses difficulty to thoroughly cover the possible scenarios. +This vulnerability is exacerbated by the fact that open-source MLLMs are +predominantly fine-tuned on limited image-text pairs that is much less than the +extensive text-based pretraining corpus, which makes the MLLMs more prone to +catastrophic forgetting of their original abilities during explicit alignment +tuning. To tackle these challenges, we introduce MLLM-Protector, a +plug-and-play strategy combining a lightweight harm detector and a response +detoxifier. The harm detector's role is to identify potentially harmful outputs +from the MLLM, while the detoxifier corrects these outputs to ensure the +response stipulates to the safety standards. This approach effectively +mitigates the risks posed by malicious visual inputs without compromising the +model's overall performance. Our results demonstrate that MLLM-Protector offers +a robust solution to a previously unaddressed aspect of MLLM security. + +
+
+
+
+
+ + ♻ ☆ Alleviating Exposure Bias in Diffusion Models through Sampling with + Shifted Time Steps ICLR2024 + + +
+ Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the +synthesis of high-quality images. However, their inference process +characteristically requires numerous, potentially hundreds, of iterative steps, +which could exaggerate the problem of exposure bias due to the training and +inference discrepancy. Previous work has attempted to mitigate this issue by +perturbing inputs during training, which consequently mandates the retraining +of the DPM. In this work, we conduct a systematic study of exposure bias in DPM +and, intriguingly, we find that the exposure bias could be alleviated with a +novel sampling method that we propose, without retraining the model. We +empirically and theoretically show that, during inference, for each backward +time step $t$ and corresponding state $\hat{x}_t$, there might exist another +time step $t_s$ which exhibits superior coupling with $\hat{x}_t$. Based on +this finding, we introduce a sampling method named Time-Shift Sampler. Our +framework can be seamlessly integrated to existing sampling algorithms, such as +DDPM, DDIM and other high-order solvers, inducing merely minimal additional +computations. Experimental results show our method brings significant and +consistent improvements in FID scores on different datasets and sampling +methods. For example, integrating Time-Shift Sampler to F-PNDM yields a +FID=3.88, achieving 44.49\% improvements as compared to F-PNDM, on CIFAR-10 +with 10 sampling steps, which is more performant than the vanilla DDIM with 100 +sampling steps. Our code is available at https://github.com/Mingxiao-Li/TS-DPM. + +
+
+ comment: Accepted at International Conference on Learning Representations + (ICLR2024) +
+
+
+
+
+ + ♻ ☆ Exploring Vulnerabilities of No-Reference Image Quality Assessment + Models: A Query-Based Black-Box Method + + +
+ No-Reference Image Quality Assessment (NR-IQA) aims to predict image quality +scores consistent with human perception without relying on pristine reference +images, serving as a crucial component in various visual tasks. Ensuring the +robustness of NR-IQA methods is vital for reliable comparisons of different +image processing techniques and consistent user experiences in recommendations. +The attack methods for NR-IQA provide a powerful instrument to test the +robustness of NR-IQA. However, current attack methods of NR-IQA heavily rely on +the gradient of the NR-IQA model, leading to limitations when the gradient +information is unavailable. In this paper, we present a pioneering query-based +black box attack against NR-IQA methods. We propose the concept of score +boundary and leverage an adaptive iterative approach with multiple score +boundaries. Meanwhile, the initial attack directions are also designed to +leverage the characteristics of the Human Visual System (HVS). Experiments show +our method outperforms all compared state-of-the-art attack methods and is far +ahead of previous black-box methods. The effective NR-IQA model DBCNN suffers a +Spearman's rank-order correlation coefficient (SROCC) decline of 0.6381 +attacked by our method, revealing the vulnerability of NR-IQA models to +black-box attacks. The proposed attack method also provides a potent tool for +further exploration into NR-IQA robustness. + +
+
+
+
+
+ + ♻ ☆ On the Hidden Mystery of OCR in Large Multimodal Models + + +
+ Large models have recently played a dominant role in natural language +processing and multimodal vision-language learning. However, their +effectiveness in text-related visual tasks remains relatively unexplored. In +this paper, we conducted a comprehensive evaluation of Large Multimodal Models, +such as GPT4V and Gemini, in various text-related visual tasks including Text +Recognition, Scene Text-Centric Visual Question Answering (VQA), +Document-Oriented VQA, Key Information Extraction (KIE), and Handwritten +Mathematical Expression Recognition (HMER). To facilitate the assessment of +Optical Character Recognition (OCR) capabilities in Large Multimodal Models, we +propose OCRBench, a comprehensive evaluation benchmark.Our study encompasses 29 +datasets, making it the most comprehensive OCR evaluation benchmark available. +Furthermore, our study reveals both the strengths and weaknesses of these +models, particularly in handling multilingual text, handwritten text, +non-semantic text, and mathematical expression recognition. Most importantly, +the baseline results showcased in this study could provide a foundational +framework for the conception and assessment of innovative strategies targeted +at enhancing zero-shot multimodal techniques. The evaluation pipeline and +benchmark are available at https://github.com/Yuliang-Liu/MultimodalOCR. + +
+
+
+
+
+ + ♻ ☆ PoseFix: Correcting 3D Human Poses with Natural Language ICCV 2023 + + +
+ Automatically producing instructions to modify one's posture could open the +door to endless applications, such as personalized coaching and in-home +physical therapy. Tackling the reverse problem (i.e., refining a 3D pose based +on some natural language feedback) could help for assisted 3D character +animation or robot teaching, for instance. Although a few recent works explore +the connections between natural language and 3D human pose, none focus on +describing 3D body pose differences. In this paper, we tackle the problem of +correcting 3D human poses with natural language. To this end, we introduce the +PoseFix dataset, which consists of several thousand paired 3D poses and their +corresponding text feedback, that describe how the source pose needs to be +modified to obtain the target pose. We demonstrate the potential of this +dataset on two tasks: (1) text-based pose editing, that aims at generating +corrected 3D body poses given a query pose and a text modifier; and (2) +correctional text generation, where instructions are generated based on the +differences between two body poses. + +
+
+ comment: Published in ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic + Play + + +
+ Infants' ability to recognize and categorize objects develops gradually. The +second year of life is marked by both the emergence of more semantic visual +representations and a better understanding of word meaning. This suggests that +language input may play an important role in shaping visual representations. +However, even in suitable contexts for word learning like dyadic play sessions, +caregivers utterances are sparse and ambiguous, often referring to objects that +are different from the one to which the child attends. Here, we systematically +investigate to what extent caregivers' utterances can nevertheless enhance +visual representations. For this we propose a computational model of visual +representation learning during dyadic play. We introduce a synthetic dataset of +ego-centric images perceived by a toddler-agent that moves and rotates toy +objects in different parts of its home environment while hearing caregivers' +utterances, modeled as captions. We propose to model toddlers' learning as +simultaneously aligning representations for 1) close-in-time images and 2) +co-occurring images and utterances. We show that utterances with statistics +matching those of real caregivers give rise to representations supporting +improved category recognition. Our analysis reveals that a small +decrease/increase in object-relevant naming frequencies can drastically impact +the learned representations. This affects the attention on object names within +an utterance, which is required for efficient visuo-linguistic alignment. +Overall, our results support the hypothesis that caregivers' naming utterances +can improve toddlers' visual representations. + +
+
+ comment: Proceedings of the 2023 IEEE International Conference on Development + and Learning (ICDL) +
+
+
+
+
+ + ♻ ☆ A Dempster-Shafer approach to trustworthy AI with application to fetal + brain MRI segmentation + + +
+ Deep learning models for medical image segmentation can fail unexpectedly and +spectacularly for pathological cases and images acquired at different centers +than training images, with labeling errors that violate expert knowledge. Such +errors undermine the trustworthiness of deep learning models for medical image +segmentation. Mechanisms for detecting and correcting such failures are +essential for safely translating this technology into clinics and are likely to +be a requirement of future regulations on artificial intelligence (AI). In this +work, we propose a trustworthy AI theoretical framework and a practical system +that can augment any backbone AI system using a fallback method and a fail-safe +mechanism based on Dempster-Shafer theory. Our approach relies on an actionable +definition of trustworthy AI. Our method automatically discards the voxel-level +labeling predicted by the backbone AI that violate expert knowledge and relies +on a fallback for those voxels. We demonstrate the effectiveness of the +proposed trustworthy AI approach on the largest reported annotated dataset of +fetal MRI consisting of 540 manually annotated fetal brain 3D T2w MRIs from 13 +centers. Our trustworthy AI method improves the robustness of a +state-of-the-art backbone AI for fetal brain MRIs acquired across various +centers and for fetuses with various brain abnormalities. + +
+
+ comment: Published in IEEE TPAMI. Minor revision compared to the previous + version +
+
+
+
+
+ + ♻ ☆ Self Expanding Convolutional Neural Networks + + +
+ In this paper, we present a novel method for dynamically expanding +Convolutional Neural Networks (CNNs) during training, aimed at meeting the +increasing demand for efficient and sustainable deep learning models. Our +approach, drawing from the seminal work on Self-Expanding Neural Networks +(SENN), employs a natural expansion score as an expansion criteria to address +the common issue of over-parameterization in deep convolutional neural +networks, thereby ensuring that the model's complexity is finely tuned to the +task's specific needs. A significant benefit of this method is its eco-friendly +nature, as it obviates the necessity of training multiple models of different +sizes. We employ a strategy where a single model is dynamically expanded, +facilitating the extraction of checkpoints at various complexity levels, +effectively reducing computational resource use and energy consumption while +also expediting the development cycle by offering diverse model complexities +from a single training session. We evaluate our method on the CIFAR-10 dataset +and our experimental results validate this approach, demonstrating that +dynamically adding layers not only maintains but also improves CNN performance, +underscoring the effectiveness of our expansion criteria. This approach marks a +considerable advancement in developing adaptive, scalable, and environmentally +considerate neural network architectures, addressing key challenges in the +field of deep learning. + +
+
+
+
+
+ + ♻ ☆ IVIM-Morph: Motion-compensated quantitative Intra-voxel Incoherent + Motion (IVIM) analysis for functional fetal lung maturity assessment from + diffusion-weighted MRI data + + +
+ Quantitative analysis of pseudo-diffusion in diffusion-weighted magnetic +resonance imaging (DWI) data shows potential for assessing fetal lung +maturation and generating valuable imaging biomarkers. Yet, the clinical +utility of DWI data is hindered by unavoidable fetal motion during acquisition. +We present IVIM-morph, a self-supervised deep neural network model for +motion-corrected quantitative analysis of DWI data using the Intra-voxel +Incoherent Motion (IVIM) model. IVIM-morph combines two sub-networks, a +registration sub-network, and an IVIM model fitting sub-network, enabling +simultaneous estimation of IVIM model parameters and motion. To promote +physically plausible image registration, we introduce a biophysically informed +loss function that effectively balances registration and model-fitting quality. +We validated the efficacy of IVIM-morph by establishing a correlation between +the predicted IVIM model parameters of the lung and gestational age (GA) using +fetal DWI data of 39 subjects. IVIM-morph exhibited a notably improved +correlation with gestational age (GA) when performing in-vivo quantitative +analysis of fetal lung DWI data during the canalicular phase. IVIM-morph shows +potential in developing valuable biomarkers for non-invasive assessment of +fetal lung maturity with DWI data. Moreover, its adaptability opens the door to +potential applications in other clinical contexts where motion compensation is +essential for quantitative DWI analysis. The IVIM-morph code is readily +available at: https://github.com/TechnionComputationalMRILab/qDWI-Morph. + +
+
+
+
+
+ + ♻ ☆ Pixel-wise Gradient Uncertainty for Convolutional Neural Networks + applied to Out-of-Distribution Segmentation + + +
+ In recent years, deep neural networks have defined the state-of-the-art in +semantic segmentation where their predictions are constrained to a predefined +set of semantic classes. They are to be deployed in applications such as +automated driving, although their categorically confined expressive power runs +contrary to such open world scenarios. Thus, the detection and segmentation of +objects from outside their predefined semantic space, i.e., out-of-distribution +(OoD) objects, is of highest interest. Since uncertainty estimation methods +like softmax entropy or Bayesian models are sensitive to erroneous predictions, +these methods are a natural baseline for OoD detection. Here, we present a +method for obtaining uncertainty scores from pixel-wise loss gradients which +can be computed efficiently during inference. Our approach is simple to +implement for a large class of models, does not require any additional training +or auxiliary data and can be readily used on pre-trained segmentation models. +Our experiments show the ability of our method to identify wrong pixel +classifications and to estimate prediction quality at negligible computational +overhead. In particular, we observe superior performance in terms of OoD +segmentation to comparable baselines on the SegmentMeIfYouCan benchmark, +clearly outperforming other methods. + +
+
+
+
+
+ + ♻ ☆ TiMix: Text-aware Image Mixing for Effective Vision-Language + Pre-training AAAI2024 + + +
+ Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances +modern Vision-Language Pre-training (VLP) models by aligning visual and +linguistic modalities. Due to noises in web-harvested text-image pairs, +however, scaling up training data volume in SMCL presents considerable +obstacles in terms of computational cost and data inefficiency. To improve data +efficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates +mix-based data augmentation techniques into SMCL, yielding significant +performance improvements without significantly increasing computational +overhead. We provide a theoretical analysis of TiMixfrom a mutual information +(MI) perspective, showing that mixed data samples for cross-modal contrastive +learning implicitly serve as a regularizer for the contrastive loss. The +experimental results demonstrate that TiMix exhibits a comparable performance +on downstream tasks, even with a reduced amount of training data and shorter +training time, when benchmarked against existing methods. This work empirically +and theoretically demonstrates the potential of data mixing for data-efficient +and computationally viable VLP, benefiting broader VLP model adoption in +practical scenarios. + +
+
+ comment: Accepted on AAAI2024 +
+
+
+
+
+ + ♻ ☆ Improved Dense Nested Attention Network Based on Transformer for + Infrared Small Target Detection + + +
+ Infrared small target detection based on deep learning offers unique +advantages in separating small targets from complex and dynamic backgrounds. +However, the features of infrared small targets gradually weaken as the depth +of convolutional neural network (CNN) increases. To address this issue, we +propose a novel method for detecting infrared small targets called improved +dense nested attention network (IDNANet), which is based on the transformer +architecture. We preserve the dense nested structure of dense nested attention +network (DNANet) and introduce the Swin-transformer during feature extraction +stage to enhance the continuity of features. Furthermore, we integrate the +ACmix attention structure into the dense nested structure to enhance the +features of intermediate layers. Additionally, we design a weighted dice binary +cross-entropy (WD-BCE) loss function to mitigate the negative impact of +foreground-background imbalance in the samples. Moreover, we develop a dataset +specifically for infrared small targets, called BIT-SIRST. The dataset +comprises a significant amount of real-world targets and manually annotated +labels, as well as synthetic data and corresponding labels. We have evaluated +the effectiveness of our method through experiments conducted on public +datasets. In comparison to other state-of-the-art methods, our approach +outperforms in terms of probability of detection ($P_d$), false-alarm rate +($F_a$), and mean intersection of union ($mIoU$). The $mIoU$ reaches 90.89\% on +the NUDT-SIRST dataset and 79.72\% on the SIRST dataset. The BIT-SIRST dataset +and codes are available openly at +\href{https://github.com/EdwardBao1006/bit\_sirst}{\color[HTML]{B22222}{https://github.com/EdwardBao1006/bit\_sirst}}. + +
+
+
+
+
+ + ♻ ☆ Online Unsupervised Video Object Segmentation via Contrastive Motion + Clustering + + +
+ Online unsupervised video object segmentation (UVOS) uses the previous frames +as its input to automatically separate the primary object(s) from a streaming +video without using any further manual annotation. A major challenge is that +the model has no access to the future and must rely solely on the history, +i.e., the segmentation mask is predicted from the current frame as soon as it +is captured. In this work, a novel contrastive motion clustering algorithm with +an optical flow as its input is proposed for the online UVOS by exploiting the +common fate principle that visual elements tend to be perceived as a group if +they possess the same motion pattern. We build a simple and effective +auto-encoder to iteratively summarize non-learnable prototypical bases for the +motion pattern, while the bases in turn help learn the representation of the +embedding network. Further, a contrastive learning strategy based on a boundary +prior is developed to improve foreground and background feature discrimination +in the representation learning stage. The proposed algorithm can be optimized +on arbitrarily-scale data i.e., frame, clip, dataset) and performed in an +online fashion. Experiments on $\textit{DAVIS}_{\textit{16}}$, $\textit{FBMS}$, +and $\textit{SegTrackV2}$ datasets show that the accuracy of our method +surpasses the previous state-of-the-art (SoTA) online UVOS method by a margin +of 0.8%, 2.9%, and 1.1%, respectively. Furthermore, by using an online deep +subspace clustering to tackle the motion grouping, our method is able to +achieve higher accuracy at $3\times$ faster inference time compared to SoTA +online UVOS method, and making a good trade-off between effectiveness and +efficiency. Our code is available at https://github.com/xilin1991/ClusterNet. + +
+
+ comment: Accepted by IEEE Transactions on Circuits and Systems for Video + Technology (TCSVT) +
+
+
+
+
+ + ♻ ☆ Frequency Masking for Universal Deepfake Detection ICASSP-2024 + + +
+ We study universal deepfake detection. Our goal is to detect synthetic images +from a range of generative AI approaches, particularly from emerging ones which +are unseen during training of the deepfake detector. Universal deepfake +detection requires outstanding generalization capability. Motivated by recently +proposed masked image modeling which has demonstrated excellent generalization +in self-supervised pre-training, we make the first attempt to explore masked +image modeling for universal deepfake detection. We study spatial and frequency +domain masking in training deepfake detectors. Based on empirical analysis, we +propose a novel deepfake detector via frequency masking. Our focus on frequency +domain is different from the majority, which primarily target spatial domain +detection. Our comparative analyses reveal substantial performance gains over +existing methods. Code and models are publicly available. + +
+
+ comment: Accepted to IEEE ICASSP-2024 +
+
+
+
+
+ + ♻ ☆ Synergy between 3DMM and 3D Landmarks for Accurate 3D Facial Geometry 3DV 2021 + + +
+ This work studies learning from a synergy process of 3D Morphable Models +(3DMM) and 3D facial landmarks to predict complete 3D facial geometry, +including 3D alignment, face orientation, and 3D face modeling. Our synergy +process leverages a representation cycle for 3DMM parameters and 3D landmarks. +3D landmarks can be extracted and refined from face meshes built by 3DMM +parameters. We next reverse the representation direction and show that +predicting 3DMM parameters from sparse 3D landmarks improves the information +flow. Together we create a synergy process that utilizes the relation between +3D landmarks and 3DMM parameters, and they collaboratively contribute to better +performance. We extensively validate our contribution on full tasks of facial +geometry prediction and show our superior and robust performance on these tasks +for various scenarios. Particularly, we adopt only simple and widely-used +network operations to attain fast and accurate facial geometry prediction. +Codes and data: https://choyingw.github.io/works/SynergyNet/ + +
+
+ comment: Accepted at 3DV 2021. This conference version supersedes + arXiv:2104.08403 +
+
+
+
+
+ + ♻ ☆ Evaluate Geometry of Radiance Fields with Low-frequency Color Prior AAAI 2024 + + +
+ A radiance field is an effective representation of 3D scenes, which has been +widely adopted in novel-view synthesis and 3D reconstruction. It is still an +open and challenging problem to evaluate the geometry, i.e., the density field, +as the ground-truth is almost impossible to obtain. One alternative indirect +solution is to transform the density field into a point-cloud and compute its +Chamfer Distance with the scanned ground-truth. However, many widely-used +datasets have no point-cloud ground-truth since the scanning process along with +the equipment is expensive and complicated. To this end, we propose a novel +metric, named Inverse Mean Residual Color (IMRC), which can evaluate the +geometry only with the observation images. Our key insight is that the better +the geometry, the lower-frequency the computed color field. From this insight, +given a reconstructed density field and observation images, we design a +closed-form method to approximate the color field with low-frequency spherical +harmonics, and compute the inverse mean residual color. Then the higher the +IMRC, the better the geometry. Qualitative and quantitative experimental +results verify the effectiveness of our proposed IMRC metric. We also benchmark +several state-of-the-art methods using IMRC to promote future related research. +Our code is available at https://github.com/qihangGH/IMRC. + +
+
+ comment: This paper has been accepted by AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Generative Denoise Distillation: Simple Stochastic Noises Induce + Efficient Knowledge Transfer for Dense Prediction + + +
+ Knowledge distillation is the process of transferring knowledge from a more +powerful large model (teacher) to a simpler counterpart (student). Numerous +current approaches involve the student imitating the knowledge of the teacher +directly. However, redundancy still exists in the learned representations +through these prevalent methods, which tend to learn each spatial location's +features indiscriminately. To derive a more compact representation (concept +feature) from the teacher, inspired by human cognition, we suggest an +innovative method, termed Generative Denoise Distillation (GDD), where +stochastic noises are added to the concept feature of the student to embed them +into the generated instance feature from a shallow network. Then, the generated +instance feature is aligned with the knowledge of the instance from the +teacher. We extensively experiment with object detection, instance +segmentation, and semantic segmentation to demonstrate the versatility and +effectiveness of our method. Notably, GDD achieves new state-of-the-art +performance in the tasks mentioned above. We have achieved substantial +improvements in semantic segmentation by enhancing PspNet and DeepLabV3, both +of which are based on ResNet-18, resulting in mIoU scores of 74.67 and 77.69, +respectively, surpassing their previous scores of 69.85 and 73.20 on the +Cityscapes dataset of 20 categories. The source code is available at +https://github.com/ZhgLiu/GDD. + +
+
+
+
+
+ + ♻ ☆ Self-supervised Feature Adaptation for 3D Industrial Anomaly Detection + + +
+ Industrial anomaly detection is generally addressed as an unsupervised task +that aims at locating defects with only normal training samples. Recently, +numerous 2D anomaly detection methods have been proposed and have achieved +promising results, however, using only the 2D RGB data as input is not +sufficient to identify imperceptible geometric surface anomalies. Hence, in +this work, we focus on multi-modal anomaly detection. Specifically, we +investigate early multi-modal approaches that attempted to utilize models +pre-trained on large-scale visual datasets, i.e., ImageNet, to construct +feature databases. And we empirically find that directly using these +pre-trained models is not optimal, it can either fail to detect subtle defects +or mistake abnormal features as normal ones. This may be attributed to the +domain gap between target industrial data and source data.Towards this problem, +we propose a Local-to-global Self-supervised Feature Adaptation (LSFA) method +to finetune the adaptors and learn task-oriented representation toward anomaly +detection.Both intra-modal adaptation and cross-modal alignment are optimized +from a local-to-global perspective in LSFA to ensure the representation quality +and consistency in the inference stage.Extensive experiments demonstrate that +our method not only brings a significant performance boost to feature embedding +based approaches, but also outperforms previous State-of-The-Art (SoTA) methods +prominently on both MVTec-3D AD and Eyecandies datasets, e.g., LSFA achieves +97.1% I-AUROC on MVTec-3D, surpass previous SoTA by +3.4%. + +
+
+
+
+
+ + ♻ ☆ Deep Attention Unet: A Network Model with Global Feature Perception + Ability + + +
+ Remote sensing image segmentation is a specific task of remote sensing image +interpretation. A good remote sensing image segmentation algorithm can provide +guidance for environmental protection, agricultural production, and urban +construction. This paper proposes a new type of UNet image segmentation +algorithm based on channel self attention mechanism and residual connection +called . In my experiment, the new network model improved mIOU by 2.48% +compared to traditional UNet on the FoodNet dataset. The image segmentation +algorithm proposed in this article enhances the internal connections between +different items in the image, thus achieving better segmentation results for +remote sensing images with occlusion. + +
+
+ comment: The experiment was inadequate and the experimental method needed + major changes +
+
+
+
+
+ + ♻ ☆ Geometry-Aware Instance Segmentation with Disparity Maps CVPR 2020 + + +
+ Most previous works of outdoor instance segmentation for images only use +color information. We explore a novel direction of sensor fusion to exploit +stereo cameras. Geometric information from disparities helps separate +overlapping objects of the same or different classes. Moreover, geometric +information penalizes region proposals with unlikely 3D shapes thus suppressing +false positive detections. Mask regression is based on 2D, 2.5D, and 3D ROI +using the pseudo-lidar and image-based representations. These mask predictions +are fused by a mask scoring process. However, public datasets only adopt stereo +systems with shorter baseline and focal legnth, which limit measuring ranges of +stereo cameras. We collect and utilize High-Quality Driving Stereo (HQDS) +dataset, using much longer baseline and focal length with higher resolution. +Our performance attains state of the art. Please refer to our project page. The +full paper is available here. + +
+
+ comment: CVPR 2020 Workshop of Scalability in Autonomous Driving (WSAD). + Please refer to WSAD site for details; fix typos +
+
+
+
+
+ + ♻ ☆ Adjacent-Level Feature Cross-Fusion With 3-D CNN for Remote Sensing + Image Change Detection + + +
+ Deep learning-based change detection (CD) using remote sensing images has +received increasing attention in recent years. However, how to effectively +extract and fuse the deep features of bi-temporal images for improving the +accuracy of CD is still a challenge. To address that, a novel adjacent-level +feature fusion network with 3D convolution (named AFCF3D-Net) is proposed in +this article. First, through the inner fusion property of 3D convolution, we +design a new feature fusion way that can simultaneously extract and fuse the +feature information from bi-temporal images. Then, to alleviate the semantic +gap between low-level features and high-level features, we propose an +adjacent-level feature cross-fusion (AFCF) module to aggregate complementary +feature information between the adjacent levels. Furthermore, the full-scale +skip connection strategy is introduced to improve the capability of pixel-wise +prediction and the compactness of changed objects in the results. Finally, the +proposed AFCF3D-Net has been validated on the three challenging remote sensing +CD datasets: the Wuhan building dataset (WHU-CD), the LEVIR building dataset +(LEVIR-CD), and the Sun Yat-Sen University dataset (SYSU-CD). The results of +quantitative analysis and qualitative comparison demonstrate that the proposed +AFCF3D-Net achieves better performance compared to other state-of-the-art +methods. The code for this work is available at +https://github.com/wm-Githuber/AFCF3D-Net. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Scene Completeness-Aware Lidar Depth Completion for Driving Scenario ICASSP 2021 + + +
+ This paper introduces Scene Completeness-Aware Depth Completion (SCADC) to +complete raw lidar scans into dense depth maps with fine and complete scene +structures. Recent sparse depth completion for lidars only focuses on the lower +scenes and produces irregular estimations on the upper because existing +datasets, such as KITTI, do not provide groundtruth for upper areas. These +areas are considered less important since they are usually sky or trees of less +scene understanding interest. However, we argue that in several driving +scenarios such as large trucks or cars with loads, objects could extend to the +upper parts of scenes. Thus depth maps with structured upper scene estimation +are important for RGBD algorithms. SCADC adopts stereo images that produce +disparities with better scene completeness but are generally less precise than +lidars, to help sparse lidar depth completion. To our knowledge, we are the +first to focus on scene completeness of sparse depth completion. We validate +our SCADC on both depth estimate precision and scene-completeness on KITTI. +Moreover, we experiment on less-explored outdoor RGBD semantic segmentation +with scene completeness-aware D-input to validate our method. + +
+
+ comment: Present at ICASSP 2021; fix typos +
+
+
+
+
+ + ♻ ☆ Learned Image Compression with ROI-Weighted Distortion and Bit + Allocation + + +
+ This one page paper describes our method for the track of image compression. +To achieve better perceptual quality, we use the adversarial loss to generate +realistic textures, use region of interest (ROI) mask to guide the bit +allocation for different regions. Our Team name is TLIC. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ Diff-PCR: Diffusion-Based Correspondence Searching in Doubly Stochastic + Matrix Space for Point Cloud Registration + + +
+ Efficiently finding optimal correspondences between point clouds is crucial +for solving both rigid and non-rigid point cloud registration problems. +Existing methods often rely on geometric or semantic feature embedding to +establish correspondences and estimate transformations or flow fields. +Recently, state-of-the-art methods have employed RAFT-like iterative updates to +refine the solution. However, these methods have certain limitations. Firstly, +their iterative refinement design lacks transparency, and their iterative +updates follow a fixed path during the refinement process, which can lead to +suboptimal results. Secondly, these methods overlook the importance of refining +or optimizing correspondences (or matching matrices) as a precursor to solving +transformations or flow fields. They typically compute candidate +correspondences based on distances in the point feature space. However, they +only project the candidate matching matrix into some matrix space once with +Sinkhorn or dual softmax operations to obtain final correspondences. This +one-shot projected matching matrix may be far from the globally optimal one, +and these approaches do not consider the distribution of the target matching +matrix. In this paper, we propose a novel approach that exploits the Denoising +Diffusion Model to predict a searching gradient for the optimal matching matrix +within the Doubly Stochastic Matrix Space. During the reverse denoising +process, our method iteratively searches for better solutions along this +denoising gradient, which points towards the maximum likelihood direction of +the target matching matrix. Our method offers flexibility by allowing the +search to start from any initial matching matrix provided by the online +backbone or white noise. Experimental evaluations on the 3DMatch/3DLoMatch and +4DMatch/4DLoMatch datasets demonstrate the effectiveness of our newly designed +framework. + +
+
+
+
+
+ + ♻ ☆ Image Background Serves as Good Proxy for Out-of-distribution Data ICLR 2024 + + +
+ Out-of-distribution (OOD) detection empowers the model trained on the closed +image set to identify unknown data in the open world. Though many prior +techniques have yielded considerable improvements in this research direction, +two crucial obstacles still remain. Firstly, a unified perspective has yet to +be presented to view the developed arts with individual designs, which is vital +for providing insights into future work. Secondly, we expect sufficient natural +OOD supervision to promote the generation of compact boundaries between the +in-distribution (ID) and OOD data without collecting explicit OOD samples. To +tackle these issues, we propose a general probabilistic framework to interpret +many existing methods and an OOD-data-free model, namely +\textbf{S}elf-supervised \textbf{S}ampling for \textbf{O}OD \textbf{D}etection +(SSOD). SSOD efficiently exploits natural OOD signals from the ID data based on +the local property of convolution. With these supervisions, it jointly +optimizes the OOD detection and conventional ID classification in an end-to-end +manner. Extensive experiments reveal that SSOD establishes competitive +state-of-the-art performance on many large-scale benchmarks, outperforming the +best previous method by a large margin, \eg, reporting \textbf{-6.28\%} FPR95 +and \textbf{+0.77\%} AUROC on ImageNet, \textbf{-19.01\%} FPR95 and +\textbf{+3.04\%} AUROC on CIFAR-10, and top-ranked performance on hard OOD +datasets, \ie, ImageNet-O and OpenImage-O. + +
+
+ comment: ICLR 2024. arXiv admin note: text overlap with arXiv:2301.06657 +
+
+
+
+
+ + ♻ ☆ Region-Enhanced Feature Learning for Scene Semantic Segmentation + + +
+ Semantic segmentation in complex scenes relies not only on object appearance +but also on object location and the surrounding environment. Nonetheless, it is +difficult to model long-range context in the format of pairwise point +correlations due to the huge computational cost for large-scale point clouds. +In this paper, we propose using regions as the intermediate representation of +point clouds instead of fine-grained points or voxels to reduce the +computational burden. We introduce a novel Region-Enhanced Feature Learning +Network (REFL-Net) that leverages region correlations to enhance point feature +learning. We design a region-based feature enhancement (RFE) module, which +consists of a Semantic-Spatial Region Extraction stage and a Region Dependency +Modeling stage. In the first stage, the input points are grouped into a set of +regions based on their semantic and spatial proximity. In the second stage, we +explore inter-region semantic and spatial relationships by employing a +self-attention block on region features and then fuse point features with the +region features to obtain more discriminative representations. Our proposed RFE +module is plug-and-play and can be integrated with common semantic segmentation +backbones. We conduct extensive experiments on ScanNetV2 and S3DIS datasets and +evaluate our RFE module with different segmentation backbones. Our REFL-Net +achieves 1.8% mIoU gain on ScanNetV2 and 1.7% mIoU gain on S3DIS with +negligible computational cost compared with backbone models. Both quantitative +and qualitative results show the powerful long-range context modeling ability +and strong generalization ability of our REFL-Net. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia 2023 +
+
+
+
+
+ + ♻ ☆ ScrollTimes: Tracing the Provenance of Paintings as a Window into + History + + +
+ The study of cultural artifact provenance, tracing ownership and +preservation, holds significant importance in archaeology and art history. +Modern technology has advanced this field, yet challenges persist, including +recognizing evidence from diverse sources, integrating sociocultural context, +and enhancing interactive automation for comprehensive provenance analysis. In +collaboration with art historians, we examined the handscroll, a traditional +Chinese painting form that provides a rich source of historical data and a +unique opportunity to explore history through cultural artifacts. We present a +three-tiered methodology encompassing artifact, contextual, and provenance +levels, designed to create a "Biography" for handscroll. Our approach +incorporates the application of image processing techniques and language models +to extract, validate, and augment elements within handscroll using various +cultural heritage databases. To facilitate efficient analysis of non-contiguous +extracted elements, we have developed a distinctive layout. Additionally, we +introduce ScrollTimes, a visual analysis system tailored to support the +three-tiered analysis of handscroll, allowing art historians to interactively +create biographies tailored to their interests. Validated through case studies +and expert interviews, our approach offers a window into history, fostering a +holistic understanding of handscroll provenance and historical significance. + +
+
+ comment: Accepted by IEEE Transactions on Visualization and Computer Graphics + (TVCG) +
+
+
+
+
+ + ♻ ☆ Diagonal Hierarchical Consistency Learning for Semi-supervised Medical + Image Segmentation + + +
+ Medical image segmentation, which is essential for many clinical +applications, has achieved almost human-level performance via data-driven deep +learning technologies. Nevertheless, its performance is predicated upon the +costly process of manually annotating a vast amount of medical images. To this +end, we propose a novel framework for robust semi-supervised medical image +segmentation using diagonal hierarchical consistency learning (DiHC-Net). +First, it is composed of multiple sub-models with identical multi-scale +architecture but with distinct sub-layers, such as up-sampling and +normalisation layers. Second, with mutual consistency, a novel consistency +regularisation is enforced between one model's intermediate and final +prediction and soft pseudo labels from other models in a diagonal hierarchical +fashion. A series of experiments verifies the efficacy of our simple framework, +outperforming all previous approaches on public benchmark dataset on organ and +tumour. + +
+
+ comment: 4 pages, 2 figures, and 2 tables +
+
+
+
+
+ + ♻ ☆ NEURO HAND: A weakly supervised Hierarchical Attention Network for + interpretable neuroimaging abnormality Detection + + +
+ Clinical neuroimaging data is naturally hierarchical. Different magnetic +resonance imaging (MRI) sequences within a series, different slices covering +the head, and different regions within each slice all confer different +information. In this work we present a hierarchical attention network for +abnormality detection using MRI scans obtained in a clinical hospital setting. +The proposed network is suitable for non-volumetric data (i.e. stacks of +high-resolution MRI slices), and can be trained from binary examination-level +labels. We show that this hierarchical approach leads to improved +classification, while providing interpretability through either coarse inter- +and intra-slice abnormality localisation, or giving importance scores for +different slices and sequences, making our model suitable for use as an +automated triaging system in radiology departments. + +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: 14 pages, 3 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ Fast and accurate sparse-view CBCT reconstruction using meta-learned + neural attenuation field and hash-encoding regularization + + +
+ Cone beam computed tomography (CBCT) is an emerging medical imaging technique +to visualize the internal anatomical structures of patients. During a CBCT +scan, several projection images of different angles or views are collectively +utilized to reconstruct a tomographic image. However, reducing the number of +projections in a CBCT scan while preserving the quality of a reconstructed +image is challenging due to the nature of an ill-posed inverse problem. +Recently, a neural attenuation field (NAF) method was proposed by adopting a +neural radiance field algorithm as a new way for CBCT reconstruction, +demonstrating fast and promising results using only 50 views. However, +decreasing the number of projections is still preferable to reduce potential +radiation exposure, and a faster reconstruction time is required considering a +typical scan time. In this work, we propose a fast and accurate sparse-view +CBCT reconstruction (FACT) method to provide better reconstruction quality and +faster optimization speed in the minimal number of view acquisitions ($<$ 50 +views). In the FACT method, we meta-trained a neural network and a hash-encoder +using a few scans (= 15), and a new regularization technique is utilized to +reconstruct the details of an anatomical structure. In conclusion, we have +shown that the FACT method produced better, and faster reconstruction results +over the other conventional algorithms based on CBCT scans of different body +parts (chest, head, and abdomen) and CT vendors (Siemens, Phillips, and GE). + +
+
+
+
+
+ + ♻ ☆ PPEA-Depth: Progressive Parameter-Efficient Adaptation for + Self-Supervised Monocular Depth Estimation AAAI 2024 + + +
+ Self-supervised monocular depth estimation is of significant importance with +applications spanning across autonomous driving and robotics. However, the +reliance on self-supervision introduces a strong static-scene assumption, +thereby posing challenges in achieving optimal performance in dynamic scenes, +which are prevalent in most real-world situations. To address these issues, we +propose PPEA-Depth, a Progressive Parameter-Efficient Adaptation approach to +transfer a pre-trained image model for self-supervised depth estimation. The +training comprises two sequential stages: an initial phase trained on a dataset +primarily composed of static scenes, succeeded by an expansion to more +intricate datasets involving dynamic scenes. To facilitate this process, we +design compact encoder and decoder adapters to enable parameter-efficient +tuning, allowing the network to adapt effectively. They not only uphold +generalized patterns from pre-trained image models but also retain knowledge +gained from the preceding phase into the subsequent one. Extensive experiments +demonstrate that PPEA-Depth achieves state-of-the-art performance on KITTI, +CityScapes and DDAD datasets. + +
+
+ comment: Accepted by AAAI 2024 Project homepage: + https://yuejiangdong.github.io/PPEADepth/ +
+
+
+
+
+ + ♻ ☆ Stanford-ORB: A Real-World 3D Object Inverse Rendering Benchmark NeurIPS 2023 + + +
+ We introduce Stanford-ORB, a new real-world 3D Object inverse Rendering +Benchmark. Recent advances in inverse rendering have enabled a wide range of +real-world applications in 3D content generation, moving rapidly from research +and commercial use cases to consumer devices. While the results continue to +improve, there is no real-world benchmark that can quantitatively assess and +compare the performance of various inverse rendering methods. Existing +real-world datasets typically only consist of the shape and multi-view images +of objects, which are not sufficient for evaluating the quality of material +recovery and object relighting. Methods capable of recovering material and +lighting often resort to synthetic data for quantitative evaluation, which on +the other hand does not guarantee generalization to complex real-world +environments. We introduce a new dataset of real-world objects captured under a +variety of natural scenes with ground-truth 3D scans, multi-view images, and +environment lighting. Using this dataset, we establish the first comprehensive +real-world evaluation benchmark for object inverse rendering tasks from +in-the-wild scenes, and compare the performance of various existing methods. + +
+
+ comment: NeurIPS 2023 Datasets and Benchmarks Track. The first two authors + contributed equally to this work. Project page: + https://stanfordorb.github.io/ +
+
+
+
+
+ + ♻ ☆ Efficient Adaptation of Large Vision Transformer via Adapter + Re-Composing NeurIPS 2023 + + +
+ The advent of high-capacity pre-trained models has revolutionized +problem-solving in computer vision, shifting the focus from training +task-specific models to adapting pre-trained models. Consequently, effectively +adapting large pre-trained models to downstream tasks in an efficient manner +has become a prominent research area. Existing solutions primarily concentrate +on designing lightweight adapters and their interaction with pre-trained +models, with the goal of minimizing the number of parameters requiring updates. +In this study, we propose a novel Adapter Re-Composing (ARC) strategy that +addresses efficient pre-trained model adaptation from a fresh perspective. Our +approach considers the reusability of adaptation parameters and introduces a +parameter-sharing scheme. Specifically, we leverage symmetric +down-/up-projections to construct bottleneck operations, which are shared +across layers. By learning low-dimensional re-scaling coefficients, we can +effectively re-compose layer-adaptive adapters. This parameter-sharing strategy +in adapter design allows us to significantly reduce the number of new +parameters while maintaining satisfactory performance, thereby offering a +promising approach to compress the adaptation cost. We conduct experiments on +24 downstream image classification tasks using various Vision Transformer +variants to evaluate our method. The results demonstrate that our approach +achieves compelling transfer learning performance with a reduced parameter +count. Our code is available at +\href{https://github.com/DavidYanAnDe/ARC}{https://github.com/DavidYanAnDe/ARC}. + +
+
+ comment: Paper is accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Pseudo-Generalized Dynamic View Synthesis from a Video ICLR 2024 + + +
+ Rendering scenes observed in a monocular video from novel viewpoints is a +challenging problem. For static scenes the community has studied both +scene-specific optimization techniques, which optimize on every test scene, and +generalized techniques, which only run a deep net forward pass on a test scene. +In contrast, for dynamic scenes, scene-specific optimization techniques exist, +but, to our best knowledge, there is currently no generalized method for +dynamic novel view synthesis from a given monocular video. To answer whether +generalized dynamic novel view synthesis from monocular videos is possible +today, we establish an analysis framework based on existing techniques and work +toward the generalized approach. We find a pseudo-generalized process without +scene-specific appearance optimization is possible, but geometrically and +temporally consistent depth estimates are needed. Despite no scene-specific +appearance optimization, the pseudo-generalized approach improves upon some +scene-specific methods. + +
+
+ comment: ICLR 2024; Originally titled as "Is Generalized Dynamic Novel View + Synthesis from Monocular Videos Possible Today?"; Project page: + https://xiaoming-zhao.github.io/projects/pgdvs +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ Foundations of Vector Retrieval + + +
+ Vectors are universal mathematical objects that can represent text, images, +speech, or a mix of these data modalities. That happens regardless of whether +data is represented by hand-crafted features or learnt embeddings. Collect a +large enough quantity of such vectors and the question of retrieval becomes +urgently relevant: Finding vectors that are more similar to a query vector. +This monograph is concerned with the question above and covers fundamental +concepts along with advanced data structures and algorithms for vector +retrieval. In doing so, it recaps this fascinating topic and lowers barriers of +entry into this rich area of research. + +
+
+
+
+
+ + ☆ BibSonomy Meets ChatLLMs for Publication Management: From Chat to + Publication Management: Organizing your related work using BibSonomy & LLMs SIGIR + + +
+ The ever-growing corpus of scientific literature presents significant +challenges for researchers with respect to discovery, management, and +annotation of relevant publications. Traditional platforms like Semantic +Scholar, BibSonomy, and Zotero offer tools for literature management, but +largely require manual laborious and error-prone input of tags and metadata. +Here, we introduce a novel retrieval augmented generation system that leverages +chat-based large language models (LLMs) to streamline and enhance the process +of publication management. It provides a unified chat-based interface, enabling +intuitive interactions with various backends, including Semantic Scholar, +BibSonomy, and the Zotero Webscraper. It supports two main use-cases: (1) +Explorative Search & Retrieval - leveraging LLMs to search for and retrieve +both specific and general scientific publications, while addressing the +challenges of content hallucination and data obsolescence; and (2) Cataloguing +& Management - aiding in the organization of personal publication libraries, in +this case BibSonomy, by automating the addition of metadata and tags, while +facilitating manual edits and updates. We compare our system to different LLM +models in three different settings, including a user study, and we can show its +advantages in different metrics. + +
+
+ comment: Accepted at 2024 ACM SIGIR CHIIR, For a demo see here + http://professor-x.de/demos/bibsonomy-chatgpt/demo.mp4 +
+
+
+
+
+ + ☆ Knowledge Pyramid: A Novel Hierarchical Reasoning Structure for + Generalized Knowledge Augmentation and Inference + + +
+ Knowledge graph (KG) based reasoning has been regarded as an effective means +for the analysis of semantic networks and is of great usefulness in areas of +information retrieval, recommendation, decision-making, and man-machine +interaction. It is widely used in recommendation, decision-making, +question-answering, search, and other fields. However, previous studies mainly +used low-level knowledge in the KG for reasoning, which may result in +insufficient generalization and poor robustness of reasoning. To this end, this +paper proposes a new inference approach using a novel knowledge augmentation +strategy to improve the generalization capability of KG. This framework +extracts high-level pyramidal knowledge from low-level knowledge and applies it +to reasoning in a multi-level hierarchical KG, called knowledge pyramid in this +paper. We tested some medical data sets using the proposed approach, and the +experimental results show that the proposed knowledge pyramid has improved the +knowledge inference performance with better generalization. Especially, when +there are fewer training samples, the inference accuracy can be significantly +improved. + +
+
+ comment: 10 pages,8 figures +
+
+
+
+
+ + ☆ Algorithmic amplification of biases on Google Search + + +
+ The evolution of information-seeking processes, driven by search engines like +Google, has transformed the access to information people have. This paper +investigates how individuals' preexisting attitudes influence the modern +information-seeking process, specifically the results presented by Google +Search. Through a comprehensive study involving surveys and information-seeking +tasks focusing on the topic of abortion, the paper provides four crucial +insights: 1) Individuals with opposing attitudes on abortion receive different +search results. 2) Individuals express their beliefs in their choice of +vocabulary used in formulating the search queries, shaping the outcome of the +search. 3) Additionally, the user's search history contributes to divergent +results among those with opposing attitudes. 4) Google Search engine reinforces +preexisting beliefs in search results. Overall, this study provides insights +into the interplay between human biases and algorithmic processes, highlighting +the potential for information polarization in modern information-seeking +processes. + +
+
+
+
+
+ + ☆ UOEP: User-Oriented Exploration Policy for Enhancing Long-Term User + Experiences in Recommender Systems + + +
+ Reinforcement learning (RL) has gained traction for enhancing user long-term +experiences in recommender systems by effectively exploring users' interests. +However, modern recommender systems exhibit distinct user behavioral patterns +among tens of millions of items, which increases the difficulty of exploration. +For example, user behaviors with different activity levels require varying +intensity of exploration, while previous studies often overlook this aspect and +apply a uniform exploration strategy to all users, which ultimately hurts user +experiences in the long run. To address these challenges, we propose +User-Oriented Exploration Policy (UOEP), a novel approach facilitating +fine-grained exploration among user groups. We first construct a distributional +critic which allows policy optimization under varying quantile levels of +cumulative reward feedbacks from users, representing user groups with varying +activity levels. Guided by this critic, we devise a population of distinct +actors aimed at effective and fine-grained exploration within its respective +user group. To simultaneously enhance diversity and stability during the +exploration process, we further introduce a population-level diversity +regularization term and a supervision module. Experimental results on public +recommendation datasets demonstrate that our approach outperforms all other +baselines in terms of long-term performance, validating its user-oriented +exploration effectiveness. Meanwhile, further analyses reveal our approach's +benefits of improved performance for low-activity users as well as increased +fairness among users. + +
+
+
+
+
+ + ☆ Estimating Gender Completeness in Wikipedia + + +
+ Gender imbalance in Wikipedia content is a known challenge which the editor +community is actively addressing. The aim of this paper is to provide the +Wikipedia community with instruments to estimate the magnitude of the problem +for different entity types (also known as classes) in Wikipedia. To this end, +we apply class completeness estimation methods based on the gender attribute. +Our results show not only which gender for different sub-classes of Person is +more prevalent in Wikipedia, but also an idea of how complete the coverage is +for difference genders and sub-classes of Person. + +
+
+
+
+
+ + ☆ Similar but Faster: Manipulation of Tempo in Music Audio Embeddings for + Tempo Prediction and Search ICASSP + + +
+ Audio embeddings enable large scale comparisons of the similarity of audio +files for applications such as search and recommendation. Due to the +subjectivity of audio similarity, it can be desirable to design systems that +answer not only whether audio is similar, but similar in what way (e.g., wrt. +tempo, mood or genre). Previous works have proposed disentangled embedding +spaces where subspaces representing specific, yet possibly correlated, +attributes can be weighted to emphasize those attributes in downstream tasks. +However, no research has been conducted into the independence of these +subspaces, nor their manipulation, in order to retrieve tracks that are similar +but different in a specific way. Here, we explore the manipulation of tempo in +embedding spaces as a case-study towards this goal. We propose tempo +translation functions that allow for efficient manipulation of tempo within a +pre-existing embedding space whilst maintaining other properties such as genre. +As this translation is specific to tempo it enables retrieval of tracks that +are similar but have specifically different tempi. We show that such a function +can be used as an efficient data augmentation strategy for both training of +downstream tempo predictors, and improved nearest neighbor retrieval of +properties largely independent of tempo. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ On the Effect of Data-Augmentation on Local Embedding Properties in the + Contrastive Learning of Music Audio Representations ICASSP + + +
+ Audio embeddings are crucial tools in understanding large catalogs of music. +Typically embeddings are evaluated on the basis of the performance they provide +in a wide range of downstream tasks, however few studies have investigated the +local properties of the embedding spaces themselves which are important in +nearest neighbor algorithms, commonly used in music search and recommendation. +In this work we show that when learning audio representations on music datasets +via contrastive learning, musical properties that are typically homogeneous +within a track (e.g., key and tempo) are reflected in the locality of +neighborhoods in the resulting embedding space. By applying appropriate data +augmentation strategies, localisation of such properties can not only be +reduced but the localisation of other attributes is increased. For example, +locality of features such as pitch and tempo that are less relevant to +non-expert listeners, may be mitigated while improving the locality of more +salient features such as genre and mood, achieving state-of-the-art performance +in nearest neighbor retrieval accuracy. Similarly, we show that the optimal +selection of data augmentation strategies for contrastive learning of music +audio embeddings is dependent on the downstream task, highlighting this as an +important embedding design decision. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ Handling Large-scale Cardinality in building recommendation systems + + +
+ Effective recommendation systems rely on capturing user preferences, often +requiring incorporating numerous features such as universally unique +identifiers (UUIDs) of entities. However, the exceptionally high cardinality of +UUIDs poses a significant challenge in terms of model degradation and increased +model size due to sparsity. This paper presents two innovative techniques to +address the challenge of high cardinality in recommendation systems. +Specifically, we propose a bag-of-words approach, combined with layer sharing, +to substantially decrease the model size while improving performance. Our +techniques were evaluated through offline and online experiments on Uber use +cases, resulting in promising results demonstrating our approach's +effectiveness in optimizing recommendation systems and enhancing their overall +performance. + +
+
+
+
+
+ + ☆ Machine Unlearning for Recommendation Systems: An Insight + + +
+ This review explores machine unlearning (MUL) in recommendation systems, +addressing adaptability, personalization, privacy, and bias challenges. Unlike +traditional models, MUL dynamically adjusts system knowledge based on shifts in +user preferences and ethical considerations. The paper critically examines +MUL's basics, real-world applications, and challenges like algorithmic +transparency. It sifts through literature, offering insights into how MUL could +transform recommendations, discussing user trust, and suggesting paths for +future research in responsible and user-focused artificial intelligence (AI). +The document guides researchers through challenges involving the trade-off +between personalization and privacy, encouraging contributions to meet +practical demands for targeted data removal. Emphasizing MUL's role in secure +and adaptive machine learning, the paper proposes ways to push its boundaries. +The novelty of this paper lies in its exploration of the limitations of the +methods, which highlights exciting prospects for advancing the field. + +
+
+ comment: In Proceedings of 7th INTERNATIONAL CONFERENCE ON INNOVATIVE + COMPUTING AND COMMUNICATION 2024 (https://icicc-conf.com/) +
+
+
+
+
+ + ☆ RELIANCE: Reliable Ensemble Learning for Information and News + Credibility Evaluation + + +
+ In the era of information proliferation, discerning the credibility of news +content poses an ever-growing challenge. This paper introduces RELIANCE, a +pioneering ensemble learning system designed for robust information and fake +news credibility evaluation. Comprising five diverse base models, including +Support Vector Machine (SVM), naive Bayes, logistic regression, random forest, +and Bidirectional Long Short Term Memory Networks (BiLSTMs), RELIANCE employs +an innovative approach to integrate their strengths, harnessing the collective +intelligence of the ensemble for enhanced accuracy. Experiments demonstrate the +superiority of RELIANCE over individual models, indicating its efficacy in +distinguishing between credible and non-credible information sources. RELIANCE, +also surpasses baseline models in information and news credibility assessment, +establishing itself as an effective solution for evaluating the reliability of +information sources. + +
+
+
+
+
+ + ☆ A New Creative Generation Pipeline for Click-Through Rate with Stable + Diffusion Model + + +
+ In online advertising scenario, sellers often create multiple creatives to +provide comprehensive demonstrations, making it essential to present the most +appealing design to maximize the Click-Through Rate (CTR). However, sellers +generally struggle to consider users preferences for creative design, leading +to the relatively lower aesthetics and quantities compared to Artificial +Intelligence (AI)-based approaches. Traditional AI-based approaches still face +the same problem of not considering user information while having limited +aesthetic knowledge from designers. In fact that fusing the user information, +the generated creatives can be more attractive because different users may have +different preferences. To optimize the results, the generated creatives in +traditional methods are then ranked by another module named creative ranking +model. The ranking model can predict the CTR score for each creative +considering user features. However, the two above stages are regarded as two +different tasks and are optimized separately. In this paper, we proposed a new +automated Creative Generation pipeline for Click-Through Rate (CG4CTR) with the +goal of improving CTR during the creative generation stage. Our contributions +have 4 parts: 1) The inpainting mode in stable diffusion is firstly applied to +creative generation task in online advertising scene. A self-cyclic generation +pipeline is proposed to ensure the convergence of training. 2) Prompt model is +designed to generate individualized creatives for different user groups, which +can further improve the diversity and quality. 3) Reward model comprehensively +considers the multimodal features of image and text to improve the +effectiveness of creative ranking task, and it is also critical in self-cyclic +pipeline. 4) The significant benefits obtained in online and offline +experiments verify the significance of our proposed method. + +
+
+
+
+
+ + ♻ ☆ Denoising Diffusion Recommender Model + + +
+ Recommender systems often grapple with noisy implicit feedback. Most studies +alleviate the noise issues from data cleaning perspective such as data +resampling and reweighting, but they are constrained by heuristic assumptions. +Another denoising avenue is from model perspective, which proactively injects +noises into user-item interactions and enhance the intrinsic denoising ability +of models. However, this kind of denoising process poses significant challenges +to the recommender model's representation capacity to capture noise patterns. +To address this issue, we propose Denoising Diffusion Recommender Model (DDRM), +which leverages multi-step denoising process based on diffusion models to +robustify user and item embeddings from any recommender models. DDRM injects +controlled Gaussian noises in the forward process and iteratively removes +noises in the reverse denoising process, thereby improving embedding robustness +against noisy feedback. To achieve this target, the key lies in offering +appropriate guidance to steer the reverse denoising process and providing a +proper starting point to start the forward-reverse process during inference. In +particular, we propose a dedicated denoising module that encodes collaborative +information as denoising guidance. Besides, in the inference stage, DDRM +utilizes the average embeddings of users' historically liked items as the +starting point rather than using pure noise since pure noise lacks +personalization, which increases the difficulty of the denoising process. +Extensive experiments on three datasets with three representative backend +recommender models demonstrate the effectiveness of DDRM. + +
+
+
+
+
+ + ♻ ☆ Deep Evolutional Instant Interest Network for CTR Prediction in + Trigger-Induced Recommendation WSDM'2024 + + +
+ The recommendation has been playing a key role in many industries, e.g., +e-commerce, streaming media, social media, etc. Recently, a new recommendation +scenario, called Trigger-Induced Recommendation (TIR), where users are able to +explicitly express their instant interests via trigger items, is emerging as an +essential role in many e-commerce platforms, e.g., Alibaba.com and Amazon. +Without explicitly modeling the user's instant interest, traditional +recommendation methods usually obtain sub-optimal results in TIR. Even though +there are a few methods considering the trigger and target items simultaneously +to solve this problem, they still haven't taken into account temporal +information of user behaviors, the dynamic change of user instant interest when +the user scrolls down and the interactions between the trigger and target +items. To tackle these problems, we propose a novel method -- Deep Evolutional +Instant Interest Network (DEI2N), for click-through rate prediction in TIR +scenarios. Specifically, we design a User Instant Interest Modeling Layer to +predict the dynamic change of the intensity of instant interest when the user +scrolls down. Temporal information is utilized in user behavior modeling. +Moreover, an Interaction Layer is introduced to learn better interactions +between the trigger and target items. We evaluate our method on several offline +and real-world industrial datasets. Experimental results show that our proposed +DEI2N outperforms state-of-the-art baselines. In addition, online A/B testing +demonstrates the superiority over the existing baseline in real-world +production environments. + +
+
+ comment: 7 pages, 3 figures, accepted by the 17th ACM International Conference + on Web Search and Data Mining(WSDM'2024) +
+
+
+
+
+ + ♻ ☆ Detecting Check-Worthy Claims in Political Debates, Speeches, and + Interviews Using Audio Data + + +
+ Developing tools to automatically detect check-worthy claims in political +debates and speeches can greatly help moderators of debates, journalists, and +fact-checkers. While previous work on this problem has focused exclusively on +the text modality, here we explore the utility of the audio modality as an +additional input. We create a new multimodal dataset (text and audio in +English) containing 48 hours of speech from past political debates in the USA. +We then experimentally demonstrate that, in the case of multiple speakers, +adding the audio modality yields sizable improvements over using the text +modality alone; moreover, an audio-only model could outperform a text-only one +for a single speaker. With the aim to enable future research, we make all our +data and code publicly available at +https://github.com/petar-iv/audio-checkworthiness-detection. + +
+
+ comment: Check-Worthiness, Fact-Checking, Fake News, Misinformation, + Disinformation, Political Debates, Multimodality +
+
+
+
+
+
+
+
+ + Machine Learning 156 + +
+
+
+ + ☆ Vision Mamba: Efficient Visual Representation Learning with + Bidirectional State Space Model + + +
+ Recently the state space models (SSMs) with efficient hardware-aware designs, +i.e., Mamba, have shown great potential for long sequence modeling. Building +efficient and generic vision backbones purely upon SSMs is an appealing +direction. However, representing visual data is challenging for SSMs due to the +position-sensitivity of visual data and the requirement of global context for +visual understanding. In this paper, we show that the reliance of visual +representation learning on self-attention is not necessary and propose a new +generic vision backbone with bidirectional Mamba blocks (Vim), which marks the +image sequences with position embeddings and compresses the visual +representation with bidirectional state space models. On ImageNet +classification, COCO object detection, and ADE20k semantic segmentation tasks, +Vim achieves higher performance compared to well-established vision +transformers like DeiT, while also demonstrating significantly improved +computation & memory efficiency. For example, Vim is 2.8$\times$ faster than +DeiT and saves 86.8% GPU memory when performing batch inference to extract +features on images with a resolution of 1248$\times$1248. The results +demonstrate that Vim is capable of overcoming the computation & memory +constraints on performing Transformer-style understanding for high-resolution +images and it has great potential to become the next-generation backbone for +vision foundation models. Code is available at https://github.com/hustvl/Vim. + +
+
+ comment: Work in progress. Code is available at https://github.com/hustvl/Vim +
+
+
+
+
+ + ☆ Vlogger: Make Your Dream A Vlog + + +
+ In this work, we present Vlogger, a generic AI system for generating a +minute-level video blog (i.e., vlog) of user descriptions. Different from short +videos with a few seconds, vlog often contains a complex storyline with +diversified scenes, which is challenging for most existing video generation +approaches. To break through this bottleneck, our Vlogger smartly leverages +Large Language Model (LLM) as Director and decomposes a long video generation +task of vlog into four key stages, where we invoke various foundation models to +play the critical roles of vlog professionals, including (1) Script, (2) Actor, +(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings, +our Vlogger can generate vlogs through explainable cooperation of top-down +planning and bottom-up shooting. Moreover, we introduce a novel video diffusion +model, ShowMaker, which serves as a videographer in our Vlogger for generating +the video snippet of each shooting scene. By incorporating Script and Actor +attentively as textual and visual prompts, it can effectively enhance +spatial-temporal coherence in the snippet. Besides, we design a concise mixed +training paradigm for ShowMaker, boosting its capacity for both T2V generation +and prediction. Finally, the extensive experiments show that our method +achieves state-of-the-art performance on zero-shot T2V generation and +prediction tasks. More importantly, Vlogger can generate over 5-minute vlogs +from open-world descriptions, without loss of video coherence on script and +actor. The code and model is all available at +https://github.com/zhuangshaobin/Vlogger. + +
+
+ comment: 16 pages, 8 figures, 11 tables +
+
+
+
+
+ + ☆ Deciphering Textual Authenticity: A Generalized Strategy through the + Lens of Large Language Semantics for Detecting Human vs. Machine-Generated + Text + + +
+ With the recent proliferation of Large Language Models (LLMs), there has been +an increasing demand for tools to detect machine-generated text. The effective +detection of machine-generated text face two pertinent problems: First, they +are severely limited in generalizing against real-world scenarios, where +machine-generated text is produced by a variety of generators, including but +not limited to GPT-4 and Dolly, and spans diverse domains, ranging from +academic manuscripts to social media posts. Second, existing detection +methodologies treat texts produced by LLMs through a restrictive binary +classification lens, neglecting the nuanced diversity of artifacts generated by +different LLMs. In this work, we undertake a systematic study on the detection +of machine-generated text in real-world scenarios. We first study the +effectiveness of state-of-the-art approaches and find that they are severely +limited against text produced by diverse generators and domains in the real +world. Furthermore, t-SNE visualizations of the embeddings from a pretrained +LLM's encoder show that they cannot reliably distinguish between human and +machine-generated text. Based on our findings, we introduce a novel system, +T5LLMCipher, for detecting machine-generated text using a pretrained T5 encoder +combined with LLM embedding sub-clustering to address the text produced by +diverse generators and domains in the real world. We evaluate our approach +across 9 machine-generated text systems and 9 domains and find that our +approach provides state-of-the-art generalization ability, with an average +increase in F1 score on machine-generated text of 19.6\% on unseen generators +and domains compared to the top performing existing approaches and correctly +attributes the generator of text with an accuracy of 93.6\%. + +
+
+
+
+
+ + ☆ Élivágar: Efficient Quantum Circuit Search for Classification ASPLOS 2024 + + +
+ Designing performant and noise-robust circuits for Quantum Machine Learning +(QML) is challenging -- the design space scales exponentially with circuit +size, and there are few well-supported guiding principles for QML circuit +design. Although recent Quantum Circuit Search (QCS) methods attempt to search +for performant QML circuits that are also robust to hardware noise, they +directly adopt designs from classical Neural Architecture Search (NAS) that are +misaligned with the unique constraints of quantum hardware, resulting in high +search overheads and severe performance bottlenecks. + We present \'Eliv\'agar, a novel resource-efficient, noise-guided QCS +framework. \'Eliv\'agar innovates in all three major aspects of QCS -- search +space, search algorithm and candidate evaluation strategy -- to address the +design flaws in current classically-inspired QCS methods. \'Eliv\'agar achieves +hardware-efficiency and avoids an expensive circuit-mapping co-search via +noise- and device topology-aware candidate generation. By introducing two +cheap-to-compute predictors, Clifford noise resilience and Representational +capacity, \'Eliv\'agar decouples the evaluation of noise robustness and +performance, enabling early rejection of low-fidelity circuits and reducing +circuit evaluation costs. Due to its resource-efficiency, \'Eliv\'agar can +further search for data embeddings, significantly improving performance. + Based on a comprehensive evaluation of \'Eliv\'agar on 12 real quantum +devices and 9 QML applications, \'Eliv\'agar achieves 5.3% higher accuracy and +a 271$\times$ speedup compared to state-of-the-art QCS methods. + +
+
+ comment: 13 pages, 11 figures. To appear in ASPLOS 2024 +
+
+
+
+
+ + ☆ Diverse Part Synthesis for 3D Shape Creation + + +
+ Methods that use neural networks for synthesizing 3D shapes in the form of a +part-based representation have been introduced over the last few years. These +methods represent shapes as a graph or hierarchy of parts and enable a variety +of applications such as shape sampling and reconstruction. However, current +methods do not allow easily regenerating individual shape parts according to +user preferences. In this paper, we investigate techniques that allow the user +to generate multiple, diverse suggestions for individual parts. Specifically, +we experiment with multimodal deep generative models that allow sampling +diverse suggestions for shape parts and focus on models which have not been +considered in previous work on shape synthesis. To provide a comparative study +of these techniques, we introduce a method for synthesizing 3D shapes in a +part-based representation and evaluate all the part suggestion techniques +within this synthesis method. In our method, which is inspired by previous +work, shapes are represented as a set of parts in the form of implicit +functions which are then positioned in space to form the final shape. Synthesis +in this representation is enabled by a neural network architecture based on an +implicit decoder and a spatial transformer. We compare the various multimodal +generative models by evaluating their performance in generating part +suggestions. Our contribution is to show with qualitative and quantitative +evaluations which of the new techniques for multimodal part generation perform +the best and that a synthesis method based on the top-performing techniques +allows the user to more finely control the parts that are generated in the 3D +shapes while maintaining high shape fidelity when reconstructing shapes. + +
+
+
+
+
+ + ☆ Unlocking Unlabeled Data: Ensemble Learning with the Hui- Walter + Paradigm for Performance Estimation in Online and Static Settings + + +
+ In the realm of machine learning and statistical modeling, practitioners +often work under the assumption of accessible, static, labeled data for +evaluation and training. However, this assumption often deviates from reality +where data may be private, encrypted, difficult- to-measure, or unlabeled. In +this paper, we bridge this gap by adapting the Hui-Walter paradigm, a method +traditionally applied in epidemiology and medicine, to the field of machine +learning. This approach enables us to estimate key performance metrics such as +false positive rate, false negative rate, and priors in scenarios where no +ground truth is available. We further extend this paradigm for handling online +data, opening up new possibilities for dynamic data environments. Our +methodology involves partitioning data into latent classes to simulate multiple +data populations (if natural populations are unavailable) and independently +training models to replicate multiple tests. By cross-tabulating binary +outcomes across ensemble categorizers and multiple populations, we are able to +estimate unknown parameters through Gibbs sampling, eliminating the need for +ground-truth or labeled data. This paper showcases the potential of our +methodology to transform machine learning practices by allowing for accurate +model assessment under dynamic and uncertain data conditions. + +
+
+
+
+
+ + ☆ Swing: Short-cutting Rings for Higher Bandwidth Allreduce + + +
+ The allreduce collective operation accounts for a significant fraction of the +runtime of workloads running on distributed systems. One factor determining its +performance is the distance between communicating nodes, especially on networks +like torus, where a higher distance implies multiple messages being forwarded +on the same link, thus reducing the allreduce bandwidth. Torus networks are +widely used on systems optimized for machine learning workloads (e.g., Google +TPUs and Amazon Trainium devices), as well as on some of the Top500 +supercomputers. To improve allreduce performance on torus networks we introduce +Swing, a new algorithm that keeps a low distance between communicating nodes by +swinging between torus directions. Our analysis and experimental evaluation +show that Swing outperforms by up to 3x existing allreduce algorithms for +vectors ranging from 32B to 128MiB, on different types of torus and torus-like +topologies, regardless of their shape and size. + +
+
+
+
+
+ + ☆ Neural Contractive Dynamical Systems + + +
+ Stability guarantees are crucial when ensuring a fully autonomous robot does +not take undesirable or potentially harmful actions. Unfortunately, global +stability guarantees are hard to provide in dynamical systems learned from +data, especially when the learned dynamics are governed by neural networks. We +propose a novel methodology to learn neural contractive dynamical systems, +where our neural architecture ensures contraction, and hence, global stability. +To efficiently scale the method to high-dimensional dynamical systems, we +develop a variant of the variational autoencoder that learns dynamics in a +low-dimensional latent representation space while retaining contractive +stability after decoding. We further extend our approach to learning +contractive systems on the Lie group of rotations to account for full-pose +end-effector dynamic motions. The result is the first highly flexible learning +architecture that provides contractive stability guarantees with capability to +perform obstacle avoidance. Empirically, we demonstrate that our approach +encodes the desired dynamics more accurately than the current state-of-the-art, +which provides less strong stability guarantees. + +
+
+
+
+
+ + ☆ High Confidence Level Inference is Almost Free using Parallel Stochastic + Optimization + + +
+ Uncertainty quantification for estimation through stochastic optimization +solutions in an online setting has gained popularity recently. This paper +introduces a novel inference method focused on constructing confidence +intervals with efficient computation and fast convergence to the nominal level. +Specifically, we propose to use a small number of independent multi-runs to +acquire distribution information and construct a t-based confidence interval. +Our method requires minimal additional computation and memory beyond the +standard updating of estimates, making the inference process almost cost-free. +We provide a rigorous theoretical guarantee for the confidence interval, +demonstrating that the coverage is approximately exact with an explicit +convergence rate and allowing for high confidence level inference. In +particular, a new Gaussian approximation result is developed for the online +estimators to characterize the coverage properties of our confidence intervals +in terms of relative errors. Additionally, our method also allows for +leveraging parallel computing to further accelerate calculations using multiple +cores. It is easy to implement and can be integrated with existing stochastic +algorithms without the need for complicated modifications. + +
+
+
+
+
+ + ☆ SceneVerse: Scaling 3D Vision-Language Learning for Grounded Scene + Understanding + + +
+ 3D vision-language grounding, which focuses on aligning language with the 3D +physical environment, stands as a cornerstone in the development of embodied +agents. In comparison to recent advancements in the 2D domain, grounding +language in 3D scenes faces several significant challenges: (i) the inherent +complexity of 3D scenes due to the diverse object configurations, their rich +attributes, and intricate relationships; (ii) the scarcity of paired 3D +vision-language data to support grounded learning; and (iii) the absence of a +unified learning framework to distill knowledge from grounded 3D data. In this +work, we aim to address these three major challenges in 3D vision-language by +examining the potential of systematically upscaling 3D vision-language learning +in indoor environments. We introduce the first million-scale 3D vision-language +dataset, SceneVerse, encompassing about 68K 3D indoor scenes and comprising +2.5M vision-language pairs derived from both human annotations and our scalable +scene-graph-based generation approach. We demonstrate that this scaling allows +for a unified pre-training framework, Grounded Pre-training for Scenes (GPS), +for 3D vision-language learning. Through extensive experiments, we showcase the +effectiveness of GPS by achieving state-of-the-art performance on all existing +3D visual grounding benchmarks. The vast potential of SceneVerse and GPS is +unveiled through zero-shot transfer experiments in the challenging 3D +vision-language tasks. Project website: https://scene-verse.github.io . + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Central Limit Theorem for Two-Timescale Stochastic Approximation with + Markovian Noise: Theory and Applications + + +
+ Two-timescale stochastic approximation (TTSA) is among the most general +frameworks for iterative stochastic algorithms. This includes well-known +stochastic optimization methods such as SGD variants and those designed for +bilevel or minimax problems, as well as reinforcement learning like the family +of gradient-based temporal difference (GTD) algorithms. In this paper, we +conduct an in-depth asymptotic analysis of TTSA under controlled Markovian +noise via central limit theorem (CLT), uncovering the coupled dynamics of TTSA +influenced by the underlying Markov chain, which has not been addressed by +previous CLT results of TTSA only with Martingale difference noise. Building +upon our CLT, we expand its application horizon of efficient sampling +strategies from vanilla SGD to a wider TTSA context in distributed learning, +thus broadening the scope of Hu et al. (2022). In addition, we leverage our CLT +result to deduce the statistical properties of GTD algorithms with nonlinear +function approximation using Markovian samples and show their identical +asymptotic performance, a perspective not evident from current finite-time +bounds. + +
+
+
+
+
+ + ☆ Machines Do See Color: A Guideline to Classify Different Forms of Racist + Discourse in Large Corpora + + +
+ Current methods to identify and classify racist language in text rely on +small-n qualitative approaches or large-n approaches focusing exclusively on +overt forms of racist discourse. This article provides a step-by-step +generalizable guideline to identify and classify different forms of racist +discourse in large corpora. In our approach, we start by conceptualizing racism +and its different manifestations. We then contextualize these racist +manifestations to the time and place of interest, which allows researchers to +identify their discursive form. Finally, we apply XLM-RoBERTa (XLM-R), a +cross-lingual model for supervised text classification with a cutting-edge +contextual understanding of text. We show that XLM-R and XLM-R-Racismo, our +pretrained model, outperform other state-of-the-art approaches in classifying +racism in large corpora. We illustrate our approach using a corpus of tweets +relating to the Ecuadorian ind\'igena community between 2018 and 2021. + +
+
+ comment: 37 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ BENO: Boundary-embedded Neural Operators for Elliptic PDEs ICLR 2024 + + +
+ Elliptic partial differential equations (PDEs) are a major class of +time-independent PDEs that play a key role in many scientific and engineering +domains such as fluid dynamics, plasma physics, and solid mechanics. Recently, +neural operators have emerged as a promising technique to solve elliptic PDEs +more efficiently by directly mapping the input to solutions. However, existing +networks typically cannot handle complex geometries and inhomogeneous boundary +values present in the real world. Here we introduce Boundary-Embedded Neural +Operators (BENO), a novel neural operator architecture that embeds the complex +geometries and inhomogeneous boundary values into the solving of elliptic PDEs. +Inspired by classical Green's function, BENO consists of two branches of Graph +Neural Networks (GNNs) for interior source term and boundary values, +respectively. Furthermore, a Transformer encoder maps the global boundary +geometry into a latent vector which influences each message passing layer of +the GNNs. We test our model extensively in elliptic PDEs with various boundary +conditions. We show that all existing baseline methods fail to learn the +solution operator. In contrast, our model, endowed with boundary-embedded +architecture, outperforms state-of-the-art neural operators and strong +baselines by an average of 60.96\%. Our source code can be found +https://github.com/AI4Science-WestlakeU/beno.git. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ T-FOLEY: A Controllable Waveform-Domain Diffusion Model for + Temporal-Event-Guided Foley Sound Synthesis + + +
+ Foley sound, audio content inserted synchronously with videos, plays a +critical role in the user experience of multimedia content. Recently, there has +been active research in Foley sound synthesis, leveraging the advancements in +deep generative models. However, such works mainly focus on replicating a +single sound class or a textual sound description, neglecting temporal +information, which is crucial in the practical applications of Foley sound. We +present T-Foley, a Temporal-event-guided waveform generation model for Foley +sound synthesis. T-Foley generates high-quality audio using two conditions: the +sound class and temporal event feature. For temporal conditioning, we devise a +temporal event feature and a novel conditioning technique named Block-FiLM. +T-Foley achieves superior performance in both objective and subjective +evaluation metrics and generates Foley sound well-synchronized with the +temporal events. Additionally, we showcase T-Foley's practical applications, +particularly in scenarios involving vocal mimicry for temporal event control. +We show the demo on our companion website. + +
+
+
+
+
+ + ☆ Adaptive Regret for Bandits Made Possible: Two Queries Suffice ICLR2024 + + +
+ Fast changing states or volatile environments pose a significant challenge to +online optimization, which needs to perform rapid adaptation under limited +observation. In this paper, we give query and regret optimal bandit algorithms +under the strict notion of strongly adaptive regret, which measures the maximum +regret over any contiguous interval $I$. Due to its worst-case nature, there is +an almost-linear $\Omega(|I|^{1-\epsilon})$ regret lower bound, when only one +query per round is allowed [Daniely el al, ICML 2015]. Surprisingly, with just +two queries per round, we give Strongly Adaptive Bandit Learner (StABL) that +achieves $\tilde{O}(\sqrt{n|I|})$ adaptive regret for multi-armed bandits with +$n$ arms. The bound is tight and cannot be improved in general. Our algorithm +leverages a multiplicative update scheme of varying stepsizes and a carefully +chosen observation distribution to control the variance. Furthermore, we extend +our results and provide optimal algorithms in the bandit convex optimization +setting. Finally, we empirically demonstrate the superior performance of our +algorithms under volatile environments and for downstream tasks, such as +algorithm selection for hyperparameter optimization. + +
+
+ comment: ICLR2024 +
+
+
+
+
+ + ☆ Avoiding strict saddle points of nonconvex regularized problems + + +
+ We introduce a strict saddle property for $\ell_p$ regularized functions, and +propose an iterative reweighted $\ell_1$ algorithm to solve the $\ell_p$ +regularized problems. The algorithm is guaranteed to converge only to local +minimizers when randomly initialized. The strict saddle property is shown +generic on these sparse optimization problems. Those analyses as well as the +proposed algorithm can be easily extended to general nonconvex regularized +problems. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Risk-Aware Accelerated Wireless Federated Learning with Heterogeneous + Clients + + +
+ Wireless Federated Learning (FL) is an emerging distributed machine learning +paradigm, particularly gaining momentum in domains with confidential and +private data on mobile clients. However, the location-dependent performance, in +terms of transmission rates and susceptibility to transmission errors, poses +major challenges for wireless FL's convergence speed and accuracy. The +challenge is more acute for hostile environments without a metric that +authenticates the data quality and security profile of the clients. In this +context, this paper proposes a novel risk-aware accelerated FL framework that +accounts for the clients heterogeneity in the amount of possessed data, +transmission rates, transmission errors, and trustworthiness. Classifying +clients according to their location-dependent performance and trustworthiness +profiles, we propose a dynamic risk-aware global model aggregation scheme that +allows clients to participate in descending order of their transmission rates +and an ascending trustworthiness constraint. In particular, the transmission +rate is the dominant participation criterion for initial rounds to accelerate +the convergence speed. Our model then progressively relaxes the transmission +rate restriction to explore more training data at cell-edge clients. The +aggregation rounds incorporate a debiasing factor that accounts for +transmission errors. Risk-awareness is enabled by a validation set, where the +base station eliminates non-trustworthy clients at the fine-tuning stage. The +proposed scheme is benchmarked against a conservative scheme (i.e., only +allowing trustworthy devices) and an aggressive scheme (i.e., oblivious to the +trust metric). The numerical results highlight the superiority of the proposed +scheme in terms of accuracy and convergence speed when compared to both +benchmarks. + +
+
+
+
+
+ + ☆ MSHyper: Multi-Scale Hypergraph Transformer for Long-Range Time Series + Forecasting + + +
+ Demystifying interactions between temporal patterns of different scales is +fundamental to precise long-range time series forecasting. However, previous +works lack the ability to model high-order interactions. To promote more +comprehensive pattern interaction modeling for long-range time series +forecasting, we propose a Multi-Scale Hypergraph Transformer (MSHyper) +framework. Specifically, a multi-scale hypergraph is introduced to provide +foundations for modeling high-order pattern interactions. Then by treating +hyperedges as nodes, we also build a hyperedge graph to enhance hypergraph +modeling. In addition, a tri-stage message passing mechanism is introduced to +aggregate pattern information and learn the interaction strength between +temporal patterns of different scales. Extensive experiments on five real-world +datasets demonstrate that MSHyper achieves state-of-the-art performance, +reducing prediction errors by an average of 8.73% and 7.15% over the best +baseline in MSE and MAE, respectively. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ A First-Order Multi-Gradient Algorithm for Multi-Objective Bi-Level + Optimization + + +
+ In this paper, we study the Multi-Objective Bi-Level Optimization (MOBLO) +problem, where the upper-level subproblem is a multi-objective optimization +problem and the lower-level subproblem is for scalar optimization. Existing +gradient-based MOBLO algorithms need to compute the Hessian matrix, causing the +computational inefficient problem. To address this, we propose an efficient +first-order multi-gradient method for MOBLO, called FORUM. Specifically, we +reformulate MOBLO problems as a constrained multi-objective optimization (MOO) +problem via the value-function approach. Then we propose a novel multi-gradient +aggregation method to solve the challenging constrained MOO problem. +Theoretically, we provide the complexity analysis to show the efficiency of the +proposed method and a non-asymptotic convergence result. Empirically, extensive +experiments demonstrate the effectiveness and efficiency of the proposed FORUM +method in different learning problems. In particular, it achieves +state-of-the-art performance on three multi-task learning benchmark datasets. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ 3D Scene Geometry Estimation from 360$^\circ$ Imagery: A Survey + + +
+ This paper provides a comprehensive survey on pioneer and state-of-the-art 3D +scene geometry estimation methodologies based on single, two, or multiple +images captured under the omnidirectional optics. We first revisit the basic +concepts of the spherical camera model, and review the most common acquisition +technologies and representation formats suitable for omnidirectional (also +called 360$^\circ$, spherical or panoramic) images and videos. We then survey +monocular layout and depth inference approaches, highlighting the recent +advances in learning-based solutions suited for spherical data. The classical +stereo matching is then revised on the spherical domain, where methodologies +for detecting and describing sparse and dense features become crucial. The +stereo matching concepts are then extrapolated for multiple view camera setups, +categorizing them among light fields, multi-view stereo, and structure from +motion (or visual simultaneous localization and mapping). We also compile and +discuss commonly adopted datasets and figures of merit indicated for each +purpose and list recent results for completeness. We conclude this paper by +pointing out current and future trends. + +
+
+ comment: Published in ACM Computing Surveys +
+
+
+
+
+ + ☆ Bridging the Gap Between General and Down-Closed Convex Sets in + Submodular Maximization + + +
+ Optimization of DR-submodular functions has experienced a notable surge in +significance in recent times, marking a pivotal development within the domain +of non-convex optimization. Motivated by real-world scenarios, some recent +works have delved into the maximization of non-monotone DR-submodular functions +over general (not necessarily down-closed) convex set constraints. Up to this +point, these works have all used the minimum $\ell_\infty$ norm of any feasible +solution as a parameter. Unfortunately, a recent hardness result due to Mualem +\& Feldman~\cite{mualem2023resolving} shows that this approach cannot yield a +smooth interpolation between down-closed and non-down-closed constraints. In +this work, we suggest novel offline and online algorithms that provably provide +such an interpolation based on a natural decomposition of the convex body +constraint into two distinct convex bodies: a down-closed convex body and a +general convex body. We also empirically demonstrate the superiority of our +proposed algorithms across three offline and two online applications. + +
+
+
+
+
+ + ☆ DiffClone: Enhanced Behaviour Cloning in Robotics with Diffusion-Driven + Policy Learning NeurIPS 2023 + + +
+ Robot learning tasks are extremely compute-intensive and hardware-specific. +Thus the avenues of tackling these challenges, using a diverse dataset of +offline demonstrations that can be used to train robot manipulation agents, is +very appealing. The Train-Offline-Test-Online (TOTO) Benchmark provides a +well-curated open-source dataset for offline training comprised mostly of +expert data and also benchmark scores of the common offline-RL and behaviour +cloning agents. In this paper, we introduce DiffClone, an offline algorithm of +enhanced behaviour cloning agent with diffusion-based policy learning, and +measured the efficacy of our method on real online physical robots at test +time. This is also our official submission to the Train-Offline-Test-Online +(TOTO) Benchmark Challenge organized at NeurIPS 2023. We experimented with both +pre-trained visual representation and agent policies. In our experiments, we +find that MOCO finetuned ResNet50 performs the best in comparison to other +finetuned representations. Goal state conditioning and mapping to transitions +resulted in a minute increase in the success rate and mean-reward. As for the +agent policy, we developed DiffClone, a behaviour cloning agent improved using +conditional diffusion. + +
+
+ comment: NeurIPS 2023 Train Offline Test Online Workshop and Competition +
+
+
+
+
+ + ☆ Classification and Reconstruction Processes in Deep Predictive Coding + Networks: Antagonists or Allies? + + +
+ Predictive coding-inspired deep networks for visual computing integrate +classification and reconstruction processes in shared intermediate layers. +Although synergy between these processes is commonly assumed, it has yet to be +convincingly demonstrated. In this study, we take a critical look at how +classifying and reconstructing interact in deep learning architectures. Our +approach utilizes a purposefully designed family of model architectures +reminiscent of autoencoders, each equipped with an encoder, a decoder, and a +classification head featuring varying modules and complexities. We meticulously +analyze the extent to which classification- and reconstruction-driven +information can seamlessly coexist within the shared latent layer of the model +architectures. Our findings underscore a significant challenge: +Classification-driven information diminishes reconstruction-driven information +in intermediate layers' shared representations and vice versa. While expanding +the shared representation's dimensions or increasing the network's complexity +can alleviate this trade-off effect, our results challenge prevailing +assumptions in predictive coding and offer guidance for future iterations of +predictive coding concepts in deep networks. + +
+
+
+
+
+ + ☆ A Characterization Theorem for Equivariant Networks with Point-wise + Activations ICLR 2024 + + +
+ Equivariant neural networks have shown improved performance, expressiveness +and sample complexity on symmetrical domains. But for some specific symmetries, +representations, and choice of coordinates, the most common point-wise +activations, such as ReLU, are not equivariant, hence they cannot be employed +in the design of equivariant neural networks. The theorem we present in this +paper describes all possible combinations of finite-dimensional +representations, choice of coordinates and point-wise activations to obtain an +exactly equivariant layer, generalizing and strengthening existing +characterizations. Notable cases of practical relevance are discussed as +corollaries. Indeed, we prove that rotation-equivariant networks can only be +invariant, as it happens for any network which is equivariant with respect to +connected compact groups. Then, we discuss implications of our findings when +applied to important instances of exactly equivariant networks. First, we +completely characterize permutation equivariant networks such as Invariant +Graph Networks with point-wise nonlinearities and their geometric counterparts, +highlighting a plethora of models whose expressive power and performance are +still unknown. Second, we show that feature spaces of disentangled steerable +convolutional neural networks are trivial representations. + +
+
+ comment: Accepted at the 12th International Conference on Learning + Representations (ICLR 2024) +
+
+
+
+
+ + ☆ A Real-Time Lyrics Alignment System Using Chroma And Phonetic Features + For Classical Vocal Performance ICASSP 2024 + + +
+ The goal of real-time lyrics alignment is to take live singing audio as input +and to pinpoint the exact position within given lyrics on the fly. The task can +benefit real-world applications such as the automatic subtitling of live +concerts or operas. However, designing a real-time model poses a great +challenge due to the constraints of only using past input and operating within +a minimal latency. Furthermore, due to the lack of datasets for real-time +models for lyrics alignment, previous studies have mostly evaluated with +private in-house datasets, resulting in a lack of standard evaluation methods. +This paper presents a real-time lyrics alignment system for classical vocal +performances with two contributions. First, we improve the lyrics alignment +algorithm by finding an optimal combination of chromagram and phonetic +posteriorgram (PPG) that capture melodic and phonetics features of the singing +voice, respectively. Second, we recast the Schubert Winterreise Dataset (SWD) +which contains multiple performance renditions of the same pieces as an +evaluation set for the real-time lyrics alignment. + +
+
+ comment: To Appear IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Space and Time Continuous Physics Simulation From Partial Observations + + +
+ Modern techniques for physical simulations rely on numerical schemes and +mesh-refinement methods to address trade-offs between precision and complexity, +but these handcrafted solutions are tedious and require high computational +power. Data-driven methods based on large-scale machine learning promise high +adaptivity by integrating long-range dependencies more directly and +efficiently. In this work, we focus on fluid dynamics and address the +shortcomings of a large part of the literature, which are based on fixed +support for computations and predictions in the form of regular or irregular +grids. We propose a novel setup to perform predictions in a continuous spatial +and temporal domain while being trained on sparse observations. We formulate +the task as a double observation problem and propose a solution with two +interlinked dynamical systems defined on, respectively, the sparse positions +and the continuous domain, which allows to forecast and interpolate a solution +from the initial condition. Our practical implementation involves recurrent +GNNs and a spatio-temporal attention observer capable of interpolating the +solution at arbitrary locations. Our model not only generalizes to new initial +conditions (as standard auto-regressive models do) but also performs evaluation +at arbitrary space and time locations. We evaluate on three standard datasets +in fluid dynamics and compare to strong baselines, which are outperformed both +in classical settings and in the extended new task requiring continuous +predictions. + +
+
+ comment: Project Page: https://continuous-pde.github.io/ +
+
+
+
+
+ + ☆ GNN-LoFI: a Novel Graph Neural Network through Localized Feature-based + Histogram Intersection + + +
+ Graph neural networks are increasingly becoming the framework of choice for +graph-based machine learning. In this paper, we propose a new graph neural +network architecture that substitutes classical message passing with an +analysis of the local distribution of node features. To this end, we extract +the distribution of features in the egonet for each local neighbourhood and +compare them against a set of learned label distributions by taking the +histogram intersection kernel. The similarity information is then propagated to +other nodes in the network, effectively creating a message passing-like +mechanism where the message is determined by the ensemble of the features. We +perform an ablation study to evaluate the network's performance under different +choices of its hyper-parameters. Finally, we test our model on standard graph +classification and regression benchmarks, and we find that it outperforms +widely used alternative approaches, including both graph kernels and graph +neural networks. + +
+
+
+
+
+ + ☆ Preparing Lessons for Progressive Training on Language Models + + +
+ The rapid progress of Transformers in artificial intelligence has come at the +cost of increased resource consumption and greenhouse gas emissions due to +growing model sizes. Prior work suggests using pretrained small models to +improve training efficiency, but this approach may not be suitable for new +model structures. On the other hand, training from scratch can be slow, and +progressively stacking layers often fails to achieve significant acceleration. +To address these challenges, we propose a novel method called Apollo, which +prep\textbf{a}res lessons for ex\textbf{p}anding \textbf{o}perations by +\textbf{l}earning high-\textbf{l}ayer functi\textbf{o}nality during training of +low layers. Our approach involves low-value-prioritized sampling (LVPS) to +train different depths and weight sharing to facilitate efficient expansion. We +also introduce an interpolation method for stable model depth extension. +Experiments demonstrate that Apollo achieves state-of-the-art acceleration +ratios, even rivaling methods using pretrained models, making it a universal +and efficient solution for training deep models while reducing time, financial, +and environmental costs. + +
+
+
+
+
+ + ☆ An Optimal Transport Approach for Computing Adversarial Training Lower + Bounds in Multiclass Classification + + +
+ Despite the success of deep learning-based algorithms, it is widely known +that neural networks may fail to be robust. A popular paradigm to enforce +robustness is adversarial training (AT), however, this introduces many +computational and theoretical difficulties. Recent works have developed a +connection between AT in the multiclass classification setting and +multimarginal optimal transport (MOT), unlocking a new set of tools to study +this problem. In this paper, we leverage the MOT connection to propose +computationally tractable numerical algorithms for computing universal lower +bounds on the optimal adversarial risk and identifying optimal classifiers. We +propose two main algorithms based on linear programming (LP) and entropic +regularization (Sinkhorn). Our key insight is that one can harmlessly truncate +the higher order interactions between classes, preventing the combinatorial run +times typically encountered in MOT problems. We validate these results with +experiments on MNIST and CIFAR-$10$, which demonstrate the tractability of our +approach. + +
+
+
+
+
+ + ☆ Exploring the Role of Convolutional Neural Networks (CNN) in Dental + Radiography Segmentation: A Comprehensive Systematic Literature Review + + +
+ In the field of dentistry, there is a growing demand for increased precision +in diagnostic tools, with a specific focus on advanced imaging techniques such +as computed tomography, cone beam computed tomography, magnetic resonance +imaging, ultrasound, and traditional intra-oral periapical X-rays. Deep +learning has emerged as a pivotal tool in this context, enabling the +implementation of automated segmentation techniques crucial for extracting +essential diagnostic data. This integration of cutting-edge technology +addresses the urgent need for effective management of dental conditions, which, +if left undetected, can have a significant impact on human health. The +impressive track record of deep learning across various domains, including +dentistry, underscores its potential to revolutionize early detection and +treatment of oral health issues. Objective: Having demonstrated significant +results in diagnosis and prediction, deep convolutional neural networks (CNNs) +represent an emerging field of multidisciplinary research. The goals of this +study were to provide a concise overview of the state of the art, standardize +the current debate, and establish baselines for future research. Method: In +this study, a systematic literature review is employed as a methodology to +identify and select relevant studies that specifically investigate the deep +learning technique for dental imaging analysis. This study elucidates the +methodological approach, including the systematic collection of data, +statistical analysis, and subsequent dissemination of outcomes. Conclusion: +This work demonstrates how Convolutional Neural Networks (CNNs) can be employed +to analyze images, serving as effective tools for detecting dental pathologies. +Although this research acknowledged some limitations, CNNs utilized for +segmenting and categorizing teeth exhibited their highest level of performance +overall. + +
+
+
+
+
+ + ☆ A Two-Scale Complexity Measure for Deep Learning Models + + +
+ We introduce a novel capacity measure 2sED for statistical models based on +the effective dimension. The new quantity provably bounds the generalization +error under mild assumptions on the model. Furthermore, simulations on standard +data sets and popular model architectures show that 2sED correlates well with +the training error. For Markovian models, we show how to efficiently +approximate 2sED from below through a layerwise iterative approach, which +allows us to tackle deep learning models with a large number of parameters. +Simulation results suggest that the approximation is good for different +prominent models and data sets. + +
+
+
+
+
+ + ☆ Beyond Anti-Forgetting: Multimodal Continual Instruction Tuning with + Positive Forward Transfer + + +
+ Multimodal Continual Instruction Tuning (MCIT) enables Multimodal Large +Language Models (MLLMs) to meet continuously emerging requirements without +expensive retraining. MCIT faces two major obstacles: catastrophic forgetting +(where old knowledge is forgotten) and negative forward transfer (where the +performance of future tasks is degraded). Although existing methods have +greatly alleviated catastrophic forgetting, they still suffer from negative +forward transfer. By performing singular value decomposition (SVD) on input +embeddings, we discover a large discrepancy in different input embeddings. The +discrepancy results in the model learning irrelevant information for old and +pre-trained tasks, which leads to catastrophic forgetting and negative forward +transfer. To address these issues, we propose Fwd-Prompt, a prompt-based method +projecting prompt gradient to the residual space to minimize the interference +between tasks and to the pre-trained subspace for reusing pre-trained +knowledge. Our experiments demonstrate that Fwd-Prompt achieves +state-of-the-art performance while updating fewer parameters and requiring no +old samples. Our research sheds light on the potential of continuously adapting +MLLMs to new tasks under the instruction tuning paradigm and encourages future +studies to explore MCIT. The code will soon be publicly available. + +
+
+
+
+
+ + ☆ Unsupervised Multiple Domain Translation through Controlled + Disentanglement in Variational Autoencoder + + +
+ Unsupervised Multiple Domain Translation is the task of transforming data +from one domain to other domains without having paired data to train the +systems. Typically, methods based on Generative Adversarial Networks (GANs) are +used to address this task. However, our proposal exclusively relies on a +modified version of a Variational Autoencoder. This modification consists of +the use of two latent variables disentangled in a controlled way by design. One +of this latent variables is imposed to depend exclusively on the domain, while +the other one must depend on the rest of the variability factors of the data. +Additionally, the conditions imposed over the domain latent variable allow for +better control and understanding of the latent space. We empirically +demonstrate that our approach works on different vision datasets improving the +performance of other well known methods. Finally, we prove that, indeed, one of +the latent variables stores all the information related to the domain and the +other one hardly contains any domain information. + +
+
+
+
+
+ + ☆ ADCNet: a unified framework for predicting the activity of antibody-drug + conjugates + + +
+ Antibody-drug conjugate (ADC) has revolutionized the field of cancer +treatment in the era of precision medicine due to their ability to precisely +target cancer cells and release highly effective drug. Nevertheless, the +realization of rational design of ADC is very difficult because the +relationship between their structures and activities is difficult to +understand. In the present study, we introduce a unified deep learning +framework called ADCNet to help design potential ADCs. The ADCNet highly +integrates the protein representation learning language model ESM-2 and +small-molecule representation learning language model FG-BERT models to achieve +activity prediction through learning meaningful features from antigen and +antibody protein sequences of ADC, SMILES strings of linker and payload, and +drug-antibody ratio (DAR) value. Based on a carefully designed and manually +tailored ADC data set, extensive evaluation results reveal that ADCNet performs +best on the test set compared to baseline machine learning models across all +evaluation metrics. For example, it achieves an average prediction accuracy of +87.12%, a balanced accuracy of 0.8689, and an area under receiver operating +characteristic curve of 0.9293 on the test set. In addition, cross-validation, +ablation experiments, and external independent testing results further prove +the stability, advancement, and robustness of the ADCNet architecture. For the +convenience of the community, we develop the first online platform +(https://ADCNet.idruglab.cn) for the prediction of ADCs activity based on the +optimal ADCNet model, and the source code is publicly available at +https://github.com/idrugLab/ADCNet. + +
+
+
+
+
+ + ☆ Asynchronous Local-SGD Training for Language Modeling + + +
+ Local stochastic gradient descent (Local-SGD), also referred to as federated +averaging, is an approach to distributed optimization where each device +performs more than one SGD update per communication. This work presents an +empirical study of {\it asynchronous} Local-SGD for training language models; +that is, each worker updates the global parameters as soon as it has finished +its SGD steps. We conduct a comprehensive investigation by examining how worker +hardware heterogeneity, model size, number of workers, and optimizer could +impact the learning performance. We find that with naive implementations, +asynchronous Local-SGD takes more iterations to converge than its synchronous +counterpart despite updating the (global) model parameters more frequently. We +identify momentum acceleration on the global parameters when worker gradients +are stale as a key challenge. We propose a novel method that utilizes a delayed +Nesterov momentum update and adjusts the workers' local training steps based on +their computation speed. This approach, evaluated with models up to 150M +parameters on the C4 dataset, matches the performance of synchronous Local-SGD +in terms of perplexity per update step, and significantly surpasses it in terms +of wall clock time. + +
+
+
+
+
+ + ☆ Understanding Heterophily for Graph Neural Networks + + +
+ Graphs with heterophily have been regarded as challenging scenarios for Graph +Neural Networks (GNNs), where nodes are connected with dissimilar neighbors +through various patterns. In this paper, we present theoretical understandings +of the impacts of different heterophily patterns for GNNs by incorporating the +graph convolution (GC) operations into fully connected networks via the +proposed Heterophilous Stochastic Block Models (HSBM), a general random graph +model that can accommodate diverse heterophily patterns. Firstly, we show that +by applying a GC operation, the separability gains are determined by two +factors, i.e., the Euclidean distance of the neighborhood distributions and +$\sqrt{\mathbb{E}\left[\operatorname{deg}\right]}$, where +$\mathbb{E}\left[\operatorname{deg}\right]$ is the averaged node degree. It +reveals that the impact of heterophily on classification needs to be evaluated +alongside the averaged node degree. Secondly, we show that the topological +noise has a detrimental impact on separability, which is equivalent to +degrading $\mathbb{E}\left[\operatorname{deg}\right]$. Finally, when applying +multiple GC operations, we show that the separability gains are determined by +the normalized distance of the $l$-powered neighborhood distributions. It +indicates that the nodes still possess separability as $l$ goes to infinity in +a wide range of regimes. Extensive experiments on both synthetic and real-world +data verify the effectiveness of our theory. + +
+
+
+
+
+ + ☆ RWKV-TS: Beyond Traditional Recurrent Neural Network for Time Series + Tasks + + +
+ Traditional Recurrent Neural Network (RNN) architectures, such as LSTM and +GRU, have historically held prominence in time series tasks. However, they have +recently seen a decline in their dominant position across various time series +tasks. As a result, recent advancements in time series forecasting have seen a +notable shift away from RNNs towards alternative architectures such as +Transformers, MLPs, and CNNs. To go beyond the limitations of traditional RNNs, +we design an efficient RNN-based model for time series tasks, named RWKV-TS, +with three distinctive features: (i) A novel RNN architecture characterized by +$O(L)$ time complexity and memory usage. (ii) An enhanced ability to capture +long-term sequence information compared to traditional RNNs. (iii) High +computational efficiency coupled with the capacity to scale up effectively. +Through extensive experimentation, our proposed RWKV-TS model demonstrates +competitive performance when compared to state-of-the-art Transformer-based or +CNN-based models. Notably, RWKV-TS exhibits not only comparable performance but +also demonstrates reduced latency and memory utilization. The success of +RWKV-TS encourages further exploration and innovation in leveraging RNN-based +approaches within the domain of Time Series. The combination of competitive +performance, low latency, and efficient memory usage positions RWKV-TS as a +promising avenue for future research in time series tasks. Code is available +at:\href{https://github.com/howard-hou/RWKV-TS}{ +https://github.com/howard-hou/RWKV-TS} + +
+
+ comment: 13 pages. 2 figures, 14 tables +
+
+
+
+
+ + ☆ Code Simulation Challenges for Large Language Models + + +
+ We investigate the extent to which Large Language Models (LLMs) can simulate +the execution of computer code and algorithms. We begin by looking straight +line programs, and show that current LLMs demonstrate poor performance even +with such simple programs -- performance rapidly degrades with the length of +code. We then investigate the ability of LLMs to simulate programs that contain +critical paths and redundant instructions. We also go beyond straight line +program simulation with sorting algorithms and nested loops, and we show the +computational complexity of a routine directly affects the ability of an LLM to +simulate its execution. We observe that LLMs execute instructions sequentially +and with a low error margin only for short programs or standard procedures. +LLMs' code simulation is in tension with their pattern recognition and +memorisation capabilities: on tasks where memorisation is detrimental, we +propose a novel prompting method to simulate code execution line by line. +Empirically, our new Chain of Simulation (CoSm) method improves on the standard +Chain of Thought prompting approach by avoiding the pitfalls of memorisation. + +
+
+ comment: main paper (10 pages) + Appendix (11 pages) +
+
+
+
+
+ + ☆ Fixed-Budget Differentially Private Best Arm Identification ICLR 2024 + + +
+ We study best arm identification (BAI) in linear bandits in the fixed-budget +regime under differential privacy constraints, when the arm rewards are +supported on the unit interval. Given a finite budget $T$ and a privacy +parameter $\varepsilon>0$, the goal is to minimise the error probability in +finding the arm with the largest mean after $T$ sampling rounds, subject to the +constraint that the policy of the decision maker satisfies a certain {\em +$\varepsilon$-differential privacy} ($\varepsilon$-DP) constraint. We construct +a policy satisfying the $\varepsilon$-DP constraint (called {\sc DP-BAI}) by +proposing the principle of {\em maximum absolute determinants}, and derive an +upper bound on its error probability. Furthermore, we derive a minimax lower +bound on the error probability, and demonstrate that the lower and the upper +bounds decay exponentially in $T$, with exponents in the two bounds matching +order-wise in (a) the sub-optimality gaps of the arms, (b) $\varepsilon$, and +(c) the problem complexity that is expressible as the sum of two terms, one +characterising the complexity of standard fixed-budget BAI (without privacy +constraints), and the other accounting for the $\varepsilon$-DP constraint. +Additionally, we present some auxiliary results that contribute to the +derivation of the lower bound on the error probability. These results, we +posit, may be of independent interest and could prove instrumental in proving +lower bounds on error probabilities in several other bandit problems. Whereas +prior works provide results for BAI in the fixed-budget regime without privacy +constraints or in the fixed-confidence regime with privacy constraints, our +work fills the gap in the literature by providing the results for BAI in the +fixed-budget regime under the $\varepsilon$-DP constraint. + +
+
+ comment: Accepted to ICLR 2024 +
+
+
+
+
+ + ☆ Rethinking Spectral Graph Neural Networks with Spatially Adaptive + Filtering + + +
+ Whilst spectral Graph Neural Networks (GNNs) are theoretically well-founded +in the spectral domain, their practical reliance on polynomial approximation +implies a profound linkage to the spatial domain. As previous studies rarely +examine spectral GNNs from the spatial perspective, their spatial-domain +interpretability remains elusive, e.g., what information is essentially encoded +by spectral GNNs in the spatial domain? In this paper, to answer this question, +we establish a theoretical connection between spectral filtering and spatial +aggregation, unveiling an intrinsic interaction that spectral filtering +implicitly leads the original graph to an adapted new graph, explicitly +computed for spatial aggregation. Both theoretical and empirical investigations +reveal that the adapted new graph not only exhibits non-locality but also +accommodates signed edge weights to reflect label consistency between nodes. +These findings thus highlight the interpretable role of spectral GNNs in the +spatial domain and inspire us to rethink graph spectral filters beyond the +fixed-order polynomials, which neglect global information. Built upon the +theoretical findings, we revisit the state-of-the-art spectral GNNs and propose +a novel Spatially Adaptive Filtering (SAF) framework, which leverages the +adapted new graph by spectral filtering for an auxiliary non-local aggregation. +Notably, our proposed SAF comprehensively models both node similarity and +dissimilarity from a global perspective, therefore alleviating persistent +deficiencies of GNNs related to long-range dependencies and graph heterophily. +Extensive experiments over 13 node classification benchmarks demonstrate the +superiority of our proposed framework to the state-of-the-art models. + +
+
+
+
+
+ + ☆ DTMM: Deploying TinyML Models on Extremely Weak IoT Devices with Pruning + + +
+ DTMM is a library designed for efficient deployment and execution of machine +learning models on weak IoT devices such as microcontroller units (MCUs). The +motivation for designing DTMM comes from the emerging field of tiny machine +learning (TinyML), which explores extending the reach of machine learning to +many low-end IoT devices to achieve ubiquitous intelligence. Due to the weak +capability of embedded devices, it is necessary to compress models by pruning +enough weights before deploying. Although pruning has been studied extensively +on many computing platforms, two key issues with pruning methods are +exacerbated on MCUs: models need to be deeply compressed without significantly +compromising accuracy, and they should perform efficiently after pruning. +Current solutions only achieve one of these objectives, but not both. In this +paper, we find that pruned models have great potential for efficient deployment +and execution on MCUs. Therefore, we propose DTMM with pruning unit selection, +pre-execution pruning optimizations, runtime acceleration, and post-execution +low-cost storage to fill the gap for efficient deployment and execution of +pruned models. It can be integrated into commercial ML frameworks for practical +deployment, and a prototype system has been developed. Extensive experiments on +various models show promising gains compared to state-of-the-art methods. + +
+
+
+
+
+ + ☆ Towards Continual Learning Desiderata via HSIC-Bottleneck + Orthogonalization and Equiangular Embedding AAAI 2024 + + +
+ Deep neural networks are susceptible to catastrophic forgetting when trained +on sequential tasks. Various continual learning (CL) methods often rely on +exemplar buffers or/and network expansion for balancing model stability and +plasticity, which, however, compromises their practical value due to privacy +and memory concerns. Instead, this paper considers a strict yet realistic +setting, where the training data from previous tasks is unavailable and the +model size remains relatively constant during sequential training. To achieve +such desiderata, we propose a conceptually simple yet effective method that +attributes forgetting to layer-wise parameter overwriting and the resulting +decision boundary distortion. This is achieved by the synergy between two key +components: HSIC-Bottleneck Orthogonalization (HBO) implements non-overwritten +parameter updates mediated by Hilbert-Schmidt independence criterion in an +orthogonal space and EquiAngular Embedding (EAE) enhances decision boundary +adaptation between old and new tasks with predefined basis vectors. Extensive +experiments demonstrate that our method achieves competitive accuracy +performance, even with absolute superiority of zero exemplar buffer and 1.02x +the base model. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ Consistent3D: Towards Consistent High-Fidelity Text-to-3D Generation + with Deterministic Sampling Prior + + +
+ Score distillation sampling (SDS) and its variants have greatly boosted the +development of text-to-3D generation, but are vulnerable to geometry collapse +and poor textures yet. To solve this issue, we first deeply analyze the SDS and +find that its distillation sampling process indeed corresponds to the +trajectory sampling of a stochastic differential equation (SDE): SDS samples +along an SDE trajectory to yield a less noisy sample which then serves as a +guidance to optimize a 3D model. However, the randomness in SDE sampling often +leads to a diverse and unpredictable sample which is not always less noisy, and +thus is not a consistently correct guidance, explaining the vulnerability of +SDS. Since for any SDE, there always exists an ordinary differential equation +(ODE) whose trajectory sampling can deterministically and consistently converge +to the desired target point as the SDE, we propose a novel and effective +"Consistent3D" method that explores the ODE deterministic sampling prior for +text-to-3D generation. Specifically, at each training iteration, given a +rendered image by a 3D model, we first estimate its desired 3D score function +by a pre-trained 2D diffusion model, and build an ODE for trajectory sampling. +Next, we design a consistency distillation sampling loss which samples along +the ODE trajectory to generate two adjacent samples and uses the less noisy +sample to guide another more noisy one for distilling the deterministic prior +into the 3D model. Experimental results show the efficacy of our Consistent3D +in generating high-fidelity and diverse 3D objects and large-scale scenes, as +shown in Fig. 1. The codes are available at +https://github.com/sail-sg/Consistent3D. + +
+
+
+
+
+ + ☆ Data Attribution for Diffusion Models: Timestep-induced Bias in + Influence Estimation + + +
+ Data attribution methods trace model behavior back to its training dataset, +offering an effective approach to better understand ``black-box'' neural +networks. While prior research has established quantifiable links between model +output and training data in diverse settings, interpreting diffusion model +outputs in relation to training samples remains underexplored. In particular, +diffusion models operate over a sequence of timesteps instead of instantaneous +input-output relationships in previous contexts, posing a significant challenge +to extend existing frameworks to diffusion models directly. Notably, we present +Diffusion-TracIn that incorporates this temporal dynamics and observe that +samples' loss gradient norms are highly dependent on timestep. This trend leads +to a prominent bias in influence estimation, and is particularly noticeable for +samples trained on large-norm-inducing timesteps, causing them to be generally +influential. To mitigate this effect, we introduce Diffusion-ReTrac as a +re-normalized adaptation that enables the retrieval of training samples more +targeted to the test sample of interest, facilitating a localized measurement +of influence and considerably more intuitive visualization. We demonstrate the +efficacy of our approach through various evaluation metrics and auxiliary +tasks, reducing the amount of generally influential samples to $\frac{1}{3}$ of +its original quantity. + +
+
+
+
+
+ + ☆ Residual Alignment: Uncovering the Mechanisms of Residual Networks NeurIPS 2023 + + +
+ The ResNet architecture has been widely adopted in deep learning due to its +significant boost to performance through the use of simple skip connections, +yet the underlying mechanisms leading to its success remain largely unknown. In +this paper, we conduct a thorough empirical study of the ResNet architecture in +classification tasks by linearizing its constituent residual blocks using +Residual Jacobians and measuring their singular value decompositions. Our +measurements reveal a process called Residual Alignment (RA) characterized by +four properties: + (RA1) intermediate representations of a given input are equispaced on a line, +embedded in high dimensional space, as observed by Gai and Zhang [2021]; + (RA2) top left and right singular vectors of Residual Jacobians align with +each other and across different depths; + (RA3) Residual Jacobians are at most rank C for fully-connected ResNets, +where C is the number of classes; and + (RA4) top singular values of Residual Jacobians scale inversely with depth. + RA consistently occurs in models that generalize well, in both +fully-connected and convolutional architectures, across various depths and +widths, for varying numbers of classes, on all tested benchmark datasets, but +ceases to occur once the skip connections are removed. It also provably occurs +in a novel mathematical model we propose. This phenomenon reveals a strong +alignment between residual branches of a ResNet (RA2+4), imparting a highly +rigid geometric structure to the intermediate representations as they progress +linearly through the network (RA1) up to the final layer, where they undergo +Neural Collapse. + +
+
+ comment: Accepted at NeurIPS 2023 as a Poster paper +
+
+
+
+
+ + ☆ Inductive Models for Artificial Intelligence Systems are Insufficient + without Good Explanations + + +
+ This paper discusses the limitations of machine learning (ML), particularly +deep artificial neural networks (ANNs), which are effective at approximating +complex functions but often lack transparency and explanatory power. It +highlights the `problem of induction' : the philosophical issue that past +observations may not necessarily predict future events, a challenge that ML +models face when encountering new, unseen data. The paper argues for the +importance of not just making predictions but also providing good explanations, +a feature that current models often fail to deliver. It suggests that for AI to +progress, we must seek models that offer insights and explanations, not just +predictions. + +
+
+
+
+
+ + ☆ Augmenting Math Word Problems via Iterative Question Composing + + +
+ Despite recent progress in improving the mathematical reasoning ability of +large language models(LLMs), solving competition-level math problems without +the use of external tools remains challenging for open-source LLMs. In this +work, we introduce the MMIQC dataset, a mixture of processed web data and +synthetic question-response pairs, to equip base models with better +mathematical reasoning skills. Mistral-7B-MMIQC, the model obtained by +fine-tuning Mistral-7B(arXiv:2310.06825) on MMIQC, achieves 36.0\% accuracy on +MATH(arXiv:2103.03874), 5.8\% higher than the previous (model size $\sim$7B) +SOTA. Our experiments also show that a large part of the improvement attributes +to our novel augmentation method IQC(Iterative Question Composing), where we +iteratively ask an LLM to compose new questions from the given seed problems +and do rejection sampling from another LLM. MMIQC has now been released on +https://huggingface.co/datasets/Vivacem/MMIQC. + +
+
+
+
+
+ + ☆ Continuous Time Continuous Space Homeostatic Reinforcement Learning + (CTCS-HRRL) : Towards Biological Self-Autonomous Agent + + +
+ Homeostasis is a biological process by which living beings maintain their +internal balance. Previous research suggests that homeostasis is a learned +behaviour. Recently introduced Homeostatic Regulated Reinforcement Learning +(HRRL) framework attempts to explain this learned homeostatic behavior by +linking Drive Reduction Theory and Reinforcement Learning. This linkage has +been proven in the discrete time-space, but not in the continuous time-space. +In this work, we advance the HRRL framework to a continuous time-space +environment and validate the CTCS-HRRL (Continuous Time Continuous Space HRRL) +framework. We achieve this by designing a model that mimics the homeostatic +mechanisms in a real-world biological agent. This model uses the +Hamilton-Jacobian Bellman Equation, and function approximation based on neural +networks and Reinforcement Learning. Through a simulation-based experiment we +demonstrate the efficacy of this model and uncover the evidence linked to the +agent's ability to dynamically choose policies that favor homeostasis in a +continuously changing internal-state milieu. Results of our experiments +demonstrate that agent learns homeostatic behaviour in a CTCS environment, +making CTCS-HRRL a promising framework for modellng animal dynamics and +decision-making. + +
+
+ comment: This work is a result of the ongoing collaboration between Cognitive + Neuroscience Lab, BITS Pilani K K Birla Goa Campus and Ecole Normale + Superieure, Paris France. This work is jointly supervised by Prof. Boris + Gutkin and Prof. Veeky Baths. arXiv admin note: substantial text overlap with + arXiv:2109.06580 +
+
+
+
+
+ + ☆ Attack and Reset for Unlearning: Exploiting Adversarial Noise toward + Machine Unlearning through Parameter Re-initialization + + +
+ With growing concerns surrounding privacy and regulatory compliance, the +concept of machine unlearning has gained prominence, aiming to selectively +forget or erase specific learned information from a trained model. In response +to this critical need, we introduce a novel approach called Attack-and-Reset +for Unlearning (ARU). This algorithm leverages meticulously crafted adversarial +noise to generate a parameter mask, effectively resetting certain parameters +and rendering them unlearnable. ARU outperforms current state-of-the-art +results on two facial machine-unlearning benchmark datasets, MUFAC and MUCAC. +In particular, we present the steps involved in attacking and masking that +strategically filter and re-initialize network parameters biased towards the +forget set. Our work represents a significant advancement in rendering data +unexploitable to deep learning models through parameter re-initialization, +achieved by harnessing adversarial noise to craft a mask. + +
+
+
+
+
+ + ☆ MicroNAS: Zero-Shot Neural Architecture Search for MCUs + + +
+ Neural Architecture Search (NAS) effectively discovers new Convolutional +Neural Network (CNN) architectures, particularly for accuracy optimization. +However, prior approaches often require resource-intensive training on super +networks or extensive architecture evaluations, limiting practical +applications. To address these challenges, we propose MicroNAS, a +hardware-aware zero-shot NAS framework designed for microcontroller units +(MCUs) in edge computing. MicroNAS considers target hardware optimality during +the search, utilizing specialized performance indicators to identify optimal +neural architectures without high computational costs. Compared to previous +works, MicroNAS achieves up to 1104x improvement in search efficiency and +discovers models with over 3.23x faster MCU inference while maintaining similar +accuracy + +
+
+
+
+
+ + ☆ Efficient Adapter Finetuning for Tail Languages in Streaming + Multilingual ASR ICASSP 2024 + + +
+ The end-to-end ASR model is often desired in the streaming multilingual +scenario since it is easier to deploy and can benefit from pre-trained speech +models such as powerful foundation models. Meanwhile, the heterogeneous nature +and imbalanced data abundance of different languages may cause performance +degradation, leading to asynchronous peak performance for different languages +during training, especially on tail ones. Sometimes even the data itself may +become unavailable as a result of the enhanced privacy protection. Existing +work tend to significantly increase the model size or learn language-specific +decoders to accommodate each language separately. In this study, we explore +simple yet effective Language-Dependent Adapter (LDA) finetuning under a +cascaded Conformer transducer framework enhanced by teacher pseudo-labeling for +tail languages in the streaming multilingual ASR. The adapter only accounts for +0.4% of the full model per language. It is plugged into the frozen foundation +model and is the only trainable module during the finetuning process with noisy +student training. The final model merges the adapter parameters from different +checkpoints for different languages. The model performance is validated on a +challenging multilingual dictation dataset, which includes 39 tail languages +across Latin, Greek, Arabic, etc. Our proposed method brings 12.2% word error +rate reduction on average and up to 37.5% on a single locale. Furthermore, we +show that our parameter-efficient LDA can match the quality of the full model +finetuning, thus greatly alleviating the asynchronous peak performance issue. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Rigid Protein-Protein Docking via Equivariant Elliptic-Paraboloid + Interface Prediction ICLR 2024 + + +
+ The study of rigid protein-protein docking plays an essential role in a +variety of tasks such as drug design and protein engineering. Recently, several +learning-based methods have been proposed for the task, exhibiting much faster +docking speed than those computational methods. In this paper, we propose a +novel learning-based method called ElliDock, which predicts an elliptic +paraboloid to represent the protein-protein docking interface. To be specific, +our model estimates elliptic paraboloid interfaces for the two input proteins +respectively, and obtains the roto-translation transformation for docking by +making two interfaces coincide. By its design, ElliDock is independently +equivariant with respect to arbitrary rotations/translations of the proteins, +which is an indispensable property to ensure the generalization of the docking +process. Experimental evaluations show that ElliDock achieves the fastest +inference time among all compared methods and is strongly competitive with +current state-of-the-art learning-based models such as DiffDock-PP and Multimer +particularly for antibody-antigen docking. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ☆ A GAN-based data poisoning framework against anomaly detection in + vertical federated learning + + +
+ In vertical federated learning (VFL), commercial entities collaboratively +train a model while preserving data privacy. However, a malicious participant's +poisoning attack may degrade the performance of this collaborative model. The +main challenge in achieving the poisoning attack is the absence of access to +the server-side top model, leaving the malicious participant without a clear +target model. To address this challenge, we introduce an innovative end-to-end +poisoning framework P-GAN. Specifically, the malicious participant initially +employs semi-supervised learning to train a surrogate target model. +Subsequently, this participant employs a GAN-based method to produce +adversarial perturbations to degrade the surrogate target model's performance. +Finally, the generator is obtained and tailored for VFL poisoning. Besides, we +develop an anomaly detection algorithm based on a deep auto-encoder (DAE), +offering a robust defense mechanism to VFL scenarios. Through extensive +experiments, we evaluate the efficacy of P-GAN and DAE, and further analyze the +factors that influence their performance. + +
+
+ comment: 6 pages, 7 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ FedLoGe: Joint Local and Generic Federated Learning under Long-tailed + Data ICLR 2024 + + +
+ Federated Long-Tailed Learning (Fed-LT), a paradigm wherein data collected +from decentralized local clients manifests a globally prevalent long-tailed +distribution, has garnered considerable attention in recent times. In the +context of Fed-LT, existing works have predominantly centered on addressing the +data imbalance issue to enhance the efficacy of the generic global model while +neglecting the performance at the local level. In contrast, conventional +Personalized Federated Learning (pFL) techniques are primarily devised to +optimize personalized local models under the presumption of a balanced global +data distribution. This paper introduces an approach termed Federated Local and +Generic Model Training in Fed-LT (FedLoGe), which enhances both local and +generic model performance through the integration of representation learning +and classifier alignment within a neural collapse framework. Our investigation +reveals the feasibility of employing a shared backbone as a foundational +framework for capturing overarching global trends, while concurrently employing +individualized classifiers to encapsulate distinct refinements stemming from +each client's local features. Building upon this discovery, we establish the +Static Sparse Equiangular Tight Frame Classifier (SSE-C), inspired by neural +collapse principles that naturally prune extraneous noisy features and foster +the acquisition of potent data representations. Furthermore, leveraging +insights from imbalance neural collapse's classifier norm patterns, we develop +Global and Local Adaptive Feature Realignment (GLA-FR) via an auxiliary global +classifier and personalized Euclidean norm transfer to align global features +with client preferences. Extensive experimental results on CIFAR-10/100-LT, +ImageNet, and iNaturalist demonstrate the advantage of our method over +state-of-the-art pFL and Fed-LT approaches. + +
+
+ comment: Accepted by ICLR 2024 +
+
+
+
+
+ + ☆ ACT-GAN: Radio map construction based on generative adversarial networks + with ACT blocks + + +
+ The radio map, serving as a visual representation of electromagnetic spatial +characteristics, plays a pivotal role in assessment of wireless communication +networks and radio monitoring coverage. Addressing the issue of low accuracy +existing in the current radio map construction, this paper presents a novel +radio map construction method based on generative adversarial network (GAN) in +which the Aggregated Contextual-Transformation (AOT) block, Convolutional Block +Attention Module (CBAM), and Transposed Convolution (T-Conv) block are applied +to the generator, and we name it as ACT-GAN. It significantly improves the +reconstruction accuracy and local texture of the radio maps. The performance of +ACT-GAN across three different scenarios is demonstrated. Experiment results +reveal that in the scenario without sparse discrete observations, the proposed +method reduces the root mean square error (RMSE) by 14.6% in comparison to the +state-of-the-art models. In the scenario with sparse discrete observations, the +RMSE is diminished by 13.2%. Furthermore, the predictive results of the +proposed model show a more lucid representation of electromagnetic spatial +field distribution. To verify the universality of this model in radio map +construction tasks, the scenario of unknown radio emission source is +investigated. The results indicate that the proposed model is robust radio map +construction and accurate in predicting the location of the emission source. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ DOO-RE: A dataset of ambient sensors in a meeting room for activity + recognition + + +
+ With the advancement of IoT technology, recognizing user activities with +machine learning methods is a promising way to provide various smart services +to users. High-quality data with privacy protection is essential for deploying +such services in the real world. Data streams from surrounding ambient sensors +are well suited to the requirement. Existing ambient sensor datasets only +support constrained private spaces and those for public spaces have yet to be +explored despite growing interest in research on them. To meet this need, we +build a dataset collected from a meeting room equipped with ambient sensors. +The dataset, DOO-RE, includes data streams from various ambient sensor types +such as Sound and Projector. Each sensor data stream is segmented into activity +units and multiple annotators provide activity labels through a +cross-validation annotation process to improve annotation quality. We finally +obtain 9 types of activities. To our best knowledge, DOO-RE is the first +dataset to support the recognition of both single and group activities in a +real meeting room with reliable annotations. + +
+
+
+
+
+ + ☆ Cascading Reinforcement Learning + + +
+ Cascading bandits have gained popularity in recent years due to their +applicability to recommendation systems and online advertising. In the +cascading bandit model, at each timestep, an agent recommends an ordered subset +of items (called an item list) from a pool of items, each associated with an +unknown attraction probability. Then, the user examines the list, and clicks +the first attractive item (if any), and after that, the agent receives a +reward. The goal of the agent is to maximize the expected cumulative reward. +However, the prior literature on cascading bandits ignores the influences of +user states (e.g., historical behaviors) on recommendations and the change of +states as the session proceeds. Motivated by this fact, we propose a +generalized cascading RL framework, which considers the impact of user states +and state transition into decisions. In cascading RL, we need to select items +not only with large attraction probabilities but also leading to good successor +states. This imposes a huge computational challenge due to the combinatorial +action space. To tackle this challenge, we delve into the properties of value +functions, and design an oracle BestPerm to efficiently find the optimal item +list. Equipped with BestPerm, we develop two algorithms CascadingVI and +CascadingBPI, which are both computationally-efficient and sample-efficient, +and provide near-optimal regret and sample complexity guarantees. Furthermore, +we present experiments to show the improved computational and sample +efficiencies of our algorithms compared to straightforward adaptations of +existing RL algorithms in practice. + +
+
+
+
+
+ + ☆ Towards Off-Policy Reinforcement Learning for Ranking Policies with + Human Feedback + + +
+ Probabilistic learning to rank (LTR) has been the dominating approach for +optimizing the ranking metric, but cannot maximize long-term rewards. +Reinforcement learning models have been proposed to maximize user long-term +rewards by formulating the recommendation as a sequential decision-making +problem, but could only achieve inferior accuracy compared to LTR counterparts, +primarily due to the lack of online interactions and the characteristics of +ranking. In this paper, we propose a new off-policy value ranking (VR) +algorithm that can simultaneously maximize user long-term rewards and optimize +the ranking metric offline for improved sample efficiency in a unified +Expectation-Maximization (EM) framework. We theoretically and empirically show +that the EM process guides the leaned policy to enjoy the benefit of +integration of the future reward and ranking metric, and learn without any +online interactions. Extensive offline and online experiments demonstrate the +effectiveness of our methods. + +
+
+
+
+
+ + ☆ AntiPhishStack: LSTM-based Stacked Generalization Model for Optimized + Phishing URLs Detection + + +
+ The escalating reliance on revolutionary online web services has introduced +heightened security risks, with persistent challenges posed by phishing despite +extensive security measures. Traditional phishing systems, reliant on machine +learning and manual features, struggle with evolving tactics. Recent advances +in deep learning offer promising avenues for tackling novel phishing challenges +and malicious URLs. This paper introduces a two-phase stack generalized model +named AntiPhishStack, designed to detect phishing sites. The model leverages +the learning of URLs and character-level TF-IDF features symmetrically, +enhancing its ability to combat emerging phishing threats. In Phase I, features +are trained on a base machine learning classifier, employing K-fold +cross-validation for robust mean prediction. Phase II employs a two-layered +stacked-based LSTM network with five adaptive optimizers for dynamic +compilation, ensuring premier prediction on these features. Additionally, the +symmetrical predictions from both phases are optimized and integrated to train +a meta-XGBoost classifier, contributing to a final robust prediction. The +significance of this work lies in advancing phishing detection with +AntiPhishStack, operating without prior phishing-specific feature knowledge. +Experimental validation on two benchmark datasets, comprising benign and +phishing or malicious URLs, demonstrates the model's exceptional performance, +achieving a notable 96.04% accuracy compared to existing studies. This research +adds value to the ongoing discourse on symmetry and asymmetry in information +security and provides a forward-thinking solution for enhancing network +security in the face of evolving cyber threats. + +
+
+
+
+
+ + ☆ CEL: A Continual Learning Model for Disease Outbreak Prediction by + Leveraging Domain Adaptation via Elastic Weight Consolidation + + +
+ Continual learning, the ability of a model to learn over time without +forgetting previous knowledge and, therefore, be adaptive to new data, is +paramount in dynamic fields such as disease outbreak prediction. Deep neural +networks, i.e., LSTM, are prone to error due to catastrophic forgetting. This +study introduces a novel CEL model for continual learning by leveraging domain +adaptation via Elastic Weight Consolidation (EWC). This model aims to mitigate +the catastrophic forgetting phenomenon in a domain incremental setting. The +Fisher Information Matrix (FIM) is constructed with EWC to develop a +regularization term that penalizes changes to important parameters, namely, the +important previous knowledge. CEL's performance is evaluated on three distinct +diseases, Influenza, Mpox, and Measles, with different metrics. The high +R-squared values during evaluation and reevaluation outperform the other +state-of-the-art models in several contexts, indicating that CEL adapts to +incremental data well. CEL's robustness and reliability are underscored by its +minimal 65% forgetting rate and 18% higher memory stability compared to +existing benchmark studies. This study highlights CEL's versatility in disease +outbreak prediction, addressing evolving data with temporal patterns. It offers +a valuable model for proactive disease control with accurate, timely +predictions. + +
+
+
+
+
+ + ☆ DeLF: Designing Learning Environments with Foundation Models AAAI 2024 + + +
+ Reinforcement learning (RL) offers a capable and intuitive structure for the +fundamental sequential decision-making problem. Despite impressive +breakthroughs, it can still be difficult to employ RL in practice in many +simple applications. In this paper, we try to address this issue by introducing +a method for designing the components of the RL environment for a given, +user-intended application. We provide an initial formalization for the problem +of RL component design, that concentrates on designing a good representation +for observation and action space. We propose a method named DeLF: Designing +Learning Environments with Foundation Models, that employs large language +models to design and codify the user's intended learning scenario. By testing +our method on four different learning environments, we demonstrate that DeLF +can obtain executable environment codes for the corresponding RL problems. + +
+
+ comment: AAAI 2024 Workshop on Synergy of Reinforcement Learning and Large + Language Models +
+
+
+
+
+ + ☆ Partial Diacritization: A Context-Contrastive Inference Approach + + +
+ Diacritization plays a pivotal role in improving readability and +disambiguating the meaning of Arabic texts. Efforts have so far focused on +marking every eligible character (Full Diacritization). Comparatively +overlooked, Partial Diacritzation (PD) is the selection of a subset of +characters to be marked to aid comprehension where needed. Research has +indicated that excessive diacritic marks can hinder skilled readers--reducing +reading speed and accuracy. We conduct a behavioral experiment and show that +partially marked text is often easier to read than fully marked text, and +sometimes easier than plain text. In this light, we introduce +Context-Contrastive Partial Diacritization (CCPD)--a novel approach to PD which +integrates seamlessly with existing Arabic diacritization systems. CCPD +processes each word twice, once with context and once without, and diacritizes +only the characters with disparities between the two inferences. Further, we +introduce novel indicators for measuring partial diacritization quality (SR, +PDER, HDER, ERE), essential for establishing this as a machine learning task. +Lastly, we introduce TD2, a Transformer-variant of an established model which +offers a markedly different per formance profile on our proposed indicators +compared to all other known systems. + +
+
+ comment: 13 equations, 5 tables, 5 figures +
+
+
+
+
+ + ☆ Characterising Gradients for Unsupervised Accuracy Estimation under + Distribution Shift + + +
+ Estimating test accuracy without access to the ground-truth test labels under +varying test environments is a challenging, yet extremely important problem in +the safe deployment of machine learning algorithms. Existing works rely on the +information from either the outputs or the extracted features of neural +networks to formulate an estimation score correlating with the ground-truth +test accuracy. In this paper, we investigate--both empirically and +theoretically--how the information provided by the gradients can be predictive +of the ground-truth test accuracy even under a distribution shift. +Specifically, we use the norm of classification-layer gradients, backpropagated +from the cross-entropy loss after only one gradient step over test data. Our +key idea is that the model should be adjusted with a higher magnitude of +gradients when it does not generalize to the test dataset with a distribution +shift. We provide theoretical insights highlighting the main ingredients of +such an approach ensuring its empirical success. Extensive experiments +conducted on diverse distribution shifts and model structures demonstrate that +our method significantly outperforms state-of-the-art algorithms. + +
+
+
+
+
+ + ☆ Herding LLaMaS: Using LLMs as an OS Module ASPLOS 2023 + + +
+ Computer systems are becoming increasingly heterogeneous with the emergence +of new memory technologies and compute devices. GPUs alongside CPUs have become +commonplace and CXL is poised to be a mainstay of cloud systems. The operating +system is responsible for managing these hardware resources, requiring +modification every time a new device is released. Years of research and +development are sunk into tuning the OS for high performance with each new +heterogeneous device. With the recent explosion in memory technologies and +domain-specific accelerators, it would be beneficial to have an OS that could +provide high performance for new devices without significant effort. + We propose LLaMaS which can adapt to new devices easily. LLaMaS uses Large +Language Models (LLMs) to extract the useful features of new devices from their +textual description and uses these features to make operating system decisions +at runtime. Adding support to LLaMaS for a new device is as simple as +describing the system and new device properties in plaintext. + LLaMaS reduces the burden on system administrators to enable easy integration +of new devices into production systems. + Preliminary evaluation using ChatGPT shows that LLMs are capable of +extracting device features from text and make correct OS decisions based on +those features. + +
+
+ comment: ASPLOS 2023, Wild and Crazy Ideas session +
+
+
+
+
+ + ☆ PPR: Enhancing Dodging Attacks while Maintaining Impersonation Attacks + on Face Recognition Systems + + +
+ Adversarial Attacks on Face Recognition (FR) encompass two types: +impersonation attacks and evasion attacks. We observe that achieving a +successful impersonation attack on FR does not necessarily ensure a successful +dodging attack on FR in the black-box setting. Introducing a novel attack +method named Pre-training Pruning Restoration Attack (PPR), we aim to enhance +the performance of dodging attacks whilst avoiding the degradation of +impersonation attacks. Our method employs adversarial example pruning, enabling +a portion of adversarial perturbations to be set to zero, while tending to +maintain the attack performance. By utilizing adversarial example pruning, we +can prune the pre-trained adversarial examples and selectively free up certain +adversarial perturbations. Thereafter, we embed adversarial perturbations in +the pruned area, which enhances the dodging performance of the adversarial face +examples. The effectiveness of our proposed attack method is demonstrated +through our experimental results, showcasing its superior performance. + +
+
+
+
+
+ + ☆ Similar but Faster: Manipulation of Tempo in Music Audio Embeddings for + Tempo Prediction and Search ICASSP + + +
+ Audio embeddings enable large scale comparisons of the similarity of audio +files for applications such as search and recommendation. Due to the +subjectivity of audio similarity, it can be desirable to design systems that +answer not only whether audio is similar, but similar in what way (e.g., wrt. +tempo, mood or genre). Previous works have proposed disentangled embedding +spaces where subspaces representing specific, yet possibly correlated, +attributes can be weighted to emphasize those attributes in downstream tasks. +However, no research has been conducted into the independence of these +subspaces, nor their manipulation, in order to retrieve tracks that are similar +but different in a specific way. Here, we explore the manipulation of tempo in +embedding spaces as a case-study towards this goal. We propose tempo +translation functions that allow for efficient manipulation of tempo within a +pre-existing embedding space whilst maintaining other properties such as genre. +As this translation is specific to tempo it enables retrieval of tracks that +are similar but have specifically different tempi. We show that such a function +can be used as an efficient data augmentation strategy for both training of +downstream tempo predictors, and improved nearest neighbor retrieval of +properties largely independent of tempo. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ Bridging State and History Representations: Understanding + Self-Predictive RL ICLR 2024 + + +
+ Representations are at the core of all deep reinforcement learning (RL) +methods for both Markov decision processes (MDPs) and partially observable +Markov decision processes (POMDPs). Many representation learning methods and +theoretical frameworks have been developed to understand what constitutes an +effective representation. However, the relationships between these methods and +the shared properties among them remain unclear. In this paper, we show that +many of these seemingly distinct methods and frameworks for state and history +abstractions are, in fact, based on a common idea of self-predictive +abstraction. Furthermore, we provide theoretical insights into the widely +adopted objectives and optimization, such as the stop-gradient technique, in +learning self-predictive representations. These findings together yield a +minimalist algorithm to learn self-predictive representations for states and +histories. We validate our theories by applying our algorithm to standard MDPs, +MDPs with distractors, and POMDPs with sparse rewards. These findings culminate +in a set of practical guidelines for RL practitioners. + +
+
+ comment: ICLR 2024 (Poster). Code is available at + https://github.com/twni2016/self-predictive-rl +
+
+
+
+
+ + ☆ CFASL: Composite Factor-Aligned Symmetry Learning for Disentanglement in + Variational AutoEncoder + + +
+ Symmetries of input and latent vectors have provided valuable insights for +disentanglement learning in VAEs.However, only a few works were proposed as an +unsupervised method, and even these works require known factor information in +training data. We propose a novel method, Composite Factor-Aligned Symmetry +Learning (CFASL), which is integrated into VAEs for learning symmetry-based +disentanglement in unsupervised learning without any knowledge of the dataset +factor information.CFASL incorporates three novel features for learning +symmetry-based disentanglement: 1) Injecting inductive bias to align latent +vector dimensions to factor-aligned symmetries within an explicit learnable +symmetry codebook 2) Learning a composite symmetry to express unknown factors +change between two random samples by learning factor-aligned symmetries within +the codebook 3) Inducing group equivariant encoder and decoder in training VAEs +with the two conditions. In addition, we propose an extended evaluation metric +for multi-factor changes in comparison to disentanglement evaluation in VAEs. +In quantitative and in-depth qualitative analysis, CFASL demonstrates a +significant improvement of disentanglement in single-factor change, and +multi-factor change conditions compared to state-of-the-art methods. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ☆ cedar: Composable and Optimized Machine Learning Input Data Pipelines + + +
+ The input data pipeline is an essential component of each machine learning +(ML) training job. It is responsible for reading massive amounts of training +data, processing batches of samples using complex of transformations, and +loading them onto training nodes at low latency and high throughput. Performant +input data systems are becoming increasingly critical, driven by skyrocketing +data volumes and training throughput demands. Unfortunately, current input data +systems cannot fully leverage key performance optimizations, resulting in +hugely inefficient infrastructures that require significant resources -- or +worse -- underutilize expensive accelerators. + To address these demands, we present cedar, a programming model and framework +that allows users to easily build, optimize, and execute input data pipelines. +cedar presents an easy-to-use programming interface, allowing users to define +input data pipelines using composable operators that support arbitrary ML +frameworks and libraries. Meanwhile, cedar transparently applies a complex and +extensible set of optimization techniques (e.g., offloading, caching, +prefetching, fusion, and reordering). It then orchestrates processing across a +customizable set of local and distributed compute resources in order to +maximize processing performance and efficiency, all without user input. On +average across six diverse input data pipelines, cedar achieves a 2.49x, 1.87x, +2.18x, and 2.74x higher performance compared to tf.data, tf.data service, Ray +Data, and PyTorch's DataLoader, respectively. + +
+
+
+
+
+ + ☆ MADA: Meta-Adaptive Optimizers through hyper-gradient Descent + + +
+ Since Adam was introduced, several novel adaptive optimizers for deep +learning have been proposed. These optimizers typically excel in some tasks but +may not outperform Adam uniformly across all tasks. In this work, we introduce +Meta-Adaptive Optimizers (MADA), a unified optimizer framework that can +generalize several known optimizers and dynamically learn the most suitable one +during training. The key idea in MADA is to parameterize the space of +optimizers and search through it using hyper-gradient descent. Numerical +results suggest that MADA is robust against sub-optimally tuned +hyper-parameters, and outperforms Adam, Lion, and Adan with their default +hyper-parameters, often even with optimized hyper-parameters. We also propose +AVGrad, a variant of AMSGrad where the maximum operator is replaced with +averaging, and observe that it performs better within MADA. Finally, we provide +a convergence analysis to show that interpolation of optimizers (specifically, +AVGrad and Adam) can improve their error bounds (up to constants), hinting at +an advantage for meta-optimizers. + +
+
+
+
+
+ + ☆ Tempo estimation as fully self-supervised binary classification ICASSP + + +
+ This paper addresses the problem of global tempo estimation in musical audio. +Given that annotating tempo is time-consuming and requires certain musical +expertise, few publicly available data sources exist to train machine learning +models for this task. Towards alleviating this issue, we propose a fully +self-supervised approach that does not rely on any human labeled data. Our +method builds on the fact that generic (music) audio embeddings already encode +a variety of properties, including information about tempo, making them easily +adaptable for downstream tasks. While recent work in self-supervised tempo +estimation aimed to learn a tempo specific representation that was subsequently +used to train a supervised classifier, we reformulate the task into the binary +classification problem of predicting whether a target track has the same or a +different tempo compared to a reference. While the former still requires +labeled training data for the final classification model, our approach uses +arbitrary unlabeled music data in combination with time-stretching for model +training as well as a small set of synthetically created reference samples for +predicting the final tempo. Evaluation of our approach in comparison with the +state-of-the-art reveals highly competitive performance when the constraint of +finding the precise tempo octave is relaxed. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ On the Effect of Data-Augmentation on Local Embedding Properties in the + Contrastive Learning of Music Audio Representations ICASSP + + +
+ Audio embeddings are crucial tools in understanding large catalogs of music. +Typically embeddings are evaluated on the basis of the performance they provide +in a wide range of downstream tasks, however few studies have investigated the +local properties of the embedding spaces themselves which are important in +nearest neighbor algorithms, commonly used in music search and recommendation. +In this work we show that when learning audio representations on music datasets +via contrastive learning, musical properties that are typically homogeneous +within a track (e.g., key and tempo) are reflected in the locality of +neighborhoods in the resulting embedding space. By applying appropriate data +augmentation strategies, localisation of such properties can not only be +reduced but the localisation of other attributes is increased. For example, +locality of features such as pitch and tempo that are less relevant to +non-expert listeners, may be mitigated while improving the locality of more +salient features such as genre and mood, achieving state-of-the-art performance +in nearest neighbor retrieval accuracy. Similarly, we show that the optimal +selection of data augmentation strategies for contrastive learning of music +audio embeddings is dependent on the downstream task, highlighting this as an +important embedding design decision. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ Convex and Bilevel Optimization for Neuro-Symbolic Inference and + Learning + + +
+ We address a key challenge for neuro-symbolic (NeSy) systems by leveraging +convex and bilevel optimization techniques to develop a general gradient-based +framework for end-to-end neural and symbolic parameter learning. The +applicability of our framework is demonstrated with NeuPSL, a state-of-the-art +NeSy architecture. To achieve this, we propose a smooth primal and dual +formulation of NeuPSL inference and show learning gradients are functions of +the optimal dual variables. Additionally, we develop a dual block coordinate +descent algorithm for the new formulation that naturally exploits warm-starts. +This leads to over 100x learning runtime improvements over the current best +NeuPSL inference method. Finally, we provide extensive empirical evaluations +across $8$ datasets covering a range of tasks and demonstrate our learning +framework achieves up to a 16% point prediction performance improvement over +alternative learning methods. + +
+
+
+
+
+ + ☆ ClimateGPT: Towards AI Synthesizing Interdisciplinary Research on + Climate Change + + +
+ This paper introduces ClimateGPT, a model family of domain-specific large +language models that synthesize interdisciplinary research on climate change. +We trained two 7B models from scratch on a science-oriented dataset of 300B +tokens. For the first model, the 4.2B domain-specific tokens were included +during pre-training and the second was adapted to the climate domain after +pre-training. Additionally, ClimateGPT-7B, 13B and 70B are continuously +pre-trained from Llama~2 on a domain-specific dataset of 4.2B tokens. Each +model is instruction fine-tuned on a high-quality and human-generated +domain-specific dataset that has been created in close cooperation with climate +scientists. To reduce the number of hallucinations, we optimize the model for +retrieval augmentation and propose a hierarchical retrieval strategy. To +increase the accessibility of our model to non-English speakers, we propose to +make use of cascaded machine translation and show that this approach can +perform comparably to natively multilingual models while being easier to scale +to a large number of languages. Further, to address the intrinsic +interdisciplinary aspect of climate change we consider different research +perspectives. Therefore, the model can produce in-depth answers focusing on +different perspectives in addition to an overall answer. We propose a suite of +automatic climate-specific benchmarks to evaluate LLMs. On these benchmarks, +ClimateGPT-7B performs on par with the ten times larger Llama-2-70B Chat model +while not degrading results on general domain benchmarks. Our human evaluation +confirms the trends we saw in our benchmarks. All models were trained and +evaluated using renewable energy and are released publicly. + +
+
+
+
+
+ + ☆ Functional Linear Non-Gaussian Acyclic Model for Causal Discovery + + +
+ In causal discovery, non-Gaussianity has been used to characterize the +complete configuration of a Linear Non-Gaussian Acyclic Model (LiNGAM), +encompassing both the causal ordering of variables and their respective +connection strengths. However, LiNGAM can only deal with the finite-dimensional +case. To expand this concept, we extend the notion of variables to encompass +vectors and even functions, leading to the Functional Linear Non-Gaussian +Acyclic Model (Func-LiNGAM). Our motivation stems from the desire to identify +causal relationships in brain-effective connectivity tasks involving, for +example, fMRI and EEG datasets. We demonstrate why the original LiNGAM fails to +handle these inherently infinite-dimensional datasets and explain the +availability of functional data analysis from both empirical and theoretical +perspectives. {We establish theoretical guarantees of the identifiability of +the causal relationship among non-Gaussian random vectors and even random +functions in infinite-dimensional Hilbert spaces.} To address the issue of +sparsity in discrete time points within intrinsic infinite-dimensional +functional data, we propose optimizing the coordinates of the vectors using +functional principal component analysis. Experimental results on synthetic data +verify the ability of the proposed framework to identify causal relationships +among multivariate functions using the observed samples. For real data, we +focus on analyzing the brain connectivity patterns derived from fMRI data. + +
+
+
+
+
+ + ☆ Automatic 3D Multi-modal Ultrasound Segmentation of Human Placenta using + Fusion Strategies and Deep Learning + + +
+ Purpose: Ultrasound is the most commonly used medical imaging modality for +diagnosis and screening in clinical practice. Due to its safety profile, +noninvasive nature and portability, ultrasound is the primary imaging modality +for fetal assessment in pregnancy. Current ultrasound processing methods are +either manual or semi-automatic and are therefore laborious, time-consuming and +prone to errors, and automation would go a long way in addressing these +challenges. Automated identification of placental changes at earlier gestation +could facilitate potential therapies for conditions such as fetal growth +restriction and pre-eclampsia that are currently detected only at late +gestational age, potentially preventing perinatal morbidity and mortality. + Methods: We propose an automatic three-dimensional multi-modal (B-mode and +power Doppler) ultrasound segmentation of the human placenta using deep +learning combined with different fusion strategies.We collected data containing +Bmode and power Doppler ultrasound scans for 400 studies. + Results: We evaluated different fusion strategies and state-of-the-art image +segmentation networks for placenta segmentation based on standard overlap- and +boundary-based metrics. We found that multimodal information in the form of +B-mode and power Doppler scans outperform any single modality. Furthermore, we +found that B-mode and power Doppler input scans fused at the data level provide +the best results with a mean Dice Similarity Coefficient (DSC) of 0.849. + Conclusion: We conclude that the multi-modal approach of combining B-mode and +power Doppler scans is effective in segmenting the placenta from 3D ultrasound +scans in a fully automated manner and is robust to quality variation of the +datasets. + +
+
+
+
+
+ + ☆ Physics-Informed Calibration of Aeromagnetic Compensation in Magnetic + Navigation Systems using Liquid Time-Constant Networks NeurIPS 2023 + + +
+ Magnetic navigation (MagNav) is a rising alternative to the Global +Positioning System (GPS) and has proven useful for aircraft navigation. +Traditional aircraft navigation systems, while effective, face limitations in +precision and reliability in certain environments and against attacks. Airborne +MagNav leverages the Earth's magnetic field to provide accurate positional +information. However, external magnetic fields induced by aircraft electronics +and Earth's large-scale magnetic fields disrupt the weaker signal of interest. +We introduce a physics-informed approach using Tolles-Lawson coefficients for +compensation and Liquid Time-Constant Networks (LTCs) to remove complex, noisy +signals derived from the aircraft's magnetic sources. Using real flight data +with magnetometer measurements and aircraft measurements, we observe up to a +64% reduction in aeromagnetic compensation error (RMSE nT), outperforming +conventional models. This significant improvement underscores the potential of +a physics-informed, machine learning approach for extracting clean, reliable, +and accurate magnetic signals for MagNav positional estimation. + +
+
+ comment: Accepted at the NeurIPS 2023 Machine Learning and the Physical + Sciences workshop, 7 pages, 4 figures, see code here: + https://github.com/fnerrise/LNNs_MagNav/ +
+
+
+
+
+ + ☆ Multiple Locally Linear Kernel Machines ICML'15 + + +
+ In this paper we propose a new non-linear classifier based on a combination +of locally linear classifiers. A well known optimization formulation is given +as we cast the problem in a $\ell_1$ Multiple Kernel Learning (MKL) problem +using many locally linear kernels. Since the number of such kernels is huge, we +provide a scalable generic MKL training algorithm handling streaming kernels. +With respect to the inference time, the resulting classifier fits the gap +between high accuracy but slow non-linear classifiers (such as classical MKL) +and fast but low accuracy linear classifiers. + +
+
+ comment: This paper was written in 2014 and was originally submitted but + rejected at ICML'15 +
+
+
+
+
+ + ☆ SymTC: A Symbiotic Transformer-CNN Net for Instance Segmentation of + Lumbar Spine MRI + + +
+ Intervertebral disc disease, a prevalent ailment, frequently leads to +intermittent or persistent low back pain, and diagnosing and assessing of this +disease rely on accurate measurement of vertebral bone and intervertebral disc +geometries from lumbar MR images. Deep neural network (DNN) models may assist +clinicians with more efficient image segmentation of individual instances +(disks and vertebrae) of the lumbar spine in an automated way, which is termed +as instance image segmentation. In this work, we proposed SymTC, an innovative +lumbar spine MR image segmentation model that combines the strengths of +Transformer and Convolutional Neural Network (CNN). Specifically, we designed a +parallel dual-path architecture to merge CNN layers and Transformer layers, and +we integrated a novel position embedding into the self-attention module of +Transformer, enhancing the utilization of positional information for more +accurate segmentation. To further improves model performance, we introduced a +new data augmentation technique to create synthetic yet realistic MR image +dataset, named SSMSpine, which is made publicly available. We evaluated our +SymTC and the other 15 existing image segmentation models on our private +in-house dataset and the public SSMSpine dataset, using two metrics, Dice +Similarity Coefficient and 95% Hausdorff Distance. The results show that our +SymTC has the best performance for segmenting vertebral bones and +intervertebral discs in lumbar spine MR images. The SymTC code and SSMSpine +dataset are available at https://github.com/jiasongchen/SymTC. + +
+
+
+
+
+ + ☆ MITS-GAN: Safeguarding Medical Imaging from Tampering with Generative + Adversarial Networks + + +
+ The progress in generative models, particularly Generative Adversarial +Networks (GANs), opened new possibilities for image generation but raised +concerns about potential malicious uses, especially in sensitive areas like +medical imaging. This study introduces MITS-GAN, a novel approach to prevent +tampering in medical images, with a specific focus on CT scans. The approach +disrupts the output of the attacker's CT-GAN architecture by introducing +imperceptible but yet precise perturbations. Specifically, the proposed +approach involves the introduction of appropriate Gaussian noise to the input +as a protective measure against various attacks. Our method aims to enhance +tamper resistance, comparing favorably to existing techniques. Experimental +results on a CT scan dataset demonstrate MITS-GAN's superior performance, +emphasizing its ability to generate tamper-resistant images with negligible +artifacts. As image tampering in medical domains poses life-threatening risks, +our proactive approach contributes to the responsible and ethical use of +generative models. This work provides a foundation for future research in +countering cyber threats in medical imaging. Models and codes are publicly +available at the following link +\url{https://iplab.dmi.unict.it/MITS-GAN-2024/}. + +
+
+
+
+
+ + ☆ SMOOTHIE: A Theory of Hyper-parameter Optimization for Software + Analytics + + +
+ Hyper-parameter optimization is the black art of tuning a learner's control +parameters. In software analytics, a repeated result is that such tuning can +result in dramatic performance improvements. Despite this, hyper-parameter +optimization is often applied rarely or poorly in software analytics--perhaps +due to the CPU cost of exploring all those parameter options can be +prohibitive. + We theorize that learners generalize better when the loss landscape is +``smooth''. This theory is useful since the influence on ``smoothness'' of +different hyper-parameter choices can be tested very quickly (e.g. for a deep +learner, after just one epoch). + To test this theory, this paper implements and tests SMOOTHIE, a novel +hyper-parameter optimizer that guides its optimizations via considerations of +``smothness''. The experiments of this paper test SMOOTHIE on numerous SE tasks +including (a) GitHub issue lifetime prediction; (b) detecting false alarms in +static code warnings; (c) defect prediction, and (d) a set of standard ML +datasets. In all these experiments, SMOOTHIE out-performed state-of-the-art +optimizers. Better yet, SMOOTHIE ran 300% faster than the prior state-of-the +art. We hence conclude that this theory (that hyper-parameter optimization is +best viewed as a ``smoothing'' function for the decision landscape), is both +theoretically interesting and practically very useful. + To support open science and other researchers working in this area, all our +scripts and datasets are available on-line at +https://github.com/yrahul3910/smoothness-hpo/. + +
+
+ comment: v1 +
+
+
+
+
+ + ☆ Land Cover Image Classification + + +
+ Land Cover (LC) image classification has become increasingly significant in +understanding environmental changes, urban planning, and disaster management. +However, traditional LC methods are often labor-intensive and prone to human +error. This paper explores state-of-the-art deep learning models for enhanced +accuracy and efficiency in LC analysis. We compare convolutional neural +networks (CNN) against transformer-based methods, showcasing their applications +and advantages in LC studies. We used EuroSAT, a patch-based LC classification +data set based on Sentinel-2 satellite images and achieved state-of-the-art +results using current transformer models. + +
+
+ comment: 7 pages, 4 figures, 1 table, published in conference +
+
+
+
+
+ + ☆ Robustness Evaluation of Machine Learning Models for Robot Arm Action + Recognition in Noisy Environments ICASSP + + +
+ In the realm of robot action recognition, identifying distinct but spatially +proximate arm movements using vision systems in noisy environments poses a +significant challenge. This paper studies robot arm action recognition in noisy +environments using machine learning techniques. Specifically, a vision system +is used to track the robot's movements followed by a deep learning model to +extract the arm's key points. Through a comparative analysis of machine +learning methods, the effectiveness and robustness of this model are assessed +in noisy environments. A case study was conducted using the Tic-Tac-Toe game in +a 3-by-3 grid environment, where the focus is to accurately identify the +actions of the arms in selecting specific locations within this constrained +environment. Experimental results show that our approach can achieve precise +key point detection and action classification despite the addition of noise and +uncertainties to the dataset. + +
+
+ comment: Accepted at ICASSP +
+
+
+
+
+ + ☆ MedBlindTuner: Towards Privacy-preserving Fine-tuning on Biomedical + Images with Transformers and Fully Homomorphic Encryption AAAI + + +
+ Advancements in machine learning (ML) have significantly revolutionized +medical image analysis, prompting hospitals to rely on external ML services. +However, the exchange of sensitive patient data, such as chest X-rays, poses +inherent privacy risks when shared with third parties. Addressing this concern, +we propose MedBlindTuner, a privacy-preserving framework leveraging fully +homomorphic encryption (FHE) and a data-efficient image transformer (DEiT). +MedBlindTuner enables the training of ML models exclusively on FHE-encrypted +medical images. Our experimental evaluation demonstrates that MedBlindTuner +achieves comparable accuracy to models trained on non-encrypted images, +offering a secure solution for outsourcing ML computations while preserving +patient data privacy. To the best of our knowledge, this is the first work that +uses data-efficient image transformers and fully homomorphic encryption in this +domain. + +
+
+ comment: Accepted for the presentation at W3PHIAI, The 38th Annual AAAI + Conference on Artificial Intelligence 2024 +
+
+
+
+
+ + ☆ Efficient generative adversarial networks using linear + additive-attention Transformers + + +
+ Although the capacity of deep generative models for image generation, such as +Diffusion Models (DMs) and Generative Adversarial Networks (GANs), has +dramatically improved in recent years, much of their success can be attributed +to computationally expensive architectures. This has limited their adoption and +use to research laboratories and companies with large resources, while +significantly raising the carbon footprint for training, fine-tuning, and +inference. In this work, we present LadaGAN, an efficient generative +adversarial network that is built upon a novel Transformer block named +Ladaformer. The main component of this block is a linear additive-attention +mechanism that computes a single attention vector per head instead of the +quadratic dot-product attention. We employ Ladaformer in both the generator and +discriminator, which reduces the computational complexity and overcomes the +training instabilities often associated with Transformer GANs. LadaGAN +consistently outperforms existing convolutional and Transformer GANs on +benchmark datasets at different resolutions while being significantly more +efficient. Moreover, LadaGAN shows competitive performance compared to +state-of-the-art multi-step generative models (e.g. DMs) using orders of +magnitude less computational resources. + +
+
+ comment: 12 pages, 6 figures +
+
+
+
+
+ + ☆ Bilevel Optimization under Unbounded Smoothness: A New Algorithm and + Convergence Analysis ICLR 2024 + + +
+ Bilevel optimization is an important formulation for many machine learning +problems. Current bilevel optimization algorithms assume that the gradient of +the upper-level function is Lipschitz. However, recent studies reveal that +certain neural networks such as recurrent neural networks (RNNs) and +long-short-term memory networks (LSTMs) exhibit potential unbounded smoothness, +rendering conventional bilevel optimization algorithms unsuitable. In this +paper, we design a new bilevel optimization algorithm, namely BO-REP, to +address this challenge. This algorithm updates the upper-level variable using +normalized momentum and incorporates two novel techniques for updating the +lower-level variable: \textit{initialization refinement} and \textit{periodic +updates}. Specifically, once the upper-level variable is initialized, a +subroutine is invoked to obtain a refined estimate of the corresponding optimal +lower-level variable, and the lower-level variable is updated only after every +specific period instead of each iteration. When the upper-level problem is +nonconvex and unbounded smooth, and the lower-level problem is strongly convex, +we prove that our algorithm requires $\widetilde{\mathcal{O}}(1/\epsilon^4)$ +iterations to find an $\epsilon$-stationary point in the stochastic setting, +where each iteration involves calling a stochastic gradient or Hessian-vector +product oracle. Notably, this result matches the state-of-the-art complexity +results under the bounded smoothness setting and without mean-squared +smoothness of the stochastic gradient, up to logarithmic factors. Our proof +relies on novel technical lemmas for the periodically updated lower-level +variable, which are of independent interest. Our experiments on +hyper-representation learning, hyperparameter optimization, and data +hyper-cleaning for text classification tasks demonstrate the effectiveness of +our proposed algorithm. + +
+
+ comment: Accepted by ICLR 2024, Spotlight +
+
+
+
+
+ + ♻ ☆ HomPINNs: homotopy physics-informed neural networks for solving the + inverse problems of nonlinear differential equations with multiple solutions + + +
+ Due to the complex behavior arising from non-uniqueness, symmetry, and +bifurcations in the solution space, solving inverse problems of nonlinear +differential equations (DEs) with multiple solutions is a challenging task. To +address this, we propose homotopy physics-informed neural networks (HomPINNs), +a novel framework that leverages homotopy continuation and neural networks +(NNs) to solve inverse problems. The proposed framework begins with the use of +NNs to simultaneously approximate unlabeled observations across diverse +solutions while adhering to DE constraints. Through homotopy continuation, the +proposed method solves the inverse problem by tracing the observations and +identifying multiple solutions. The experiments involve testing the performance +of the proposed method on one-dimensional DEs and applying it to solve a +two-dimensional Gray-Scott simulation. Our findings demonstrate that the +proposed method is scalable and adaptable, providing an effective solution for +solving DEs with multiple solutions and unknown parameters. Moreover, it has +significant potential for various applications in scientific computing, such as +modeling complex systems and solving inverse problems in physics, chemistry, +biology, etc. + +
+
+ comment: 20 pages, 15 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ AUTOACT: Automatic Agent Learning from Scratch via Self-Planning + + +
+ Language agents have achieved considerable performance on various complex +tasks. Despite the incessant exploration in this field, existing language agent +systems still struggle with costly, non-reproducible data reliance and face the +challenge of compelling a single model for multiple functions. To this end, we +introduce AutoAct, an automatic agent learning framework that does not rely on +large-scale annotated data and synthetic trajectories from closed-source models +(e.g., GPT-4). Given limited data with a tool library, AutoAct first +automatically synthesizes planning trajectories without any assistance from +humans or strong closed-source models. Then, AutoAct leverages a +division-of-labor strategy to automatically differentiate based on the target +task information and synthesized trajectories, producing a sub-agent group to +complete the task. We conduct comprehensive experiments with different LLMs, +which demonstrates that AutoAct yields better or parallel performance compared +to various strong baselines. We even notice that AutoAct, when using the +Llama-2-13b model, can achieve performance comparable to that of the zero-shot +GPT-3.5-Turbo agent. Code will be available at +https://github.com/zjunlp/AutoAct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Flame: Simplifying Topology Extension in Federated Learning + + +
+ Distributed machine learning approaches, including a broad class of federated +learning (FL) techniques, present a number of benefits when deploying machine +learning applications over widely distributed infrastructures. The benefits are +highly dependent on the details of the underlying machine learning topology, +which specifies the functionality executed by the participating nodes, their +dependencies and interconnections. Current systems lack the flexibility and +extensibility necessary to customize the topology of a machine learning +deployment. We present Flame, a new system that provides flexibility of the +topology configuration of distributed FL applications around the specifics of a +particular deployment context, and is easily extensible to support new FL +architectures. Flame achieves this via a new high-level abstraction Topology +Abstraction Graphs (TAGs). TAGs decouple the ML application logic from the +underlying deployment details, making it possible to specialize the application +deployment with reduced development effort. Flame is released as an open source +project, and its flexibility and extensibility support a variety of topologies +and mechanisms, and can facilitate the development of new FL methodologies. + +
+
+
+
+
+ + ♻ ☆ Intensity Profile Projection: A Framework for Continuous-Time + Representation Learning for Dynamic Networks + + +
+ We present a new representation learning framework, Intensity Profile +Projection, for continuous-time dynamic network data. Given triples $(i,j,t)$, +each representing a time-stamped ($t$) interaction between two entities +($i,j$), our procedure returns a continuous-time trajectory for each node, +representing its behaviour over time. The framework consists of three stages: +estimating pairwise intensity functions, e.g. via kernel smoothing; learning a +projection which minimises a notion of intensity reconstruction error; and +constructing evolving node representations via the learned projection. The +trajectories satisfy two properties, known as structural and temporal +coherence, which we see as fundamental for reliable inference. Moreoever, we +develop estimation theory providing tight control on the error of any estimated +trajectory, indicating that the representations could even be used in quite +noise-sensitive follow-on analyses. The theory also elucidates the role of +smoothing as a bias-variance trade-off, and shows how we can reduce the level +of smoothing as the signal-to-noise ratio increases on account of the algorithm +`borrowing strength' across the network. + +
+
+ comment: 38 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on finetuning RoBERTa and LLaMA-2 (7B and +70B) demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and enables +aggressive quantization to sub-3 bits with only minor performance degradations. +When finetuned on a language modeling calibration dataset, LQ-LoRA can also be +used for model compression; in this setting our 2.75-bit LLaMA-2-70B model +(which has 2.85 bits on average when including the low-rank components and +requires 27GB of GPU memory) performs respectably compared to the 16-bit +baseline. + +
+
+
+
+
+ + ♻ ☆ AIRI: Predicting Retention Indices and their Uncertainties using + Artificial Intelligence + + +
+ The Kov\'ats Retention index (RI) is a quantity measured using gas +chromatography and commonly used in the identification of chemical structures. +Creating libraries of observed RI values is a laborious task, so we explore the +use of a deep neural network for predicting RI values from structure for +standard semipolar columns. This network generated predictions with a mean +absolute error of 15.1 and, in a quantification of the tail of the error +distribution, a 95th percentile absolute error of 46.5. Because of the +Artificial Intelligence Retention Indices (AIRI) network's accuracy, it was +used to predict RI values for the NIST EI-MS spectral libraries. These RI +values are used to improve chemical identification methods and the quality of +the library. Estimating uncertainty is an important practical need when using +prediction models. To quantify the uncertainty of our network for each +individual prediction, we used the outputs of an ensemble of 8 networks to +calculate a predicted standard deviation for each RI value prediction. This +predicted standard deviation was corrected to follow the error between observed +and predicted RI values. The Z scores using these predicted standard deviations +had a standard deviation of 1.52 and a 95th percentile absolute Z score +corresponding to a mean RI value of 42.6. + +
+
+
+
+
+ + ♻ ☆ Improved Probabilistic Image-Text Representations ICLR 2024 + + +
+ Image-Text Matching (ITM) task, a fundamental vision-language (VL) task, +suffers from the inherent ambiguity arising from multiplicity and imperfect +annotations. Deterministic functions are not sufficiently powerful to capture +ambiguity, prompting the exploration of probabilistic embeddings to tackle the +challenge. However, the existing probabilistic ITM approach encounters two key +shortcomings; the burden of heavy computations due to the Monte Carlo +approximation, and the loss saturation issue in the face of abundant false +negatives. To overcome the issues, this paper presents an improved +Probabilistic Cross-Modal Embeddings (named PCME++) by introducing a new +probabilistic distance with a closed-form solution. In addition, two +optimization techniques are proposed to enhance PCME++ further: first, the +incorporation of pseudo-positives to prevent the loss saturation problem under +massive false negatives; second, mixed sample data augmentation for +probabilistic matching. Experimental results on MS-COCO Caption and two +extended benchmarks, CxC and ECCV Caption, demonstrate the effectiveness of +PCME++ compared to state-of-the-art ITM methods. The robustness of PCME++ is +also evaluated under noisy image-text correspondences. In addition, the +potential applicability of PCME++ in automatic prompt tuning for zero-shot +classification is shown. The code is available at +https://github.com/naver-ai/pcmepp. + +
+
+ comment: ICLR 2024; Code: https://github.com/naver-ai/pcmepp. Project page: + https://naver-ai.github.io/pcmepp/. 26 pages, 2.4 MB +
+
+
+
+
+ + ♻ ☆ ID-MixGCL: Identity Mixup for Graph Contrastive Learning + + +
+ Graph contrastive learning (GCL) has recently achieved substantial +advancements. Existing GCL approaches compare two different ``views'' of the +same graph in order to learn node/graph representations. The underlying +assumption of these studies is that the graph augmentation strategy is capable +of generating several different graph views such that the graph views are +structurally different but semantically similar to the original graphs, and +thus the ground-truth labels of the original and augmented graph/nodes can be +regarded identical in contrastive learning. However, we observe that this +assumption does not always hold. For instance, the deletion of a super-node +within a social network can exert a substantial influence on the partitioning +of communities for other nodes. Similarly, any perturbation to nodes or edges +in a molecular graph will change the labels of the graph. Therefore, we believe +that augmenting the graph, accompanied by an adaptation of the labels used for +the contrastive loss, will facilitate the encoder to learn a better +representation. Based on this idea, we propose ID-MixGCL, which allows the +simultaneous interpolation of input nodes and corresponding identity labels to +obtain soft-confidence samples, with a controllable degree of change, leading +to the capture of fine-grained representations from self-supervised training on +unlabeled graphs. Experimental results demonstrate that ID-MixGCL improves +performance on graph classification and node classification tasks, as +demonstrated by significant improvements on the Cora, IMDB-B, IMDB-M, and +PROTEINS datasets compared to state-of-the-art techniques, by 3-29% absolute +points. + +
+
+ comment: 10 pages, 7 figures, accepted by IEEE BigData 2023 +
+
+
+
+
+ + ♻ ☆ Tiny Time Mixers (TTMs): Fast Pre-trained Models for Enhanced + Zero/Few-Shot Forecasting of Multivariate Time Series + + +
+ Large pre-trained models for zero/few-shot learning excel in language and +vision domains but encounter challenges in multivariate time series (TS) due to +the diverse nature and scarcity of publicly available pre-training data. +Consequently, there has been a recent surge in utilizing pre-trained large +language models (LLMs) with token adaptations for TS forecasting. These +approaches employ cross-domain transfer learning and surprisingly yield +impressive results. However, these models are typically very slow and large +(~billion parameters) and do not consider cross-channel correlations. To +address this, we present Tiny Time Mixers (TTM), a significantly small model +based on the lightweight TSMixer architecture. TTM marks the first success in +developing fast and tiny general pre-trained models (<1M parameters), +exclusively trained on public TS datasets, with effective transfer learning +capabilities for forecasting. To tackle the complexity of pre-training on +multiple datasets with varied temporal resolutions, we introduce several novel +enhancements such as adaptive patching, dataset augmentation via downsampling, +and resolution prefix tuning. Moreover, we employ a multi-level modeling +strategy to effectively model channel correlations and infuse exogenous signals +during fine-tuning, a crucial capability lacking in existing benchmarks. TTM +shows significant accuracy gains (12-38\%) over popular benchmarks in +few/zero-shot forecasting. It also drastically reduces the compute needs as +compared to LLM-TS methods, with a 14X cut in learnable parameters, 106X less +total parameters, and substantial reductions in fine-tuning (65X) and inference +time (54X). In fact, TTM's zero-shot often surpasses the few-shot results in +many popular benchmarks, highlighting the efficacy of our approach. Code and +pre-trained models will be open-sourced. + +
+
+
+
+
+ + ♻ ☆ Carrying over algorithm in transformers + + +
+ Addition is perhaps one of the simplest arithmetic tasks one can think of and +is usually performed using the carrying over algorithm. This algorithm consists +of two tasks: adding digits in the same position and carrying over a one +whenever necessary. We study how transformer models implement this algorithm +and how the two aforementioned tasks are allocated to different parts of the +network. We first focus on two-layer encoder-only models and show that the +carrying over algorithm is implemented in a modular fashion. The first layer is +mostly responsible for adding digits in the same position. The second layer +first decides, in the attention, which positions need a carried one or not, and +then performs the carrying of the one in the final MLP. We provide a simple way +of precisely identifying which neurons are responsible for that task. This +implementation of the carrying over algorithm occurs across a range of +hyperparameters for two as well as three-layer models. For small decoder-only +models, we observe the same implementation and provide suggestive evidence for +its existence in three 7B large language models. + +
+
+ comment: Comments welcome! +
+
+
+
+
+ + ♻ ☆ Supporting Safety Analysis of Image-processing DNNs through + Clustering-based Approaches + + +
+ The adoption of deep neural networks (DNNs) in safety-critical contexts is +often prevented by the lack of effective means to explain their results, +especially when they are erroneous. In our previous work, we proposed a +white-box approach (HUDD) and a black-box approach (SAFE) to automatically +characterize DNN failures. They both identify clusters of similar images from a +potentially large set of images leading to DNN failures. However, the analysis +pipelines for HUDD and SAFE were instantiated in specific ways according to +common practices, deferring the analysis of other pipelines to future work. In +this paper, we report on an empirical evaluation of 99 different pipelines for +root cause analysis of DNN failures. They combine transfer learning, +autoencoders, heatmaps of neuron relevance, dimensionality reduction +techniques, and different clustering algorithms. Our results show that the best +pipeline combines transfer learning, DBSCAN, and UMAP. It leads to clusters +almost exclusively capturing images of the same failure scenario, thus +facilitating root cause analysis. Further, it generates distinct clusters for +each root cause of failure, thus enabling engineers to detect all the unsafe +scenarios. Interestingly, these results hold even for failure scenarios that +are only observed in a small percentage of the failing images. + +
+
+ comment: 16 Tables, 15 Figures +
+
+
+
+
+ + ♻ ☆ A Probabilistic Fluctuation based Membership Inference Attack for + Diffusion Models + + +
+ Membership Inference Attack (MIA) identifies whether a record exists in a +machine learning model's training set by querying the model. MIAs on the +classic classification models have been well-studied, and recent works have +started to explore how to transplant MIA onto generative models. Our +investigation indicates that existing MIAs designed for generative models +mainly depend on the overfitting in target models. However, overfitting can be +avoided by employing various regularization techniques, whereas existing MIAs +demonstrate poor performance in practice. Unlike overfitting, memorization is +essential for deep learning models to attain optimal performance, making it a +more prevalent phenomenon. Memorization in generative models leads to an +increasing trend in the probability distribution of generating records around +the member record. Therefore, we propose a Probabilistic Fluctuation Assessing +Membership Inference Attack (PFAMI), a black-box MIA that infers memberships by +detecting these trends via analyzing the overall probabilistic fluctuations +around given records. We conduct extensive experiments across multiple +generative models and datasets, which demonstrate PFAMI can improve the attack +success rate (ASR) by about 27.9% when compared with the best baseline. + +
+
+
+
+
+ + ♻ ☆ Generalizing Medical Image Representations via Quaternion Wavelet + Networks + + +
+ Neural network generalizability is becoming a broad research field due to the +increasing availability of datasets from different sources and for various +tasks. This issue is even wider when processing medical data, where a lack of +methodological standards causes large variations being provided by different +imaging centers or acquired with various devices and cofactors. To overcome +these limitations, we introduce a novel, generalizable, data- and task-agnostic +framework able to extract salient features from medical images. The proposed +quaternion wavelet network (QUAVE) can be easily integrated with any +pre-existing medical image analysis or synthesis task, and it can be involved +with real, quaternion, or hypercomplex-valued models, generalizing their +adoption to single-channel data. QUAVE first extracts different sub-bands +through the quaternion wavelet transform, resulting in both +low-frequency/approximation bands and high-frequency/fine-grained features. +Then, it weighs the most representative set of sub-bands to be involved as +input to any other neural model for image processing, replacing standard data +samples. We conduct an extensive experimental evaluation comprising different +datasets, diverse image analysis, and synthesis tasks including reconstruction, +segmentation, and modality translation. We also evaluate QUAVE in combination +with both real and quaternion-valued models. Results demonstrate the +effectiveness and the generalizability of the proposed framework that improves +network performance while being flexible to be adopted in manifold scenarios +and robust to domain shifts. The full code is available at: +https://github.com/ispamm/QWT. + +
+
+ comment: This paper is currently under review +
+
+
+
+
+ + ♻ ☆ Balancing stability and plasticity in continual learning: the + readout-decomposition of activation change (RDAC) framework + + +
+ Continual learning (CL) algorithms strive to acquire new knowledge while +preserving prior information. However, this stability-plasticity trade-off +remains a central challenge. This paper introduces a framework that dissects +this trade-off, offering valuable insights into CL algorithms. The +Readout-Decomposition of Activation Change (RDAC) framework first addresses the +stability-plasticity dilemma and its relation to catastrophic forgetting. It +relates learning-induced activation changes in the range of prior readouts to +the degree of stability and changes in the null space to the degree of +plasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the +framework clarifies the stability-plasticity trade-offs of the popular +regularization algorithms Synaptic intelligence (SI), Elastic-weight +consolidation (EWC), and learning without Forgetting (LwF), and replay-based +algorithms Gradient episodic memory (GEM), and data replay. GEM and data replay +preserved stability and plasticity, while SI, EWC, and LwF traded off +plasticity for stability. The inability of the regularization algorithms to +maintain plasticity was linked to them restricting the change of activations in +the null space of the prior readout. Additionally, for one-hidden-layer linear +neural networks, we derived a gradient decomposition algorithm to restrict +activation change only in the range of the prior readouts, to maintain high +stability while not further sacrificing plasticity. Results demonstrate that +the algorithm maintained stability without significant plasticity loss. The +RDAC framework informs the behavior of existing CL algorithms and paves the way +for novel CL approaches. Finally, it sheds light on the connection between +learning-induced activation/representation changes and the stability-plasticity +dilemma, also offering insights into representational drift in biological +systems. + +
+
+ comment: 15 pages, 5 figures, Revision +
+
+
+
+
+ + ♻ ☆ Creating Multi-Level Skill Hierarchies in Reinforcement Learning NeurIPS 2023 + + +
+ What is a useful skill hierarchy for an autonomous agent? We propose an +answer based on a graphical representation of how the interaction between an +agent and its environment may unfold. Our approach uses modularity maximisation +as a central organising principle to expose the structure of the interaction +graph at multiple levels of abstraction. The result is a collection of skills +that operate at varying time scales, organised into a hierarchy, where skills +that operate over longer time scales are composed of skills that operate over +shorter time scales. The entire skill hierarchy is generated automatically, +with no human intervention, including the skills themselves (their behaviour, +when they can be called, and when they terminate) as well as the hierarchical +dependency structure between them. In a wide range of environments, this +approach generates skill hierarchies that are intuitively appealing and that +considerably improve the learning performance of the agent. + +
+
+ comment: 20 pages, 10 figures. Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Score-based Source Separation with Applications to Digital Communication + Signals + + +
+ We propose a new method for separating superimposed sources using +diffusion-based generative models. Our method relies only on separately trained +statistical priors of independent sources to establish a new objective function +guided by maximum a posteriori estimation with an $\alpha$-posterior, across +multiple levels of Gaussian smoothing. Motivated by applications in +radio-frequency (RF) systems, we are interested in sources with underlying +discrete nature and the recovery of encoded bits from a signal of interest, as +measured by the bit error rate (BER). Experimental results with RF mixtures +demonstrate that our method results in a BER reduction of 95% over classical +and existing learning-based methods. Our analysis demonstrates that our +proposed method yields solutions that asymptotically approach the modes of an +underlying discrete distribution. Furthermore, our method can be viewed as a +multi-source extension to the recently proposed score distillation sampling +scheme, shedding additional light on its use beyond conditional sampling. The +project webpage is available at https://alpha-rgs.github.io + +
+
+ comment: 34 pages, 18 figures, for associated project webpage see + https://alpha-rgs.github.io +
+
+
+
+
+ + ♻ ☆ Stochastic Thermodynamics of Learning Parametric Probabilistic Models + + +
+ We have formulated a family of machine learning problems as the time +evolution of Parametric Probabilistic Models (PPMs), inherently rendering a +thermodynamic process. Our primary motivation is to leverage the rich toolbox +of thermodynamics of information to assess the information-theoretic content of +learning a probabilistic model. We first introduce two information-theoretic +metrics: Memorized-information (M-info) and Learned-information (L-info), which +trace the flow of information during the learning process of PPMs. Then, we +demonstrate that the accumulation of L-info during the learning process is +associated with entropy production, and parameters serve as a heat reservoir in +this process, capturing learned information in the form of M-info. + +
+
+
+
+
+ + ♻ ☆ Segment Anything Model for Medical Images? + + +
+ The Segment Anything Model (SAM) is the first foundation model for general +image segmentation. It has achieved impressive results on various natural image +segmentation tasks. However, medical image segmentation (MIS) is more +challenging because of the complex modalities, fine anatomical structures, +uncertain and complex object boundaries, and wide-range object scales. To fully +validate SAM's performance on medical data, we collected and sorted 53 +open-source datasets and built a large medical segmentation dataset with 18 +modalities, 84 objects, 125 object-modality paired targets, 1050K 2D images, +and 6033K masks. We comprehensively analyzed different models and strategies on +the so-called COSMOS 1050K dataset. Our findings mainly include the following: +1) SAM showed remarkable performance in some specific objects but was unstable, +imperfect, or even totally failed in other situations. 2) SAM with the large +ViT-H showed better overall performance than that with the small ViT-B. 3) SAM +performed better with manual hints, especially box, than the Everything mode. +4) SAM could help human annotation with high labeling quality and less time. 5) +SAM was sensitive to the randomness in the center point and tight box prompts, +and may suffer from a serious performance drop. 6) SAM performed better than +interactive methods with one or a few points, but will be outpaced as the +number of points increases. 7) SAM's performance correlated to different +factors, including boundary complexity, intensity differences, etc. 8) +Finetuning the SAM on specific medical tasks could improve its average DICE +performance by 4.39% and 6.68% for ViT-B and ViT-H, respectively. We hope that +this comprehensive report can help researchers explore the potential of SAM +applications in MIS, and guide how to appropriately use and develop SAM. + +
+
+ comment: Accepted by Medical Image Analysis. 23 pages, 18 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ CLadder: Assessing Causal Reasoning in Language Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insights into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023; updated with CLadder dataset v1.5 +
+
+
+
+
+ + ♻ ☆ Causal Component Analysis NeurIPS 2023 + + +
+ Independent Component Analysis (ICA) aims to recover independent latent +variables from observed mixtures thereof. Causal Representation Learning (CRL) +aims instead to infer causally related (thus often statistically dependent) +latent variables, together with the unknown graph encoding their causal +relationships. We introduce an intermediate problem termed Causal Component +Analysis (CauCA). CauCA can be viewed as a generalization of ICA, modelling the +causal dependence among the latent components, and as a special case of CRL. In +contrast to CRL, it presupposes knowledge of the causal graph, focusing solely +on learning the unmixing function and the causal mechanisms. Any impossibility +results regarding the recovery of the ground truth in CauCA also apply for CRL, +while possibility results may serve as a stepping stone for extensions to CRL. +We characterize CauCA identifiability from multiple datasets generated through +different types of interventions on the latent causal variables. As a +corollary, this interventional perspective also leads to new identifiability +results for nonlinear ICA -- a special case of CauCA with an empty graph -- +requiring strictly fewer datasets than previous results. We introduce a +likelihood-based approach using normalizing flows to estimate both the unmixing +function and the causal mechanisms, and demonstrate its effectiveness through +extensive synthetic experiments in the CauCA and ICA setting. + +
+
+ comment: NeurIPS 2023 final camera-ready version +
+
+
+
+
+ + ♻ ☆ Multi-Lattice Sampling of Quantum Field Theories via Neural + Operator-based Flows + + +
+ We consider the problem of sampling discrete field configurations $\phi$ from +the Boltzmann distribution $[d\phi] Z^{-1} e^{-S[\phi]}$, where $S$ is the +lattice-discretization of the continuous Euclidean action $\mathcal S$ of some +quantum field theory. Since such densities arise as the approximation of the +underlying functional density $[\mathcal D\phi(x)] \mathcal Z^{-1} e^{-\mathcal +S[\phi(x)]}$, we frame the task as an instance of operator learning. In +particular, we propose to approximate a time-dependent operator $\mathcal V_t$ +whose time integral provides a mapping between the functional distributions of +the free theory $[\mathcal D\phi(x)] \mathcal Z_0^{-1} e^{-\mathcal +S_{0}[\phi(x)]}$ and of the target theory $[\mathcal D\phi(x)]\mathcal +Z^{-1}e^{-\mathcal S[\phi(x)]}$. Whenever a particular lattice is chosen, the +operator $\mathcal V_t$ can be discretized to a finite dimensional, +time-dependent vector field $V_t$ which in turn induces a continuous +normalizing flow between finite dimensional distributions over the chosen +lattice. This flow can then be trained to be a diffeormorphism between the +discretized free and target theories $[d\phi] Z_0^{-1} e^{-S_{0}[\phi]}$, +$[d\phi] Z^{-1}e^{-S[\phi]}$. We run experiments on the $\phi^4$-theory to +explore to what extent such operator-based flow architectures generalize to +lattice sizes they were not trained on and show that pretraining on smaller +lattices can lead to speedup over training only a target lattice size. + +
+
+
+
+
+ + ♻ Combining Spatial and Temporal Abstraction in Planning for Better + Generalization ICLR 2024 + + +
+ Inspired by human conscious planning, we propose Skipper, a model-based +reinforcement learning agent utilizing spatio-temporal abstractions to +generalize learned skills in novel situations. It automatically decomposes the +given task into smaller, more manageable subtasks, and hence enables sparse +decision-making and focused computation on the relevant parts of the +environment. This relies on the extraction of an abstracted proxy problem +represented as a directed graph, in which vertices and edges are learned +end-to-end from hindsight. Our theoretical analyses provide performance +guarantees under appropriate assumptions and establish where our approach is +expected to be helpful. Generalization-focused experiments validate Skipper's +significant advantage in zero-shot generalization, compared to existing +state-of-the-art hierarchical planning methods. + +
+
+ comment: accepted version for ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Wasserstein Distributionally Robust Policy Evaluation and Learning for + Contextual Bandits + + +
+ Off-policy evaluation and learning are concerned with assessing a given +policy and learning an optimal policy from offline data without direct +interaction with the environment. Often, the environment in which the data are +collected differs from the environment in which the learned policy is applied. +To account for the effect of different environments during learning and +execution, distributionally robust optimization (DRO) methods have been +developed that compute worst-case bounds on the policy values assuming that the +distribution of the new environment lies within an uncertainty set. Typically, +this uncertainty set is defined based on the KL divergence around the empirical +distribution computed from the logging dataset. However, the KL uncertainty set +fails to encompass distributions with varying support and lacks awareness of +the geometry of the distribution support. As a result, KL approaches fall short +in addressing practical environment mismatches and lead to over-fitting to +worst-case scenarios. To overcome these limitations, we propose a novel DRO +approach that employs the Wasserstein distance instead. While Wasserstein DRO +is generally computationally more expensive compared to KL DRO, we present a +regularized method and a practical (biased) stochastic gradient descent method +to optimize the policy efficiently. We also provide a theoretical analysis of +the finite sample complexity and iteration complexity for our proposed method. +We further validate our approach using a public dataset that was recorded in a +randomized stoke trial. + +
+
+
+
+
+ + ♻ ☆ The Rise of Diffusion Models in Time-Series Forecasting + + +
+ This survey delves into the application of diffusion models in time-series +forecasting. Diffusion models are demonstrating state-of-the-art results in +various fields of generative AI. The paper includes comprehensive background +information on diffusion models, detailing their conditioning methods and +reviewing their use in time-series forecasting. The analysis covers 11 specific +time-series implementations, the intuition and theory behind them, the +effectiveness on different datasets, and a comparison among each other. Key +contributions of this work are the thorough exploration of diffusion models' +applications in time-series forecasting and a chronologically ordered overview +of these models. Additionally, the paper offers an insightful discussion on the +current state-of-the-art in this domain and outlines potential future research +directions. This serves as a valuable resource for researchers in AI and +time-series analysis, offering a clear view of the latest advancements and +future potential of diffusion models. + +
+
+ comment: Version 2, 24 pages, 10 figures, 12 tables, For complete LuaTeX + source: + https://github.com/Capsar/The-Rise-of-Diffusion-Models-in-Time-Series-Forecasting + , Written by: Caspar Meijer, Supervised by: Lydia Y. Chen +
+
+
+
+
+ + ♻ ☆ CLSA-CIM: A Cross-Layer Scheduling Approach for Computing-in-Memory + Architectures + + +
+ The demand for efficient machine learning (ML) accelerators is growing +rapidly, driving the development of novel computing concepts such as resistive +random access memory (RRAM)-based tiled computing-in-memory (CIM) +architectures. CIM allows to compute within the memory unit, resulting in +faster data processing and reduced power consumption. Efficient compiler +algorithms are essential to exploit the potential of tiled CIM architectures. +While conventional ML compilers focus on code generation for CPUs, GPUs, and +other von Neumann architectures, adaptations are needed to cover CIM +architectures. Cross-layer scheduling is a promising approach, as it enhances +the utilization of CIM cores, thereby accelerating computations. Although +similar concepts are implicitly used in previous work, there is a lack of clear +and quantifiable algorithmic definitions for cross-layer scheduling for tiled +CIM architectures. To close this gap, we present CLSA-CIM, a cross-layer +scheduling algorithm for tiled CIM architectures. We integrate CLSA-CIM with +existing weight-mapping strategies and compare performance against +state-of-the-art (SOTA) scheduling algorithms. CLSA-CIM improves the +utilization by up to 17.9 x , resulting in an overall speedup increase of up to +29.2 x compared to SOTA. + +
+
+
+
+
+ + ♻ ☆ Matrix Completion with Hypergraphs:Sharp Thresholds and Efficient + Algorithms + + +
+ This paper considers the problem of completing a rating matrix based on +sub-sampled matrix entries as well as observed social graphs and hypergraphs. +We show that there exists a \emph{sharp threshold} on the sample probability +for the task of exactly completing the rating matrix -- the task is achievable +when the sample probability is above the threshold, and is impossible otherwise +-- demonstrating a phase transition phenomenon. The threshold can be expressed +as a function of the ``quality'' of hypergraphs, enabling us to \emph{quantify} +the amount of reduction in sample probability due to the exploitation of +hypergraphs. This also highlights the usefulness of hypergraphs in the matrix +completion problem. En route to discovering the sharp threshold, we develop a +computationally efficient matrix completion algorithm that effectively exploits +the observed graphs and hypergraphs. Theoretical analyses show that our +algorithm succeeds with high probability as long as the sample probability +exceeds the aforementioned threshold, and this theoretical result is further +validated by synthetic experiments. Moreover, our experiments on a real social +network dataset (with both graphs and hypergraphs) show that our algorithm +outperforms other state-of-the-art matrix completion algorithms. + +
+
+ comment: Submitted to IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ An Explainable Proxy Model for Multiabel Audio Segmentation ICASSP 2024 + + +
+ Audio signal segmentation is a key task for automatic audio indexing. It +consists of detecting the boundaries of class-homogeneous segments in the +signal. In many applications, explainable AI is a vital process for +transparency of decision-making with machine learning. In this paper, we +propose an explainable multilabel segmentation model that solves speech +activity (SAD), music (MD), noise (ND), and overlapped speech detection (OSD) +simultaneously. This proxy uses the non-negative matrix factorization (NMF) to +map the embedding used for the segmentation to the frequency domain. +Experiments conducted on two datasets show similar performances as the +pre-trained black box model while showing strong explainability features. +Specifically, the frequency bins used for the decision can be easily identified +at both the segment level (local explanations) and global level (class +prototypes). + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Implicit Gaussian process representation of vector fields over arbitrary + latent manifolds ICLR 2024 + + +
+ Gaussian processes (GPs) are popular nonparametric statistical models for +learning unknown functions and quantifying the spatiotemporal uncertainty in +data. Recent works have extended GPs to model scalar and vector quantities +distributed over non-Euclidean domains, including smooth manifolds appearing in +numerous fields such as computer vision, dynamical systems, and neuroscience. +However, these approaches assume that the manifold underlying the data is +known, limiting their practical utility. We introduce RVGP, a generalisation of +GPs for learning vector signals over latent Riemannian manifolds. Our method +uses positional encoding with eigenfunctions of the connection Laplacian, +associated with the tangent bundle, readily derived from common graph-based +approximation of data. We demonstrate that RVGP possesses global regularity +over the manifold, which allows it to super-resolve and inpaint vector fields +while preserving singularities. Furthermore, we use RVGP to reconstruct +high-density neural dynamics derived from low-density EEG recordings in healthy +individuals and Alzheimer's patients. We show that vector field singularities +are important disease markers and that their reconstruction leads to a +comparable classification accuracy of disease states to high-density +recordings. Thus, our method overcomes a significant practical limitation in +experimental and clinical applications. + +
+
+ comment: ICLR 2024 conference paper. Associated code: + https://github.com/agosztolai/RVGP +
+
+
+
+
+ + ♻ ☆ A DenseNet-based method for decoding auditory spatial attention with EEG ICASSP 2024 + + +
+ Auditory spatial attention detection (ASAD) aims to decode the attended +spatial location with EEG in a multiple-speaker setting. ASAD methods are +inspired by the brain lateralization of cortical neural responses during the +processing of auditory spatial attention, and show promising performance for +the task of auditory attention decoding (AAD) with neural recordings. In the +previous ASAD methods, the spatial distribution of EEG electrodes is not fully +exploited, which may limit the performance of these methods. In the present +work, by transforming the original EEG channels into a two-dimensional (2D) +spatial topological map, the EEG data is transformed into a three-dimensional +(3D) arrangement containing spatial-temporal information. And then a 3D deep +convolutional neural network (DenseNet-3D) is used to extract temporal and +spatial features of the neural representation for the attended locations. The +results show that the proposed method achieves higher decoding accuracy than +the state-of-the-art (SOTA) method (94.3% compared to XANet's 90.6%) with +1-second decision window for the widely used KULeuven (KUL) dataset, and the +code to implement our work is available on Github: + https://github.com/xuxiran/ASAD_DenseNet + +
+
+ comment: 5 pages, 3 figures, has been accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ E3x: $\mathrm{E}(3)$-Equivariant Deep Learning Made Easy + + +
+ This work introduces E3x, a software package for building neural networks +that are equivariant with respect to the Euclidean group $\mathrm{E}(3)$, +consisting of translations, rotations, and reflections of three-dimensional +space. Compared to ordinary neural networks, $\mathrm{E}(3)$-equivariant models +promise benefits whenever input and/or output data are quantities associated +with three-dimensional objects. This is because the numeric values of such +quantities (e.g. positions) typically depend on the chosen coordinate system. +Under transformations of the reference frame, the values change predictably, +but the underlying rules can be difficult to learn for ordinary machine +learning models. With built-in $\mathrm{E}(3)$-equivariance, neural networks +are guaranteed to satisfy the relevant transformation rules exactly, resulting +in superior data efficiency and accuracy. The code for E3x is available from +https://github.com/google-research/e3x, detailed documentation and usage +examples can be found on https://e3x.readthedocs.io. + +
+
+
+
+
+ + ♻ ☆ A Scalable Neural Network for DSIC Affine Maximizer Auction Design NeurIPS 2023 + + +
+ Automated auction design aims to find empirically high-revenue mechanisms +through machine learning. Existing works on multi item auction scenarios can be +roughly divided into RegretNet-like and affine maximizer auctions (AMAs) +approaches. However, the former cannot strictly ensure dominant strategy +incentive compatibility (DSIC), while the latter faces scalability issue due to +the large number of allocation candidates. To address these limitations, we +propose AMenuNet, a scalable neural network that constructs the AMA parameters +(even including the allocation menu) from bidder and item representations. +AMenuNet is always DSIC and individually rational (IR) due to the properties of +AMAs, and it enhances scalability by generating candidate allocations through a +neural network. Additionally, AMenuNet is permutation equivariant, and its +number of parameters is independent of auction scale. We conduct extensive +experiments to demonstrate that AMenuNet outperforms strong baselines in both +contextual and non-contextual multi-item auctions, scales well to larger +auctions, generalizes well to different settings, and identifies useful +deterministic allocations. Overall, our proposed approach offers an effective +solution to automated DSIC auction design, with improved scalability and strong +revenue performance in various settings. + +
+
+ comment: NeurIPS 2023 (spotlight) +
+
+
+
+
+ + ♻ ☆ Learning from Label Proportions: Bootstrapping Supervised Learners via + Belief Propagation ICLR 2024 + + +
+ Learning from Label Proportions (LLP) is a learning problem where only +aggregate level labels are available for groups of instances, called bags, +during training, and the aim is to get the best performance at the +instance-level on the test data. This setting arises in domains like +advertising and medicine due to privacy considerations. We propose a novel +algorithmic framework for this problem that iteratively performs two main +steps. For the first step (Pseudo Labeling) in every iteration, we define a +Gibbs distribution over binary instance labels that incorporates a) covariate +information through the constraint that instances with similar covariates +should have similar labels and b) the bag level aggregated label. We then use +Belief Propagation (BP) to marginalize the Gibbs distribution to obtain pseudo +labels. In the second step (Embedding Refinement), we use the pseudo labels to +provide supervision for a learner that yields a better embedding. Further, we +iterate on the two steps again by using the second step's embeddings as new +covariates for the next iteration. In the final iteration, a classifier is +trained using the pseudo labels. Our algorithm displays strong gains against +several SOTA baselines (up to 15%) for the LLP Binary Classification problem on +various dataset types - tabular and Image. We achieve these improvements with +minimal computational overhead above standard supervised learning due to Belief +Propagation, for large bag sizes, even for a million samples. + +
+
+ comment: Accepted at The Twelfth International Conference on Learning + Representations (ICLR 2024) & Oral Presentation at Regulatable ML @ NeurIPS + 2023 +
+
+
+
+
+ + ♻ ☆ Regularized Contrastive Pre-training for Few-shot Bioacoustic Sound + Detection + + +
+ Bioacoustic sound event detection allows for better understanding of animal +behavior and for better monitoring biodiversity using audio. Deep learning +systems can help achieve this goal, however it is difficult to acquire +sufficient annotated data to train these systems from scratch. To address this +limitation, the Detection and Classification of Acoustic Scenes and Events +(DCASE) community has recasted the problem within the framework of few-shot +learning and organize an annual challenge for learning to detect animal sounds +from only five annotated examples. In this work, we regularize supervised +contrastive pre-training to learn features that can transfer well on new target +tasks with animal sounds unseen during training, achieving a high F-score of +61.52%(0.48) when no feature adaptation is applied, and an F-score of +68.19%(0.75) when we further adapt the learned features for each new target +task. This work aims to lower the entry bar to few-shot bioacoustic sound event +detection by proposing a simple and yet effective framework for this task, by +also providing open-source code. + +
+
+
+
+
+ + ♻ ☆ Online Loss Function Learning + + +
+ Loss function learning is a new meta-learning paradigm that aims to automate +the essential task of designing a loss function for a machine learning model. +Existing techniques for loss function learning have shown promising results, +often improving a model's training dynamics and final inference performance. +However, a significant limitation of these techniques is that the loss +functions are meta-learned in an offline fashion, where the meta-objective only +considers the very first few steps of training, which is a significantly +shorter time horizon than the one typically used for training deep neural +networks. This causes significant bias towards loss functions that perform well +at the very start of training but perform poorly at the end of training. To +address this issue we propose a new loss function learning technique for +adaptively updating the loss function online after each update to the base +model parameters. The experimental results show that our proposed method +consistently outperforms the cross-entropy loss and offline loss function +learning techniques on a diverse range of neural network architectures and +datasets. + +
+
+
+
+
+ + ♻ ☆ Language Modeling on a SpiNNaker 2 Neuromorphic Chip + + +
+ As large language models continue to scale in size rapidly, so too does the +computational power required to run them. Event-based networks on neuromorphic +devices offer a potential way to reduce energy consumption for inference +significantly. However, to date, most event-based networks that can run on +neuromorphic hardware, including spiking neural networks (SNNs), have not +achieved task performance even on par with LSTM models for language modeling. +As a result, language modeling on neuromorphic devices has seemed a distant +prospect. In this work, we demonstrate the first-ever implementation of a +language model on a neuromorphic device - specifically the SpiNNaker 2 chip - +based on a recently published event-based architecture called the EGRU. +SpiNNaker 2 is a many-core neuromorphic chip designed for large-scale +asynchronous processing, while the EGRU is architected to leverage such +hardware efficiently while maintaining competitive task performance. This +implementation marks the first time a neuromorphic language model matches +LSTMs, setting the stage for taking task performance to the level of large +language models. We also demonstrate results on a gesture recognition task +based on inputs from a DVS camera. Overall, our results showcase the +feasibility of this neuro-inspired neural network in hardware, highlighting +significant gains versus conventional hardware in energy efficiency for the +common use case of single batch inference. + +
+
+
+
+
+ + ♻ ☆ Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic + Play + + +
+ Infants' ability to recognize and categorize objects develops gradually. The +second year of life is marked by both the emergence of more semantic visual +representations and a better understanding of word meaning. This suggests that +language input may play an important role in shaping visual representations. +However, even in suitable contexts for word learning like dyadic play sessions, +caregivers utterances are sparse and ambiguous, often referring to objects that +are different from the one to which the child attends. Here, we systematically +investigate to what extent caregivers' utterances can nevertheless enhance +visual representations. For this we propose a computational model of visual +representation learning during dyadic play. We introduce a synthetic dataset of +ego-centric images perceived by a toddler-agent that moves and rotates toy +objects in different parts of its home environment while hearing caregivers' +utterances, modeled as captions. We propose to model toddlers' learning as +simultaneously aligning representations for 1) close-in-time images and 2) +co-occurring images and utterances. We show that utterances with statistics +matching those of real caregivers give rise to representations supporting +improved category recognition. Our analysis reveals that a small +decrease/increase in object-relevant naming frequencies can drastically impact +the learned representations. This affects the attention on object names within +an utterance, which is required for efficient visuo-linguistic alignment. +Overall, our results support the hypothesis that caregivers' naming utterances +can improve toddlers' visual representations. + +
+
+ comment: Proceedings of the 2023 IEEE International Conference on Development + and Learning (ICDL) +
+
+
+
+
+ + ♻ ☆ A Dempster-Shafer approach to trustworthy AI with application to fetal + brain MRI segmentation + + +
+ Deep learning models for medical image segmentation can fail unexpectedly and +spectacularly for pathological cases and images acquired at different centers +than training images, with labeling errors that violate expert knowledge. Such +errors undermine the trustworthiness of deep learning models for medical image +segmentation. Mechanisms for detecting and correcting such failures are +essential for safely translating this technology into clinics and are likely to +be a requirement of future regulations on artificial intelligence (AI). In this +work, we propose a trustworthy AI theoretical framework and a practical system +that can augment any backbone AI system using a fallback method and a fail-safe +mechanism based on Dempster-Shafer theory. Our approach relies on an actionable +definition of trustworthy AI. Our method automatically discards the voxel-level +labeling predicted by the backbone AI that violate expert knowledge and relies +on a fallback for those voxels. We demonstrate the effectiveness of the +proposed trustworthy AI approach on the largest reported annotated dataset of +fetal MRI consisting of 540 manually annotated fetal brain 3D T2w MRIs from 13 +centers. Our trustworthy AI method improves the robustness of a +state-of-the-art backbone AI for fetal brain MRIs acquired across various +centers and for fetuses with various brain abnormalities. + +
+
+ comment: Published in IEEE TPAMI. Minor revision compared to the previous + version +
+
+
+
+
+ + ♻ ☆ Semantic similarity prediction is better than other semantic similarity + measures + + +
+ Semantic similarity between natural language texts is typically measured +either by looking at the overlap between subsequences (e.g., BLEU) or by using +embeddings (e.g., BERTScore, S-BERT). Within this paper, we argue that when we +are only interested in measuring the semantic similarity, it is better to +directly predict the similarity using a fine-tuned model for such a task. Using +a fine-tuned model for the Semantic Textual Similarity Benchmark tasks (STS-B) +from the GLUE benchmark, we define the STSScore approach and show that the +resulting similarity is better aligned with our expectations on a robust +semantic similarity measure than other approaches. + +
+
+ comment: Accepted at TMLR: https://openreview.net/forum?id=bfsNmgN5je +
+
+
+
+
+ + ♻ ☆ A Comparative Study of Deep Learning and Iterative Algorithms for Joint + Channel Estimation and Signal Detection + + +
+ Joint channel estimation and signal detection (JCESD) in wireless +communication systems is a crucial and challenging task, especially since it +inherently poses a nonlinear inverse problem. This challenge is further +highlighted in low signal-to-noise ratio (SNR) scenarios, where traditional +algorithms often perform poorly. Deep learning (DL) methods have been +investigated, but concerns regarding computational expense and lack of +validation in low-SNR settings remain. Hence, the development of a robust and +low-complexity model that can deliver excellent performance across a wide range +of SNRs is highly desirable. In this paper, we aim to establish a benchmark +where traditional algorithms and DL methods are validated on different channel +models, Doppler, and SNR settings. In particular, we propose a new DL model +where the backbone network is formed by unrolling the iterative algorithm, and +the hyperparameters are estimated by hypernetworks. Additionally, we adapt a +lightweight DenseNet to the task of JCESD for comparison. We evaluate different +methods in three aspects: generalization in terms of bit error rate (BER), +robustness, and complexity. Our results indicate that DL approaches outperform +traditional algorithms in the challenging low-SNR setting, while the iterative +algorithm performs better in high-SNR settings. Furthermore, the iterative +algorithm is more robust in the presence of carrier frequency offset, whereas +DL methods excel when signals are corrupted by asymmetric Gaussian noise. + +
+
+ comment: Code is available at https://github.com/j991222/MIMO_JCESD +
+
+
+
+
+ + ♻ ☆ Efficient Generalized Low-Rank Tensor Contextual Bandits + + +
+ In this paper, we aim to build a novel bandits algorithm that is capable of +fully harnessing the power of multi-dimensional data and the inherent +non-linearity of reward functions to provide high-usable and accountable +decision-making services. To this end, we introduce a generalized low-rank +tensor contextual bandits model in which an action is formed from three feature +vectors, and thus can be represented by a tensor. In this formulation, the +reward is determined through a generalized linear function applied to the inner +product of the action's feature tensor and a fixed but unknown parameter tensor +with a low tubal rank. To effectively achieve the trade-off between exploration +and exploitation, we introduce a novel algorithm called "Generalized Low-Rank +Tensor Exploration Subspace then Refine" (G-LowTESTR). This algorithm first +collects raw data to explore the intrinsic low-rank tensor subspace information +embedded in the decision-making scenario, and then converts the original +problem into an almost lower-dimensional generalized linear contextual bandits +problem. Rigorous theoretical analysis shows that the regret bound of +G-LowTESTR is superior to those in vectorization and matricization cases. We +conduct a series of simulations and real data experiments to further highlight +the effectiveness of G-LowTESTR, leveraging its ability to capitalize on the +low-rank tensor structure for enhanced learning. + +
+
+
+
+
+ + ♻ ☆ Post-hoc Bias Scoring Is Optimal For Fair Classification ICLR 2024 + + +
+ We consider a binary classification problem under group fairness constraints, +which can be one of Demographic Parity (DP), Equalized Opportunity (EOp), or +Equalized Odds (EO). We propose an explicit characterization of Bayes optimal +classifier under the fairness constraints, which turns out to be a simple +modification rule of the unconstrained classifier. Namely, we introduce a novel +instance-level measure of bias, which we call bias score, and the modification +rule is a simple linear rule on top of the finite amount of bias scores.Based +on this characterization, we develop a post-hoc approach that allows us to +adapt to fairness constraints while maintaining high accuracy. In the case of +DP and EOp constraints, the modification rule is thresholding a single bias +score, while in the case of EO constraints we are required to fit a linear +modification rule with 2 parameters. The method can also be applied for +composite group-fairness criteria, such as ones involving several sensitive +attributes. + +
+
+ comment: Accepted for publication at The Twelfth International Conference on + Learning Representations (ICLR 2024) +
+
+
+
+
+ + ♻ ☆ TiMix: Text-aware Image Mixing for Effective Vision-Language + Pre-training AAAI2024 + + +
+ Self-supervised Multi-modal Contrastive Learning (SMCL) remarkably advances +modern Vision-Language Pre-training (VLP) models by aligning visual and +linguistic modalities. Due to noises in web-harvested text-image pairs, +however, scaling up training data volume in SMCL presents considerable +obstacles in terms of computational cost and data inefficiency. To improve data +efficiency in VLP, we propose Text-aware Image Mixing (TiMix), which integrates +mix-based data augmentation techniques into SMCL, yielding significant +performance improvements without significantly increasing computational +overhead. We provide a theoretical analysis of TiMixfrom a mutual information +(MI) perspective, showing that mixed data samples for cross-modal contrastive +learning implicitly serve as a regularizer for the contrastive loss. The +experimental results demonstrate that TiMix exhibits a comparable performance +on downstream tasks, even with a reduced amount of training data and shorter +training time, when benchmarked against existing methods. This work empirically +and theoretically demonstrates the potential of data mixing for data-efficient +and computationally viable VLP, benefiting broader VLP model adoption in +practical scenarios. + +
+
+ comment: Accepted on AAAI2024 +
+
+
+
+
+ + ♻ ☆ Model-Informed Generative Adversarial Network (MI-GAN) for Learning + Optimal Power Flow + + +
+ The optimal power flow (OPF) problem, as a critical component of power system +operations, becomes increasingly difficult to solve due to the variability, +intermittency, and unpredictability of renewable energy brought to the power +system. Although traditional optimization techniques, such as stochastic and +robust optimization approaches, could be leveraged to address the OPF problem, +in the face of renewable energy uncertainty, i.e., the dynamic coefficients in +the optimization model, their effectiveness in dealing with large-scale +problems remains limited. As a result, deep learning techniques, such as neural +networks, have recently been developed to improve computational efficiency in +solving OPF problems with the utilization of data. However, the feasibility and +optimality of the solution may not be guaranteed, and the system dynamics +cannot be properly addressed as well. In this paper, we propose an optimization +model-informed generative adversarial network (MI-GAN) framework to solve OPF +under uncertainty. The main contributions are summarized into three aspects: +(1) to ensure feasibility and improve optimality of generated solutions, three +important layers are proposed: feasibility filter layer, comparison layer, and +gradient-guided layer; (2) in the GAN-based framework, an efficient +model-informed selector incorporating these three new layers is established; +and (3) a new recursive iteration algorithm is also proposed to improve +solution optimality and handle the system dynamics. The numerical results on +IEEE test systems show that the proposed method is very effective and +promising. + +
+
+
+
+
+ + ♻ ☆ VertiBench: Advancing Feature Distribution Diversity in Vertical + Federated Learning Benchmarks + + +
+ Vertical Federated Learning (VFL) is a crucial paradigm for training machine +learning models on feature-partitioned, distributed data. However, due to +privacy restrictions, few public real-world VFL datasets exist for algorithm +evaluation, and these represent a limited array of feature distributions. +Existing benchmarks often resort to synthetic datasets, derived from arbitrary +feature splits from a global set, which only capture a subset of feature +distributions, leading to inadequate algorithm performance assessment. This +paper addresses these shortcomings by introducing two key factors affecting VFL +performance - feature importance and feature correlation - and proposing +associated evaluation metrics and dataset splitting methods. Additionally, we +introduce a real VFL dataset to address the deficit in image-image VFL +scenarios. Our comprehensive evaluation of cutting-edge VFL algorithms provides +valuable insights for future research in the field. + +
+
+
+
+
+ + ♻ ☆ Bounds on the price of feedback for mistake-bounded online learning + + +
+ We improve several worst-case bounds for various online learning scenarios +from (Auer and Long, Machine Learning, 1999). In particular, we sharpen an +upper bound for delayed ambiguous reinforcement learning by a factor of 2 and +an upper bound for learning compositions of families of functions by a factor +of 2.41. We also improve a lower bound from the same paper for learning +compositions of $k$ families of functions by a factor of $\Theta(\ln{k})$, +matching the upper bound up to a constant factor. In addition, we solve a +problem from (Long, Theoretical Computer Science, 2020) on the price of bandit +feedback with respect to standard feedback for multiclass learning, and we +improve an upper bound from (Feng et al., Theoretical Computer Science, 2023) +on the price of $r$-input delayed ambiguous reinforcement learning by a factor +of $r$, matching a lower bound from the same paper up to the leading term. + +
+
+
+
+
+ + ♻ ☆ Efficient Reinforcemen Learning via Decoupling Exploration and + Utilization + + +
+ Deep neural network(DNN) generalization is limited by the over-reliance of +current offline reinforcement learning techniques on conservative processing of +existing datasets. This method frequently results in algorithms that settle for +suboptimal solutions that only adjust to a certain dataset. Similarly, in +online reinforcement learning, the previously imposed punitive pessimism also +deprives the model of its exploratory potential. Our research proposes a novel +framework, Optimistic and Pessimistic Actor Reinforcement Learning (OPARL). +OPARL employs a unique dual-actor approach: an optimistic actor dedicated to +exploration and a pessimistic actor focused on utilization, thereby effectively +differentiating between exploration and utilization strategies. This unique +combination in reinforcement learning methods fosters a more balanced and +efficient approach. It enables the optimization of policies that focus on +actions yielding high rewards through pessimistic utilization strategies, while +also ensuring extensive state coverage via optimistic exploration. Experiments +and theoretical study demonstrates OPARL improves agents' capacities for +application and exploration. In the most tasks of DMControl benchmark and +Mujoco environment, OPARL performed better than state-of-the-art methods. Our +code has released on https://github.com/yydsok/OPARL + +
+
+ comment: Update V3 +
+
+
+
+
+ + ♻ ☆ Matching of Users and Creators in Two-Sided Markets with Departures + + +
+ Many online platforms of today, including social media sites, are two-sided +markets bridging content creators and users. Most of the existing literature on +platform recommendation algorithms largely focuses on user preferences and +decisions, and does not simultaneously address creator incentives. We propose a +model of content recommendation that explicitly focuses on the dynamics of +user-content matching, with the novel property that both users and creators may +leave the platform permanently if they do not experience sufficient engagement. +In our model, each player decides to participate at each time step based on +utilities derived from the current match: users based on alignment of the +recommended content with their preferences, and creators based on their +audience size. We show that a user-centric greedy algorithm that does not +consider creator departures can result in arbitrarily poor total engagement, +relative to an algorithm that maximizes total engagement while accounting for +two-sided departures. Moreover, in stark contrast to the case where only users +or only creators leave the platform, we prove that with two-sided departures, +approximating maximum total engagement within any constant factor is NP-hard. +We present two practical algorithms, one with performance guarantees under mild +assumptions on user preferences, and another that tends to outperform +algorithms that ignore two-sided departures in practice. + +
+
+
+
+
+ + ♻ ☆ Dynamic Fault Characteristics Evaluation in Power Grid + + +
+ To enhance the intelligence degree in operation and maintenance, a novel +method for fault detection in power grids is proposed. The proposed GNN-based +approach first identifies fault nodes through a specialized feature extraction +method coupled with a knowledge graph. By incorporating temporal data, the +method leverages the status of nodes from preceding and subsequent time periods +to help current fault detection. To validate the effectiveness of the node +features, a correlation analysis of the output features from each node was +conducted. The results from experiments show that this method can accurately +locate fault nodes in simulation scenarios with a remarkable accuracy. +Additionally, the graph neural network based feature modeling allows for a +qualitative examination of how faults spread across nodes, which provides +valuable insights for analyzing fault nodes. + +
+
+
+
+
+ + ♻ ☆ Last-Iterate Convergent Policy Gradient Primal-Dual Methods for + Constrained MDPs NeurIPS 2023 + + +
+ We study the problem of computing an optimal policy of an infinite-horizon +discounted constrained Markov decision process (constrained MDP). Despite the +popularity of Lagrangian-based policy search methods used in practice, the +oscillation of policy iterates in these methods has not been fully understood, +bringing out issues such as violation of constraints and sensitivity to +hyper-parameters. To fill this gap, we employ the Lagrangian method to cast a +constrained MDP into a constrained saddle-point problem in which max/min +players correspond to primal/dual variables, respectively, and develop two +single-time-scale policy-based primal-dual algorithms with non-asymptotic +convergence of their policy iterates to an optimal constrained policy. +Specifically, we first propose a regularized policy gradient primal-dual +(RPG-PD) method that updates the policy using an entropy-regularized policy +gradient, and the dual variable via a quadratic-regularized gradient ascent, +simultaneously. We prove that the policy primal-dual iterates of RPG-PD +converge to a regularized saddle point with a sublinear rate, while the policy +iterates converge sublinearly to an optimal constrained policy. We further +instantiate RPG-PD in large state or action spaces by including function +approximation in policy parametrization, and establish similar sublinear +last-iterate policy convergence. Second, we propose an optimistic policy +gradient primal-dual (OPG-PD) method that employs the optimistic gradient +method to update primal/dual variables, simultaneously. We prove that the +policy primal-dual iterates of OPG-PD converge to a saddle point that contains +an optimal constrained policy, with a linear rate. To the best of our +knowledge, this work appears to be the first non-asymptotic policy last-iterate +convergence result for single-time-scale algorithms in constrained MDPs. + +
+
+ comment: 65 pages, 17 figures, and 1 table; NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel + Identification + + +
+ Highly parallelized workloads like machine learning training, inferences and +general HPC tasks are greatly accelerated using GPU devices. In a cloud +computing cluster, serving a GPU's computation power through multi-tasks +sharing is highly demanded since there are always more task requests than the +number of GPU available. Existing GPU sharing solutions focus on reducing +task-level waiting time or task-level switching costs when multiple jobs +competing for a single GPU. Non-stopped computation requests come with +different priorities, having non-symmetric impact on QoS for sharing a GPU +device. Existing work missed the kernel-level optimization opportunity brought +by this setting. To address this problem, we present a novel kernel-level +scheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT +incorporates task-level priority information, fine-grained kernel +identification, and kernel measurement, allowing low priorities task's +execution during high priority task's inter-kernel idle time. Thereby, filling +the GPU's device runtime fully, and reduce overall GPU sharing impact to cloud +services. Across a set of ML models, the FIKIT based inference system +accelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in +GPU sharing mode, and more than half of the cases are accelerated by more than +3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have +a comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We +further limit the kernel measurement and runtime fine-grained kernel scheduling +overhead to less than 10%. + +
+
+ comment: 20 pages, 20 figures. Delete a duplicated paragraph in the + introduction section; Add more experiments with 2 additional figures; Update + the conclusion +
+
+
+
+
+ + ♻ ☆ Understanding Addition in Transformers ICLR 2024 + + +
+ Understanding the inner workings of machine learning models like Transformers +is vital for their safe and ethical use. This paper presents an in-depth +analysis of a one-layer Transformer model trained for n-digit integer addition. +We reveal that the model divides the task into parallel, digit-specific streams +and employs distinct algorithms for different digit positions. Our study also +finds that the model starts calculations late but executes them rapidly. A rare +use case with high loss is identified and explained. Overall, the model's +algorithm is explained in detail. These findings are validated through rigorous +testing and mathematical modeling, contributing to the broader works in +Mechanistic Interpretability, AI safety, and alignment. Our approach opens the +door for analyzing more complex tasks and multi-layer Transformer models. + +
+
+ comment: 9 pages, 8 figures, accepted by ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Towards Best Practices of Activation Patching in Language Models: + Metrics and Methods ICLR 2024 + + +
+ Mechanistic interpretability seeks to understand the internal mechanisms of +machine learning models, where localization -- identifying the important model +components -- is a key step. Activation patching, also known as causal tracing +or interchange intervention, is a standard technique for this task (Vig et al., +2020), but the literature contains many variants with little consensus on the +choice of hyperparameters or methodology. In this work, we systematically +examine the impact of methodological details in activation patching, including +evaluation metrics and corruption methods. In several settings of localization +and circuit discovery in language models, we find that varying these +hyperparameters could lead to disparate interpretability results. Backed by +empirical observations, we give conceptual arguments for why certain metrics or +methods may be preferred. Finally, we provide recommendations for the best +practices of activation patching going forwards. + +
+
+ comment: 27 pages. ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Use of Prior Knowledge to Discover Causal Additive Models with + Unobserved Variables and its Application to Time Series Data + + +
+ This paper proposes two methods for causal additive models with unobserved +variables (CAM-UV). CAM-UV assumes that the causal functions take the form of +generalized additive models and that latent confounders are present. First, we +propose a method that leverages prior knowledge for efficient causal discovery. +Then, we propose an extension of this method for inferring causality in time +series data. The original CAM-UV algorithm differs from other existing causal +function models in that it does not seek the causal order between observed +variables, but rather aims to identify the causes for each observed variable. +Therefore, the first proposed method in this paper utilizes prior knowledge, +such as understanding that certain variables cannot be causes of specific +others. Moreover, by incorporating the prior knowledge that causes precedes +their effects in time, we extend the first algorithm to the second method for +causal discovery in time series data. We validate the first proposed method by +using simulated data to demonstrate that the accuracy of causal discovery +increases as more prior knowledge is accumulated. Additionally, we test the +second proposed method by comparing it with existing time series causal +discovery methods, using both simulated data and real-world data. + +
+
+
+
+
+ + ♻ ☆ Deep Evolutional Instant Interest Network for CTR Prediction in + Trigger-Induced Recommendation WSDM'2024 + + +
+ The recommendation has been playing a key role in many industries, e.g., +e-commerce, streaming media, social media, etc. Recently, a new recommendation +scenario, called Trigger-Induced Recommendation (TIR), where users are able to +explicitly express their instant interests via trigger items, is emerging as an +essential role in many e-commerce platforms, e.g., Alibaba.com and Amazon. +Without explicitly modeling the user's instant interest, traditional +recommendation methods usually obtain sub-optimal results in TIR. Even though +there are a few methods considering the trigger and target items simultaneously +to solve this problem, they still haven't taken into account temporal +information of user behaviors, the dynamic change of user instant interest when +the user scrolls down and the interactions between the trigger and target +items. To tackle these problems, we propose a novel method -- Deep Evolutional +Instant Interest Network (DEI2N), for click-through rate prediction in TIR +scenarios. Specifically, we design a User Instant Interest Modeling Layer to +predict the dynamic change of the intensity of instant interest when the user +scrolls down. Temporal information is utilized in user behavior modeling. +Moreover, an Interaction Layer is introduced to learn better interactions +between the trigger and target items. We evaluate our method on several offline +and real-world industrial datasets. Experimental results show that our proposed +DEI2N outperforms state-of-the-art baselines. In addition, online A/B testing +demonstrates the superiority over the existing baseline in real-world +production environments. + +
+
+ comment: 7 pages, 3 figures, accepted by the 17th ACM International Conference + on Web Search and Data Mining(WSDM'2024) +
+
+
+
+
+ + ♻ ☆ Exploiting Inter-Layer Expert Affinity for Accelerating + Mixture-of-Experts Model Inference + + +
+ In large language models like the Generative Pre-trained Transformer, the +Mixture of Experts paradigm has emerged as a powerful technique for enhancing +model expressiveness and accuracy. However, deploying GPT MoE models for +parallel inference on distributed systems presents significant challenges, +primarily due to the extensive Alltoall communication required for expert +routing and aggregation. This communication bottleneck exacerbates the already +complex computational landscape, hindering the efficient utilization of +high-performance computing resources. In this paper, we propose a lightweight +optimization technique called ExFlow, to largely accelerate the inference of +these MoE models. We take a new perspective on alleviating the communication +overhead by exploiting the inter-layer expert affinity. Unlike previous +methods, our solution can be directly applied to pre-trained MoE models without +any fine-tuning or accuracy degradation. By proposing a context-coherent expert +parallelism on distributed systems, our design only uses one Alltoall +communication to deliver the same functionality while previous methods all +require two Alltoalls. By carefully examining the conditional probability in +tokens' routing across multiple layers, we proved that pre-trained GPT MoE +models implicitly exhibit a strong inter-layer expert affinity. We then design +an efficient integer programming model to capture such features and show that +by properly placing the experts on corresponding GPUs, we can reduce up to 67% +cross-GPU routing latency. Our solution beats the cutting-edge MoE +implementations with experts from 8 to 64, with up to 2.2x improvement in +inference throughput. We further provide a detailed study of how the model +implicitly acquires this expert affinity at the very early training stage and +how this affinity evolves and stabilizes during training. + +
+
+
+
+
+ + ♻ ☆ On-Policy Distillation of Language Models: Learning from Self-Generated + Mistakes ICLR 2024 + + +
+ Knowledge distillation (KD) is widely used for compressing a teacher model to +reduce its inference cost and memory footprint, by training a smaller student +model. However, current KD methods for auto-regressive sequence models suffer +from distribution mismatch between output sequences seen during training and +those generated by the student during inference. To address this issue, we +introduce Generalized Knowledge Distillation (GKD). Instead of solely relying +on a fixed set of output sequences, GKD trains the student on its +self-generated output sequences by leveraging feedback from the teacher on such +sequences. Unlike supervised KD approaches, GKD also offers the flexibility to +employ alternative loss functions between the student and teacher, which can be +useful when the student lacks the expressivity to mimic the teacher's +distribution. Furthermore, GKD facilitates the seamless integration of +distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for +distilling auto-regressive language models on summarization, translation, and +arithmetic reasoning tasks, and task-agnostic distillation for +instruction-tuning. + +
+
+ comment: Accepted at ICLR 2024. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Lyapunov Function Consistent Adaptive Network Signal Control with Back + Pressure and Reinforcement Learning + + +
+ In traffic signal control, flow-based (optimizing the overall flow) and +pressure-based methods (equalizing and alleviating congestion) are commonly +used but often considered separately. This study introduces a unified framework +using Lyapunov control theory, defining specific Lyapunov functions +respectively for these methods. We have found interesting results. For example, +the well-recognized back-pressure method is equal to differential queue lengths +weighted by intersection lane saturation flows. We further improve it by adding +basic traffic flow theory. Rather than ensuring that the control system be +stable, the system should be also capable of adaptive to various performance +metrics. Building on insights from Lyapunov theory, this study designs a reward +function for the Reinforcement Learning (RL)-based network signal control, +whose agent is trained with Double Deep Q-Network (DDQN) for effective control +over complex traffic networks. The proposed algorithm is compared with several +traditional and RL-based methods under pure passenger car flow and heterogenous +traffic flow including freight, respectively. The numerical tests demonstrate +that the proposed method outperforms the alternative control methods across +different traffic scenarios, covering corridor and general network situations +each with varying traffic demands, in terms of the average network vehicle +waiting time per vehicle. + +
+
+
+
+
+ + ♻ ☆ MMSFormer: Multimodal Transformer for Material and Semantic Segmentation + + +
+ Leveraging information across diverse modalities is known to enhance +performance on multimodal segmentation tasks. However, effectively fusing +information from different modalities remains challenging due to the unique +characteristics of each modality. In this paper, we propose a novel fusion +strategy that can effectively fuse information from different modality +combinations. We also propose a new model named Multi-Modal Segmentation +TransFormer (MMSFormer) that incorporates the proposed fusion strategy to +perform multimodal material and semantic segmentation tasks. MMSFormer +outperforms current state-of-the-art models on three different datasets. As we +begin with only one input modality, performance improves progressively as +additional modalities are incorporated, showcasing the effectiveness of the +fusion block in combining useful information from diverse input modalities. +Ablation studies show that different modules in the fusion block are crucial +for overall model performance. Furthermore, our ablation studies also highlight +the capacity of different input modalities to improve performance in the +identification of different types of materials. The code and pretrained models +will be made available at https://github.com/csiplab/MMSFormer. + +
+
+ comment: 14 pages, 3 figures, 8 tables +
+
+
+
+
+ + ♻ ☆ KinSPEAK: Improving speech recognition for Kinyarwanda via + semi-supervised learning methods + + +
+ Despite recent availability of large transcribed Kinyarwanda speech data, +achieving robust speech recognition for Kinyarwanda is still challenging. In +this work, we show that using self-supervised pre-training, following a simple +curriculum schedule during fine-tuning and using semi-supervised learning to +leverage large unlabelled speech data significantly improve speech recognition +performance for Kinyarwanda. Our approach focuses on using public domain data +only. A new studio-quality speech dataset is collected from a public website, +then used to train a clean baseline model. The clean baseline model is then +used to rank examples from a more diverse and noisy public dataset, defining a +simple curriculum training schedule. Finally, we apply semi-supervised learning +to label and learn from large unlabelled data in four successive generations. +Our final model achieves 3.2% word error rate (WER) on the new dataset and +15.9% WER on Mozilla Common Voice benchmark, which is state-of-the-art to the +best of our knowledge. Our experiments also indicate that using syllabic rather +than character-based tokenization results in better speech recognition +performance for Kinyarwanda. + +
+
+ comment: 9 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Approximating Numerical Fluxes Using Fourier Neural Operators for + Hyperbolic Conservation Laws + + +
+ Traditionally, classical numerical schemes have been employed to solve +partial differential equations (PDEs) using computational methods. Recently, +neural network-based methods have emerged. Despite these advancements, neural +network-based methods, such as physics-informed neural networks (PINNs) and +neural operators, exhibit deficiencies in robustness and generalization. To +address these issues, numerous studies have integrated classical numerical +frameworks with machine learning techniques, incorporating neural networks into +parts of traditional numerical methods. In this study, we focus on hyperbolic +conservation laws by replacing traditional numerical fluxes with neural +operators. To this end, we developed loss functions inspired by established +numerical schemes related to conservation laws and approximated numerical +fluxes using Fourier neural operators (FNOs). Our experiments demonstrated that +our approach combines the strengths of both traditional numerical schemes and +FNOs, outperforming standard FNO methods in several respects. For instance, we +demonstrate that our method is robust, has resolution invariance, and is +feasible as a data-driven method. In particular, our method can make continuous +predictions over time and exhibits superior generalization capabilities with +out-of-distribution (OOD) samples, which are challenges that existing neural +operator methods encounter. + +
+
+ comment: 26 pages, 28 figures +
+
+
+
+
+ + ♻ ☆ Degeneracy is OK: Logarithmic Regret for Network Revenue Management with + Indiscrete Distributions + + +
+ We study the classical Network Revenue Management (NRM) problem with +accept/reject decisions and $T$ IID arrivals. We consider a distributional form +where each arrival must fall under a finite number of possible categories, each +with a deterministic resource consumption vector, but a random value +distributed continuously over an interval. We develop an online algorithm that +achieves $O(\log^2 T)$ regret under this model, with the only (necessary) +assumption being that the probability densities are bounded away from 0. We +derive a second result that achieves $O(\log T)$ regret under an additional +assumption of second-order growth. To our knowledge, these are the first +results achieving logarithmic-level regret in an NRM model with continuous +values that do not require any kind of ``non-degeneracy'' assumptions. Our +results are achieved via new techniques including a new method of bounding +myopic regret, a ``semi-fluid'' relaxation of the offline allocation, and an +improved bound on the ``dual convergence''. + +
+
+
+
+
+ + ♻ ☆ Efficient Adaptation of Large Vision Transformer via Adapter + Re-Composing NeurIPS 2023 + + +
+ The advent of high-capacity pre-trained models has revolutionized +problem-solving in computer vision, shifting the focus from training +task-specific models to adapting pre-trained models. Consequently, effectively +adapting large pre-trained models to downstream tasks in an efficient manner +has become a prominent research area. Existing solutions primarily concentrate +on designing lightweight adapters and their interaction with pre-trained +models, with the goal of minimizing the number of parameters requiring updates. +In this study, we propose a novel Adapter Re-Composing (ARC) strategy that +addresses efficient pre-trained model adaptation from a fresh perspective. Our +approach considers the reusability of adaptation parameters and introduces a +parameter-sharing scheme. Specifically, we leverage symmetric +down-/up-projections to construct bottleneck operations, which are shared +across layers. By learning low-dimensional re-scaling coefficients, we can +effectively re-compose layer-adaptive adapters. This parameter-sharing strategy +in adapter design allows us to significantly reduce the number of new +parameters while maintaining satisfactory performance, thereby offering a +promising approach to compress the adaptation cost. We conduct experiments on +24 downstream image classification tasks using various Vision Transformer +variants to evaluate our method. The results demonstrate that our approach +achieves compelling transfer learning performance with a reduced parameter +count. Our code is available at +\href{https://github.com/DavidYanAnDe/ARC}{https://github.com/DavidYanAnDe/ARC}. + +
+
+ comment: Paper is accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Generalized test utilities for long-tail performance in extreme + multi-label classification NeurIPS 2023 + + +
+ Extreme multi-label classification (XMLC) is the task of selecting a small +subset of relevant labels from a very large set of possible labels. As such, it +is characterized by long-tail labels, i.e., most labels have very few positive +instances. With standard performance measures such as precision@k, a classifier +can ignore tail labels and still report good performance. However, it is often +argued that correct predictions in the tail are more "interesting" or +"rewarding," but the community has not yet settled on a metric capturing this +intuitive concept. The existing propensity-scored metrics fall short on this +goal by confounding the problems of long-tail and missing labels. In this +paper, we analyze generalized metrics budgeted "at k" as an alternative +solution. To tackle the challenging problem of optimizing these metrics, we +formulate it in the expected test utility (ETU) framework, which aims to +optimize the expected performance on a fixed test set. We derive optimal +prediction rules and construct computationally efficient approximations with +provable regret guarantees and robustness against model misspecification. Our +algorithm, based on block coordinate ascent, scales effortlessly to XMLC +problems and obtains promising results in terms of long-tail performance. + +
+
+ comment: This is the authors' version of the work accepted to NeurIPS 2023; + the final version of the paper, errors and typos corrected, and minor + modifications to improve clarity +
+
+
+
+
+ + ♻ ☆ Symbolic Regression on FPGAs for Fast Machine Learning Inference + + +
+ The high-energy physics community is investigating the potential of deploying +machine-learning-based solutions on Field-Programmable Gate Arrays (FPGAs) to +enhance physics sensitivity while still meeting data processing time +constraints. In this contribution, we introduce a novel end-to-end procedure +that utilizes a machine learning technique called symbolic regression (SR). It +searches the equation space to discover algebraic relations approximating a +dataset. We use PySR (a software to uncover these expressions based on an +evolutionary algorithm) and extend the functionality of hls4ml (a package for +machine learning inference in FPGAs) to support PySR-generated expressions for +resource-constrained production environments. Deep learning models often +optimize the top metric by pinning the network size because the vast +hyperparameter space prevents an extensive search for neural architecture. +Conversely, SR selects a set of models on the Pareto front, which allows for +optimizing the performance-resource trade-off directly. By embedding symbolic +forms, our implementation can dramatically reduce the computational resources +needed to perform critical tasks. We validate our method on a physics +benchmark: the multiclass classification of jets produced in simulated +proton-proton collisions at the CERN Large Hadron Collider. We show that our +approach can approximate a 3-layer neural network using an inference model that +achieves up to a 13-fold decrease in execution time, down to 5 ns, while still +preserving more than 90% approximation accuracy. + +
+
+ comment: 9 pages. Accepted to 26th International Conference on Computing in + High Energy & Nuclear Physics (CHEP 2023) +
+
+
+
+
+ + ♻ ☆ Circuit Component Reuse Across Tasks in Transformer Language Models ICLR 2024 + + +
+ Recent work in mechanistic interpretability has shown that behaviors in +language models can be successfully reverse-engineered through circuit +analysis. A common criticism, however, is that each circuit is task-specific, +and thus such analysis cannot contribute to understanding the models at a +higher level. In this work, we present evidence that insights (both low-level +findings about specific heads and higher-level findings about general +algorithms) can indeed generalize across tasks. Specifically, we study the +circuit discovered in Wang et al. (2022) for the Indirect Object Identification +(IOI) task and 1.) show that it reproduces on a larger GPT2 model, and 2.) that +it is mostly reused to solve a seemingly different task: Colored Objects +(Ippolito & Callison-Burch, 2023). We provide evidence that the process +underlying both tasks is functionally very similar, and contains about a 78% +overlap in in-circuit attention heads. We further present a proof-of-concept +intervention experiment, in which we adjust four attention heads in middle +layers in order to 'repair' the Colored Objects circuit and make it behave like +the IOI circuit. In doing so, we boost accuracy from 49.6% to 93.7% on the +Colored Objects task and explain most sources of error. The intervention +affects downstream attention heads in specific ways predicted by their +interactions in the IOI circuit, indicating that this subcircuit behavior is +invariant to the different task inputs. Overall, our results provide evidence +that it may yet be possible to explain large language models' behavior in terms +of a relatively small number of interpretable task-general algorithmic building +blocks and computational components. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Normality-Guided Distributional Reinforcement Learning for Continuous + Control + + +
+ Learning a predictive model of the mean return, or value function, plays a +critical role in many reinforcement learning algorithms. Distributional +reinforcement learning (DRL) has been shown to improve performance by modeling +the value distribution, not just the mean. We study the value distribution in +several continuous control tasks and find that the learned value distribution +is empirical quite close to normal. We design a method that exploits this +property, employ variances predicted from a variance network, along with +returns, to analytically compute target quantile bars representing a normal for +our distributional value function. In addition, we propose a policy update +strategy based on the correctness as measured by structural characteristics of +the value distribution not present in the standard value function. The approach +we outline is compatible with many DRL structures. We use two representative +on-policy algorithms, PPO and TRPO, as testbeds. Our method yields +statistically significant improvements in 10 out of 16 continuous task +settings, while utilizing a reduced number of weights and achieving faster +training time compared to an ensemble-based method for quantifying value +distribution uncertainty. + +
+
+
+
+
+ + ♻ ☆ Climate-Invariant Machine Learning + + +
+ Projecting climate change is a generalization problem: we extrapolate the +recent past using physical models across past, present, and future climates. +Current climate models require representations of processes that occur at +scales smaller than model grid size, which have been the main source of model +projection uncertainty. Recent machine learning (ML) algorithms hold promise to +improve such process representations, but tend to extrapolate poorly to climate +regimes they were not trained on. To get the best of the physical and +statistical worlds, we propose a new framework - termed "climate-invariant" ML +- incorporating knowledge of climate processes into ML algorithms, and show +that it can maintain high offline accuracy across a wide range of climate +conditions and configurations in three distinct atmospheric models. Our results +suggest that explicitly incorporating physical knowledge into data-driven +models of Earth system processes can improve their consistency, data +efficiency, and generalizability across climate regimes. + +
+
+ comment: 26+28 pages, 9+15 figures, 0+3 tables in the main text + + supplementary materials. Accepted for publication in Science Advances on Jan + 5, 2024 +
+
+
+
+
+ + ♻ ☆ BridgeData V2: A Dataset for Robot Learning at Scale + + +
+ We introduce BridgeData V2, a large and diverse dataset of robotic +manipulation behaviors designed to facilitate research on scalable robot +learning. BridgeData V2 contains 60,096 trajectories collected across 24 +environments on a publicly available low-cost robot. BridgeData V2 provides +extensive task and environment variability, leading to skills that can +generalize across environments, domains, and institutions, making the dataset a +useful resource for a broad range of researchers. Additionally, the dataset is +compatible with a wide variety of open-vocabulary, multi-task learning methods +conditioned on goal images or natural language instructions. In our +experiments, we train 6 state-of-the-art imitation learning and offline +reinforcement learning methods on our dataset, and find that they succeed on a +suite of tasks requiring varying amounts of generalization. We also demonstrate +that the performance of these methods improves with more data and higher +capacity models, and that training on a greater variety of skills leads to +improved generalization. By publicly sharing BridgeData V2 and our pre-trained +models, we aim to accelerate research in scalable robot learning methods. +Project page at https://rail-berkeley.github.io/bridgedata + +
+
+ comment: 9 pages +
+
+
+
+
+ + ♻ ☆ Sleeper Agents: Training Deceptive LLMs that Persist Through Safety + Training + + +
+ Humans are capable of strategically deceptive behavior: behaving helpfully in +most situations, but then behaving very differently in order to pursue +alternative objectives when given the opportunity. If an AI system learned such +a deceptive strategy, could we detect it and remove it using current +state-of-the-art safety training techniques? To study this question, we +construct proof-of-concept examples of deceptive behavior in large language +models (LLMs). For example, we train models that write secure code when the +prompt states that the year is 2023, but insert exploitable code when the +stated year is 2024. We find that such backdoor behavior can be made +persistent, so that it is not removed by standard safety training techniques, +including supervised fine-tuning, reinforcement learning, and adversarial +training (eliciting unsafe behavior and then training to remove it). The +backdoor behavior is most persistent in the largest models and in models +trained to produce chain-of-thought reasoning about deceiving the training +process, with the persistence remaining even when the chain-of-thought is +distilled away. Furthermore, rather than removing backdoors, we find that +adversarial training can teach models to better recognize their backdoor +triggers, effectively hiding the unsafe behavior. Our results suggest that, +once a model exhibits deceptive behavior, standard techniques could fail to +remove such deception and create a false impression of safety. + +
+
+ comment: updated to add missing acknowledgements +
+
+
+
+
+ + ♻ ☆ Black Box Variational Inference with a Deterministic Objective: Faster, + More Accurate, and Even More Black Box + + +
+ Automatic differentiation variational inference (ADVI) offers fast and +easy-to-use posterior approximation in multiple modern probabilistic +programming languages. However, its stochastic optimizer lacks clear +convergence criteria and requires tuning parameters. Moreover, ADVI inherits +the poor posterior uncertainty estimates of mean-field variational Bayes +(MFVB). We introduce "deterministic ADVI" (DADVI) to address these issues. +DADVI replaces the intractable MFVB objective with a fixed Monte Carlo +approximation, a technique known in the stochastic optimization literature as +the "sample average approximation" (SAA). By optimizing an approximate but +deterministic objective, DADVI can use off-the-shelf second-order optimization, +and, unlike standard mean-field ADVI, is amenable to more accurate posterior +covariances via linear response (LR). In contrast to existing worst-case +theory, we show that, on certain classes of common statistical problems, DADVI +and the SAA can perform well with relatively few samples even in very high +dimensions, though we also show that such favorable results cannot extend to +variational approximations that are too expressive relative to mean-field ADVI. +We show on a variety of real-world problems that DADVI reliably finds good +solutions with default settings (unlike ADVI) and, together with LR +covariances, is typically faster and more accurate than standard ADVI. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Vlogger: Make Your Dream A Vlog + + +
+ In this work, we present Vlogger, a generic AI system for generating a +minute-level video blog (i.e., vlog) of user descriptions. Different from short +videos with a few seconds, vlog often contains a complex storyline with +diversified scenes, which is challenging for most existing video generation +approaches. To break through this bottleneck, our Vlogger smartly leverages +Large Language Model (LLM) as Director and decomposes a long video generation +task of vlog into four key stages, where we invoke various foundation models to +play the critical roles of vlog professionals, including (1) Script, (2) Actor, +(3) ShowMaker, and (4) Voicer. With such a design of mimicking human beings, +our Vlogger can generate vlogs through explainable cooperation of top-down +planning and bottom-up shooting. Moreover, we introduce a novel video diffusion +model, ShowMaker, which serves as a videographer in our Vlogger for generating +the video snippet of each shooting scene. By incorporating Script and Actor +attentively as textual and visual prompts, it can effectively enhance +spatial-temporal coherence in the snippet. Besides, we design a concise mixed +training paradigm for ShowMaker, boosting its capacity for both T2V generation +and prediction. Finally, the extensive experiments show that our method +achieves state-of-the-art performance on zero-shot T2V generation and +prediction tasks. More importantly, Vlogger can generate over 5-minute vlogs +from open-world descriptions, without loss of video coherence on script and +actor. The code and model is all available at +https://github.com/zhuangshaobin/Vlogger. + +
+
+ comment: 16 pages, 8 figures, 11 tables +
+
+
+
+
+ + ☆ Change Detection Between Optical Remote Sensing Imagery and Map Data via + Segment Anything Model (SAM) + + +
+ Unsupervised multimodal change detection is pivotal for time-sensitive tasks +and comprehensive multi-temporal Earth monitoring. In this study, we explore +unsupervised multimodal change detection between two key remote sensing data +sources: optical high-resolution imagery and OpenStreetMap (OSM) data. +Specifically, we propose to utilize the vision foundation model Segmentation +Anything Model (SAM), for addressing our task. Leveraging SAM's exceptional +zero-shot transfer capability, high-quality segmentation maps of optical images +can be obtained. Thus, we can directly compare these two heterogeneous data +forms in the so-called segmentation domain. We then introduce two strategies +for guiding SAM's segmentation process: the 'no-prompt' and 'box/mask prompt' +methods. The two strategies are designed to detect land-cover changes in +general scenarios and to identify new land-cover objects within existing +backgrounds, respectively. Experimental results on three datasets indicate that +the proposed approach can achieve more competitive results compared to +representative unsupervised multimodal change detection methods. + +
+
+
+
+
+ + ☆ On the Effect of Data-Augmentation on Local Embedding Properties in the + Contrastive Learning of Music Audio Representations ICASSP + + +
+ Audio embeddings are crucial tools in understanding large catalogs of music. +Typically embeddings are evaluated on the basis of the performance they provide +in a wide range of downstream tasks, however few studies have investigated the +local properties of the embedding spaces themselves which are important in +nearest neighbor algorithms, commonly used in music search and recommendation. +In this work we show that when learning audio representations on music datasets +via contrastive learning, musical properties that are typically homogeneous +within a track (e.g., key and tempo) are reflected in the locality of +neighborhoods in the resulting embedding space. By applying appropriate data +augmentation strategies, localisation of such properties can not only be +reduced but the localisation of other attributes is increased. For example, +locality of features such as pitch and tempo that are less relevant to +non-expert listeners, may be mitigated while improving the locality of more +salient features such as genre and mood, achieving state-of-the-art performance +in nearest neighbor retrieval accuracy. Similarly, we show that the optimal +selection of data augmentation strategies for contrastive learning of music +audio embeddings is dependent on the downstream task, highlighting this as an +important embedding design decision. + +
+
+ comment: Accepted to the International Conference on Acoustics, Speech and + Signal Processing (ICASSP) 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 79 + +
+
+
+ + ☆ Cross-lingual neural fuzzy matching for exploiting target-language + monolingual corpora in computer-aided translation + + +
+ Computer-aided translation (CAT) tools based on translation memories (MT) +play a prominent role in the translation workflow of professional translators. +However, the reduced availability of in-domain TMs, as compared to in-domain +monolingual corpora, limits its adoption for a number of translation tasks. In +this paper, we introduce a novel neural approach aimed at overcoming this +limitation by exploiting not only TMs, but also in-domain target-language (TL) +monolingual corpora, and still enabling a similar functionality to that offered +by conventional TM-based CAT tools. Our approach relies on cross-lingual +sentence embeddings to retrieve translation proposals from TL monolingual +corpora, and on a neural model to estimate their post-editing effort. The paper +presents an automatic evaluation of these techniques on four language pairs +that shows that our approach can successfully exploit monolingual texts in a +TM-based CAT environment, increasing the amount of useful translation +proposals, and that our neural model for estimating the post-editing effort +enables the combination of translation proposals obtained from monolingual +corpora and from TMs in the usual way. A human evaluation performed on a single +language pair confirms the results of the automatic evaluation and seems to +indicate that the translation proposals retrieved with our approach are more +useful than what the automatic evaluation shows. + +
+
+
+
+
+ + ☆ Morphology and Syntax of the Tamil Language + + +
+ This paper provides an overview of the morphology and syntax of the Tamil +language, focusing on its contemporary usage. The paper also highlights the +complexity and richness of Tamil in terms of its morphological and syntactic +features, which will be useful for linguists analysing the language and +conducting comparative studies. In addition, the paper will be useful for those +developing computational resources for the Tamil language. It is proven as a +rule-based morphological analyser cum generator and a computational grammar for +Tamil have already been developed based on this paper. To enhance accessibility +for a broader audience, the analysis is conducted without relying on any +specific grammatical formalism. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ☆ Hallucination Detection and Hallucination Mitigation: An Investigation + + +
+ Large language models (LLMs), including ChatGPT, Bard, and Llama, have +achieved remarkable successes over the last two years in a range of different +applications. In spite of these successes, there exist concerns that limit the +wide application of LLMs. A key problem is the problem of hallucination. +Hallucination refers to the fact that in addition to correct responses, LLMs +can also generate seemingly correct but factually incorrect responses. This +report aims to present a comprehensive review of the current literature on both +hallucination detection and hallucination mitigation. We hope that this report +can serve as a good reference for both engineers and researchers who are +interested in LLMs and applying them to real world tasks. + +
+
+
+
+
+ + ☆ Salute the Classic: Revisiting Challenges of Machine Translation in the + Age of Large Language Models + + +
+ The evolution of Neural Machine Translation (NMT) has been significantly +influenced by six core challenges (Koehn and Knowles, 2017), which have acted +as benchmarks for progress in this field. This study revisits these challenges, +offering insights into their ongoing relevance in the context of advanced Large +Language Models (LLMs): domain mismatch, amount of parallel data, rare word +prediction, translation of long sentences, attention model as word alignment, +and sub-optimal beam search. Our empirical findings indicate that LLMs +effectively lessen the reliance on parallel data for major languages in the +pretraining phase. Additionally, the LLM-based translation system significantly +enhances the translation of long sentences that contain approximately 80 words +and shows the capability to translate documents of up to 512 words. However, +despite these significant improvements, the challenges of domain mismatch and +prediction of rare words persist. While the challenges of word alignment and +beam search, specifically associated with NMT, may not apply to LLMs, we +identify three new challenges for LLMs in translation tasks: inference +efficiency, translation of low-resource languages in the pretraining phase, and +human-aligned evaluation. The datasets and models are released at +https://github.com/pangjh3/LLM4MT. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ RoTBench: A Multi-Level Benchmark for Evaluating the Robustness of Large + Language Models in Tool Learning + + +
+ Tool learning has generated widespread interest as a vital means of +interaction between Large Language Models (LLMs) and the physical world. +Current research predominantly emphasizes LLMs' capacity to utilize tools in +well-structured environments while overlooking their stability when confronted +with the inevitable noise of the real world. To bridge this gap, we introduce +RoTBench, a multi-level benchmark for evaluating the robustness of LLMs in tool +learning. Specifically, we establish five external environments, each featuring +varying levels of noise (i.e., Clean, Slight, Medium, Heavy, and Union), +providing an in-depth analysis of the model's resilience across three critical +phases: tool selection, parameter identification, and content filling. +Experiments involving six widely-used models underscore the urgent necessity +for enhancing the robustness of LLMs in tool learning. For instance, the +performance of GPT-4 even drops significantly from 80.00 to 58.10 when there is +no substantial change in manual accuracy. More surprisingly, the noise +correction capability inherent in the GPT family paradoxically impedes its +adaptability in the face of mild noise. In light of these findings, we propose +RoTTuning, a strategy that enriches the diversity of training environments to +bolster the robustness of LLMs in tool learning. The code and data are +available at https://github.com/Junjie-Ye/RoTBench. + +
+
+
+
+
+ + ☆ Application of LLM Agents in Recruitment: A Novel Framework for Resume + Screening + + +
+ The automation of resume screening is a crucial aspect of the recruitment +process in organizations. Automated resume screening systems often encompass a +range of natural language processing (NLP) tasks. The advent of Large Language +Models (LLMs) has notably enhanced the efficacy of these systems, showcasing +their robust generalization abilities across diverse language-related tasks. +Accompanying these developments are various agents based on LLMs, which +facilitate their application in practical scenarios. This paper introduces a +novel LLM-based agent framework for resume screening, aimed at enhancing +efficiency and time management in recruitment processes. Our framework is +distinct in its ability to efficiently summarize and grade each resume from a +large dataset. Moreover, it utilizes LLM agents for decision-making, +determining which candidates receive job offers, or which ones to bring in for +interviews. To evaluate our framework, we constructed a dataset from actual +resumes and conducted simulate a resume screening process. Subsequently, the +outcomes of the simulation experiment were compared and subjected to detailed +analysis. The results demonstrate that our automated resume screening framework +is 11 times faster than traditional manual methods. Furthermore, by fine-tuning +the LLMs, we observed a significant improvement in the F1 score, reaching +87.73\%, during the resume sentence classification phase. In the resume +summarization and grading phase, our fine-tuned model surpassed the baseline +performance of the GPT-3.5 model. Analysis of the decision-making efficacy of +the LLM agents in the final offer stage further underscores the potential of +LLM agents in transforming resume screening processes. + +
+
+ comment: Under review, 14 pages, 10 figures +
+
+
+
+
+ + ☆ Anchor function: a type of benchmark functions for studying language + models + + +
+ Understanding transformer-based language models is becoming increasingly +crucial, particularly as they play pivotal roles in advancing towards +artificial general intelligence. However, language model research faces +significant challenges, especially for academic research groups with +constrained resources. These challenges include complex data structures, +unknown target functions, high computational costs and memory requirements, and +a lack of interpretability in the inference process, etc. Drawing a parallel to +the use of simple models in scientific research, we propose the concept of an +anchor function. This is a type of benchmark function designed for studying +language models in learning tasks that follow an "anchor-key" pattern. By +utilizing the concept of an anchor function, we can construct a series of +functions to simulate various language tasks. The anchor function plays a role +analogous to that of mice in diabetes research, particularly suitable for +academic research. We demonstrate the utility of the anchor function with an +example, revealing two basic operations by attention structures in language +models: shifting tokens and broadcasting one token from one position to many +positions. These operations are also commonly observed in large language +models. The anchor function framework, therefore, opens up a series of valuable +and accessible research questions for further exploration, especially for +theoretical study. + +
+
+
+
+
+ + ☆ DAPT: A Dual Attention Framework for Parameter-Efficient Continual + Learning of Large Language Models + + +
+ The continual learning (CL) ability is vital for deploying large language +models (LLMs) in the dynamic world. Based on parameter-efficient tuning (PET), +existing methods devise the learning module and the selection module to handle +the challenges of catastrophic forgetting (CF) and knowledge transfer (KT) in +CL. The learning module allocates separate PET blocks for each continually +emerged task and the selection module function to choose the correct one for +the input at testing time. However, there are limitations in their deigns of +both modules and they ignore the potential of aligning the two module to +address CF and KT simultaneously. To this end, we propose a novel Dual +Attention Framework , to align the PET learning and selection via the Dual +Attentive Learning\&Selection module. Extensive Experiments on two CL +benchmarks demonstrate the superiority of DAPT to resist CF and facilitate KT +at the same time. Moreover, DAPT exhibits the superiority when we scale it to +different model sizes (from 770M to 11B) and unseen tasks. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ Inferflow: an Efficient and Highly Configurable Inference Engine for + Large Language Models + + +
+ We present Inferflow, an efficient and highly configurable inference engine +for large language models (LLMs). With Inferflow, users can serve most of the +common transformer models by simply modifying some lines in corresponding +configuration files, without writing a single line of source code. Compared +with most existing inference engines, Inferflow has some key features. First, +by implementing a modular framework of atomic build-blocks and technologies, +Inferflow is compositionally generalizable to new models. Second, 3.5-bit +quantization is introduced in Inferflow as a tradeoff between 3-bit and 4-bit +quantization. Third, hybrid model partitioning for multi-GPU inference is +introduced in Inferflow to better balance inference speed and throughput than +the existing partition-by-layer and partition-by-tensor strategies. + +
+
+ comment: Technical report of Inferflow +
+
+
+
+
+ + ☆ AesBench: An Expert Benchmark for Multimodal Large Language Models on + Image Aesthetics Perception + + +
+ With collective endeavors, multimodal large language models (MLLMs) are +undergoing a flourishing development. However, their performances on image +aesthetics perception remain indeterminate, which is highly desired in +real-world applications. An obvious obstacle lies in the absence of a specific +benchmark to evaluate the effectiveness of MLLMs on aesthetic perception. This +blind groping may impede the further development of more advanced MLLMs with +aesthetic perception capacity. To address this dilemma, we propose AesBench, an +expert benchmark aiming to comprehensively evaluate the aesthetic perception +capacities of MLLMs through elaborate design across dual facets. (1) We +construct an Expert-labeled Aesthetics Perception Database (EAPD), which +features diversified image contents and high-quality annotations provided by +professional aesthetic experts. (2) We propose a set of integrative criteria to +measure the aesthetic perception abilities of MLLMs from four perspectives, +including Perception (AesP), Empathy (AesE), Assessment (AesA) and +Interpretation (AesI). Extensive experimental results underscore that the +current MLLMs only possess rudimentary aesthetic perception ability, and there +is still a significant gap between MLLMs and humans. We hope this work can +inspire the community to engage in deeper explorations on the aesthetic +potentials of MLLMs. Source data will be available at +https://github.com/yipoh/AesBench. + +
+
+
+
+
+ + ☆ Large Language Models are Null-Shot Learners + + +
+ This paper presents null-shot prompting. Null-shot prompting exploits +hallucination in large language models (LLMs) by instructing LLMs to utilize +information from the "Examples" section that never exists within the provided +context to perform a task. While reducing hallucination is crucial and +non-negligible for daily and critical uses of LLMs, we propose that in the +current landscape in which these LLMs still hallucinate, it is possible, in +fact, to exploit hallucination to increase performance in performing tasks +compared to standard zero-shot prompting. Experiments with six LLMs show +improvements in performance across the majority of eight datasets, including +reading comprehension, arithmetic reasoning, and closed-book question +answering. The observed inconsistency in increased relative performance across +LLMs also potentially indicates a different degree of inherent hallucination in +each model. These differences show that it is possible to utilize null-shot +prompting as a way to detect degrees of hallucination in LLMs using existing +benchmarking datasets. We also perform ablation studies, including +experimenting with a modified version of null-shot prompting that incorporates +ideas from zero-shot chain-of-thought prompting, which shows different trends +of results. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ A Generative Adversarial Attack for Multilingual Text Classifiers AAAI-24 + + +
+ Current adversarial attack algorithms, where an adversary changes a text to +fool a victim model, have been repeatedly shown to be effective against text +classifiers. These attacks, however, generally assume that the victim model is +monolingual and cannot be used to target multilingual victim models, a +significant limitation given the increased use of these models. For this +reason, in this work we propose an approach to fine-tune a multilingual +paraphrase model with an adversarial objective so that it becomes able to +generate effective adversarial examples against multilingual classifiers. The +training objective incorporates a set of pre-trained models to ensure text +quality and language consistency of the generated text. In addition, all the +models are suitably connected to the generator by vocabulary-mapping matrices, +allowing for full end-to-end differentiability of the overall training +pipeline. The experimental validation over two multilingual datasets and five +languages has shown the effectiveness of the proposed approach compared to +existing baselines, particularly in terms of query efficiency. We also provide +a detailed analysis of the generated attacks and discuss limitations and +opportunities for future research. + +
+
+ comment: AAAI-24 Workshop on Artificial Intelligence for Cyber Security (AICS) +
+
+
+
+
+ + ☆ Generative Multi-Modal Knowledge Retrieval with Large Language Models AAAI 2024 + + +
+ Knowledge retrieval with multi-modal queries plays a crucial role in +supporting knowledge-intensive multi-modal applications. However, existing +methods face challenges in terms of their effectiveness and training +efficiency, especially when it comes to training and integrating multiple +retrievers to handle multi-modal queries. In this paper, we propose an +innovative end-to-end generative framework for multi-modal knowledge retrieval. +Our framework takes advantage of the fact that large language models (LLMs) can +effectively serve as virtual knowledge bases, even when trained with limited +data. We retrieve knowledge via a two-step process: 1) generating knowledge +clues related to the queries, and 2) obtaining the relevant document by +searching databases using the knowledge clue. In particular, we first introduce +an object-aware prefix-tuning technique to guide multi-grained visual learning. +Then, we align multi-grained visual features into the textual feature space of +the LLM, employing the LLM to capture cross-modal interactions. Subsequently, +we construct instruction data with a unified format for model training. +Finally, we propose the knowledge-guided generation strategy to impose prior +constraints in the decoding steps, thereby promoting the generation of +distinctive knowledge clues. Through experiments conducted on three benchmarks, +we demonstrate significant improvements ranging from 3.0% to 14.6% across all +evaluation metrics when compared to strong baselines. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ MARIO: MAth Reasoning with code Interpreter Output -- A Reproducible + Pipeline + + +
+ Large language models (LLMs) have seen considerable advancements in natural +language understanding tasks, yet there remains a gap to bridge before +attaining true artificial general intelligence, especially concerning +shortcomings in mathematical reasoning capabilities. We postulate that the +inherent nature of LLM training, which focuses on predicting probabilities of +next token, presents challenges in effectively modeling mathematical reasoning +that demands exact calculations, both from data-driven and theoretical +standpoints. In this paper, we address this challenge by enriching the data +landscape and introducing a novel math dataset, enhanced with a capability to +utilize a Python code interpreter. This dataset is derived from GSM8K and MATH +and has been further refined through a combination of GPT-4 annotations, human +review, and self-training processes, where the errors in the original GSM8K +training set have been fixed. Additionally, we propose a tentative, easily +replicable protocol for the fine-tuning of math-specific LLMs, which has led to +a significant improvement in the performance of a 7B-parameter LLM on the GSM8K +and MATH datasets. We are committed to advancing the field of mathematical +reasoning in LLMs and, to that end, we have made the model checkpoints and will +make the dataset publicly available. We hope this will facilitate further +research and development within the community. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ PRewrite: Prompt Rewriting with Reinforcement Learning + + +
+ Prompt engineering is critical for the development of LLM-based applications. +However, it is usually done manually in a "trial and error" fashion. This +manual procedure can be time consuming, ineffective, and the generated prompts +are, in a lot of cases, sub-optimal. Even for the prompts which seemingly work +well, there is always a lingering question: can the prompts be made better with +further modifications? + To address these questions, in this paper, we investigate prompt engineering +automation. We consider a specific use case scenario in which developers/users +have drafted initial prompts, but lack the time/expertise to optimize them. We +propose PRewrite, an automated tool to rewrite these drafts and to generate +highly effective new prompts. PRewrite is based on the Reinforcement Learning +(RL) framework which allows for end-to-end optimization and our design allows +the RL search to happen in a large action space. The automated tool leverages +manually crafted prompts as starting points which makes the rewriting procedure +more guided and efficient. The generated prompts are human readable, and +self-explanatory, unlike some of those in previous works. We conducted +extensive experiments on diverse datasets and found that the prompts generated +with this new method not only outperform professionally crafted prompts, but +also prompts generated with other previously proposed methods. + +
+
+
+
+
+ + ☆ A Study on Training and Developing Large Language Models for Behavior + Tree Generation + + +
+ This paper presents an innovative exploration of the application potential of +large language models (LLM) in addressing the challenging task of automatically +generating behavior trees (BTs) for complex tasks. The conventional manual BT +generation method is inefficient and heavily reliant on domain expertise. On +the other hand, existing automatic BT generation technologies encounter +bottlenecks related to task complexity, model adaptability, and reliability. In +order to overcome these challenges, we propose a novel methodology that +leverages the robust representation and reasoning abilities of LLMs. The core +contribution of this paper lies in the design of a BT generation framework +based on LLM, which encompasses the entire process, from data synthesis and +model training to application developing and data verification. Synthetic data +is introduced to train the BT generation model (BTGen model), enhancing its +understanding and adaptability to various complex tasks, thereby significantly +improving its overall performance. In order to ensure the effectiveness and +executability of the generated BTs, we emphasize the importance of data +verification and introduce a multilevel verification strategy. Additionally, we +explore a range of agent design and development schemes with LLM as the central +element. We hope that the work in this paper may provide a reference for the +researchers who are interested in BT generation based on LLMs. + +
+
+
+
+
+ + ☆ Enhancing Document-level Translation of Large Language Model via + Translation Mixed-instructions + + +
+ Existing large language models (LLMs) for machine translation are typically +fine-tuned on sentence-level translation instructions and achieve satisfactory +performance at the sentence level. However, when applied to document-level +translation, these models face a significant challenge, particularly when +dealing with documents containing over 512 tokens. This challenge arises from +the issue of sentence-level coverage, where subsequent sentences in the +document remain untranslated. As a result, the document-level translation +capability of LLMs fine-tuned on sentence-level translation instructions is +significantly limited. We conjecture that the primary cause of LLMs' weak +document-level translation performance is the absence of document-to-document +mapping ability. To address the issue, we propose an approach that combines +sentence-level and document-level translation instructions of varying lengths +to fine-tune LLMs. Our proposed translation mixed-instructions enable LLMs +(Llama-2~7B and 13B) to maintain consistent translation performance from the +sentence level to documents containing as many as 2048 tokens. Extensive +experimental results show that the proposed approach significantly enhances the +document-level translation capabilities of LLMs on 10 language pairs, +effectively mitigating the sentence-level coverage issue in document-level +translation. Experimentation on discourse phenomena has demonstrated that our +document-level translation approach significantly improves translation quality, +both in terms of BLEU score and discourse coherence. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Incremental Extractive Opinion Summarization Using Cover Trees + + +
+ Extractive opinion summarization involves automatically producing a summary +of text about an entity (e.g., a product's reviews) by extracting +representative sentences that capture prevalent opinions in the review set. +Typically, in online marketplaces user reviews accrue over time, and opinion +summaries need to be updated periodically to provide customers with up-to-date +information. In this work, we study the task of extractive opinion +summarization in an incremental setting, where the underlying review set +evolves over time. Many of the state-of-the-art extractive opinion +summarization approaches are centrality-based, such as CentroidRank. +CentroidRank performs extractive summarization by selecting a subset of review +sentences closest to the centroid in the representation space as the summary. +However, these methods are not capable of operating efficiently in an +incremental setting, where reviews arrive one at a time. In this paper, we +present an efficient algorithm for accurately computing the CentroidRank +summaries in an incremental setting. Our approach, CoverSumm, relies on +indexing review representations in a cover tree and maintaining a reservoir of +candidate summary review sentences. CoverSumm's efficacy is supported by a +theoretical and empirical analysis of running time. Empirically, on a diverse +collection of data (both real and synthetically created to illustrate scaling +considerations), we demonstrate that CoverSumm is up to 25x faster than +baseline methods, and capable of adapting to nuanced changes in data +distribution. We also conduct human evaluations of the generated summaries and +find that CoverSumm is capable of producing informative summaries consistent +with the underlying review set. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Enhancing Robustness of LLM-Synthetic Text Detectors for Academic + Writing: A Comprehensive Analysis + + +
+ The emergence of large language models (LLMs), such as Generative Pre-trained +Transformer 4 (GPT-4) used by ChatGPT, has profoundly impacted the academic and +broader community. While these models offer numerous advantages in terms of +revolutionizing work and study methods, they have also garnered significant +attention due to their potential negative consequences. One example is +generating academic reports or papers with little to no human contribution. +Consequently, researchers have focused on developing detectors to address the +misuse of LLMs. However, most existing methods prioritize achieving higher +accuracy on restricted datasets, neglecting the crucial aspect of +generalizability. This limitation hinders their practical application in +real-life scenarios where reliability is paramount. In this paper, we present a +comprehensive analysis of the impact of prompts on the text generated by LLMs +and highlight the potential lack of robustness in one of the current +state-of-the-art GPT detectors. To mitigate these issues concerning the misuse +of LLMs in academic writing, we propose a reference-based Siamese detector +named Synthetic-Siamese which takes a pair of texts, one as the inquiry and the +other as the reference. Our method effectively addresses the lack of robustness +of previous detectors (OpenAI detector and DetectGPT) and significantly +improves the baseline performances in realistic academic writing scenarios by +approximately 67% to 95%. + +
+
+
+
+
+ + ☆ Calpric: Inclusive and Fine-grain Labeling of Privacy Policies with + Crowdsourcing and Active Learning USENIX Security 2023 + + +
+ A significant challenge to training accurate deep learning models on privacy +policies is the cost and difficulty of obtaining a large and comprehensive set +of training data. To address these challenges, we present Calpric , which +combines automatic text selection and segmentation, active learning and the use +of crowdsourced annotators to generate a large, balanced training set for +privacy policies at low cost. Automated text selection and segmentation +simplifies the labeling task, enabling untrained annotators from crowdsourcing +platforms, like Amazon's Mechanical Turk, to be competitive with trained +annotators, such as law students, and also reduces inter-annotator agreement, +which decreases labeling cost. Having reliable labels for training enables the +use of active learning, which uses fewer training samples to efficiently cover +the input space, further reducing cost and improving class and data category +balance in the data set. The combination of these techniques allows Calpric to +produce models that are accurate over a wider range of data categories, and +provide more detailed, fine-grain labels than previous work. Our crowdsourcing +process enables Calpric to attain reliable labeled data at a cost of roughly +$0.92-$1.71 per labeled text segment. Calpric 's training process also +generates a labeled data set of 16K privacy policy text segments across 9 Data +categories with balanced positive and negative samples. + +
+
+ comment: published at USENIX Security 2023; associated website: + https://www.usenix.org/conference/usenixsecurity23/presentation/qiu +
+
+
+
+
+ + ☆ JustiLM: Few-shot Justification Generation for Explainable Fact-Checking + of Real-world Claims ACL + + +
+ Justification is an explanation that supports the veracity assigned to a +claim in fact-checking. However, the task of justification generation is +previously oversimplified as summarization of fact-check article authored by +fact-checkers. Therefore, we propose a realistic approach to generate +justification based on retrieved evidence. We present a new benchmark dataset +called ExClaim for \underline{Ex}plainable fact-checking of real-world +\underline{Claim}s, and introduce JustiLM, a novel few-shot +\underline{Justi}fication generation based on retrieval-augmented +\underline{L}anguage \underline{M}odel by using fact-check articles as +auxiliary resource during training only. Experiments show that JustiLM achieves +promising performance in justification generation compared to strong baselines, +and can also enhance veracity classification with a straightforward extension. + +
+
+ comment: Accepted in TACL. This is a pre-MIT Press publication version +
+
+
+
+
+ + ☆ Self-Imagine: Effective Unimodal Reasoning with Multimodal Models using + Self-Imagination + + +
+ The potential of Vision-Language Models (\textsc{vlm}s) often remains +underutilized in handling complex text-based problems, particularly when these +problems could benefit from visual representation. Resonating with humans' +ability to solve complex text-based problems by (1) creating a visual diagram +from the problem and (2) deducing what steps they need to take to solve it, we +propose \textsc{Self-Imagine}. We leverage a single Vision-Language Model +(\textsc{vlm}) to generate a structured representation of the question using +HTML, then render the HTML as an image, and finally use the same \vlm to answer +the question using both the question and the image. Our approach does not +require any additional training data or training. We evaluate our approach in +three mathematics tasks and nine general-purpose reasoning tasks using +state-of-the-art \textsc{vlm}. Our approach boosts the performance of +\textsc{vlm} on all math tasks (\gsm: +4.62\%; \asdiv: +4.49\%; \svamp: ++9.30\%) and the majority of the general-purpose reasoning tasks by 0.4\% to +13.20\% while achieving comparable performance in other tasks. + Code and data at https://github.com/snat1505027/self-imagine . + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ NOTSOFAR-1 Challenge: New Datasets, Baseline, and Tasks for Distant + Meeting Transcription + + +
+ We introduce the first Natural Office Talkers in Settings of Far-field Audio +Recordings (``NOTSOFAR-1'') Challenge alongside datasets and baseline system. +The challenge focuses on distant speaker diarization and automatic speech +recognition (DASR) in far-field meeting scenarios, with single-channel and +known-geometry multi-channel tracks, and serves as a launch platform for two +new datasets: First, a benchmarking dataset of 315 meetings, averaging 6 +minutes each, capturing a broad spectrum of real-world acoustic conditions and +conversational dynamics. It is recorded across 30 conference rooms, featuring +4-8 attendees and a total of 35 unique speakers. Second, a 1000-hour simulated +training dataset, synthesized with enhanced authenticity for real-world +generalization, incorporating 15,000 real acoustic transfer functions. The +tasks focus on single-device DASR, where multi-channel devices always share the +same known geometry. This is aligned with common setups in actual conference +rooms, and avoids technical complexities associated with multi-device tasks. It +also allows for the development of geometry-specific solutions. The NOTSOFAR-1 +Challenge aims to advance research in the field of distant conversational +speech recognition, providing key resources to unlock the potential of +data-driven methods, which we believe are currently constrained by the absence +of comprehensive high-quality training and benchmarking datasets. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Using i-vectors for subject-independent cross-session EEG transfer + learning + + +
+ Cognitive load classification is the task of automatically determining an +individual's utilization of working memory resources during performance of a +task based on physiologic measures such as electroencephalography (EEG). In +this paper, we follow a cross-disciplinary approach, where tools and +methodologies from speech processing are used to tackle this problem. The +corpus we use was released publicly in 2021 as part of the first passive +brain-computer interface competition on cross-session workload estimation. We +present our approach which used i-vector-based neural network classifiers to +accomplish inter-subject cross-session EEG transfer learning, achieving 18% +relative improvement over equivalent subject-dependent models. We also report +experiments showing how our subject-independent models perform competitively on +held-out subjects and improve with additional subject data, suggesting that +subject-dependent training is not required for effective cognitive load +determination. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Improving ASR Contextual Biasing with Guided Attention ICASSP 2024 + + +
+ In this paper, we propose a Guided Attention (GA) auxiliary training loss, +which improves the effectiveness and robustness of automatic speech recognition +(ASR) contextual biasing without introducing additional parameters. A common +challenge in previous literature is that the word error rate (WER) reduction +brought by contextual biasing diminishes as the number of bias phrases +increases. To address this challenge, we employ a GA loss as an additional +training objective besides the Transducer loss. The proposed GA loss aims to +teach the cross attention how to align bias phrases with text tokens or audio +frames. Compared to studies with similar motivations, the proposed loss +operates directly on the cross attention weights and is easier to implement. +Through extensive experiments based on Conformer Transducer with Contextual +Adapter, we demonstrate that the proposed method not only leads to a lower WER +but also retains its effectiveness as the number of bias phrases increases. +Specifically, the GA loss decreases the WER of rare vocabularies by up to 19.2% +on LibriSpeech compared to the contextual biasing baseline, and up to 49.3% +compared to a vanilla Transducer. + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Revisiting Self-supervised Learning of Speech Representation from a + Mutual Information Perspective ICASSP 2024 + + +
+ Existing studies on self-supervised speech representation learning have +focused on developing new training methods and applying pre-trained models for +different applications. However, the quality of these models is often measured +by the performance of different downstream tasks. How well the representations +access the information of interest is less studied. In this work, we take a +closer look into existing self-supervised methods of speech from an +information-theoretic perspective. We aim to develop metrics using mutual +information to help practical problems such as model design and selection. We +use linear probes to estimate the mutual information between the target +information and learned representations, showing another insight into the +accessibility to the target information from speech representations. Further, +we explore the potential of evaluating representations in a self-supervised +fashion, where we estimate the mutual information between different parts of +the data without using any labels. Finally, we show that both supervised and +unsupervised measures echo the performance of the models on layer-wise linear +probing and speech recognition. + +
+
+ comment: ICASSP 2024 +
+
+
+
+
+ + ☆ AiGen-FoodReview: A Multimodal Dataset of Machine-Generated Restaurant + Reviews and Images on Social Media + + +
+ Online reviews in the form of user-generated content (UGC) significantly +impact consumer decision-making. However, the pervasive issue of not only human +fake content but also machine-generated content challenges UGC's reliability. +Recent advances in Large Language Models (LLMs) may pave the way to fabricate +indistinguishable fake generated content at a much lower cost. Leveraging +OpenAI's GPT-4-Turbo and DALL-E-2 models, we craft AiGen-FoodReview, a +multi-modal dataset of 20,144 restaurant review-image pairs divided into +authentic and machine-generated. We explore unimodal and multimodal detection +models, achieving 99.80% multimodal accuracy with FLAVA. We use attributes from +readability and photographic theories to score reviews and images, +respectively, demonstrating their utility as hand-crafted features in scalable +and interpretable detection models, with comparable performance. The paper +contributes by open-sourcing the dataset and releasing fake review detectors, +recommending its use in unimodal and multimodal fake review detection tasks, +and evaluating linguistic and visual features in synthetic versus authentic +data. + +
+
+
+
+
+ + ☆ HuixiangDou: Overcoming Group Chat Scenarios with LLM-based Technical + Assistance + + +
+ In this work, we present HuixiangDou, a technical assistant powered by Large +Language Models (LLM). This system is designed to assist algorithm developers +by providing insightful responses to questions related to open-source algorithm +projects, such as computer vision and deep learning projects from OpenMMLab. We +further explore the integration of this assistant into the group chats of +instant messaging (IM) tools such as WeChat and Lark. Through several iterative +improvements and trials, we have developed a sophisticated technical chat +assistant capable of effectively answering users' technical questions without +causing message flooding. This paper's contributions include: 1) Designing an +algorithm pipeline specifically for group chat scenarios; 2) Verifying the +reliable performance of text2vec in task rejection; 3) Identifying three +critical requirements for LLMs in technical-assistant-like products, namely +scoring ability, In-Context Learning (ICL), and Long Context. We have made the +software and source code available at https://github.com/internlm/huixiangdou +to aid in future research and application. HuixiangDou is applicable to any +group chat within IM tools. + +
+
+ comment: Technical report, 11 pages, 3 figures +
+
+
+
+
+ + ☆ MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in + 3D World + + +
+ Human beings possess the capability to multiply a melange of multisensory +cues while actively exploring and interacting with the 3D world. Current +multi-modal large language models, however, passively absorb sensory data as +inputs, lacking the capacity to actively interact with the objects in the 3D +environment and dynamically collect their multisensory information. To usher in +the study of this area, we propose MultiPLY, a multisensory embodied large +language model that could incorporate multisensory interactive data, including +visual, audio, tactile, and thermal information into large language models, +thereby establishing the correlation among words, actions, and percepts. To +this end, we first collect Multisensory Universe, a large-scale multisensory +interaction dataset comprising 500k data by deploying an LLM-powered embodied +agent to engage with the 3D environment. To perform instruction tuning with +pre-trained LLM on such generated data, we first encode the 3D scene as +abstracted object-centric representations and then introduce action tokens +denoting that the embodied agent takes certain actions within the environment, +as well as state tokens that represent the multisensory state observations of +the agent at each time step. In the inference time, MultiPLY could generate +action tokens, instructing the agent to take the action in the environment and +obtain the next multisensory state observation. The observation is then +appended back to the LLM via state tokens to generate subsequent text or action +tokens. We demonstrate that MultiPLY outperforms baselines by a large margin +through a diverse set of embodied tasks involving object retrieval, tool use, +multisensory captioning, and task decomposition. + +
+
+ comment: Project page: https://vis-www.cs.umass.edu/multiply +
+
+
+
+
+ + ☆ MMToM-QA: Multimodal Theory of Mind Question Answering + + +
+ Theory of Mind (ToM), the ability to understand people's minds, is an +essential ingredient for developing machines with human-level social +intelligence. Recent machine learning models, particularly large language +models, seem to show some aspects of ToM understanding. However, existing ToM +benchmarks use unimodal datasets - either video or text. Human ToM, on the +other hand, is more than video or text understanding. People can flexibly +reason about another person's mind based on conceptual representations (e.g., +goals, beliefs, plans) extracted from any available data, which can include +visual cues, linguistic narratives, or both. To address this, we introduce a +multimodal Theory of Mind question answering (MMToM-QA) benchmark. MMToM-QA +comprehensively evaluates machine ToM both on multimodal data and on different +kinds of unimodal data about a person's activity in a household environment. To +engineer multimodal ToM capacity, we propose a novel method, BIP-ALM (Bayesian +Inverse Planning Accelerated by Language Models). BIP-ALM extracts unified +representations from multimodal data and utilizes language models for scalable +Bayesian inverse planning. We conducted a systematic comparison of human +performance, BIP-ALM, and state-of-the-art models, including GPT-4. The +experiments demonstrate that large language models and large multimodal models +still lack robust ToM capacity. BIP-ALM, on the other hand, shows promising +results, by leveraging the power of both model-based mental inference and +language models. + +
+
+ comment: 27 pages, 11 figures, 7 tables +
+
+
+
+
+ + ☆ Deductive Closure Training of Language Models for Coherence, Accuracy, + and Updatability + + +
+ While language models (LMs) can sometimes generate factually correct text and +estimate truth values of individual claims, these generally do not reflect a +globally coherent, manipulable model of the world. As a consequence, current +LMs also generate incorrect or nonsensical content, and are difficult to edit +and bring up to date. We present a method called Deductive Closure Training +(DCT) that uses LMs themselves to identify implications of (and contradictions +within) the text that they generate, yielding an efficient self-supervised +procedure for improving LM factuality. Given a collection of seed documents, +DCT prompts LMs to generate additional text implied by these documents, reason +globally about the correctness of this generated text, and finally fine-tune on +text inferred to be correct. Given seed documents from a trusted source, DCT +provides a tool for supervised model updating; if seed documents are sampled +from the LM itself, DCT enables fully unsupervised fine-tuning for improved +coherence and accuracy. Across the CREAK, MQUaKE, and Reversal Curse datasets, +supervised DCT improves LM fact verification and text generation accuracy by +3-26%; on CREAK fully unsupervised DCT improves verification accuracy by 12%. +These results show that LMs' reasoning capabilities during inference can be +leveraged during training to improve their reliability. + +
+
+
+
+
+ + ☆ Connect, Collapse, Corrupt: Learning Cross-Modal Tasks with Uni-Modal + Data ICLR 2024 + + +
+ Building cross-modal applications is challenging due to limited paired +multi-modal data. Recent works have shown that leveraging a pre-trained +multi-modal contrastive representation space enables cross-modal tasks to be +learned from uni-modal data. This is based on the assumption that contrastive +optimization makes embeddings from different modalities interchangeable. +However, this assumption is under-explored due to the poorly understood +geometry of the multi-modal contrastive space, where a modality gap exists. In +our study, we provide a theoretical explanation of this space's geometry and +introduce a three-step method, $C^3$ (Connect, Collapse, Corrupt), to bridge +the modality gap, enhancing the interchangeability of embeddings. Our $C^3$ +method significantly improves cross-modal learning from uni-modal data, +achieving state-of-the-art results on zero-shot image / audio / video +captioning and text-to-image generation. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ☆ Tuning Language Models by Proxy + + +
+ Despite the general capabilities of large pretrained language models, they +consistently benefit from further adaptation to better achieve desired +behaviors. However, tuning these models has become increasingly +resource-intensive, or impossible when model weights are private. We introduce +proxy-tuning, a lightweight decoding-time algorithm that operates on top of +black-box LMs to achieve the result of directly tuning the model, but by +accessing only its prediction over the output vocabulary. Our method instead +tunes a smaller LM, then applies the difference between the predictions of the +small tuned and untuned LMs to shift the original predictions of the base model +in the direction of tuning, while retaining the benefits of larger scale +pretraining. In experiments, when we apply proxy-tuning to Llama2-70B using +proxies of only 7B size, we can close 88% of the gap between Llama2-70B and its +truly-tuned chat version, when evaluated across knowledge, reasoning, and +safety benchmarks. Interestingly, when tested on TruthfulQA, proxy-tuned models +are actually more truthful than directly tuned models, possibly because +decoding-time guidance better retains the model's factual knowledge. We then +demonstrate the generality of proxy-tuning by applying it for domain adaptation +on code, and task-specific finetuning on question-answering and math problems. +Our work demonstrates the promise of using small tuned LMs to efficiently +customize large, potentially proprietary LMs through decoding-time guidance. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Spatial Entity Resolution between Restaurant Locations and + Transportation Destinations in Southeast Asia + + +
+ As a tech company, Grab has expanded from transportation to food delivery, +aiming to serve Southeast Asia with hyperlocalized applications. Information +about places as transportation destinations can help to improve our knowledge +about places as restaurants, so long as the spatial entity resolution problem +between these datasets can be solved. In this project, we attempted to +recognize identical place entities from databases of Points-of-Interest (POI) +and GrabFood restaurants, using their spatial and textual attributes, i.e., +latitude, longitude, place name, and street address. + Distance metrics were calculated for these attributes and fed to tree-based +classifiers. POI-restaurant matching was conducted separately for Singapore, +Philippines, Indonesia, and Malaysia. Experimental estimates demonstrate that a +matching POI can be found for over 35% of restaurants in these countries. As +part of these estimates, test datasets were manually created, and RandomForest, +AdaBoost, Gradient Boosting, and XGBoost perform well, with most accuracy, +precision, and recall scores close to or higher than 90% for matched vs. +unmatched classification. To the authors' knowledge, there are no previous +published scientific papers devoted to matching of spatial entities for the +Southeast Asia region. + +
+
+
+
+
+ + ☆ Supporting Student Decisions on Learning Recommendations: An LLM-Based + Chatbot with Knowledge Graph Contextualization for Conversational + Explainability and Mentoring + + +
+ Student commitment towards a learning recommendation is not separable from +their understanding of the reasons it was recommended to them; and their +ability to modify it based on that understanding. Among explainability +approaches, chatbots offer the potential to engage the student in a +conversation, similar to a discussion with a peer or a mentor. The capabilities +of chatbots, however, are still not sufficient to replace a human mentor, +despite the advancements of generative AI (GenAI) and large language models +(LLM). Therefore, we propose an approach to utilize chatbots as mediators of +the conversation and sources of limited and controlled generation of +explanations, to harvest the potential of LLMs while reducing their potential +risks at the same time. The proposed LLM-based chatbot supports students in +understanding learning-paths recommendations. We use a knowledge graph (KG) as +a human-curated source of information, to regulate the LLM's output through +defining its prompt's context. A group chat approach is developed to connect +students with human mentors, either on demand or in cases that exceed the +chatbot's pre-defined tasks. We evaluate the chatbot with a user study, to +provide a proof-of-concept and highlight the potential requirements and +limitations of utilizing chatbots in conversational explainability. + +
+
+
+
+
+ + ☆ The Gaps between Pre-train and Downstream Settings in Bias Evaluation + and Debiasing + + +
+ The output tendencies of Pre-trained Language Models (PLM) vary markedly +before and after Fine-Tuning (FT) due to the updates to the model parameters. +These divergences in output tendencies result in a gap in the social biases of +PLMs. For example, there exits a low correlation between intrinsic bias scores +of a PLM and its extrinsic bias scores under FT-based debiasing methods. +Additionally, applying FT-based debiasing methods to a PLM leads to a decline +in performance in downstream tasks. On the other hand, PLMs trained on large +datasets can learn without parameter updates via In-Context Learning (ICL) +using prompts. ICL induces smaller changes to PLMs compared to FT-based +debiasing methods. Therefore, we hypothesize that the gap observed in +pre-trained and FT models does not hold true for debiasing methods that use +ICL. In this study, we demonstrate that ICL-based debiasing methods show a +higher correlation between intrinsic and extrinsic bias scores compared to +FT-based methods. Moreover, the performance degradation due to debiasing is +also lower in the ICL case compared to that in the FT case. + +
+
+
+
+
+ + ☆ EmoLLMs: A Series of Emotional Large Language Models and Annotation + Tools for Comprehensive Affective Analysis + + +
+ Sentiment analysis and emotion detection are important research topics in +natural language processing (NLP) and benefit many downstream tasks. With the +widespread application of LLMs, researchers have started exploring the +application of LLMs based on instruction-tuning in the field of sentiment +analysis. However, these models only focus on single aspects of affective +classification tasks (e.g. sentimental polarity or categorical emotions), and +overlook the regression tasks (e.g. sentiment strength or emotion intensity), +which leads to poor performance in downstream tasks. The main reason is the +lack of comprehensive affective instruction tuning datasets and evaluation +benchmarks, which cover various affective classification and regression tasks. +Moreover, although emotional information is useful for downstream tasks, +existing downstream datasets lack high-quality and comprehensive affective +annotations. In this paper, we propose EmoLLMs, the first series of +open-sourced instruction-following LLMs for comprehensive affective analysis +based on fine-tuning various LLMs with instruction data, the first multi-task +affective analysis instruction dataset (AAID) with 234K data samples based on +various classification and regression tasks to support LLM instruction tuning, +and a comprehensive affective evaluation benchmark (AEB) with 14 tasks from +various sources and domains to test the generalization ability of LLMs. We +propose a series of EmoLLMs by fine-tuning LLMs with AAID to solve various +affective instruction tasks. We compare our model with a variety of LLMs on +AEB, where our models outperform all other open-sourced LLMs, and surpass +ChatGPT and GPT-4 in most tasks, which shows that the series of EmoLLMs achieve +the ChatGPT-level and GPT-4-level generalization capabilities on affective +analysis tasks, and demonstrates our models can be used as affective annotation +tools. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Code Generation with AlphaCodium: From Prompt Engineering to Flow + Engineering + + +
+ Code generation problems differ from common natural language problems - they +require matching the exact syntax of the target language, identifying happy +paths and edge cases, paying attention to numerous small details in the problem +spec, and addressing other code-specific issues and requirements. Hence, many +of the optimizations and tricks that have been successful in natural language +generation may not be effective for code tasks. In this work, we propose a new +approach to code generation by LLMs, which we call AlphaCodium - a test-based, +multi-stage, code-oriented iterative flow, that improves the performances of +LLMs on code problems. We tested AlphaCodium on a challenging code generation +dataset called CodeContests, which includes competitive programming problems +from platforms such as Codeforces. The proposed flow consistently and +significantly improves results. On the validation set, for example, GPT-4 +accuracy (pass@5) increased from 19% with a single well-designed direct prompt +to 44% with the AlphaCodium flow. Many of the principles and best practices +acquired in this work, we believe, are broadly applicable to general code +generation tasks. Full implementation is available at: +https://github.com/Codium-ai/AlphaCodium + +
+
+
+
+
+ + ☆ The Effect of Group Status on the Variability of Group Representations + in LLM-generated Text NeurIPS 2023 + + +
+ Large Language Models (LLMs) have become pervasive in everyday life, yet +their inner workings remain opaque. While scholarly efforts have demonstrated +LLMs' propensity to reproduce biases in their training data, they have +primarily focused on the association of social groups with stereotypic +attributes. In this paper, we extend this line of inquiry to investigate a bias +akin to the social-psychological phenomenon where socially dominant groups are +perceived to be less homogeneous than socially subordinate groups as it is +reproduced by LLMs. We had ChatGPT, a state-of-the-art LLM, generate a +diversity of texts about intersectional group identities and compared text +homogeneity. We consistently find that LLMs portray African, Asian, and +Hispanic Americans as more homogeneous than White Americans. They also portray +women as more homogeneous than men, but these differences are small. Finally, +we find that the effect of gender differs across racial/ethnic groups such that +the effect of gender is consistent within African and Hispanic Americans but +not within Asian and White Americans. We speculate possible sources of this +bias in LLMs and posit that the bias has the potential to amplify biases in +future LLM training and to reinforce stereotypes. + +
+
+ comment: Presented at the Socially Responsible Language Modelling Research + (SoLaR) Workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ Contrastive Perplexity for Controlled Generation: An Application in + Detoxifying Large Language Models + + +
+ The generation of undesirable and factually incorrect content of large +language models poses a significant challenge and remains largely an unsolved +issue. This paper studies the integration of a contrastive learning objective +for fine-tuning LLMs for implicit knowledge editing and controlled text +generation. Optimizing the training objective entails aligning text +perplexities in a contrastive fashion. To facilitate training the model in a +self-supervised fashion, we leverage an off-the-shelf LLM for training data +generation. We showcase applicability in the domain of detoxification. Herein, +the proposed approach leads to a significant decrease in the generation of +toxic content while preserving general utility for downstream tasks such as +commonsense reasoning and reading comprehension. The proposed approach is +conceptually simple but empirically powerful. + +
+
+
+
+
+ + ☆ Decentralised Emergence of Robust and Adaptive Linguistic Conventions in + Populations of Autonomous Agents Grounded in Continuous Worlds + + +
+ This paper introduces a methodology through which a population of autonomous +agents can establish a linguistic convention that enables them to refer to +arbitrary entities that they observe in their environment. The linguistic +convention emerges in a decentralised manner through local communicative +interactions between pairs of agents drawn from the population. The convention +consists of symbolic labels (word forms) associated to concept representations +(word meanings) that are grounded in a continuous feature space. The concept +representations of each agent are individually constructed yet compatible on a +communicative level. Through a range of experiments, we show (i) that the +methodology enables a population to converge on a communicatively effective, +coherent and human-interpretable linguistic convention, (ii) that it is +naturally robust against sensor defects in individual agents, (iii) that it can +effectively deal with noisy observations, uncalibrated sensors and +heteromorphic populations, (iv) that the method is adequate for continual +learning, and (v) that the convention self-adapts to changes in the environment +and communicative needs of the agents. + +
+
+
+
+
+ + ☆ Reinforcement Learning for Conversational Question Answering over + Knowledge Graph + + +
+ Conversational question answering (ConvQA) over law knowledge bases (KBs) +involves answering multi-turn natural language questions about law and hope to +find answers in the law knowledge base. Despite many methods have been +proposed. Existing law knowledge base ConvQA model assume that the input +question is clear and can perfectly reflect user's intention. However, in real +world, the input questions are noisy and inexplict. This makes the model hard +to find the correct answer in the law knowledge bases. In this paper, we try to +use reinforcement learning to solve this problem. The reinforcement learning +agent can automatically learn how to find the answer based on the input +question and the conversation history, even when the input question is +inexplicit. We test the proposed method on several real world datasets and the +results show the effectivenss of the proposed model. + +
+
+
+
+
+ + ☆ Machine Translation with Large Language Models: Prompt Engineering for + Persian, English, and Russian Directions + + +
+ Generative large language models (LLMs) have demonstrated exceptional +proficiency in various natural language processing (NLP) tasks, including +machine translation, question answering, text summarization, and natural +language understanding. + To further enhance the performance of LLMs in machine translation, we +conducted an investigation into two popular prompting methods and their +combination, focusing on cross-language combinations of Persian, English, and +Russian. We employed n-shot feeding and tailored prompting frameworks. Our +findings indicate that multilingual LLMs like PaLM exhibit human-like machine +translation outputs, enabling superior fine-tuning of desired translation +nuances in accordance with style guidelines and linguistic considerations. +These models also excel in processing and applying prompts. However, the choice +of language model, machine translation task, and the specific source and target +languages necessitate certain considerations when adopting prompting frameworks +and utilizing n-shot in-context learning. + Furthermore, we identified errors and limitations inherent in popular LLMs as +machine translation tools and categorized them based on various linguistic +metrics. This typology of errors provides valuable insights for utilizing LLMs +effectively and offers methods for designing prompts for in-context learning. +Our report aims to contribute to the advancement of machine translation with +LLMs by improving both the accuracy and reliability of evaluation metrics. + +
+
+ comment: 34 pages, 46 figures +
+
+
+
+
+ + ☆ Ask the experts: sourcing high-quality datasets for nutritional + counselling through Human-AI collaboration + + +
+ Large Language Models (LLMs), with their flexible generation abilities, can +be powerful data sources in domains with few or no available corpora. However, +problems like hallucinations and biases limit such applications. In this case +study, we pick nutrition counselling, a domain lacking any public resource, and +show that high-quality datasets can be gathered by combining LLMs, +crowd-workers and nutrition experts. We first crowd-source and cluster a novel +dataset of diet-related issues, then work with experts to prompt ChatGPT into +producing related supportive text. Finally, we let the experts evaluate the +safety of the generated text. We release HAI-coaching, the first +expert-annotated nutrition counselling dataset containing ~2.4K dietary +struggles from crowd workers, and ~97K related supportive texts generated by +ChatGPT. Extensive analysis shows that ChatGPT while producing highly fluent +and human-like text, also manifests harmful behaviours, especially in sensitive +topics like mental health, making it unsuitable for unsupervised use. + +
+
+
+
+
+ + ☆ Contrastive Preference Optimization: Pushing the Boundaries of LLM + Performance in Machine Translation + + +
+ Moderate-sized large language models (LLMs) -- those with 7B or 13B +parameters -- exhibit promising machine translation (MT) performance. However, +even the top-performing 13B LLM-based translation models, like ALMA, does not +match the performance of state-of-the-art conventional encoder-decoder +translation models or larger-scale LLMs such as GPT-4. In this study, we bridge +this performance gap. We first assess the shortcomings of supervised +fine-tuning for LLMs in the MT task, emphasizing the quality issues present in +the reference data, despite being human-generated. Then, in contrast to SFT +which mimics reference translations, we introduce Contrastive Preference +Optimization (CPO), a novel approach that trains models to avoid generating +adequate but not perfect translations. Applying CPO to ALMA models with only +22K parallel sentences and 12M parameters yields significant improvements. The +resulting model, called ALMA-R, can match or exceed the performance of the WMT +competition winners and GPT-4 on WMT'21, WMT'22 and WMT'23 test datasets. + +
+
+
+
+
+ + ☆ RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on + Agriculture + + +
+ There are two common ways in which developers are incorporating proprietary +and domain-specific data when building applications of Large Language Models +(LLMs): Retrieval-Augmented Generation (RAG) and Fine-Tuning. RAG augments the +prompt with the external data, while fine-Tuning incorporates the additional +knowledge into the model itself. However, the pros and cons of both approaches +are not well understood. In this paper, we propose a pipeline for fine-tuning +and RAG, and present the tradeoffs of both for multiple popular LLMs, including +Llama2-13B, GPT-3.5, and GPT-4. Our pipeline consists of multiple stages, +including extracting information from PDFs, generating questions and answers, +using them for fine-tuning, and leveraging GPT-4 for evaluating the results. We +propose metrics to assess the performance of different stages of the RAG and +fine-Tuning pipeline. We conduct an in-depth study on an agricultural dataset. +Agriculture as an industry has not seen much penetration of AI, and we study a +potentially disruptive application - what if we could provide location-specific +insights to a farmer? Our results show the effectiveness of our dataset +generation pipeline in capturing geographic-specific knowledge, and the +quantitative and qualitative benefits of RAG and fine-tuning. We see an +accuracy increase of over 6 p.p. when fine-tuning the model and this is +cumulative with RAG, which increases accuracy by 5 p.p. further. In one +particular experiment, we also demonstrate that the fine-tuned model leverages +information from across geographies to answer specific questions, increasing +answer similarity from 47% to 72%. Overall, the results point to how systems +built using LLMs can be adapted to respond and incorporate knowledge across a +dimension that is critical for a specific industry, paving the way for further +applications of LLMs in other industrial domains. + +
+
+
+
+
+ + ☆ Hidden Flaws Behind Expert-Level Accuracy of GPT-4 Vision in Medicine + + +
+ Recent studies indicate that Generative Pre-trained Transformer 4 with Vision +(GPT-4V) outperforms human physicians in medical challenge tasks. However, +these evaluations primarily focused on the accuracy of multi-choice questions +alone. Our study extends the current scope by conducting a comprehensive +analysis of GPT-4V's rationales of image comprehension, recall of medical +knowledge, and step-by-step multimodal reasoning when solving New England +Journal of Medicine (NEJM) Image Challenges - an imaging quiz designed to test +the knowledge and diagnostic capabilities of medical professionals. Evaluation +results confirmed that GPT-4V outperforms human physicians regarding +multi-choice accuracy (88.0% vs. 77.0%, p=0.034). GPT-4V also performs well in +cases where physicians incorrectly answer, with over 80% accuracy. However, we +discovered that GPT-4V frequently presents flawed rationales in cases where it +makes the correct final choices (27.3%), most prominent in image comprehension +(21.6%). Regardless of GPT-4V's high accuracy in multi-choice questions, our +findings emphasize the necessity for further in-depth evaluations of its +rationales before integrating such models into clinical workflows. + +
+
+
+
+
+ + ☆ DoraemonGPT: Toward Understanding Dynamic Scenes with Large Language + Models + + +
+ The field of AI agents is advancing at an unprecedented rate due to the +capabilities of large language models (LLMs). However, LLM-driven visual agents +mainly focus on solving tasks for the image modality, which limits their +ability to understand the dynamic nature of the real world, making it still far +from real-life applications, e.g., guiding students in laboratory experiments +and identifying their mistakes. Considering the video modality better reflects +the ever-changing and perceptually intensive nature of real-world scenarios, we +devise DoraemonGPT, a comprehensive and conceptually elegant system driven by +LLMs to handle dynamic video tasks. Given a video with a question/task, +DoraemonGPT begins by converting the input video with massive content into a +symbolic memory that stores \textit{task-related} attributes. This structured +representation allows for spatial-temporal querying and reasoning by sub-task +tools, resulting in concise and relevant intermediate results. Recognizing that +LLMs have limited internal knowledge when it comes to specialized domains +(e.g., analyzing the scientific principles underlying experiments), we +incorporate plug-and-play tools to assess external knowledge and address tasks +across different domains. Moreover, we introduce a novel LLM-driven planner +based on Monte Carlo Tree Search to efficiently explore the large planning +space for scheduling various tools. The planner iteratively finds feasible +solutions by backpropagating the result's reward, and multiple solutions can be +summarized into an improved final answer. We extensively evaluate DoraemonGPT +in dynamic scenes and provide in-the-wild showcases demonstrating its ability +to handle more complex questions than previous studies. + +
+
+
+
+
+ + ♻ ☆ Linguistic and Structural Basis of Engineering Design Knowledge + + +
+ Artefact descriptions are the primary carriers of engineering design +knowledge that is both an outcome and a driver of the design process. While an +artefact could be described in different connotations, the design process +requires a description to embody engineering design knowledge, which is +expressed in the text through intricate placement of entities and +relationships. As large-language models learn from all kinds of text merely as +a sequence of characters/tokens, these are yet to generate text that embodies +explicit engineering design facts. Existing ontological design theories are +less likely to guide the large-language models whose applications are currently +limited to ideation and learning purposes. In this article, we explicate +engineering design knowledge as knowledge graphs from a large sample of 33,881 +patent documents. We examine the constituents of these knowledge graphs to +understand the linguistic and structural basis of engineering design knowledge. +In terms of linguistic basis, we observe that entities and relationships could +be generalised to 64 and 24 linguistic syntaxes. While relationships mainly +capture attributes ('of'), structure ('in', 'with'), purpose ('to', 'for'), +hierarchy ('include'), exemplification ('such as'), and behaviour ('to', +'from'), the hierarchical relationships could specifically be identified using +75 unique syntaxes. To understand the structural basis, we draw inspiration +from various studies on biological/ecological networks and discover motifs from +patent knowledge graphs. We identify four 3-node and four 4-node patterns that +could further be converged and simplified into sequence [->...->], aggregation +[->...<-], and hierarchy [<-...->]. Expected to guide large-language model +based design tools, we propose few regulatory precepts for concretising +abstract entities and relationships within subgraphs, while explicating +hierarchical structures. + +
+
+
+
+
+ + ♻ ☆ CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language + Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insights into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023; updated with CLadder dataset v1.5 +
+
+
+
+
+ + ♻ ☆ Knowledge Graph Error Detection with Contrastive Confidence Adaption AAAI + + +
+ Knowledge graphs (KGs) often contain various errors. Previous works on +detecting errors in KGs mainly rely on triplet embedding from graph structure. +We conduct an empirical study and find that these works struggle to +discriminate noise from semantically-similar correct triplets. In this paper, +we propose a KG error detection model CCA to integrate both textual and graph +structural information from triplet reconstruction for better distinguishing +semantics. We design interactive contrastive learning to capture the +differences between textual and structural patterns. Furthermore, we construct +realistic datasets with semantically-similar noise and adversarial noise. +Experimental results demonstrate that CCA outperforms state-of-the-art +baselines, especially in detecting semantically-similar noise and adversarial +noise. + +
+
+ comment: Accepted in the 38th AAAI Conference on Artificial Intelligence (AAAI + 2024) +
+
+
+
+
+ + ♻ ☆ Diffusion Language Models Generation Can Be Halted Early + + +
+ Diffusion Language models (DLMs) are a promising avenue for text generation +due to their practical properties on tractable controllable generation. They +also have the advantage of not having to predict text autoregressively. +However, despite these notable features, DLMs have not yet reached the +performance levels of their Autoregressive counterparts. One of the ways to +reduce the performance gap between these two types of language models is to +speed up the generation of DLMs. Therefore, we propose a pioneering methodology +to address this issue in this work. It enables the execution of more generation +steps within a given time frame, potentially leading to higher-quality outputs. +Specifically, our methods estimate DLMs completeness of text generation and +allow adaptive halting of the generation process. We test and refine our +methods on Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their +generation workflows. Finally, we confirm that our methods allow halting Plaid, +SSD, and CDCD models and decrease the generation time by $10$-$40$% without a +drop in the quality of model samples. + +
+
+
+
+
+ + ♻ ☆ Learning a Structural Causal Model for Intuition Reasoning in + Conversation + + +
+ Reasoning, a crucial aspect of NLP research, has not been adequately +addressed by prevailing models including Large Language Model. Conversation +reasoning, as a critical component of it, remains largely unexplored due to the +absence of a well-designed cognitive model. In this paper, inspired by +intuition theory on conversation cognition, we develop a conversation cognitive +model (CCM) that explains how each utterance receives and activates channels of +information recursively. Besides, we algebraically transformed CCM into a +structural causal model (SCM) under some mild assumptions, rendering it +compatible with various causal discovery methods. We further propose a +probabilistic implementation of the SCM for utterance-level relation reasoning. +By leveraging variational inference, it explores substitutes for implicit +causes, addresses the issue of their unobservability, and reconstructs the +causal representations of utterances through the evidence lower bounds. +Moreover, we constructed synthetic and simulated datasets incorporating +implicit causes and complete cause labels, alleviating the current situation +where all available datasets are implicit-causes-agnostic. Extensive +experiments demonstrate that our proposed method significantly outperforms +existing methods on synthetic, simulated, and real-world datasets. Finally, we +analyze the performance of CCM under latent confounders and propose theoretical +ideas for addressing this currently unresolved issue. + +
+
+
+
+
+ + ♻ ☆ Translatotron 3: Speech to Speech Translation with Monolingual Data ICASSP 2024 + + +
+ This paper presents Translatotron 3, a novel approach to unsupervised direct +speech-to-speech translation from monolingual speech-text datasets by combining +masked autoencoder, unsupervised embedding mapping, and back-translation. +Experimental results in speech-to-speech translation tasks between Spanish and +English show that Translatotron 3 outperforms a baseline cascade system, +reporting $18.14$ BLEU points improvement on the synthesized +Unpaired-Conversational dataset. In contrast to supervised approaches that +necessitate real paired data, or specialized modeling to replicate +para-/non-linguistic information such as pauses, speaking rates, and speaker +identity, Translatotron 3 showcases its capability to retain it. Audio samples +can be found at http://google-research.github.io/lingvo-lab/translatotron3 + +
+
+ comment: To appear in ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ "Paraphrasing The Original Text" Makes High Accuracy Long-Context QA + + +
+ Most open-source generative language models currently have a context window +of no more than 4k, limiting their ability when facing long text. Many previous +efforts have tried to extend the context window of models, but their actual +effects have been found to be very limited. To address this issue, we +theoretically analyze the effectiveness of the long-context training data and +find that long-context training requires "effective" data rather than simply +"long" data, which is rarely noticed in previous studies. Thus, we propose +adding "original text paraphrasing" to enhance the effectiveness of the data. +The model trained on our re-fined dataset obtains excellent long-context +capabilities and achieves state-of-the-art accuracy on multi-document retrieval +and QA tasks among models of comparable scales. The model and training data +have been made available on +HuggingFace(https://huggingface.co/yuyijiong/Qwen-14b-chat-yarn-32k) and +WiseModel(https://wisemodel.cn/models/yuyijiong/Qwen-14b-chat-yarn-32k). + +
+
+ comment: Chinese version of this paper can be downloaded from + (https://cloud.tsinghua.edu.cn/d/5894ec4442e54a6aac96/) +
+
+
+
+
+ + ♻ ☆ Self-Supervised Models of Speech Infer Universal Articulatory Kinematics + + +
+ Self-Supervised Learning (SSL) based models of speech have shown remarkable +performance on a range of downstream tasks. These state-of-the-art models have +remained blackboxes, but many recent studies have begun "probing" models like +HuBERT, to correlate their internal representations to different aspects of +speech. In this paper, we show "inference of articulatory kinematics" as +fundamental property of SSL models, i.e., the ability of these models to +transform acoustics into the causal articulatory dynamics underlying the speech +signal. We also show that this abstraction is largely overlapping across the +language of the data used to train the model, with preference to the language +with similar phonological system. Furthermore, we show that with simple affine +transformations, Acoustic-to-Articulatory inversion (AAI) is transferrable +across speakers, even across genders, languages, and dialects, showing the +generalizability of this property. Together, these results shed new light on +the internals of SSL models that are critical to their superior performance, +and open up new avenues into language-agnostic universal models for speech +engineering, that are interpretable and grounded in speech science. + +
+
+
+
+
+ + ♻ ☆ ALYMPICS: LLM Agents Meet Game Theory -- Exploring Strategic + Decision-Making with AI Agents + + +
+ This paper introduces Alympics (Olympics for Agents), a systematic simulation +framework utilizing Large Language Model (LLM) agents for game theory research. +Alympics creates a versatile platform for studying complex game theory +problems, bridging the gap between theoretical game theory and empirical +investigations by providing a controlled environment for simulating human-like +strategic interactions with LLM agents. In our pilot case study, the "Water +Allocation Challenge," we explore Alympics through a challenging strategic game +focused on the multi-round auction on scarce survival resources. This study +demonstrates the framework's ability to qualitatively and quantitatively +analyze game determinants, strategies, and outcomes. Additionally, we conduct a +comprehensive human assessment and an in-depth evaluation of LLM agents in +strategic decision-making scenarios. Our findings not only expand the +understanding of LLM agents' proficiency in emulating human strategic behavior +but also highlight their potential in advancing game theory knowledge, thereby +enriching our understanding of both game theory and empowering further research +into strategic decision-making domains with LLM agents. Codes, prompts, and all +related resources are available at https://github.com/microsoft/Alympics. + +
+
+
+
+
+ + ♻ ☆ Generalizing Visual Question Answering from Synthetic to Human-Written + Questions via a Chain of QA with a Large Language Model + + +
+ Visual question answering (VQA) is a task where an image is given, and a +series of questions are asked about the image. To build an efficient VQA +algorithm, a large amount of QA data is required which is very expensive. +Generating synthetic QA pairs based on templates is a practical way to obtain +data. However, VQA models trained on those data do not perform well on complex, +human-written questions. To address this issue, we propose a new method called +{\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a +sequence of QA interactions between a large language model and a VQA model +trained on synthetic data to reason and derive logical answers for +human-written questions. We tested the effectiveness of CoQAH on two types of +human-written VQA datasets for 3D-rendered and chest X-ray images and found +that it achieved state-of-the-art accuracy in both types of data. Notably, +CoQAH outperformed general vision-language models, VQA models, and medical +foundation models with no finetuning. + +
+
+
+
+
+ + ♻ ☆ SD-HuBERT: Sentence-Level Self-Distillation Induces Syllabic + Organization in HuBERT + + +
+ Data-driven unit discovery in self-supervised learning (SSL) of speech has +embarked on a new era of spoken language processing. Yet, the discovered units +often remain in phonetic space and the units beyond phonemes are largely +underexplored. Here, we demonstrate that a syllabic organization emerges in +learning sentence-level representation of speech. In particular, we adopt +"self-distillation" objective to fine-tune the pretrained HuBERT with an +aggregator token that summarizes the entire sentence. Without any supervision, +the resulting model draws definite boundaries in speech, and the +representations across frames exhibit salient syllabic structures. We +demonstrate that this emergent structure largely corresponds to the ground +truth syllables. Furthermore, we propose a new benchmark task, Spoken Speech +ABX, for evaluating sentence-level representation of speech. When compared to +previous models, our model outperforms in both unsupervised syllable discovery +and learning sentence-level representation. Together, we demonstrate that the +self-distillation of HuBERT gives rise to syllabic organization without relying +on external labels or modalities, and potentially provides novel data-driven +units for spoken language modeling. + +
+
+
+
+
+ + ♻ ☆ DinoSR: Self-Distillation and Online Clustering for Self-supervised + Speech Representation Learning + + +
+ In this paper, we introduce self-distillation and online clustering for +self-supervised speech representation learning (DinoSR) which combines masked +language modeling, self-distillation, and online clustering. We show that these +concepts complement each other and result in a strong representation learning +model for speech. DinoSR first extracts contextualized embeddings from the +input audio with a teacher network, then runs an online clustering system on +the embeddings to yield a machine-discovered phone inventory, and finally uses +the discretized tokens to guide a student network. We show that DinoSR +surpasses previous state-of-the-art performance in several downstream tasks, +and provide a detailed analysis of the model and the learned discrete units. + +
+
+
+
+
+ + ♻ ☆ Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt + Generation for Few-shot Learning AAAI 2024 + + +
+ Prompt-based pre-trained language models (PLMs) paradigm have succeeded +substantially in few-shot natural language processing (NLP) tasks. However, +prior discrete prompt optimization methods require expert knowledge to design +the base prompt set and identify high-quality prompts, which is costly, +inefficient, and subjective. Meanwhile, existing continuous prompt optimization +methods improve the performance by learning the ideal prompts through the +gradient information of PLMs, whose high computational cost, and low +readability and generalizability are often concerning. To address the research +gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt +Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment +strategy for readability prompt set generation based on GPT-4. Furthermore, we +propose an efficient prompt screening metric to identify high-quality prompts +with linear complexity. Finally, we construct a reinforcement learning (RL) +framework based on policy gradients to match the prompts to inputs optimally. +By training a policy network with only 0.67% of the PLM parameter size on the +tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA) +method by 1.52% in accuracy on average on four open-source datasets. Moreover, +subsequent experiments also demonstrate that $DP_2O$ has good universality, +robustness, and generalization ability. + +
+
+ comment: AAAI 2024 Main Track +
+
+
+
+
+ + ♻ ☆ A Content-Based Novelty Measure for Scholarly Publications: A Proof of + Concept + + +
+ Novelty, akin to gene mutation in evolution, opens possibilities for +scholarly advancement. Although peer review remains the gold standard for +evaluating novelty in scholarly communication and resource allocation, the vast +volume of submissions necessitates an automated measure of scholarly novelty. +Adopting a perspective that views novelty as the atypical combination of +existing knowledge, we introduce an information-theoretic measure of novelty in +scholarly publications. This measure quantifies the degree of 'surprise' +perceived by a language model that represents the word distribution of +scholarly discourse. The proposed measure is accompanied by face and construct +validity evidence; the former demonstrates correspondence to scientific common +sense, and the latter is endorsed through alignment with novelty evaluations +from a select panel of domain experts. Additionally, characterized by its +interpretability, fine granularity, and accessibility, this measure addresses +gaps prevalent in existing methods. We believe this measure holds great +potential to benefit editors, stakeholders, and policymakers, and it provides a +reliable lens for examining the relationship between novelty and academic +dynamics such as creativity, interdisciplinarity, and scientific advances. + +
+
+ comment: Accepted for publication in the proceedings of iConference2024 +
+
+
+
+
+ + ♻ ☆ CryCeleb: A Speaker Verification Dataset Based on Infant Cry Sounds ICASSP 2024 + + +
+ This paper describes the Ubenwa CryCeleb dataset - a labeled collection of +infant cries - and the accompanying CryCeleb 2023 task, which is a public +speaker verification challenge based on cry sounds. We released more than 6 +hours of manually segmented cry sounds from 786 newborns for academic use, +aiming to encourage research in infant cry analysis. The inaugural public +competition attracted 59 participants, 11 of whom improved the baseline +performance. The top-performing system achieved a significant improvement +scoring 25.8% equal error rate, which is still far from the performance of +state-of-the-art adult speaker verification systems. Therefore, we believe +there is room for further research on this dataset, potentially extending +beyond the verification task. + +
+
+ comment: To appear in ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ BiomedCLIP: a multimodal biomedical foundation model pretrained from + fifteen million scientific image-text pairs + + +
+ Biomedical data is inherently multimodal, comprising physical measurements +and natural language narratives. A generalist biomedical AI model needs to +simultaneously process different modalities of data, including text and images. +Therefore, training an effective generalist biomedical model requires +high-quality multimodal data, such as parallel image-text pairs. Here, we +present PMC-15M, a novel dataset that is two orders of magnitude larger than +existing biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse +range of biomedical image types. PMC-15M contains 15 million biomedical +image-text pairs collected from 4.4 million scientific articles. Based on +PMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with +domain-specific adaptations tailored to biomedical vision-language processing. +We conducted extensive experiments and ablation studies on standard biomedical +imaging tasks from retrieval to classification to visual question-answering +(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of +standard datasets, substantially outperforming prior approaches. Intriguingly, +by large-scale pretraining on diverse biomedical image types, BiomedCLIP even +outperforms state-of-the-art radiology-specific models such as BioViL in +radiology-specific tasks such as RSNA pneumonia detection. In summary, +BiomedCLIP is a fully open-access foundation model that achieves +state-of-the-art performance on various biomedical tasks, paving the way for +transformative multimodal biomedical discovery and applications. We release our +models at https://aka.ms/biomedclip to facilitate future research in multimodal +biomedical AI. + +
+
+ comment: The models are released at https://aka.ms/biomedclip +
+
+
+
+
+ + ♻ ☆ AboutMe: Using Self-Descriptions in Webpages to Document the Effects of + English Pretraining Data Filters + + +
+ Large language models' (LLMs) abilities are drawn from their pretraining +data, and model development begins with data curation. However, decisions +around what data is retained or removed during this initial stage is +under-scrutinized. In our work, we ground web text, which is a popular +pretraining data source, to its social and geographic contexts. We create a new +dataset of 10.3 million self-descriptions of website creators, and extract +information about who they are and where they are from: their topical +interests, social roles, and geographic affiliations. Then, we conduct the +first study investigating how ten "quality" and English language identification +(langID) filters affect webpages that vary along these social dimensions. Our +experiments illuminate a range of implicit preferences in data curation: we +show that some quality classifiers act like topical domain filters, and langID +can overlook English content from some regions of the world. Overall, we hope +that our work will encourage a new line of research on pretraining data +curation practices and its social implications. + +
+
+ comment: 28 pages, 13 figures +
+
+
+
+
+ + ♻ ☆ Large Language Model Displays Emergent Ability to Interpret Novel + Literary Metaphors + + +
+ Recent advances in the performance of large language models (LLMs) have +sparked debate over whether, given sufficient training, high-level human +abilities emerge in such generic forms of artificial intelligence (AI). Despite +the exceptional performance of LLMs on a wide range of tasks involving natural +language processing and reasoning, there has been sharp disagreement as to +whether their abilities extend to more creative human abilities. A core example +is the ability to interpret novel metaphors. Given the enormous and non curated +text corpora used to train LLMs, a serious obstacle to designing tests is the +requirement of finding novel yet high quality metaphors that are unlikely to +have been included in the training data. Here we assessed the ability of GPT4, +a state of the art large language model, to provide natural-language +interpretations of novel literary metaphors drawn from Serbian poetry and +translated into English. Despite exhibiting no signs of having been exposed to +these metaphors previously, the AI system consistently produced detailed and +incisive interpretations. Human judges, blind to the fact that an AI model was +involved, rated metaphor interpretations generated by GPT4 as superior to those +provided by a group of college students. In interpreting reversed metaphors, +GPT4, as well as humans, exhibited signs of sensitivity to the Gricean +cooperative principle. In addition, for several novel English poems GPT4 +produced interpretations that were rated as excellent or good by a human +literary critic. These results indicate that LLMs such as GPT4 have acquired an +emergent ability to interpret complex metaphors, including those embedded in +novel poems. + +
+
+
+
+
+ + ♻ ☆ VeRA: Vector-based Random Matrix Adaptation ICLR 2024 + + +
+ Low-rank adapation (LoRA) is a popular method that reduces the number of +trainable parameters when finetuning large language models, but still faces +acute storage challenges when scaling to even larger models or deploying +numerous per-user or per-task adapted models. In this work, we present +Vector-based Random Matrix Adaptation (VeRA), which significantly reduces the +number of trainable parameters compared to LoRA, yet maintains the same +performance. It achieves this by using a single pair of low-rank matrices +shared across all layers and learning small scaling vectors instead. We +demonstrate its effectiveness on the GLUE and E2E benchmarks, image +classification tasks, and show its application in instruction-tuning of 7B and +13B language models. + +
+
+ comment: Accepted at ICLR 2024, website: https://dkopi.github.io/vera +
+
+
+
+
+ + ♻ ☆ The Impact of Reasoning Step Length on Large Language Models + + +
+ Chain of Thought (CoT) is significant in improving the reasoning abilities of +large language models (LLMs). However, the correlation between the +effectiveness of CoT and the length of reasoning steps in prompts remains +largely unknown. To shed light on this, we have conducted several empirical +experiments to explore the relations. Specifically, we design experiments that +expand and compress the rationale reasoning steps within CoT demonstrations, +while keeping all other factors constant. We have the following key findings. +First, the results indicate that lengthening the reasoning steps in prompts, +even without adding new information into the prompt, considerably enhances +LLMs' reasoning abilities across multiple datasets. Alternatively, shortening +the reasoning steps, even while preserving the key information, significantly +diminishes the reasoning abilities of models. This finding highlights the +importance of the number of steps in CoT prompts and provides practical +guidance to make better use of LLMs' potential in complex problem-solving +scenarios. Second, we also investigated the relationship between the +performance of CoT and the rationales used in demonstrations. Surprisingly, the +result shows that even incorrect rationales can yield favorable outcomes if +they maintain the requisite length of inference. Third, we observed that the +advantages of increasing reasoning steps are task-dependent: simpler tasks +require fewer steps, whereas complex tasks gain significantly from longer +inference sequences. + +
+
+
+
+
+ + ♻ ☆ Beyond Extraction: Contextualising Tabular Data for Efficient + Summarisation by Language Models + + +
+ The conventional use of the Retrieval-Augmented Generation (RAG) architecture +has proven effective for retrieving information from diverse documents. +However, challenges arise in handling complex table queries, especially within +PDF documents containing intricate tabular structures.This research introduces +an innovative approach to enhance the accuracy of complex table queries in +RAG-based systems. Our methodology involves storing PDFs in the retrieval +database and extracting tabular content separately. The extracted tables +undergo a process of context enrichment, concatenating headers with +corresponding values. To ensure a comprehensive understanding of the enriched +data, we employ a fine-tuned version of the Llama-2-chat language model for +summarisation within the RAG architecture. Furthermore, we augment the tabular +data with contextual sense using the ChatGPT 3.5 API through a one-shot prompt. +This enriched data is then fed into the retrieval database alongside other +PDFs. Our approach aims to significantly improve the precision of complex table +queries, offering a promising solution to a longstanding challenge in +information retrieval. + +
+
+
+
+
+ + ♻ ☆ Human Feedback is not Gold Standard ICLR 2024 + + +
+ Human feedback has become the de facto standard for evaluating the +performance of Large Language Models, and is increasingly being used as a +training objective. However, it is not clear which properties of a generated +output this single `preference' score captures. We hypothesise that preference +scores are subjective and open to undesirable biases. We critically analyse the +use of human feedback for both training and evaluation, to verify whether it +fully captures a range of crucial error criteria. We find that while preference +scores have fairly good coverage, they under-represent important aspects like +factuality. We further hypothesise that both preference scores and error +annotation may be affected by confounders, and leverage instruction-tuned +models to generate outputs that vary along two possible confounding dimensions: +assertiveness and complexity. We find that the assertiveness of an output skews +the perceived rate of factuality errors, indicating that human annotations are +not a fully reliable evaluation metric or training objective. Finally, we offer +preliminary evidence that using human feedback as a training objective +disproportionately increases the assertiveness of model outputs. We encourage +future work to carefully consider whether preference scores are well aligned +with the desired objective. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ SwissBERT: The Multilingual Language Model for Switzerland + + +
+ We present SwissBERT, a masked language model created specifically for +processing Switzerland-related text. SwissBERT is a pre-trained model that we +adapted to news articles written in the national languages of Switzerland -- +German, French, Italian, and Romansh. We evaluate SwissBERT on natural language +understanding tasks related to Switzerland and find that it tends to outperform +previous models on these tasks, especially when processing contemporary news +and/or Romansh Grischun. Since SwissBERT uses language adapters, it may be +extended to Swiss German dialects in future work. The model and our open-source +code are publicly released at https://github.com/ZurichNLP/swissbert. + +
+
+ comment: SwissText 2023 [v3: Changed template because the proceedings moved to + a different publisher. Same content.] +
+
+
+
+
+ + ♻ ☆ MosaicBERT: A Bidirectional Encoder Optimized for Fast Pretraining + + +
+ Although BERT-style encoder models are heavily used in NLP research, many +researchers do not pretrain their own BERTs from scratch due to the high cost +of training. In the past half-decade since BERT first rose to prominence, many +advances have been made with other transformer architectures and training +configurations that have yet to be systematically incorporated into BERT. Here, +we introduce MosaicBERT, a BERT-style encoder architecture and training recipe +that is empirically optimized for fast pretraining. This efficient architecture +incorporates FlashAttention, Attention with Linear Biases (ALiBi), Gated Linear +Units (GLU), a module to dynamically remove padded tokens, and low precision +LayerNorm into the classic transformer encoder block. The training recipe +includes a 30% masking ratio for the Masked Language Modeling (MLM) objective, +bfloat16 precision, and vocabulary size optimized for GPU throughput, in +addition to best-practices from RoBERTa and other encoder models. When +pretrained from scratch on the C4 dataset, this base model achieves a +downstream average GLUE (dev) score of 79.6 in 1.13 hours on 8 A100 80 GB GPUs +at a cost of roughly $20. We plot extensive accuracy vs. pretraining speed +Pareto curves and show that MosaicBERT base and large are consistently Pareto +optimal when compared to a competitive BERT base and large. This empirical +speed up in pretraining enables researchers and engineers to pretrain custom +BERT-style models at low cost instead of finetune on existing generic models. +We open source our model weights and code. + +
+
+ comment: 10 pages, 4 figures in main text. 25 pages total +
+
+
+
+
+ + ♻ ☆ Motion-Based Sign Language Video Summarization using Curvature and + Torsion + + +
+ An interesting problem in many video-based applications is the generation of +short synopses by selecting the most informative frames, a procedure which is +known as video summarization. For sign language videos the benefits of using +the $t$-parameterized counterpart of the curvature of the 2-D signer's wrist +trajectory to identify keyframes, have been recently reported in the +literature. In this paper we extend these ideas by modeling the 3-D hand motion +that is extracted from each frame of the video. To this end we propose a new +informative function based on the $t$-parameterized curvature and torsion of +the 3-D trajectory. The method to characterize video frames as keyframes +depends on whether the motion occurs in 2-D or 3-D space. Specifically, in the +case of 3-D motion we look for the maxima of the harmonic mean of the curvature +and torsion of the target's trajectory; in the planar motion case we seek for +the maxima of the trajectory's curvature. The proposed 3-D feature is +experimentally evaluated in applications of sign language videos on (1) +objective measures using ground-truth keyframe annotations, (2) human-based +evaluation of understanding, and (3) gloss classification and the results +obtained are promising. + +
+
+ comment: This work is under consideration at Pattern Recognition Letters for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Unsupervised Pretraining for Fact Verification by Language Model + Distillation ICLR 2024 + + +
+ Fact verification aims to verify a claim using evidence from a trustworthy +knowledge base. To address this challenge, algorithms must produce features for +every claim that are both semantically meaningful, and compact enough to find a +semantic alignment with the source information. In contrast to previous work, +which tackled the alignment problem by learning over annotated corpora of +claims and their corresponding labels, we propose SFAVEL (Self-supervised Fact +Verification via Language Model Distillation), a novel unsupervised pretraining +framework that leverages pre-trained language models to distil self-supervised +features into high-quality claim-fact alignments without the need for +annotations. This is enabled by a novel contrastive loss function that +encourages features to attain high-quality claim and evidence alignments whilst +preserving the semantic relationships across the corpora. Notably, we present +results that achieve a new state-of-the-art on FB15k-237 (+5.3% Hits@1) and +FEVER (+8% accuracy) with linear evaluation. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ POMP: Probability-driven Meta-graph Prompter for LLMs in Low-resource + Unsupervised Neural Machine Translation + + +
+ Low-resource languages (LRLs) face challenges in supervised neural machine +translation due to limited parallel data, prompting research into unsupervised +methods. Unsupervised neural machine translation (UNMT) methods, including +back-translation, transfer learning, and pivot-based translation, offer +practical solutions for LRL translation, but they are hindered by issues like +synthetic data noise, language bias, and error propagation, which can +potentially be mitigated by Large Language Models (LLMs). LLMs have advanced +NMT with in-context learning (ICL) and supervised fine-tuning methods, but +insufficient training data results in poor performance in LRLs. We argue that +LLMs can mitigate the linguistic noise with auxiliary languages to improve +translations in LRLs. In this paper, we propose Probability-driven Meta-graph +Prompter (POMP), a novel approach employing a dynamic, sampling-based graph of +multiple auxiliary languages to enhance LLMs' translation capabilities for +LRLs. POMP involves constructing a directed acyclic meta-graph for each source +language, from which we dynamically sample multiple paths to prompt LLMs to +mitigate the linguistic noise and improve translations during training. We use +the BLEURT metric to evaluate the translations and back-propagate rewards, +estimated by scores, to update the probabilities of auxiliary languages in the +paths. Our experiments show significant improvements in the translation quality +of three LRLs, demonstrating the effectiveness of our approach. + +
+
+
+
+
+ + ♻ ☆ Beyond Accuracy: Evaluating Self-Consistency of Code Large Language + Models with IdentityChain ICLR 2024 + + +
+ Code Large Language Models (Code LLMs) are being increasingly employed in +real-life applications, so evaluating them is critical. While the conventional +accuracy evaluates the performance of Code LLMs on a set of individual tasks, +their self-consistency across different tasks is overlooked. Intuitively, a +trustworthy model should be self-consistent when generating natural language +specifications for its own code and generating code for its own specifications. +Failure to preserve self-consistency reveals a lack of understanding of the +shared semantics underlying natural language and programming language, and +therefore undermines the trustworthiness of a model. In this paper, we first +formally define the self-consistency of Code LLMs and then design a framework, +IdentityChain, which effectively and efficiently evaluates the self-consistency +and conventional accuracy of a model at the same time. We study eleven Code +LLMs and show that they fail to preserve self-consistency, which is indeed a +distinct aspect from conventional accuracy. Furthermore, we show that +IdentityChain can be used as a model debugging tool to expose weaknesses of +Code LLMs by demonstrating three major weaknesses that we identify in current +models using IdentityChain. Our code is available at +https://github.com/marcusm117/IdentityChain. + +
+
+ comment: ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Glitter or Gold? Deriving Structured Insights from Sustainability + Reports via Large Language Models + + +
+ Over the last decade, several regulatory bodies have started requiring the +disclosure of non-financial information from publicly listed companies, in +light of the investors' increasing attention to Environmental, Social, and +Governance (ESG) issues. Publicly released information on sustainability +practices is often disclosed in diverse, unstructured, and multi-modal +documentation. This poses a challenge in efficiently gathering and aligning the +data into a unified framework to derive insights related to Corporate Social +Responsibility (CSR). Thus, using Information Extraction (IE) methods becomes +an intuitive choice for delivering insightful and actionable data to +stakeholders. In this study, we employ Large Language Models (LLMs), In-Context +Learning, and the Retrieval-Augmented Generation (RAG) paradigm to extract +structured insights related to ESG aspects from companies' sustainability +reports. We then leverage graph-based representations to conduct statistical +analyses concerning the extracted insights. These analyses revealed that ESG +criteria cover a wide range of topics, exceeding 500, often beyond those +considered in existing categorizations, and are addressed by companies through +a variety of initiatives. Moreover, disclosure similarities emerged among +companies from the same region or sector, validating ongoing hypotheses in the +ESG literature. Lastly, by incorporating additional company attributes into our +analyses, we investigated which factors impact the most on companies' ESG +ratings, showing that ESG disclosure affects the obtained ratings more than +other financial or company data. + +
+
+
+
+
+ + ♻ ☆ Towards General-Purpose Text-Instruction-Guided Voice Conversion + + +
+ This paper introduces a novel voice conversion (VC) model, guided by text +instructions such as "articulate slowly with a deep tone" or "speak in a +cheerful boyish voice". Unlike traditional methods that rely on reference +utterances to determine the attributes of the converted speech, our model adds +versatility and specificity to voice conversion. The proposed VC model is a +neural codec language model which processes a sequence of discrete codes, +resulting in the code sequence of converted speech. It utilizes text +instructions as style prompts to modify the prosody and emotional information +of the given speech. In contrast to previous approaches, which often rely on +employing separate encoders like prosody and content encoders to handle +different aspects of the source speech, our model handles various information +of speech in an end-to-end manner. Experiments have demonstrated the impressive +capabilities of our model in comprehending instructions and delivering +reasonable results. + +
+
+ comment: Accepted to ASRU 2023 +
+
+
+
+
+ + ♻ ☆ Inducing Meaningful Units from Character Sequences with Dynamic Capacity + Slot Attention + + +
+ Characters do not convey meaning, but sequences of characters do. We propose +an unsupervised distributional method to learn the abstract meaningful units in +a sequence of characters. Rather than segmenting the sequence, our Dynamic +Capacity Slot Attention model discovers continuous representations of the +objects in the sequence, extending an architecture for object discovery in +images. We train our model on different languages and evaluate the quality of +the obtained representations with forward and reverse probing classifiers. +These experiments show that our model succeeds in discovering units which are +similar to those proposed previously in form, content and level of abstraction, +and which show promise for capturing meaningful information at a higher level +of abstraction. + +
+
+ comment: Accepted to TMLR 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 106 + +
+
+
+ + ☆ SAMF: Small-Area-Aware Multi-focus Image Fusion for Object Detection ICASSP + + +
+ Existing multi-focus image fusion (MFIF) methods often fail to preserve the +uncertain transition region and detect small focus areas within large defocused +regions accurately. To address this issue, this study proposes a new +small-area-aware MFIF algorithm for enhancing object detection capability. +First, we enhance the pixel attributes within the small focus and boundary +regions, which are subsequently combined with visual saliency detection to +obtain the pre-fusion results used to discriminate the distribution of focused +pixels. To accurately ensure pixel focus, we consider the source image as a +combination of focused, defocused, and uncertain regions and propose a +three-region segmentation strategy. Finally, we design an effective pixel +selection rule to generate segmentation decision maps and obtain the final +fusion results. Experiments demonstrated that the proposed method can +accurately detect small and smooth focus areas while improving object detection +performance, outperforming existing methods in both subjective and objective +evaluations. The source code is available at https://github.com/ixilai/SAMF. + +
+
+ comment: Accepted to International Conference on Acoustics, Speech and Signal + Processing (ICASSP) 2024 +
+
+
+
+
+ + ☆ Multi-view Distillation based on Multi-modal Fusion for Few-shot Action + Recognition(CLIP-$\mathrm{M^2}$DF) + + +
+ In recent years, few-shot action recognition has attracted increasing +attention. It generally adopts the paradigm of meta-learning. In this field, +overcoming the overlapping distribution of classes and outliers is still a +challenging problem based on limited samples. We believe the combination of +Multi-modal and Multi-view can improve this issue depending on information +complementarity. Therefore, we propose a method of Multi-view Distillation +based on Multi-modal Fusion. Firstly, a Probability Prompt Selector for the +query is constructed to generate probability prompt embedding based on the +comparison score between the prompt embeddings of the support and the visual +embedding of the query. Secondly, we establish a Multi-view. In each view, we +fuse the prompt embedding as consistent information with visual and the global +or local temporal context to overcome the overlapping distribution of classes +and outliers. Thirdly, we perform the distance fusion for the Multi-view and +the mutual distillation of matching ability from one to another, enabling the +model to be more robust to the distribution bias. Our code is available at the +URL: \url{https://github.com/cofly2014/MDMF}. + +
+
+
+
+
+ + ☆ Generative Denoise Distillation: Simple Stochastic Noises Induce + Efficient Knowledge Transfer for Dense Prediction + + +
+ Knowledge distillation is the process of transferring knowledge from a more +powerful large model (teacher) to a simpler counterpart (student). Numerous +current approaches involve the student imitating the knowledge of the teacher +directly. However, redundancy still exists in the learned representations +through these prevalent methods, which tend to learn each spatial location's +features indiscriminately. To derive a more compact representation (concept +feature) from the teacher, inspired by human cognition, we suggest an +innovative method, termed Generative Denoise Distillation (GDD), where +stochastic noises are added to the concept feature of the student to embed them +into the generated instance feature from a shallow network. Then, the generated +instance feature is aligned with the knowledge of the instance from the +teacher. We extensively experiment with object detection, instance +segmentation, and semantic segmentation to demonstrate the versatility and +effectiveness of our method. Notably, GDD achieves new state-of-the-art +performance in the tasks mentioned above. We have achieved substantial +improvements in semantic segmentation by enhancing PspNet and DeepLabV3, both +of which are based on ResNet-18, resulting in mIoU scores of 74.67 and 77.69, +respectively, surpassing their previous scores of 69.85 and 73.20 on the +Cityscapes dataset of 20 categories. The source code of GDD is available at +https://github.com/ZhgLiu/GDD. + +
+
+
+
+
+ + ☆ Un-Mixing Test-Time Normalization Statistics: Combatting Label Temporal + Correlation + + +
+ In an era where test-time adaptation methods increasingly rely on the nuanced +manipulation of batch normalization (BN) parameters, one critical assumption +often goes overlooked: that of independently and identically distributed +(i.i.d.) test batches with respect to unknown labels. This assumption +culminates in biased estimates of BN statistics and jeopardizes system +stability under non-i.i.d. conditions. This paper pioneers a departure from the +i.i.d. paradigm by introducing a groundbreaking strategy termed "Un-Mixing +Test-Time Normalization Statistics" (UnMix-TNS). UnMix-TNS re-calibrates the +instance-wise statistics used to normalize each instance in a batch by mixing +it with multiple unmixed statistics components, thus inherently simulating the +i.i.d. environment. The key lies in our innovative online unmixing procedure, +which persistently refines these statistics components by drawing upon the +closest instances from an incoming test batch. Remarkably generic in its +design, UnMix-TNS seamlessly integrates with an array of state-of-the-art +test-time adaptation methods and pre-trained architectures equipped with BN +layers. Empirical evaluations corroborate the robustness of UnMix-TNS under +varied scenarios ranging from single to continual and mixed domain shifts. +UnMix-TNS stands out when handling test data streams with temporal correlation, +including those with corrupted real-world non-i.i.d. streams, sustaining its +efficacy even with minimal batch sizes and individual samples. Our results set +a new standard for test-time adaptation, demonstrating significant improvements +in both stability and performance across multiple benchmarks. + +
+
+
+
+
+ + ☆ The Faiss library + + +
+ Vector databases manage large collections of embedding vectors. As AI +applications are growing rapidly, so are the number of embeddings that need to +be stored and indexed. The Faiss library is dedicated to vector similarity +search, a core functionality of vector databases. Faiss is a toolkit of +indexing methods and related primitives used to search, cluster, compress and +transform vectors. This paper first describes the tradeoff space of vector +search, then the design principles of Faiss in terms of structure, approach to +optimization and interfacing. We benchmark key features of the library and +discuss a few selected applications to highlight its broad applicability. + +
+
+
+
+
+ + ☆ AesBench: An Expert Benchmark for Multimodal Large Language Models on + Image Aesthetics Perception + + +
+ With collective endeavors, multimodal large language models (MLLMs) are +undergoing a flourishing development. However, their performances on image +aesthetics perception remain indeterminate, which is highly desired in +real-world applications. An obvious obstacle lies in the absence of a specific +benchmark to evaluate the effectiveness of MLLMs on aesthetic perception. This +blind groping may impede the further development of more advanced MLLMs with +aesthetic perception capacity. To address this dilemma, we propose AesBench, an +expert benchmark aiming to comprehensively evaluate the aesthetic perception +capacities of MLLMs through elaborate design across dual facets. (1) We +construct an Expert-labeled Aesthetics Perception Database (EAPD), which +features diversified image contents and high-quality annotations provided by +professional aesthetic experts. (2) We propose a set of integrative criteria to +measure the aesthetic perception abilities of MLLMs from four perspectives, +including Perception (AesP), Empathy (AesE), Assessment (AesA) and +Interpretation (AesI). Extensive experimental results underscore that the +current MLLMs only possess rudimentary aesthetic perception ability, and there +is still a significant gap between MLLMs and humans. We hope this work can +inspire the community to engage in deeper explorations on the aesthetic +potentials of MLLMs. Source data will be available at +https://github.com/yipoh/AesBench. + +
+
+
+
+
+ + ☆ Modeling Spoof Noise by De-spoofing Diffusion and its Application in + Face Anti-spoofing + + +
+ Face anti-spoofing is crucial for ensuring the security and reliability of +face recognition systems. Several existing face anti-spoofing methods utilize +GAN-like networks to detect presentation attacks by estimating the noise +pattern of a spoof image and recovering the corresponding genuine image. But +GAN's limited face appearance space results in the denoised faces cannot cover +the full data distribution of genuine faces, thereby undermining the +generalization performance of such methods. In this work, we present a +pioneering attempt to employ diffusion models to denoise a spoof image and +restore the genuine image. The difference between these two images is +considered as the spoof noise, which can serve as a discriminative cue for face +anti-spoofing. We evaluate our proposed method on several intra-testing and +inter-testing protocols, where the experimental results showcase the +effectiveness of our method in achieving competitive performance in terms of +both accuracy and generalization. + +
+
+ comment: Accepted by IJCB2023 +
+
+
+
+
+ + ☆ Siamese Content-based Search Engine for a More Transparent Skin and + Breast Cancer Diagnosis through Histological Imaging + + +
+ Computer Aid Diagnosis (CAD) has developed digital pathology with Deep +Learning (DL)-based tools to assist pathologists in decision-making. +Content-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek +highly correlated patches in terms of similarity in histopathological features. +In this work, we proposed two CBHIR approaches on breast (Breast-twins) and +skin cancer (Skin-twins) data sets for robust and accurate patch-level +retrieval, integrating a custom-built Siamese network as a feature extractor. +The proposed Siamese network is able to generalize for unseen images by +focusing on the similar histopathological features of the input pairs. The +proposed CBHIR approaches are evaluated on the Breast (public) and Skin +(private) data sets with top K accuracy. Finding the optimum amount of K is +challenging, but also, as much as K increases, the dissimilarity between the +query and the returned images increases which might mislead the pathologists. +To the best of the author's belief, this paper is tackling this issue for the +first time on histopathological images by evaluating the top first retrieved +images. The Breast-twins model achieves 70% of the F1score at the top first, +which exceeds the other state-of-the-art methods at a higher amount of K such +as 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto +Encoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model +tackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential +(STUMP) to assist pathologists with retrieving top K images and their +corresponding labels. So, this approach can offer a more explainable CAD tool +to pathologists in terms of transparency, trustworthiness, or reliability among +other characteristics. + +
+
+
+
+
+ + ☆ Multi-Technique Sequential Information Consistency For Dynamic Visual + Place Recognition In Changing Environments + + +
+ Visual place recognition (VPR) is an essential component of robot navigation +and localization systems that allows them to identify a place using only image +data. VPR is challenging due to the significant changes in a place's appearance +driven by different daily illumination, seasonal weather variations and diverse +viewpoints. Currently, no single VPR technique excels in every environmental +condition, each exhibiting unique benefits and shortcomings, and therefore +combining multiple techniques can achieve more reliable VPR performance. +Present multi-method approaches either rely on online ground-truth information, +which is often not available, or on brute-force technique combination, +potentially lowering performance with high variance technique sets. Addressing +these shortcomings, we propose a VPR system dubbed Multi-Sequential Information +Consistency (MuSIC) which leverages sequential information to select the most +cohesive technique on an online per-frame basis. For each technique in a set, +MuSIC computes their respective sequential consistencies by analysing the +frame-to-frame continuity of their top match candidates, which are then +directly compared to select the optimal technique for the current query image. +The use of sequential information to select between VPR methods results in an +overall VPR performance increase across different benchmark datasets, while +avoiding the need for extra ground-truth of the runtime environment. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.14247 +
+
+
+
+
+ + ☆ Multitask Learning in Minimally Invasive Surgical Vision: A Review + + +
+ Minimally invasive surgery (MIS) has revolutionized many procedures and led +to reduced recovery time and risk of patient injury. However, MIS poses +additional complexity and burden on surgical teams. Data-driven surgical vision +algorithms are thought to be key building blocks in the development of future +MIS systems with improved autonomy. Recent advancements in machine learning and +computer vision have led to successful applications in analyzing videos +obtained from MIS with the promise of alleviating challenges in MIS videos. +Surgical scene and action understanding encompasses multiple related tasks +that, when solved individually, can be memory-intensive, inefficient, and fail +to capture task relationships. Multitask learning (MTL), a learning paradigm +that leverages information from multiple related tasks to improve performance +and aid generalization, is wellsuited for fine-grained and high-level +understanding of MIS data. This review provides an overview of the current +state-of-the-art MTL systems that leverage videos obtained from MIS. Beyond +listing published approaches, we discuss the benefits and limitations of these +MTL systems. Moreover, this manuscript presents an analysis of the literature +for various application fields of MTL in MIS, including those with large +models, highlighting notable trends, new directions of research, and +developments. + +
+
+
+
+
+ + ☆ Multi-scale 2D Temporal Map Diffusion Models for Natural Language Video + Localization + + +
+ Natural Language Video Localization (NLVL), grounding phrases from natural +language descriptions to corresponding video segments, is a complex yet +critical task in video understanding. Despite ongoing advancements, many +existing solutions lack the capability to globally capture temporal dynamics of +the video data. In this study, we present a novel approach to NLVL that aims to +address this issue. Our method involves the direct generation of a global 2D +temporal map via a conditional denoising diffusion process, based on the input +video and language query. The main challenges are the inherent sparsity and +discontinuity of a 2D temporal map in devising the diffusion decoder. To +address these challenges, we introduce a multi-scale technique and develop an +innovative diffusion decoder. Our approach effectively encapsulates the +interaction between the query and video data across various time scales. +Experiments on the Charades and DiDeMo datasets underscore the potency of our +design. + +
+
+
+
+
+ + ☆ Human vs. LMMs: Exploring the Discrepancy in Emoji Interpretation and + Usage in Digital Communication + + +
+ Leveraging Large Multimodal Models (LMMs) to simulate human behaviors when +processing multimodal information, especially in the context of social media, +has garnered immense interest due to its broad potential and far-reaching +implications. Emojis, as one of the most unique aspects of digital +communication, are pivotal in enriching and often clarifying the emotional and +tonal dimensions. Yet, there is a notable gap in understanding how these +advanced models, such as GPT-4V, interpret and employ emojis in the nuanced +context of online interaction. This study intends to bridge this gap by +examining the behavior of GPT-4V in replicating human-like use of emojis. The +findings reveal a discernible discrepancy between human and GPT-4V behaviors, +likely due to the subjective nature of human interpretation and the limitations +of GPT-4V's English-centric training, suggesting cultural biases and inadequate +representation of non-English cultures. + +
+
+
+
+
+ + ☆ ModelNet-O: A Large-Scale Synthetic Dataset for Occlusion-Aware Point + Cloud Classification + + +
+ Recently, 3D point cloud classification has made significant progress with +the help of many datasets. However, these datasets do not reflect the +incomplete nature of real-world point clouds caused by occlusion, which limits +the practical application of current methods. To bridge this gap, we propose +ModelNet-O, a large-scale synthetic dataset of 123,041 samples that emulate +real-world point clouds with self-occlusion caused by scanning from monocular +cameras. ModelNet-O is 10 times larger than existing datasets and offers more +challenging cases to evaluate the robustness of existing methods. Our +observation on ModelNet-O reveals that well-designed sparse structures can +preserve structural information of point clouds under occlusion, motivating us +to propose a robust point cloud processing method that leverages a critical +point sampling (CPS) strategy in a multi-level manner. We term our method +PointMLS. Through extensive experiments, we demonstrate that our PointMLS +achieves state-of-the-art results on ModelNet-O and competitive results on +regular datasets, and it is robust and effective. More experiments also +demonstrate the robustness and effectiveness of PointMLS. + +
+
+ comment: Project page: https://github.com/fanglaosi/PointMLS +
+
+
+
+
+ + ☆ Transcending the Limit of Local Window: Advanced Super-Resolution + Transformer with Adaptive Token Dictionary + + +
+ Single Image Super-Resolution is a classic computer vision problem that +involves estimating high-resolution (HR) images from low-resolution (LR) ones. +Although deep neural networks (DNNs), especially Transformers for +super-resolution, have seen significant advancements in recent years, +challenges still remain, particularly in limited receptive field caused by +window-based self-attention. To address these issues, we introduce a group of +auxiliary Adapeive Token Dictionary to SR Transformer and establish an ATD-SR +method. The introduced token dictionary could learn prior information from +training data and adapt the learned prior to specific testing image through an +adaptive refinement step. The refinement strategy could not only provide global +information to all input tokens but also group image tokens into categories. +Based on category partitions, we further propose a category-based +self-attention mechanism designed to leverage distant but similar tokens for +enhancing input features. The experimental results show that our method +achieves the best performance on various single image super-resolution +benchmarks. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ End-to-End Optimized Image Compression with the Frequency-Oriented + Transform + + +
+ Image compression constitutes a significant challenge amidst the era of +information explosion. Recent studies employing deep learning methods have +demonstrated the superior performance of learning-based image compression +methods over traditional codecs. However, an inherent challenge associated with +these methods lies in their lack of interpretability. Following an analysis of +the varying degrees of compression degradation across different frequency +bands, we propose the end-to-end optimized image compression model facilitated +by the frequency-oriented transform. The proposed end-to-end image compression +model consists of four components: spatial sampling, frequency-oriented +transform, entropy estimation, and frequency-aware fusion. The +frequency-oriented transform separates the original image signal into distinct +frequency bands, aligning with the human-interpretable concept. Leveraging the +non-overlapping hypothesis, the model enables scalable coding through the +selective transmission of arbitrary frequency components. Extensive experiments +are conducted to demonstrate that our model outperforms all traditional codecs +including next-generation standard H.266/VVC on MS-SSIM metric. Moreover, +visual analysis tasks (i.e., object detection and semantic segmentation) are +conducted to verify the proposed compression method could preserve semantic +fidelity besides signal-level precision. + +
+
+ comment: 25 pages, accepted by MVAP +
+
+
+
+
+ + ☆ DPAFNet:Dual Path Attention Fusion Network for Single Image Deraining + + +
+ Rainy weather will have a significant impact on the regular operation of the +imaging system. Based on this premise, image rain removal has always been a +popular branch of low-level visual tasks, especially methods using deep neural +networks. However, most neural networks are but-branched, such as only using +convolutional neural networks or Transformers, which is unfavourable for the +multidimensional fusion of image features. In order to solve this problem, this +paper proposes a dual-branch attention fusion network. Firstly, a two-branch +network structure is proposed. Secondly, an attention fusion module is proposed +to selectively fuse the features extracted by the two branches rather than +simply adding them. Finally, complete ablation experiments and sufficient +comparison experiments prove the rationality and effectiveness of the proposed +method. + +
+
+
+
+
+ + ☆ Key-point Guided Deformable Image Manipulation Using Diffusion Model + + +
+ In this paper, we introduce a Key-point-guided Diffusion probabilistic Model +(KDM) that gains precise control over images by manipulating the object's +key-point. We propose a two-stage generative model incorporating an optical +flow map as an intermediate output. By doing so, a dense pixel-wise +understanding of the semantic relation between the image and sparse key point +is configured, leading to more realistic image generation. Additionally, the +integration of optical flow helps regulate the inter-frame variance of +sequential images, demonstrating an authentic sequential image generation. The +KDM is evaluated with diverse key-point conditioned image synthesis tasks, +including facial image generation, human pose synthesis, and echocardiography +video prediction, demonstrating the KDM is proving consistency enhanced and +photo-realistic images compared with state-of-the-art models. + +
+
+ comment: Code is released at + https://github.com/joseph9337/Key-point-Guided-Deformable-Image-Manipulation-Using-Diffusion-Mode +
+
+
+
+
+ + ☆ Completely Occluded and Dense Object Instance Segmentation Using Box + Prompt-Based Segmentation Foundation Models + + +
+ Completely occluded and dense object instance segmentation (IS) is an +important and challenging task. Although current amodal IS methods can predict +invisible regions of occluded objects, they are difficult to directly predict +completely occluded objects. For dense object IS, existing box-based methods +are overly dependent on the performance of bounding box detection. In this +paper, we propose CFNet, a coarse-to-fine IS framework for completely occluded +and dense objects, which is based on box prompt-based segmentation foundation +models (BSMs). Specifically, CFNet first detects oriented bounding boxes (OBBs) +to distinguish instances and provide coarse localization information. Then, it +predicts OBB prompt-related masks for fine segmentation. To predict completely +occluded object instances, CFNet performs IS on occluders and utilizes prior +geometric properties, which overcomes the difficulty of directly predicting +completely occluded object instances. Furthermore, based on BSMs, CFNet reduces +the dependence on bounding box detection performance, improving dense object IS +performance. Moreover, we propose a novel OBB prompt encoder for BSMs. To make +CFNet more lightweight, we perform knowledge distillation on it and introduce a +Gaussian smoothing method for teacher targets. Experimental results demonstrate +that CFNet achieves the best performance on both industrial and publicly +available datasets. + +
+
+
+
+
+ + ☆ Deep Linear Array Pushbroom Image Restoration: A Degradation Pipeline + and Jitter-Aware Restoration Network AAAI 2024 + + +
+ Linear Array Pushbroom (LAP) imaging technology is widely used in the realm +of remote sensing. However, images acquired through LAP always suffer from +distortion and blur because of camera jitter. Traditional methods for restoring +LAP images, such as algorithms estimating the point spread function (PSF), +exhibit limited performance. To tackle this issue, we propose a Jitter-Aware +Restoration Network (JARNet), to remove the distortion and blur in two stages. +In the first stage, we formulate an Optical Flow Correction (OFC) block to +refine the optical flow of the degraded LAP images, resulting in pre-corrected +images where most of the distortions are alleviated. In the second stage, for +further enhancement of the pre-corrected images, we integrate two jitter-aware +techniques within the Spatial and Frequency Residual (SFRes) block: 1) +introducing Coordinate Attention (CoA) to the SFRes block in order to capture +the jitter state in orthogonal direction; 2) manipulating image features in +both spatial and frequency domains to leverage local and global priors. +Additionally, we develop a data synthesis pipeline, which applies Continue +Dynamic Shooting Model (CDSM) to simulate realistic degradation in LAP images. +Both the proposed JARNet and LAP image synthesis pipeline establish a +foundation for addressing this intricate challenge. Extensive experiments +demonstrate that the proposed two-stage method outperforms state-of-the-art +image restoration models. Code is available at +https://github.com/JHW2000/JARNet. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Learned Image Compression with ROI-Weighted Distortion and Bit + Allocation + + +
+ This one page paper describes our method for the track of image compression. +To achieve better perceptual quality, we use the adversarial loss to generate +realistic textures, use region of interest (ROI) mask to guide the bit +allocation for different regions. Our Team name is TLIC. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ ProvNeRF: Modeling per Point Provenance in NeRFs as a Stochastic Process + + +
+ Neural radiance fields (NeRFs) have gained popularity across various +applications. However, they face challenges in the sparse view setting, lacking +sufficient constraints from volume rendering. Reconstructing and understanding +a 3D scene from sparse and unconstrained cameras is a long-standing problem in +classical computer vision with diverse applications. While recent works have +explored NeRFs in sparse, unconstrained view scenarios, their focus has been +primarily on enhancing reconstruction and novel view synthesis. Our approach +takes a broader perspective by posing the question: "from where has each point +been seen?" -- which gates how well we can understand and reconstruct it. In +other words, we aim to determine the origin or provenance of each 3D point and +its associated information under sparse, unconstrained views. We introduce +ProvNeRF, a model that enriches a traditional NeRF representation by +incorporating per-point provenance, modeling likely source locations for each +point. We achieve this by extending implicit maximum likelihood estimation +(IMLE) for stochastic processes. Notably, our method is compatible with any +pre-trained NeRF model and the associated training camera poses. We demonstrate +that modeling per-point provenance offers several advantages, including +uncertainty estimation, criteria-based view selection, and improved novel view +synthesis, compared to state-of-the-art methods. + +
+
+
+
+
+ + ☆ The Devil is in the Details: Boosting Guided Depth Super-Resolution via + Rethinking Cross-Modal Alignment and Aggregation + + +
+ Guided depth super-resolution (GDSR) involves restoring missing depth details +using the high-resolution RGB image of the same scene. Previous approaches have +struggled with the heterogeneity and complementarity of the multi-modal inputs, +and neglected the issues of modal misalignment, geometrical misalignment, and +feature selection. In this study, we rethink some essential components in GDSR +networks and propose a simple yet effective Dynamic Dual Alignment and +Aggregation network (D2A2). D2A2 mainly consists of 1) a dynamic dual alignment +module that adapts to alleviate the modal misalignment via a learnable domain +alignment block and geometrically align cross-modal features by learning the +offset; and 2) a mask-to-pixel feature aggregate module that uses the gated +mechanism and pixel attention to filter out irrelevant texture noise from RGB +features and combine the useful features with depth features. By combining the +strengths of RGB and depth features while minimizing disturbance introduced by +the RGB image, our method with simple reuse and redesign of basic components +achieves state-of-the-art performance on multiple benchmark datasets. The code +is available at https://github.com/JiangXinni/D2A2. + +
+
+
+
+
+ + ☆ E2HQV: High-Quality Video Generation from Event Camera via + Theory-Inspired Model-Aided Deep Learning AAAI2024 + + +
+ The bio-inspired event cameras or dynamic vision sensors are capable of +asynchronously capturing per-pixel brightness changes (called event-streams) in +high temporal resolution and high dynamic range. However, the non-structural +spatial-temporal event-streams make it challenging for providing intuitive +visualization with rich semantic information for human vision. It calls for +events-to-video (E2V) solutions which take event-streams as input and generate +high quality video frames for intuitive visualization. However, current +solutions are predominantly data-driven without considering the prior knowledge +of the underlying statistics relating event-streams and video frames. It highly +relies on the non-linearity and generalization capability of the deep neural +networks, thus, is struggling on reconstructing detailed textures when the +scenes are complex. In this work, we propose \textbf{E2HQV}, a novel E2V +paradigm designed to produce high-quality video frames from events. This +approach leverages a model-aided deep learning framework, underpinned by a +theory-inspired E2V model, which is meticulously derived from the fundamental +imaging principles of event cameras. To deal with the issue of state-reset in +the recurrent components of E2HQV, we also design a temporal shift embedding +module to further improve the quality of the video frames. Comprehensive +evaluations on the real world event camera datasets validate our approach, with +E2HQV, notably outperforming state-of-the-art approaches, e.g., surpassing the +second best by over 40\% for some evaluation metrics. + +
+
+ comment: Accepted in AAAI2024 +
+
+
+
+
+ + ☆ No-Clean-Reference Image Super-Resolution: Application to Electron + Microscopy + + +
+ The inability to acquire clean high-resolution (HR) electron microscopy (EM) +images over a large brain tissue volume hampers many neuroscience studies. To +address this challenge, we propose a deep-learning-based image super-resolution +(SR) approach to computationally reconstruct clean HR 3D-EM with a large field +of view (FoV) from noisy low-resolution (LR) acquisition. Our contributions are +I) Investigating training with no-clean references for $\ell_2$ and $\ell_1$ +loss functions; II) Introducing a novel network architecture, named EMSR, for +enhancing the resolution of LR EM images while reducing inherent noise; and, +III) Comparing different training strategies including using acquired LR and HR +image pairs, i.e., real pairs with no-clean references contaminated with real +corruptions, the pairs of synthetic LR and acquired HR, as well as acquired LR +and denoised HR pairs. Experiments with nine brain datasets showed that +training with real pairs can produce high-quality super-resolved results, +demonstrating the feasibility of training with non-clean references for both +loss functions. Additionally, comparable results were observed, both visually +and numerically, when employing denoised and noisy references for training. +Moreover, utilizing the network trained with synthetically generated LR images +from HR counterparts proved effective in yielding satisfactory SR results, even +in certain cases, outperforming training with real pairs. The proposed SR +network was compared quantitatively and qualitatively with several established +SR techniques, showcasing either the superiority or competitiveness of the +proposed method in mitigating noise while recovering fine details. + +
+
+ comment: 14 pages, 12 figures, and 2 tables +
+
+
+
+
+ + ☆ Mobile Contactless Palmprint Recognition: Use of Multiscale, Multimodel + Embeddings + + +
+ Contactless palmprints are comprised of both global and local discriminative +features. Most prior work focuses on extracting global features or local +features alone for palmprint matching, whereas this research introduces a novel +framework that combines global and local features for enhanced palmprint +matching accuracy. Leveraging recent advancements in deep learning, this study +integrates a vision transformer (ViT) and a convolutional neural network (CNN) +to extract complementary local and global features. Next, a mobile-based, +end-to-end palmprint recognition system is developed, referred to as Palm-ID. +On top of the ViT and CNN features, Palm-ID incorporates a palmprint +enhancement module and efficient dimensionality reduction (for faster +matching). Palm-ID balances the trade-off between accuracy and latency, +requiring just 18ms to extract a template of size 516 bytes, which can be +efficiently searched against a 10,000 palmprint gallery in 0.33ms on an AMD +EPYC 7543 32-Core CPU utilizing 128-threads. Cross-database matching protocols +and evaluations on large-scale operational datasets demonstrate the robustness +of the proposed method, achieving a TAR of 98.06% at FAR=0.01% on a newly +collected, time-separated dataset. To show a practical deployment of the +end-to-end system, the entire recognition pipeline is embedded within a mobile +device for enhanced user privacy and security. + +
+
+
+
+
+ + ☆ Deep Shape-Texture Statistics for Completely Blind Image Quality + Evaluation + + +
+ Opinion-Unaware Blind Image Quality Assessment (OU-BIQA) models aim to +predict image quality without training on reference images and subjective +quality scores. Thereinto, image statistical comparison is a classic paradigm, +while the performance is limited by the representation ability of visual +descriptors. Deep features as visual descriptors have advanced IQA in recent +research, but they are discovered to be highly texture-biased and lack of +shape-bias. On this basis, we find out that image shape and texture cues +respond differently towards distortions, and the absence of either one results +in an incomplete image representation. Therefore, to formulate a well-round +statistical description for images, we utilize the shapebiased and +texture-biased deep features produced by Deep Neural Networks (DNNs) +simultaneously. More specifically, we design a Shape-Texture Adaptive Fusion +(STAF) module to merge shape and texture information, based on which we +formulate qualityrelevant image statistics. The perceptual quality is +quantified by the variant Mahalanobis Distance between the inner and outer +Shape-Texture Statistics (DSTS), wherein the inner and outer statistics +respectively describe the quality fingerprints of the distorted image and +natural images. The proposed DSTS delicately utilizes shape-texture statistical +relations between different data scales in the deep domain, and achieves +state-of-the-art (SOTA) quality prediction performance on images with +artificial and authentic distortions. + +
+
+
+
+
+ + ☆ Hardware Acceleration for Real-Time Wildfire Detection Onboard Drone + Networks + + +
+ Early wildfire detection in remote and forest areas is crucial for minimizing +devastation and preserving ecosystems. Autonomous drones offer agile access to +remote, challenging terrains, equipped with advanced imaging technology that +delivers both high-temporal and detailed spatial resolution, making them +valuable assets in the early detection and monitoring of wildfires. However, +the limited computation and battery resources of Unmanned Aerial Vehicles +(UAVs) pose significant challenges in implementing robust and efficient image +classification models. Current works in this domain often operate offline, +emphasizing the need for solutions that can perform inference in real time, +given the constraints of UAVs. To address these challenges, this paper aims to +develop a real-time image classification and fire segmentation model. It +presents a comprehensive investigation into hardware acceleration using the +Jetson Nano P3450 and the implications of TensorRT, NVIDIA's high-performance +deep-learning inference library, on fire classification accuracy and speed. The +study includes implementations of Quantization Aware Training (QAT), Automatic +Mixed Precision (AMP), and post-training mechanisms, comparing them against the +latest baselines for fire segmentation and classification. All experiments +utilize the FLAME dataset - an image dataset collected by low-altitude drones +during a prescribed forest fire. This work contributes to the ongoing efforts +to enable real-time, on-board wildfire detection capabilities for UAVs, +addressing speed and the computational and energy constraints of these crucial +monitoring systems. The results show a 13% increase in classification speed +compared to similar models without hardware optimization. Comparatively, loss +and accuracy are within 1.225% of the original values. + +
+
+ comment: 6 pages, 7 figures, NETROBOTICS conference submission +
+
+
+
+
+ + ☆ KTVIC: A Vietnamese Image Captioning Dataset on the Life Domain + + +
+ Image captioning is a crucial task with applications in a wide range of +domains, including healthcare and education. Despite extensive research on +English image captioning datasets, the availability of such datasets for +Vietnamese remains limited, with only two existing datasets. In this study, we +introduce KTVIC, a comprehensive Vietnamese Image Captioning dataset focused on +the life domain, covering a wide range of daily activities. This dataset +comprises 4,327 images and 21,635 Vietnamese captions, serving as a valuable +resource for advancing image captioning in the Vietnamese language. We conduct +experiments using various deep neural networks as the baselines on our dataset, +evaluating them using the standard image captioning metrics, including BLEU, +METEOR, CIDEr, and ROUGE. Our findings underscore the effectiveness of the +proposed dataset and its potential contributions to the field of image +captioning in the Vietnamese context. + +
+
+
+
+
+ + ☆ Inpainting Normal Maps for Lightstage data + + +
+ This study introduces a novel method for inpainting normal maps using a +generative adversarial network (GAN). Normal maps, often derived from a +lightstage, are crucial in performance capture but can have obscured areas due +to movement (e.g., by arms, hair, or props). Inpainting fills these missing +areas with plausible data. Our approach extends previous general image +inpainting techniques, employing a bow tie-like generator network and a +discriminator network, with alternating training phases. The generator aims to +synthesize images aligning with the ground truth and deceive the discriminator, +which differentiates between real and processed images. Periodically, the +discriminator undergoes retraining to enhance its ability to identify processed +images. Importantly, our method adapts to the unique characteristics of normal +map data, necessitating modifications to the loss function. We utilize a cosine +loss instead of mean squared error loss for generator training. Limited +training data availability, even with synthetic datasets, demands significant +augmentation, considering the specific nature of the input data. This includes +appropriate image flipping and in-plane rotations to accurately alter normal +vectors. Throughout training, we monitored key metrics such as average loss, +Structural Similarity Index Measure (SSIM), and Peak Signal-to-Noise Ratio +(PSNR) for the generator, along with average loss and accuracy for the +discriminator. Our findings suggest that the proposed model effectively +generates high-quality, realistic inpainted normal maps, suitable for +performance capture applications. These results establish a foundation for +future research, potentially involving more advanced networks and comparisons +with inpainting of source images used to create the normal maps. + +
+
+ comment: 8 pages, 4 figures, CGVC Conference, The Eurographics Association +
+
+
+
+
+ + ☆ Spatial-Semantic Collaborative Cropping for User Generated Content + + +
+ A large amount of User Generated Content (UGC) is uploaded to the Internet +daily and displayed to people world-widely through the client side (e.g., +mobile and PC). This requires the cropping algorithms to produce the aesthetic +thumbnail within a specific aspect ratio on different devices. However, +existing image cropping works mainly focus on landmark or landscape images, +which fail to model the relations among the multi-objects with the complex +background in UGC. Besides, previous methods merely consider the aesthetics of +the cropped images while ignoring the content integrity, which is crucial for +UGC cropping. In this paper, we propose a Spatial-Semantic Collaborative +cropping network (S2CNet) for arbitrary user generated content accompanied by a +new cropping benchmark. Specifically, we first mine the visual genes of the +potential objects. Then, the suggested adaptive attention graph recasts this +task as a procedure of information association over visual nodes. The +underlying spatial and semantic relations are ultimately centralized to the +crop candidate through differentiable message passing, which helps our network +efficiently to preserve both the aesthetics and the content integrity. +Extensive experiments on the proposed UGCrop5K and other public datasets +demonstrate the superiority of our approach over state-of-the-art counterparts. +Our project is available at https://github.com/suyukun666/S2CNet. + +
+
+
+
+
+ + ☆ UV-SAM: Adapting Segment Anything Model for Urban Village Identification AAAI 2024 + + +
+ Urban villages, defined as informal residential areas in or around urban +centers, are characterized by inadequate infrastructures and poor living +conditions, closely related to the Sustainable Development Goals (SDGs) on +poverty, adequate housing, and sustainable cities. Traditionally, governments +heavily depend on field survey methods to monitor the urban villages, which +however are time-consuming, labor-intensive, and possibly delayed. Thanks to +widely available and timely updated satellite images, recent studies develop +computer vision techniques to detect urban villages efficiently. However, +existing studies either focus on simple urban village image classification or +fail to provide accurate boundary information. To accurately identify urban +village boundaries from satellite images, we harness the power of the vision +foundation model and adapt the Segment Anything Model (SAM) to urban village +segmentation, named UV-SAM. Specifically, UV-SAM first leverages a small-sized +semantic segmentation model to produce mixed prompts for urban villages, +including mask, bounding box, and image representations, which are then fed +into SAM for fine-grained boundary identification. Extensive experimental +results on two datasets in China demonstrate that UV-SAM outperforms existing +baselines, and identification results over multiple years show that both the +number and area of urban villages are decreasing over time, providing deeper +insights into the development trends of urban villages and sheds light on the +vision foundation models for sustainable cities. The dataset and codes of this +study are available at https://github.com/tsinghua-fib-lab/UV-SAM. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Adversarial Masking Contrastive Learning for vein recognition + + +
+ Vein recognition has received increasing attention due to its high security +and privacy. Recently, deep neural networks such as Convolutional neural +networks (CNN) and Transformers have been introduced for vein recognition and +achieved state-of-the-art performance. Despite the recent advances, however, +existing solutions for finger-vein feature extraction are still not optimal due +to scarce training image samples. To overcome this problem, in this paper, we +propose an adversarial masking contrastive learning (AMCL) approach, that +generates challenging samples to train a more robust contrastive learning model +for the downstream palm-vein recognition task, by alternatively optimizing the +encoder in the contrastive learning model and a set of latent variables. First, +a huge number of masks are generated to train a robust generative adversarial +network (GAN). The trained generator transforms a latent variable from the +latent variable space into a mask space. Then, we combine the trained generator +with a contrastive learning model to obtain our AMCL, where the generator +produces challenging masking images to increase the contrastive loss and the +contrastive learning model is trained based on the harder images to learn a +more robust feature representation. After training, the trained encoder in the +contrastive learning model is combined with a classification layer to build a +classifier, which is further fine-tuned on labeled training data for vein +recognition. The experimental results on three databases demonstrate that our +approach outperforms existing contrastive learning approaches in terms of +improving identification accuracy of vein classifiers and achieves +state-of-the-art recognition results. + +
+
+
+
+
+ + ☆ Representation Learning on Event Stream via an Elastic Net-incorporated + Tensor Network + + +
+ Event cameras are neuromorphic sensors that capture asynchronous and sparse +event stream when per-pixel brightness changes. The state-of-the-art processing +methods for event signals typically aggregate events into a frame or a grid. +However, events are dense in time, these works are limited to local information +of events due to the stacking. In this paper, we present a novel spatiotemporal +representation learning method which can capture the global correlations of all +events in the event stream simultaneously by tensor decomposition. In addition, +with the events are sparse in space, we propose an Elastic Net-incorporated +tensor network (ENTN) model to obtain more spatial and temporal details about +event stream. Empirically, the results indicate that our method can represent +the spatiotemporal correlation of events with high quality, and can achieve +effective results in applications like filtering noise compared with the +state-of-the-art methods. + +
+
+ comment: 7 pages, 3 figure +
+
+
+
+
+ + ☆ Achieve Fairness without Demographics for Dermatological Disease + Diagnosis + + +
+ In medical image diagnosis, fairness has become increasingly crucial. Without +bias mitigation, deploying unfair AI would harm the interests of the +underprivileged population and potentially tear society apart. Recent research +addresses prediction biases in deep learning models concerning demographic +groups (e.g., gender, age, and race) by utilizing demographic (sensitive +attribute) information during training. However, many sensitive attributes +naturally exist in dermatological disease images. If the trained model only +targets fairness for a specific attribute, it remains unfair for other +attributes. Moreover, training a model that can accommodate multiple sensitive +attributes is impractical due to privacy concerns. To overcome this, we propose +a method enabling fair predictions for sensitive attributes during the testing +phase without using such information during training. Inspired by prior work +highlighting the impact of feature entanglement on fairness, we enhance the +model features by capturing the features related to the sensitive and target +attributes and regularizing the feature entanglement between corresponding +classes. This ensures that the model can only classify based on the features +related to the target attribute without relying on features associated with +sensitive attributes, thereby improving fairness and accuracy. Additionally, we +use disease masks from the Segment Anything Model (SAM) to enhance the quality +of the learned feature. Experimental results demonstrate that the proposed +method can improve fairness in classification compared to state-of-the-art +methods in two dermatological disease datasets. + +
+
+
+
+
+ + ☆ Augmenting Ground-Level PM2.5 Prediction via Kriging-Based Pseudo-Label + Generation NeurIPS 2023 + + +
+ Fusing abundant satellite data with sparse ground measurements constitutes a +major challenge in climate modeling. To address this, we propose a strategy to +augment the training dataset by introducing unlabeled satellite images paired +with pseudo-labels generated through a spatial interpolation technique known as +ordinary kriging, thereby making full use of the available satellite data +resources. We show that the proposed data augmentation strategy helps enhance +the performance of the state-of-the-art convolutional neural network-random +forest (CNN-RF) model by a reasonable amount, resulting in a noteworthy +improvement in spatial correlation and a reduction in prediction error. + +
+
+ comment: 8 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change + with Machine Learning +
+
+
+
+
+ + ☆ Toward Clinically Trustworthy Deep Learning: Applying Conformal + Prediction to Intracranial Hemorrhage Detection + + +
+ As deep learning (DL) continues to demonstrate its ability in radiological +tasks, it is critical that we optimize clinical DL solutions to include safety. +One of the principal concerns in the clinical adoption of DL tools is trust. +This study aims to apply conformal prediction as a step toward trustworthiness +for DL in radiology. This is a retrospective study of 491 non-contrast head CTs +from the CQ500 dataset, in which three senior radiologists annotated slices +containing intracranial hemorrhage (ICH). The dataset was split into definite +and challenging subsets, where challenging images were defined to those in +which there was disagreement among readers. A DL model was trained on 146 +patients (10,815 slices) from the definite data (training dataset) to perform +ICH localization and classification for five classes of ICH. To develop an +uncertainty-aware DL model, 1,546 cases of the definite data (calibration +dataset) was used for Mondrian conformal prediction (MCP). The +uncertainty-aware DL model was tested on 8,401 definite and challenging cases +to assess its ability to identify challenging cases. After the MCP procedure, +the model achieved an F1 score of 0.920 for ICH classification on the test +dataset. Additionally, it correctly identified 6,837 of the 6,856 total +challenging cases as challenging (99.7% accuracy). It did not incorrectly label +any definite cases as challenging. The uncertainty-aware ICH detector performs +on par with state-of-the-art models. MCP's performance in detecting challenging +cases demonstrates that it is useful in automated ICH detection and promising +for trustworthiness in radiological DL. + +
+
+ comment: 14 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Robust Tiny Object Detection in Aerial Images amidst Label Noise + + +
+ Precise detection of tiny objects in remote sensing imagery remains a +significant challenge due to their limited visual information and frequent +occurrence within scenes. This challenge is further exacerbated by the +practical burden and inherent errors associated with manual annotation: +annotating tiny objects is laborious and prone to errors (i.e., label noise). +Training detectors for such objects using noisy labels often leads to +suboptimal performance, with networks tending to overfit on noisy labels. In +this study, we address the intricate issue of tiny object detection under noisy +label supervision. We systematically investigate the impact of various types of +noise on network training, revealing the vulnerability of object detectors to +class shifts and inaccurate bounding boxes for tiny objects. To mitigate these +challenges, we propose a DeNoising Tiny Object Detector (DN-TOD), which +incorporates a Class-aware Label Correction (CLC) scheme to address class +shifts and a Trend-guided Learning Strategy (TLS) to handle bounding box noise. +CLC mitigates inaccurate class supervision by identifying and filtering out +class-shifted positive samples, while TLS reduces noisy box-induced erroneous +supervision through sample reweighting and bounding box regeneration. +Additionally, Our method can be seamlessly integrated into both one-stage and +two-stage object detection pipelines. Comprehensive experiments conducted on +synthetic (i.e., noisy AI-TOD-v2.0 and DOTA-v2.0) and real-world (i.e., AI-TOD) +noisy datasets demonstrate the robustness of DN-TOD under various types of +label noise. Notably, when applied to the strong baseline RFLA, DN-TOD exhibits +a noteworthy performance improvement of 4.9 points under 40% mixed noise. +Datasets, codes, and models will be made publicly available. + +
+
+
+
+
+ + ☆ SCoFT: Self-Contrastive Fine-Tuning for Equitable Image Generation + + +
+ Accurate representation in media is known to improve the well-being of the +people who consume it. Generative image models trained on large web-crawled +datasets such as LAION are known to produce images with harmful stereotypes and +misrepresentations of cultures. We improve inclusive representation in +generated images by (1) engaging with communities to collect a culturally +representative dataset that we call the Cross-Cultural Understanding Benchmark +(CCUB) and (2) proposing a novel Self-Contrastive Fine-Tuning (SCoFT) method +that leverages the model's known biases to self-improve. SCoFT is designed to +prevent overfitting on small datasets, encode only high-level information from +the data, and shift the generated distribution away from misrepresentations +encoded in a pretrained model. Our user study conducted on 51 participants from +5 different countries based on their self-selected national cultural +affiliation shows that fine-tuning on CCUB consistently generates images with +higher cultural relevance and fewer stereotypes when compared to the Stable +Diffusion baseline, which is further improved with our SCoFT technique. + +
+
+
+
+
+ + ☆ EmoTalker: Emotionally Editable Talking Face Generation via Diffusion + Model ICASSP2024 + + +
+ In recent years, the field of talking faces generation has attracted +considerable attention, with certain methods adept at generating virtual faces +that convincingly imitate human expressions. However, existing methods face +challenges related to limited generalization, particularly when dealing with +challenging identities. Furthermore, methods for editing expressions are often +confined to a singular emotion, failing to adapt to intricate emotions. To +overcome these challenges, this paper proposes EmoTalker, an emotionally +editable portraits animation approach based on the diffusion model. EmoTalker +modifies the denoising process to ensure preservation of the original +portrait's identity during inference. To enhance emotion comprehension from +text input, Emotion Intensity Block is introduced to analyze fine-grained +emotions and strengths derived from prompts. Additionally, a crafted dataset is +harnessed to enhance emotion comprehension within prompts. Experiments show the +effectiveness of EmoTalker in generating high-quality, emotionally customizable +facial expressions. + +
+
+ comment: Accepted by 2024 IEEE International Conference on Acoustics, Speech, + and Signal Processing (ICASSP2024) +
+
+
+
+
+ + ☆ Forging Vision Foundation Models for Autonomous Driving: Challenges, + Methodologies, and Opportunities + + +
+ The rise of large foundation models, trained on extensive datasets, is +revolutionizing the field of AI. Models such as SAM, DALL-E2, and GPT-4 +showcase their adaptability by extracting intricate patterns and performing +effectively across diverse tasks, thereby serving as potent building blocks for +a wide range of AI applications. Autonomous driving, a vibrant front in AI +applications, remains challenged by the lack of dedicated vision foundation +models (VFMs). The scarcity of comprehensive training data, the need for +multi-sensor integration, and the diverse task-specific architectures pose +significant obstacles to the development of VFMs in this field. This paper +delves into the critical challenge of forging VFMs tailored specifically for +autonomous driving, while also outlining future directions. Through a +systematic analysis of over 250 papers, we dissect essential techniques for VFM +development, including data preparation, pre-training strategies, and +downstream task adaptation. Moreover, we explore key advancements such as NeRF, +diffusion models, 3D Gaussian Splatting, and world models, presenting a +comprehensive roadmap for future research. To empower researchers, we have +built and maintained https://github.com/zhanghm1995/Forge_VFM4AD, an +open-access repository constantly updated with the latest advancements in +forging VFMs for autonomous driving. + +
+
+ comment: Github Repo: https://github.com/zhanghm1995/Forge_VFM4AD +
+
+
+
+
+ + ☆ Cross-Modal Semi-Dense 6-DoF Tracking of an Event Camera in Challenging + Conditions + + +
+ Vision-based localization is a cost-effective and thus attractive solution +for many intelligent mobile platforms. However, its accuracy and especially +robustness still suffer from low illumination conditions, illumination changes, +and aggressive motion. Event-based cameras are bio-inspired visual sensors that +perform well in HDR conditions and have high temporal resolution, and thus +provide an interesting alternative in such challenging scenarios. While purely +event-based solutions currently do not yet produce satisfying mapping results, +the present work demonstrates the feasibility of purely event-based tracking if +an alternative sensor is permitted for mapping. The method relies on geometric +3D-2D registration of semi-dense maps and events, and achieves highly reliable +and accurate cross-modal tracking results. Practically relevant scenarios are +given by depth camera-supported tracking or map-based localization with a +semi-dense map prior created by a regular image-based visual SLAM or +structure-from-motion system. Conventional edge-based 3D-2D alignment is +extended by a novel polarity-aware registration that makes use of signed +time-surface maps (STSM) obtained from event streams. We furthermore introduce +a novel culling strategy for occluded points. Both modifications increase the +speed of the tracker and its robustness against occlusions or large view-point +variations. The approach is validated on many real datasets covering the +above-mentioned challenging conditions, and compared against similar solutions +realised with regular cameras. + +
+
+ comment: accepted by IEEE Transactions on Robotics (T-RO). arXiv admin note: + text overlap with arXiv:2202.02556 +
+
+
+
+
+ + ☆ 3D Lane Detection from Front or Surround-View using Joint-Modeling & + Matching + + +
+ 3D lanes offer a more comprehensive understanding of the road surface +geometry than 2D lanes, thereby providing crucial references for driving +decisions and trajectory planning. While many efforts aim to improve prediction +accuracy, we recognize that an efficient network can bring results closer to +lane modeling. However, if the modeling data is imprecise, the results might +not accurately capture the real-world scenario. Therefore, accurate lane +modeling is essential to align prediction results closely with the environment. +This study centers on efficient and accurate lane modeling, proposing a joint +modeling approach that combines Bezier curves and interpolation methods. +Furthermore, based on this lane modeling approach, we developed a Global2Local +Lane Matching method with Bezier Control-Point and Key-Point, which serve as a +comprehensive solution that leverages hierarchical features with two +mathematical models to ensure a precise match. We also introduce a novel 3D +Spatial Constructor, representing an exploration of 3D surround-view lane +detection research. The framework is suitable for front-view or surround-view +3D lane detection. By directly outputting the key points of lanes in 3D space, +it overcomes the limitations of anchor-based methods, enabling accurate +prediction of closed-loop or U-shaped lanes and effective adaptation to complex +road conditions. This innovative method establishes a new benchmark in +front-view 3D lane detection on the Openlane dataset and achieves competitive +performance in surround-view 2D lane detection on the Argoverse2 dataset. + +
+
+
+
+
+ + ☆ BanglaNet: Bangla Handwritten Character Recognition using Ensembling of + Convolutional Neural Network + + +
+ Handwritten character recognition is a crucial task because of its abundant +applications. The recognition task of Bangla handwritten characters is +especially challenging because of the cursive nature of Bangla characters and +the presence of compound characters with more than one way of writing. In this +paper, a classification model based on the ensembling of several Convolutional +Neural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic +characters, compound characters, numerals, and modifiers. Three different +models based on the idea of state-of-the-art CNN models like Inception, ResNet, +and DenseNet have been trained with both augmented and non-augmented inputs. +Finally, all these models are averaged or ensembled to get the finishing model. +Rigorous experimentation on three benchmark Bangla handwritten characters +datasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited +significant recognition accuracies compared to some recent CNN-based research. +The top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and +the top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb, +BanglaLekha-Isolated, and Ekush datasets respectively. + +
+
+
+
+
+ + ☆ Spatial Channel State Information Prediction with Generative AI: Towards + Holographic Communication and Digital Radio Twin + + +
+ As 5G technology becomes increasingly established, the anticipation for 6G is +growing, which promises to deliver faster and more reliable wireless +connections via cutting-edge radio technologies. However, efficient management +method of the large-scale antenna arrays deployed by those radio technologies +is crucial. Traditional management methods are mainly reactive, usually based +on feedback from users to adapt to the dynamic wireless channel. However, a +more promising approach lies in the prediction of spatial channel state +information (spatial-CSI), which is an all-inclusive channel characterization +and consists of all the feasible line-of-sight (LoS) and non-line-of-sight +(NLoS) paths between the transmitter (Tx) and receiver (Rx), with the +three-dimension (3D) trajectory, attenuation, phase shift, delay, and +polarization of each path. Advances in hardware and neural networks make it +possible to predict such spatial-CSI using precise environmental information, +and further look into the possibility of holographic communication, which +implies complete control over every aspect of the radio waves emitted. Based on +the integration of holographic communication and digital twin, we proposed a +new framework, digital radio twin, which takes advantages from both the digital +world and deterministic control over radio waves, supporting a wide range of +high-level applications. As a preliminary attempt towards this visionary +direction, in this paper, we explore the use of generative artificial +intelligence (AI) to pinpoint the valid paths in a given environment, +demonstrating promising results, and highlighting the potential of this +approach in driving forward the evolution of 6G wireless communication +technologies. + +
+
+ comment: submitted to IEEE for potential publication +
+
+
+
+
+ + ☆ Small Object Detection by DETR via Information Augmentation and Adaptive + Feature Fusion + + +
+ The main challenge for small object detection algorithms is to ensure +accuracy while pursuing real-time performance. The RT-DETR model performs well +in real-time object detection, but performs poorly in small object detection +accuracy. In order to compensate for the shortcomings of the RT-DETR model in +small object detection, two key improvements are proposed in this study. +Firstly, The RT-DETR utilises a Transformer that receives input solely from the +final layer of Backbone features. This means that the Transformer's input only +receives semantic information from the highest level of abstraction in the Deep +Network, and ignores detailed information such as edges, texture or color +gradients that are critical to the location of small objects at lower levels of +abstraction. Including only deep features can introduce additional background +noise. This can have a negative impact on the accuracy of small object +detection. To address this issue, we propose the fine-grained path augmentation +method. This method helps to locate small objects more accurately by providing +detailed information to the deep network. So, the input to the transformer +contains both semantic and detailed information. Secondly, In RT-DETR, the +decoder takes feature maps of different levels as input after concatenating +them with equal weight. However, this operation is not effective in dealing +with the complex relationship of multi-scale information captured by feature +maps of different sizes. Therefore, we propose an adaptive feature fusion +algorithm that assigns learnable parameters to each feature map from different +levels. This allows the model to adaptively fuse feature maps from different +levels and effectively integrate feature information from different scales. +This enhances the model's ability to capture object features at different +scales, thereby improving the accuracy of detecting small objects. + +
+
+
+
+
+ + ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their lack of interpretability makes uncertainty quantification challenging. We +investigate the effects of presenting conformal prediction +sets$\unicode{x2013}$a method for generating valid confidence sets in +distribution-free uncertainty quantification$\unicode{x2013}$to express +uncertainty in AI-advised decision-making. Through a large pre-registered +experiment, we compare the utility of conformal prediction sets to displays of +Top-1 and Top-k predictions for AI-advised image labeling. We find that the +utility of prediction sets for accuracy varies with the difficulty of the task: +while they result in accuracy on par with or less than Top-1 and Top-k displays +for easy images, prediction sets excel at assisting humans in labeling +out-of-distribution (OOD) images especially when the set size is small. Our +results empirically pinpoint the practical challenges of conformal prediction +sets and provide implications on how to incorporate them for real-world +decision-making. + +
+
+ comment: 28 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ B-Cos Aligned Transformers Learn Human-Interpretable Features MICCAI 2023 + + +
+ Vision Transformers (ViTs) and Swin Transformers (Swin) are currently +state-of-the-art in computational pathology. However, domain experts are still +reluctant to use these models due to their lack of interpretability. This is +not surprising, as critical decisions need to be transparent and +understandable. The most common approach to understanding transformers is to +visualize their attention. However, attention maps of ViTs are often +fragmented, leading to unsatisfactory explanations. Here, we introduce a novel +architecture called the B-cos Vision Transformer (BvT) that is designed to be +more interpretable. It replaces all linear transformations with the B-cos +transform to promote weight-input alignment. In a blinded study, medical +experts clearly ranked BvTs above ViTs, suggesting that our network is better +at capturing biomedically relevant structures. This is also true for the B-cos +Swin Transformer (Bwin). Compared to the Swin Transformer, it even improves the +F1-score by up to 4.7% on two public datasets. + +
+
+ comment: Accepted at MICCAI 2023 (oral) +
+
+
+
+
+ + ☆ The Effect of Intrinsic Dataset Properties on Generalization: Unraveling + Learning Differences Between Natural and Medical Images ICLR 2024 + + +
+ This paper investigates discrepancies in how neural networks learn from +different imaging domains, which are commonly overlooked when adopting computer +vision techniques from the domain of natural images to other specialized +domains such as medical images. Recent works have found that the generalization +error of a trained network typically increases with the intrinsic dimension +($d_{data}$) of its training set. Yet, the steepness of this relationship +varies significantly between medical (radiological) and natural imaging +domains, with no existing theoretical explanation. We address this gap in +knowledge by establishing and empirically validating a generalization scaling +law with respect to $d_{data}$, and propose that the substantial scaling +discrepancy between the two considered domains may be at least partially +attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging +datasets, a metric which we propose. Next, we demonstrate an additional benefit +of measuring the label sharpness of a training set: it is negatively correlated +with the trained model's adversarial robustness, which notably leads to models +for medical images having a substantially higher vulnerability to adversarial +attack. Finally, we extend our $d_{data}$ formalism to the related metric of +learned representation intrinsic dimension ($d_{repr}$), derive a +generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$ +serves as an upper bound for $d_{repr}$. Our theoretical results are supported +by thorough experiments with six models and eleven natural and medical imaging +datasets over a range of training set sizes. Our findings offer insights into +the influence of intrinsic dataset properties on generalization, representation +learning, and robustness in deep neural networks. + +
+
+ comment: ICLR 2024. Code: + https://github.com/mazurowski-lab/intrinsic-properties +
+
+
+
+
+ + ☆ Cross-Level Multi-Instance Distillation for Self-Supervised Fine-Grained + Visual Categorization + + +
+ High-quality annotation of fine-grained visual categories demands great +expert knowledge, which is taxing and time consuming. Alternatively, learning +fine-grained visual representation from enormous unlabeled images (e.g., +species, brands) by self-supervised learning becomes a feasible solution. +However, recent researches find that existing self-supervised learning methods +are less qualified to represent fine-grained categories. The bottleneck lies in +that the pre-text representation is built from every patch-wise embedding, +while fine-grained categories are only determined by several key patches of an +image. In this paper, we propose a Cross-level Multi-instance Distillation +(CMD) framework to tackle the challenge. Our key idea is to consider the +importance of each image patch in determining the fine-grained pre-text +representation by multiple instance learning. To comprehensively learn the +relation between informative patches and fine-grained semantics, the +multi-instance knowledge distillation is implemented on both the region/image +crop pairs from the teacher and student net, and the region-image crops inside +the teacher / student net, which we term as intra-level multi-instance +distillation and inter-level multi-instance distillation. Extensive experiments +on CUB-200-2011, Stanford Cars and FGVC Aircraft show that the proposed method +outperforms the contemporary method by upto 10.14% and existing +state-of-the-art self-supervised learning approaches by upto 19.78% on both +top-1 accuracy and Rank-1 retrieval metric. + +
+
+ comment: work in progress +
+
+
+
+
+ + ☆ RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and + Efficiency Assessment of Medical Image Segmentation Models + + +
+ Deep learning techniques, despite their potential, often suffer from a lack +of reproducibility and generalizability, impeding their clinical adoption. +Image segmentation is one of the critical tasks in medical image analysis, in +which one or several regions/volumes of interest should be annotated. This +paper introduces the RIDGE checklist, a framework for assessing the +Reproducibility, Integrity, Dependability, Generalizability, and Efficiency of +deep learning-based medical image segmentation models. The checklist serves as +a guide for researchers to enhance the quality and transparency of their work, +ensuring that segmentation models are not only scientifically sound but also +clinically relevant. + +
+
+ comment: 20 pages, 1 Figure, 1 Table +
+
+
+
+
+ + ☆ Efficient Neural Representation of Volumetric Data using + Coordinate-Based Networks + + +
+ In this paper, we propose an efficient approach for the compression and +representation of volumetric data utilizing coordinate-based networks and +multi-resolution hash encoding. Efficient compression of volumetric data is +crucial for various applications, such as medical imaging and scientific +simulations. Our approach enables effective compression by learning a mapping +between spatial coordinates and intensity values. We compare different encoding +schemes and demonstrate the superiority of multi-resolution hash encoding in +terms of compression quality and training efficiency. Furthermore, we leverage +optimization-based meta-learning, specifically using the Reptile algorithm, to +learn weight initialization for neural representations tailored to volumetric +data, enabling faster convergence during optimization. Additionally, we compare +our approach with state-of-the-art methods to showcase improved image quality +and compression ratios. These findings highlight the potential of +coordinate-based networks and multi-resolution hash encoding for an efficient +and accurate representation of volumetric data, paving the way for advancements +in large-scale data visualization and other applications. + +
+
+
+
+
+ + ☆ Image Fusion in Remote Sensing: An Overview and Meta Analysis + + +
+ Image fusion in Remote Sensing (RS) has been a consistent demand due to its +ability to turn raw images of different resolutions, sources, and modalities +into accurate, complete, and spatio-temporally coherent images. It greatly +facilitates downstream applications such as pan-sharpening, change detection, +land-cover classification, etc. Yet, image fusion solutions are highly +disparate to various remote sensing problems and thus are often narrowly +defined in existing reviews as topical applications, such as pan-sharpening, +and spatial-temporal image fusion. Considering that image fusion can be +theoretically applied to any gridded data through pixel-level operations, in +this paper, we expanded its scope by comprehensively surveying relevant works +with a simple taxonomy: 1) many-to-one image fusion; 2) many-to-many image +fusion. This simple taxonomy defines image fusion as a mapping problem that +turns either a single or a set of images into another single or set of images, +depending on the desired coherence, e.g., spectral, spatial/resolution +coherence, etc. We show that this simple taxonomy, despite the significant +modality difference it covers, can be presented by a conceptually easy +framework. In addition, we provide a meta-analysis to review the major papers +studying the various types of image fusion and their applications over the +years (from the 1980s to date), covering 5,926 peer-reviewed papers. Finally, +we discuss the main benefits and emerging challenges to provide open research +directions and potential future works. + +
+
+ comment: 21pages, 10 figures +
+
+
+
+
+ + ☆ AiGen-FoodReview: A Multimodal Dataset of Machine-Generated Restaurant + Reviews and Images on Social Media + + +
+ Online reviews in the form of user-generated content (UGC) significantly +impact consumer decision-making. However, the pervasive issue of not only human +fake content but also machine-generated content challenges UGC's reliability. +Recent advances in Large Language Models (LLMs) may pave the way to fabricate +indistinguishable fake generated content at a much lower cost. Leveraging +OpenAI's GPT-4-Turbo and DALL-E-2 models, we craft AiGen-FoodReview, a +multi-modal dataset of 20,144 restaurant review-image pairs divided into +authentic and machine-generated. We explore unimodal and multimodal detection +models, achieving 99.80% multimodal accuracy with FLAVA. We use attributes from +readability and photographic theories to score reviews and images, +respectively, demonstrating their utility as hand-crafted features in scalable +and interpretable detection models, with comparable performance. The paper +contributes by open-sourcing the dataset and releasing fake review detectors, +recommending its use in unimodal and multimodal fake review detection tasks, +and evaluating linguistic and visual features in synthetic versus authentic +data. + +
+
+
+
+
+ + ☆ Adversarial Supervision Makes Layout-to-Image Diffusion Models Thrive ICLR 2024 + + +
+ Despite the recent advances in large-scale diffusion models, little progress +has been made on the layout-to-image (L2I) synthesis task. Current L2I models +either suffer from poor editability via text or weak alignment between the +generated image and the input layout. This limits their usability in practice. +To mitigate this, we propose to integrate adversarial supervision into the +conventional training pipeline of L2I diffusion models (ALDM). Specifically, we +employ a segmentation-based discriminator which provides explicit feedback to +the diffusion generator on the pixel-level alignment between the denoised image +and the input layout. To encourage consistent adherence to the input layout +over the sampling steps, we further introduce the multistep unrolling strategy. +Instead of looking at a single timestep, we unroll a few steps recursively to +imitate the inference process, and ask the discriminator to assess the +alignment of denoised images with the layout over a certain time window. Our +experiments show that ALDM enables layout faithfulness of the generated images, +while allowing broad editability via text prompts. Moreover, we showcase its +usefulness for practical applications: by synthesizing target distribution +samples via text control, we improve domain generalization of semantic +segmentation models by a large margin (~12 mIoU points). + +
+
+ comment: Accepted at ICLR 2024. Project page: + https://yumengli007.github.io/ALDM/ and code: + https://github.com/boschresearch/ALDM +
+
+
+
+
+ + ☆ Learning Implicit Representation for Reconstructing Articulated Objects ICLR 2024 + + +
+ 3D Reconstruction of moving articulated objects without additional +information about object structure is a challenging problem. Current methods +overcome such challenges by employing category-specific skeletal models. +Consequently, they do not generalize well to articulated objects in the wild. +We treat an articulated object as an unknown, semi-rigid skeletal structure +surrounded by nonrigid material (e.g., skin). Our method simultaneously +estimates the visible (explicit) representation (3D shapes, colors, camera +parameters) and the implicit skeletal representation, from motion cues in the +object video without 3D supervision. Our implicit representation consists of +four parts. (1) Skeleton, which specifies how semi-rigid parts are connected. +(2) \textcolor{black}{Skinning Weights}, which associates each surface vertex +with semi-rigid parts with probability. (3) Rigidity Coefficients, specifying +the articulation of the local surface. (4) Time-Varying Transformations, which +specify the skeletal motion and surface deformation parameters. We introduce an +algorithm that uses physical constraints as regularization terms and +iteratively estimates both implicit and explicit representations. Our method is +category-agnostic, thus eliminating the need for category-specific skeletons, +we show that our method outperforms state-of-the-art across standard video +datasets. + +
+
+ comment: Accepted by ICLR 2024. Code: https://github.com/haoz19/LIMR +
+
+
+
+
+ + ☆ Segment Anything Model Can Not Segment Anything: Assessing AI Foundation + Model's Generalizability in Permafrost Mapping + + +
+ This paper assesses trending AI foundation models, especially emerging +computer vision foundation models and their performance in natural landscape +feature segmentation. While the term foundation model has quickly garnered +interest from the geospatial domain, its definition remains vague. Hence, this +paper will first introduce AI foundation models and their defining +characteristics. Built upon the tremendous success achieved by Large Language +Models (LLMs) as the foundation models for language tasks, this paper discusses +the challenges of building foundation models for geospatial artificial +intelligence (GeoAI) vision tasks. To evaluate the performance of large AI +vision models, especially Meta's Segment Anything Model (SAM), we implemented +different instance segmentation pipelines that minimize the changes to SAM to +leverage its power as a foundation model. A series of prompt strategies was +developed to test SAM's performance regarding its theoretical upper bound of +predictive accuracy, zero-shot performance, and domain adaptability through +fine-tuning. The analysis used two permafrost feature datasets, ice-wedge +polygons and retrogressive thaw slumps because (1) these landform features are +more challenging to segment than manmade features due to their complicated +formation mechanisms, diverse forms, and vague boundaries; (2) their presence +and changes are important indicators for Arctic warming and climate change. The +results show that although promising, SAM still has room for improvement to +support AI-augmented terrain mapping. The spatial and domain generalizability +of this finding is further validated using a more general dataset EuroCrop for +agricultural field mapping. Finally, we discuss future research directions that +strengthen SAM's applicability in challenging geospatial domains. + +
+
+
+
+
+ + ☆ MultiPLY: A Multisensory Object-Centric Embodied Large Language Model in + 3D World + + +
+ Human beings possess the capability to multiply a melange of multisensory +cues while actively exploring and interacting with the 3D world. Current +multi-modal large language models, however, passively absorb sensory data as +inputs, lacking the capacity to actively interact with the objects in the 3D +environment and dynamically collect their multisensory information. To usher in +the study of this area, we propose MultiPLY, a multisensory embodied large +language model that could incorporate multisensory interactive data, including +visual, audio, tactile, and thermal information into large language models, +thereby establishing the correlation among words, actions, and percepts. To +this end, we first collect Multisensory Universe, a large-scale multisensory +interaction dataset comprising 500k data by deploying an LLM-powered embodied +agent to engage with the 3D environment. To perform instruction tuning with +pre-trained LLM on such generated data, we first encode the 3D scene as +abstracted object-centric representations and then introduce action tokens +denoting that the embodied agent takes certain actions within the environment, +as well as state tokens that represent the multisensory state observations of +the agent at each time step. In the inference time, MultiPLY could generate +action tokens, instructing the agent to take the action in the environment and +obtain the next multisensory state observation. The observation is then +appended back to the LLM via state tokens to generate subsequent text or action +tokens. We demonstrate that MultiPLY outperforms baselines by a large margin +through a diverse set of embodied tasks involving object retrieval, tool use, +multisensory captioning, and task decomposition. + +
+
+ comment: Project page: https://vis-www.cs.umass.edu/multiply +
+
+
+
+
+ + ☆ MMToM-QA: Multimodal Theory of Mind Question Answering + + +
+ Theory of Mind (ToM), the ability to understand people's minds, is an +essential ingredient for developing machines with human-level social +intelligence. Recent machine learning models, particularly large language +models, seem to show some aspects of ToM understanding. However, existing ToM +benchmarks use unimodal datasets - either video or text. Human ToM, on the +other hand, is more than video or text understanding. People can flexibly +reason about another person's mind based on conceptual representations (e.g., +goals, beliefs, plans) extracted from any available data, which can include +visual cues, linguistic narratives, or both. To address this, we introduce a +multimodal Theory of Mind question answering (MMToM-QA) benchmark. MMToM-QA +comprehensively evaluates machine ToM both on multimodal data and on different +kinds of unimodal data about a person's activity in a household environment. To +engineer multimodal ToM capacity, we propose a novel method, BIP-ALM (Bayesian +Inverse Planning Accelerated by Language Models). BIP-ALM extracts unified +representations from multimodal data and utilizes language models for scalable +Bayesian inverse planning. We conducted a systematic comparison of human +performance, BIP-ALM, and state-of-the-art models, including GPT-4. The +experiments demonstrate that large language models and large multimodal models +still lack robust ToM capacity. BIP-ALM, on the other hand, shows promising +results, by leveraging the power of both model-based mental inference and +language models. + +
+
+ comment: 27 pages, 11 figures, 7 tables +
+
+
+
+
+ + ☆ Benchmarking the Robustness of Image Watermarks + + +
+ This paper investigates the weaknesses of image watermarking techniques. We +present WAVES (Watermark Analysis Via Enhanced Stress-testing), a novel +benchmark for assessing watermark robustness, overcoming the limitations of +current evaluation methods.WAVES integrates detection and identification tasks, +and establishes a standardized evaluation protocol comprised of a diverse range +of stress tests. The attacks in WAVES range from traditional image distortions +to advanced and novel variations of adversarial, diffusive, and embedding-based +attacks. We introduce a normalized score of attack potency which incorporates +several widely used image quality metrics and allows us to produce of an +ordered ranking of attacks. Our comprehensive evaluation over reveals +previously undetected vulnerabilities of several modern watermarking +algorithms. WAVES is envisioned as a toolkit for the future development of +robust watermarking systems. + +
+
+
+
+
+ + ☆ Fast Dynamic 3D Object Generation from a Single-view Video + + +
+ Generating dynamic three-dimensional (3D) object from a single-view video is +challenging due to the lack of 4D labeled data. Existing methods extend +text-to-3D pipelines by transferring off-the-shelf image generation models such +as score distillation sampling, but they are slow and expensive to scale (e.g., +150 minutes per object) due to the need for back-propagating the +information-limited supervision signals through a large pretrained model. To +address this limitation, we propose an efficient video-to-4D object generation +framework called Efficient4D. It generates high-quality spacetime-consistent +images under different camera views, and then uses them as labeled data to +directly train a novel 4D Gaussian splatting model with explicit point cloud +geometry, enabling real-time rendering under continuous camera trajectories. +Extensive experiments on synthetic and real videos show that Efficient4D offers +a remarkable 10-fold increase in speed when compared to prior art alternatives +while preserving the same level of innovative view synthesis quality. For +example, Efficient4D takes only 14 minutes to model a dynamic object. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ RoHM: Robust Human Motion Reconstruction via Diffusion + + +
+ We propose RoHM, an approach for robust 3D human motion reconstruction from +monocular RGB(-D) videos in the presence of noise and occlusions. Most previous +approaches either train neural networks to directly regress motion in 3D or +learn data-driven motion priors and combine them with optimization at test +time. The former do not recover globally coherent motion and fail under +occlusions; the latter are time-consuming, prone to local minima, and require +manual tuning. To overcome these shortcomings, we exploit the iterative, +denoising nature of diffusion models. RoHM is a novel diffusion-based motion +model that, conditioned on noisy and occluded input data, reconstructs +complete, plausible motions in consistent global coordinates. Given the +complexity of the problem -- requiring one to address different tasks +(denoising and infilling) in different solution spaces (local and global +motion) -- we decompose it into two sub-tasks and learn two models, one for +global trajectory and one for local motion. To capture the correlations between +the two, we then introduce a novel conditioning module, combining it with an +iterative inference scheme. We apply RoHM to a variety of tasks -- from motion +reconstruction and denoising to spatial and temporal infilling. Extensive +experiments on three popular datasets show that our method outperforms +state-of-the-art approaches qualitatively and quantitatively, while being +faster at test time. The code will be available at +https://sanweiliti.github.io/ROHM/ROHM.html. + +
+
+ comment: With the appendix included +
+
+
+
+
+ + ☆ Fixed Point Diffusion Models + + +
+ We introduce the Fixed Point Diffusion Model (FPDM), a novel approach to +image generation that integrates the concept of fixed point solving into the +framework of diffusion-based generative modeling. Our approach embeds an +implicit fixed point solving layer into the denoising network of a diffusion +model, transforming the diffusion process into a sequence of closely-related +fixed point problems. Combined with a new stochastic training method, this +approach significantly reduces model size, reduces memory usage, and +accelerates training. Moreover, it enables the development of two new +techniques to improve sampling efficiency: reallocating computation across +timesteps and reusing fixed point solutions between timesteps. We conduct +extensive experiments with state-of-the-art models on ImageNet, FFHQ, +CelebA-HQ, and LSUN-Church, demonstrating substantial improvements in +performance and efficiency. Compared to the state-of-the-art DiT model, FPDM +contains 87% fewer parameters, consumes 60% less memory during training, and +improves image generation quality in situations where sampling computation or +time is limited. Our code and pretrained models are available at +https://lukemelas.github.io/fixed-point-diffusion-models. + +
+
+ comment: Project page: + https://lukemelas.github.io/fixed-point-diffusion-models +
+
+
+
+
+ + ☆ SiT: Exploring Flow and Diffusion-based Generative Models with Scalable + Interpolant Transformers + + +
+ We present Scalable Interpolant Transformers (SiT), a family of generative +models built on the backbone of Diffusion Transformers (DiT). The interpolant +framework, which allows for connecting two distributions in a more flexible way +than standard diffusion models, makes possible a modular study of various +design choices impacting generative models built on dynamical transport: using +discrete vs. continuous time learning, deciding the objective for the model to +learn, choosing the interpolant connecting the distributions, and deploying a +deterministic or stochastic sampler. By carefully introducing the above +ingredients, SiT surpasses DiT uniformly across model sizes on the conditional +ImageNet 256x256 benchmark using the exact same backbone, number of parameters, +and GFLOPs. By exploring various diffusion coefficients, which can be tuned +separately from learning, SiT achieves an FID-50K score of 2.06. + +
+
+ comment: Code available: https://github.com/willisma/SiT +
+
+
+
+
+ + ☆ EgoGen: An Egocentric Synthetic Data Generator + + +
+ Understanding the world in first-person view is fundamental in Augmented +Reality (AR). This immersive perspective brings dramatic visual changes and +unique challenges compared to third-person views. Synthetic data has empowered +third-person-view vision models, but its application to embodied egocentric +perception tasks remains largely unexplored. A critical challenge lies in +simulating natural human movements and behaviors that effectively steer the +embodied cameras to capture a faithful egocentric representation of the 3D +world. To address this challenge, we introduce EgoGen, a new synthetic data +generator that can produce accurate and rich ground-truth training data for +egocentric perception tasks. At the heart of EgoGen is a novel human motion +synthesis model that directly leverages egocentric visual inputs of a virtual +human to sense the 3D environment. Combined with collision-avoiding motion +primitives and a two-stage reinforcement learning approach, our motion +synthesis model offers a closed-loop solution where the embodied perception and +movement of the virtual human are seamlessly coupled. Compared to previous +works, our model eliminates the need for a pre-defined global path, and is +directly applicable to dynamic environments. Combined with our easy-to-use and +scalable data generation pipeline, we demonstrate EgoGen's efficacy in three +tasks: mapping and localization for head-mounted cameras, egocentric camera +tracking, and human mesh recovery from egocentric views. EgoGen will be fully +open-sourced, offering a practical solution for creating realistic egocentric +training data and aiming to serve as a useful tool for egocentric computer +vision research. Refer to our project page: https://ego-gen.github.io/. + +
+
+ comment: 22 pages, 16 figures. Project page: https://ego-gen.github.io/ +
+
+
+
+
+ + ☆ Connect, Collapse, Corrupt: Learning Cross-Modal Tasks with Uni-Modal + Data ICLR 2024 + + +
+ Building cross-modal applications is challenging due to limited paired +multi-modal data. Recent works have shown that leveraging a pre-trained +multi-modal contrastive representation space enables cross-modal tasks to be +learned from uni-modal data. This is based on the assumption that contrastive +optimization makes embeddings from different modalities interchangeable. +However, this assumption is under-explored due to the poorly understood +geometry of the multi-modal contrastive space, where a modality gap exists. In +our study, we provide a theoretical explanation of this space's geometry and +introduce a three-step method, $C^3$ (Connect, Collapse, Corrupt), to bridge +the modality gap, enhancing the interchangeability of embeddings. Our $C^3$ +method significantly improves cross-modal learning from uni-modal data, +achieving state-of-the-art results on zero-shot image / audio / video +captioning and text-to-image generation. + +
+
+ comment: Published at ICLR 2024 +
+
+
+
+
+ + ☆ Registration of algebraic varieties using Riemannian optimization + + +
+ We consider the point cloud registration problem, the task of finding a +transformation between two point clouds that represent the same object but are +expressed in different coordinate systems. Our approach is not based on a +point-to-point correspondence, matching every point in the source point cloud +to a point in the target point cloud. Instead, we assume and leverage a +low-dimensional nonlinear geometric structure of the data. Firstly, we +approximate each point cloud by an algebraic variety (a set defined by finitely +many polynomial equations). This is done by solving an optimization problem on +the Grassmann manifold, using a connection between algebraic varieties and +polynomial bases. Secondly, we solve an optimization problem on the orthogonal +group to find the transformation (rotation $+$ translation) which makes the two +algebraic varieties overlap. We use second-order Riemannian optimization +methods for the solution of both steps. Numerical experiments on real and +synthetic data are provided, with encouraging results. Our approach is +particularly useful when the two point clouds describe different parts of an +objects (which may not even be overlapping), on the condition that the surface +of the object may be well approximated by a set of polynomial equations. The +first procedure -- the approximation -- is of independent interest, as it can +be used for denoising data that belongs to an algebraic variety. We provide +statistical guarantees for the estimation error of the denoising using Stein's +unbiased estimator. + +
+
+
+
+
+ + ☆ Multi-Track Timeline Control for Text-Driven 3D Human Motion Generation + + +
+ Recent advances in generative modeling have led to promising progress on +synthesizing 3D human motion from text, with methods that can generate +character animations from short prompts and specified durations. However, using +a single text prompt as input lacks the fine-grained control needed by +animators, such as composing multiple actions and defining precise durations +for parts of the motion. To address this, we introduce the new problem of +timeline control for text-driven motion synthesis, which provides an intuitive, +yet fine-grained, input interface for users. Instead of a single prompt, users +can specify a multi-track timeline of multiple prompts organized in temporal +intervals that may overlap. This enables specifying the exact timings of each +action and composing multiple actions in sequence or at overlapping intervals. +To generate composite animations from a multi-track timeline, we propose a new +test-time denoising method. This method can be integrated with any pre-trained +motion diffusion model to synthesize realistic motions that accurately reflect +the timeline. At every step of denoising, our method processes each timeline +interval (text prompt) individually, subsequently aggregating the predictions +with consideration for the specific body parts engaged in each action. +Experimental comparisons and ablations validate that our method produces +realistic motions that respect the semantics and timing of given text prompts. +Our code and models are publicly available at https://mathis.petrovich.fr/stmc. + +
+
+ comment: Project page: https://mathis.petrovich.fr/stmc +
+
+
+
+
+ + ☆ Scalable Pre-training of Large Autoregressive Image Models + + +
+ This paper introduces AIM, a collection of vision models pre-trained with an +autoregressive objective. These models are inspired by their textual +counterparts, i.e., Large Language Models (LLMs), and exhibit similar scaling +properties. Specifically, we highlight two key findings: (1) the performance of +the visual features scale with both the model capacity and the quantity of +data, (2) the value of the objective function correlates with the performance +of the model on downstream tasks. We illustrate the practical implication of +these findings by pre-training a 7 billion parameter AIM on 2 billion images, +that achieves 84.0% on ImageNet-1k with a frozen trunk. Interestingly, even at +this scale, we observe no sign of saturation in performance, suggesting that +AIM potentially represents a new frontier for training large-scale vision +models. The pre-training of AIM is similar to the pre-training of LLMs, and +does not require any image-specific strategy to stabilize the training at +scale. + +
+
+ comment: https://github.com/apple/ml-aim +
+
+
+
+
+ + ☆ MICA: Towards Explainable Skin Lesion Diagnosis via Multi-Level + Image-Concept Alignment + + +
+ Black-box deep learning approaches have showcased significant potential in +the realm of medical image analysis. However, the stringent trustworthiness +requirements intrinsic to the medical field have catalyzed research into the +utilization of Explainable Artificial Intelligence (XAI), with a particular +focus on concept-based methods. Existing concept-based methods predominantly +apply concept annotations from a single perspective (e.g., global level), +neglecting the nuanced semantic relationships between sub-regions and concepts +embedded within medical images. This leads to underutilization of the valuable +medical information and may cause models to fall short in harmoniously +balancing interpretability and performance when employing inherently +interpretable architectures such as Concept Bottlenecks. To mitigate these +shortcomings, we propose a multi-modal explainable disease diagnosis framework +that meticulously aligns medical images and clinical-related concepts +semantically at multiple strata, encompassing the image level, token level, and +concept level. Moreover, our method allows for model intervention and offers +both textual and visual explanations in terms of human-interpretable concepts. +Experimental results on three skin image datasets demonstrate that our method, +while preserving model interpretability, attains high performance and label +efficiency for concept detection and disease diagnosis. + +
+
+
+
+
+ + ☆ GATS: Gather-Attend-Scatter + + +
+ As the AI community increasingly adopts large-scale models, it is crucial to +develop general and flexible tools to integrate them. We introduce +Gather-Attend-Scatter (GATS), a novel module that enables seamless combination +of pretrained foundation models, both trainable and frozen, into larger +multimodal networks. GATS empowers AI systems to process and generate +information across multiple modalities at different rates. In contrast to +traditional fine-tuning, GATS allows for the original component models to +remain frozen, avoiding the risk of them losing important knowledge acquired +during the pretraining phase. We demonstrate the utility and versatility of +GATS with a few experiments across games, robotics, and multimodal input-output +systems. + +
+
+
+
+
+ + ☆ Bag of Tricks to Boost Adversarial Transferability + + +
+ Deep neural networks are widely known to be vulnerable to adversarial +examples. However, vanilla adversarial examples generated under the white-box +setting often exhibit low transferability across different models. Since +adversarial transferability poses more severe threats to practical +applications, various approaches have been proposed for better transferability, +including gradient-based, input transformation-based, and model-related +attacks, \etc. In this work, we find that several tiny changes in the existing +adversarial attacks can significantly affect the attack performance, \eg, the +number of iterations and step size. Based on careful studies of existing +adversarial attacks, we propose a bag of tricks to enhance adversarial +transferability, including momentum initialization, scheduled step size, dual +example, spectral-based input transformation, and several ensemble strategies. +Extensive experiments on the ImageNet dataset validate the high effectiveness +of our proposed tricks and show that combining them can further boost +adversarial transferability. Our work provides practical insights and +techniques to enhance adversarial transferability, and offers guidance to +improve the attack performance on the real-world application through simple +adjustments. + +
+
+
+
+
+ + ☆ Video Quality Assessment Based on Swin TransformerV2 and Coarse to Fine + Strategy + + +
+ The objective of non-reference video quality assessment is to evaluate the +quality of distorted video without access to reference high-definition +references. In this study, we introduce an enhanced spatial perception module, +pre-trained on multiple image quality assessment datasets, and a lightweight +temporal fusion module to address the no-reference visual quality assessment +(NR-VQA) task. This model implements Swin Transformer V2 as a local-level +spatial feature extractor and fuses these multi-stage representations through a +series of transformer layers. Furthermore, a temporal transformer is utilized +for spatiotemporal feature fusion across the video. To accommodate compressed +videos of varying bitrates, we incorporate a coarse-to-fine contrastive +strategy to enrich the model's capability to discriminate features from videos +of different bitrates. This is an expanded version of the one-page abstract. + +
+
+
+
+
+ + ☆ PPSURF: Combining Patches and Point Convolutions for Detailed Surface + Reconstruction + + +
+ 3D surface reconstruction from point clouds is a key step in areas such as +content creation, archaeology, digital cultural heritage, and engineering. +Current approaches either try to optimize a non-data-driven surface +representation to fit the points, or learn a data-driven prior over the +distribution of commonly occurring surfaces and how they correlate with +potentially noisy point clouds. Data-driven methods enable robust handling of +noise and typically either focus on a global or a local prior, which trade-off +between robustness to noise on the global end and surface detail preservation +on the local end. We propose PPSurf as a method that combines a global prior +based on point convolutions and a local prior based on processing local point +cloud patches. We show that this approach is robust to noise while recovering +surface details more accurately than the current state-of-the-art. + Our source code, pre-trained model and dataset are available at: +https://github.com/cg-tuwien/ppsurf + +
+
+ comment: Published in Computer Graphics Forum (Jan 2024): + https://onlinelibrary.wiley.com/doi/10.1111/cgf.15000 +
+
+
+
+
+ + ☆ Real3D-Portrait: One-shot Realistic 3D Talking Portrait Synthesis ICLR 2024 + + +
+ One-shot 3D talking portrait generation aims to reconstruct a 3D avatar from +an unseen image, and then animate it with a reference video or audio to +generate a talking portrait video. The existing methods fail to simultaneously +achieve the goals of accurate 3D avatar reconstruction and stable talking face +animation. Besides, while the existing works mainly focus on synthesizing the +head part, it is also vital to generate natural torso and background segments +to obtain a realistic talking portrait video. To address these limitations, we +present Real3D-Potrait, a framework that (1) improves the one-shot 3D +reconstruction power with a large image-to-plane model that distills 3D prior +knowledge from a 3D face generative model; (2) facilitates accurate +motion-conditioned animation with an efficient motion adapter; (3) synthesizes +realistic video with natural torso movement and switchable background using a +head-torso-background super-resolution model; and (4) supports one-shot +audio-driven talking face generation with a generalizable audio-to-motion +model. Extensive experiments show that Real3D-Portrait generalizes well to +unseen identities and generates more realistic talking portrait videos compared +to previous methods. + +
+
+ comment: ICLR 2024 (Spotlight). Project page: https://real3dportrait.github.io +
+
+
+
+
+ + ☆ ValUES: A Framework for Systematic Validation of Uncertainty Estimation + in Semantic Segmentation ICLR 2024 + + +
+ Uncertainty estimation is an essential and heavily-studied component for the +reliable application of semantic segmentation methods. While various studies +exist claiming methodological advances on the one hand, and successful +application on the other hand, the field is currently hampered by a gap between +theory and practice leaving fundamental questions unanswered: Can data-related +and model-related uncertainty really be separated in practice? Which components +of an uncertainty method are essential for real-world performance? Which +uncertainty method works well for which application? In this work, we link this +research gap to a lack of systematic and comprehensive evaluation of +uncertainty methods. Specifically, we identify three key pitfalls in current +literature and present an evaluation framework that bridges the research gap by +providing 1) a controlled environment for studying data ambiguities as well as +distribution shifts, 2) systematic ablations of relevant method components, and +3) test-beds for the five predominant uncertainty applications: OoD-detection, +active learning, failure detection, calibration, and ambiguity modeling. +Empirical results on simulated as well as real-world data demonstrate how the +proposed framework is able to answer the predominant questions in the field +revealing for instance that 1) separation of uncertainty types works on +simulated data but does not necessarily translate to real-world data, 2) +aggregation of scores is a crucial but currently neglected component of +uncertainty methods, 3) While ensembles are performing most robustly across the +different downstream tasks and settings, test-time augmentation often +constitutes a light-weight alternative. Code is at: +https://github.com/IML-DKFZ/values + +
+
+ comment: ICLR 2024 (oral) +
+
+
+
+
+ + ☆ TUMTraf Event: Calibration and Fusion Resulting in a Dataset for + Roadside Event-Based and RGB Cameras + + +
+ Event-based cameras are predestined for Intelligent Transportation Systems +(ITS). They provide very high temporal resolution and dynamic range, which can +eliminate motion blur and make objects easier to recognize at night. However, +event-based images lack color and texture compared to images from a +conventional rgb camera. Considering that, data fusion between event-based and +conventional cameras can combine the strengths of both modalities. For this +purpose, extrinsic calibration is necessary. To the best of our knowledge, no +targetless calibration between event-based and rgb cameras can handle multiple +moving objects, nor data fusion optimized for the domain of roadside ITS +exists, nor synchronized event-based and rgb camera datasets in the field of +ITS are known. To fill these research gaps, based on our previous work, we +extend our targetless calibration approach with clustering methods to handle +multiple moving objects. Furthermore, we develop an early fusion, simple late +fusion, and a novel spatiotemporal late fusion method. Lastly, we publish the +TUMTraf Event Dataset, which contains more than 4k synchronized event-based and +rgb images with 21.9k labeled 2D boxes. During our extensive experiments, we +verified the effectiveness of our calibration method with multiple moving +objects. Furthermore, compared to a single rgb camera, we increased the +detection performance of up to +16% mAP in the day and up to +12% mAP in the +challenging night with our presented event-based sensor fusion methods. The +TUMTraf Event Dataset is available at +https://innovation-mobility.com/tumtraf-dataset. + +
+
+ comment: 14 pages, 8 figures, 4 tables. This work has been submitted to the + IEEE for possible publication. Copyright may be transferred without notice, + after which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Deep learning based Image Compression for Microscopy Images: An + Empirical Study + + +
+ With the fast development of modern microscopes and bioimaging techniques, an +unprecedentedly large amount of imaging data are being generated, stored, +analyzed, and even shared through networks. The size of the data poses great +challenges for current data infrastructure. One common way to reduce the data +size is by image compression. This present study analyzes classic and deep +learning based image compression methods, and their impact on deep learning +based image processing models. Deep learning based label-free prediction models +(i.e., predicting fluorescent images from bright field images) are used as an +example application for comparison and analysis. Effective image compression +methods could help reduce the data size significantly without losing necessary +information, and therefore reduce the burden on data management infrastructure +and permit fast transmission through the network for data sharing or cloud +computing. To compress images in such a wanted way, multiple classical lossy +image compression techniques are compared to several AI-based compression +models provided by and trained with the CompressAI toolbox using python. These +different compression techniques are compared in compression ratio, multiple +image similarity measures and, most importantly, the prediction accuracy from +label-free models on compressed images. We found that AI-based compression +techniques largely outperform the classic ones and will minimally affect the +downstream label-free task in 2D cases. In the end, we hope the present study +could shed light on the potential of deep learning based image compression and +the impact of image compression on downstream deep learning based image +analysis models. + +
+
+ comment: - Update github link; - correct the author name; - update the table + (correct some errors during calculation); - update the implementation detail + section and the discussion section +
+
+
+
+
+ + ♻ ☆ DomainStudio: Fine-Tuning Diffusion Models for Domain-Driven Image + Generation using Limited Data + + +
+ Denoising diffusion probabilistic models (DDPMs) have been proven capable of +synthesizing high-quality images with remarkable diversity when trained on +large amounts of data. Typical diffusion models and modern large-scale +conditional generative models like text-to-image generative models are +vulnerable to overfitting when fine-tuned on extremely limited data. Existing +works have explored subject-driven generation using a reference set containing +a few images. However, few prior works explore DDPM-based domain-driven +generation, which aims to learn the common features of target domains while +maintaining diversity. This paper proposes a novel DomainStudio approach to +adapt DDPMs pre-trained on large-scale source datasets to target domains using +limited data. It is designed to keep the diversity of subjects provided by +source domains and get high-quality and diverse adapted samples in target +domains. We propose to keep the relative distances between adapted samples to +achieve considerable generation diversity. In addition, we further enhance the +learning of high-frequency details for better generation quality. Our approach +is compatible with both unconditional and conditional diffusion models. This +work makes the first attempt to realize unconditional few-shot image generation +with diffusion models, achieving better quality and greater diversity than +current state-of-the-art GAN-based approaches. Moreover, this work also +significantly relieves overfitting for conditional generation and realizes +high-quality domain-driven generation, further expanding the applicable +scenarios of modern large-scale text-to-image models. + +
+
+ comment: extended from DDPM-PA (arXiv:2211.03264), 33 pages, 34 figures. arXiv + admin note: substantial text overlap with arXiv:2211.03264 +
+
+
+
+
+ + ♻ ☆ FUSC: Fetal Ultrasound Semantic Clustering of Second Trimester Scans + Using Deep Self-supervised Learning + + +
+ Ultrasound is the primary imaging modality in clinical practice during +pregnancy. More than 140M fetuses are born yearly, resulting in numerous scans. +The availability of a large volume of fetal ultrasound scans presents the +opportunity to train robust machine learning models. However, the abundance of +scans also has its challenges, as manual labeling of each image is needed for +supervised methods. Labeling is typically labor-intensive and requires +expertise to annotate the images accurately. This study presents an +unsupervised approach for automatically clustering ultrasound images into a +large range of fetal views, reducing or eliminating the need for manual +labeling. Our Fetal Ultrasound Semantic Clustering (FUSC) method is developed +using a large dataset of 88,063 images and further evaluated on an additional +unseen dataset of 8,187 images achieving over 92% clustering purity. The result +of our investigation hold the potential to significantly impact the field of +fetal ultrasound imaging and pave the way for more advanced automated labeling +solutions. Finally, we make the code and the experimental setup publicly +available to help advance the field. + +
+
+
+
+
+ + ♻ ☆ Adaptive Confidence Multi-View Hashing for Multimedia Retrieval ICASSP2024 + + +
+ The multi-view hash method converts heterogeneous data from multiple views +into binary hash codes, which is one of the critical technologies in multimedia +retrieval. However, the current methods mainly explore the complementarity +among multiple views while lacking confidence learning and fusion. Moreover, in +practical application scenarios, the single-view data contain redundant noise. +To conduct the confidence learning and eliminate unnecessary noise, we propose +a novel Adaptive Confidence Multi-View Hashing (ACMVH) method. First, a +confidence network is developed to extract useful information from various +single-view features and remove noise information. Furthermore, an adaptive +confidence multi-view network is employed to measure the confidence of each +view and then fuse multi-view features through a weighted summation. Lastly, a +dilation network is designed to further enhance the feature representation of +the fused features. To the best of our knowledge, we pioneer the application of +confidence learning into the field of multimedia retrieval. Extensive +experiments on two public datasets show that the proposed ACMVH performs better +than state-of-the-art methods (maximum increase of 3.24%). The source code is +available at https://github.com/HackerHyper/ACMVH. + +
+
+ comment: accepted by International Conference on Acoustics, Speech and Signal + Processing 2024(ICASSP2024) +
+
+
+
+
+ + ♻ ☆ ZeroShape: Regression-based Zero-shot Shape Reconstruction + + +
+ We study the problem of single-image zero-shot 3D shape reconstruction. +Recent works learn zero-shot shape reconstruction through generative modeling +of 3D assets, but these models are computationally expensive at train and +inference time. In contrast, the traditional approach to this problem is +regression-based, where deterministic models are trained to directly regress +the object shape. Such regression methods possess much higher computational +efficiency than generative methods. This raises a natural question: is +generative modeling necessary for high performance, or conversely, are +regression-based approaches still competitive? To answer this, we design a +strong regression-based model, called ZeroShape, based on the converging +findings in this field and a novel insight. We also curate a large real-world +evaluation benchmark, with objects from three different real-world 3D datasets. +This evaluation benchmark is more diverse and an order of magnitude larger than +what prior works use to quantitatively evaluate their models, aiming at +reducing the evaluation variance in our field. We show that ZeroShape not only +achieves superior performance over state-of-the-art methods, but also +demonstrates significantly higher computational and data efficiency. + +
+
+ comment: Project page: https://zixuanh.com/projects/zeroshape.html +
+
+
+
+
+ + ♻ ☆ Learning Explicit Contact for Implicit Reconstruction of Hand-held + Objects from Monocular Images AAAI 2024 + + +
+ Reconstructing hand-held objects from monocular RGB images is an appealing +yet challenging task. In this task, contacts between hands and objects provide +important cues for recovering the 3D geometry of the hand-held objects. Though +recent works have employed implicit functions to achieve impressive progress, +they ignore formulating contacts in their frameworks, which results in +producing less realistic object meshes. In this work, we explore how to model +contacts in an explicit way to benefit the implicit reconstruction of hand-held +objects. Our method consists of two components: explicit contact prediction and +implicit shape reconstruction. In the first part, we propose a new subtask of +directly estimating 3D hand-object contacts from a single image. The part-level +and vertex-level graph-based transformers are cascaded and jointly learned in a +coarse-to-fine manner for more accurate contact probabilities. In the second +part, we introduce a novel method to diffuse estimated contact states from the +hand mesh surface to nearby 3D space and leverage diffused contact +probabilities to construct the implicit neural representation for the +manipulated object. Benefiting from estimating the interaction patterns between +the hand and the object, our method can reconstruct more realistic object +meshes, especially for object parts that are in contact with hands. Extensive +experiments on challenging benchmarks show that the proposed method outperforms +the current state of the arts by a great margin. Our code is publicly available +at https://junxinghu.github.io/projects/hoi.html. + +
+
+ comment: Accepted to AAAI 2024.Code and model available at + https://junxinghu.github.io/projects/hoi.html +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves the +state-of-the-art performance over single-step non-adversarial generation. Our +code is available at https://github.com/DJ-LYH/EC-VAE. + +
+
+ comment: update results +
+
+
+
+
+ + ♻ ☆ Generalizing Visual Question Answering from Synthetic to Human-Written + Questions via a Chain of QA with a Large Language Model + + +
+ Visual question answering (VQA) is a task where an image is given, and a +series of questions are asked about the image. To build an efficient VQA +algorithm, a large amount of QA data is required which is very expensive. +Generating synthetic QA pairs based on templates is a practical way to obtain +data. However, VQA models trained on those data do not perform well on complex, +human-written questions. To address this issue, we propose a new method called +{\it chain of QA for human-written questions} (CoQAH). CoQAH utilizes a +sequence of QA interactions between a large language model and a VQA model +trained on synthetic data to reason and derive logical answers for +human-written questions. We tested the effectiveness of CoQAH on two types of +human-written VQA datasets for 3D-rendered and chest X-ray images and found +that it achieved state-of-the-art accuracy in both types of data. Notably, +CoQAH outperformed general vision-language models, VQA models, and medical +foundation models with no finetuning. + +
+
+
+
+
+ + ♻ ☆ Frequency Masking for Universal Deepfake Detection ICASSP-2024 + + +
+ We study universal deepfake detection. Our goal is to detect synthetic images +from a range of generative AI approaches, particularly from emerging ones which +are unseen during training of the deepfake detector. Universal deepfake +detection requires outstanding generalization capability. Motivated by recently +proposed masked image modeling which has demonstrated excellent generalization +in self-supervised pre-training, we make the first attempt to explore masked +image modeling for universal deepfake detection. We study spatial and frequency +domain masking in training deepfake detectors. Based on empirical analysis, we +propose a novel deepfake detector via frequency masking. Our focus on frequency +domain is different from the majority, which primarily target spatial domain +detection. Our comparative analyses reveal substantial performance gains over +existing methods. Code and models are publicly available. + +
+
+ comment: Accepted to IEEE ICASSP-2024 +
+
+
+
+
+ + ♻ ☆ What Matters to You? Towards Visual Representation Alignment for Robot + Learning + + +
+ When operating in service of people, robots need to optimize rewards aligned +with end-user preferences. Since robots will rely on raw perceptual inputs like +RGB images, their rewards will inevitably use visual representations. Recently +there has been excitement in using representations from pre-trained visual +models, but key to making these work in robotics is fine-tuning, which is +typically done via proxy tasks like dynamics prediction or enforcing temporal +cycle-consistency. However, all these proxy tasks bypass the human's input on +what matters to them, exacerbating spurious correlations and ultimately leading +to robot behaviors that are misaligned with user preferences. In this work, we +propose that robots should leverage human feedback to align their visual +representations with the end-user and disentangle what matters for the task. We +propose Representation-Aligned Preference-based Learning (RAPL), a method for +solving the visual representation alignment problem and visual reward learning +problem through the lens of preference-based learning and optimal transport. +Across experiments in X-MAGICAL and in robotic manipulation, we find that +RAPL's reward consistently generates preferred robot behaviors with high sample +efficiency, and shows strong zero-shot generalization when the visual +representation is learned from a different embodiment than the robot's. + +
+
+
+
+
+ + ♻ ☆ Characteristic Guidance: Non-linear Correction for Diffusion Model at + Large Guidance Scale + + +
+ Popular guidance for denoising diffusion probabilistic model (DDPM) linearly +combines distinct conditional models together to provide enhanced control over +samples. However, this approach overlooks nonlinear effects that become +significant when guidance scale is large. To address this issue, we propose +characteristic guidance, a sampling method that provides first-principle +non-linear correction for classifier-free guided DDPMs. Such correction forces +the guided DDPMs to respect the Fokker-Planck equation of their underlying +diffusion process, in a way that is training-free, derivative-free, and +compatible with existing sampling methods. Experiments show that characteristic +guidance enhances control and reduces color and exposure issues in image +generation, proving effective in diverse applications ranging from latent space +sampling to solving physics problems like magnet phase transitions. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Diff-PCR: Diffusion-Based Correspondence Searching in Doubly Stochastic + Matrix Space for Point Cloud Registration + + +
+ Efficiently finding optimal correspondences between point clouds is crucial +for solving both rigid and non-rigid point cloud registration problems. +Existing methods often rely on geometric or semantic feature embedding to +establish correspondences and estimate transformations or flow fields. +Recently, state-of-the-art methods have employed RAFT-like iterative updates to +refine the solution. However, these methods have certain limitations. Firstly, +their iterative refinement design lacks transparency, and their iterative +updates follow a fixed path during the refinement process, which can lead to +suboptimal results. Secondly, these methods overlook the importance of refining +or optimizing correspondences (or matching matrices) as a precursor to solving +transformations or flow fields. They typically compute candidate +correspondences based on distances in the point feature space. However, they +only project the candidate matching matrix into some matrix space once with +Sinkhorn or dual softmax operations to obtain final correspondences. This +one-shot projected matching matrix may be far from the globally optimal one, +and these approaches do not consider the distribution of the target matching +matrix. In this paper, we propose a novel approach that exploits the Denoising +Diffusion Model to predict a searching gradient for the optimal matching matrix +within the Doubly Stochastic Matrix Space. During the reverse denoising +process, our method iteratively searches for better solutions along this +denoising gradient, which points towards the maximum likelihood direction of +the target matching matrix. Our method offers flexibility by allowing the +search to start from any initial matching matrix provided by the online +backbone or white noise. Experimental evaluations on the 3DMatch/3DLoMatch and +4DMatch/4DLoMatch datasets demonstrate the effectiveness of our newly designed +framework. + +
+
+
+
+
+ + ♻ ☆ RanPAC: Random Projections and Pre-trained Models for Continual Learning + + +
+ Continual learning (CL) aims to incrementally learn different tasks (such as +classification) in a non-stationary data stream without forgetting old ones. +Most CL works focus on tackling catastrophic forgetting under a +learning-from-scratch paradigm. However, with the increasing prominence of +foundation models, pre-trained models equipped with informative representations +have become available for various downstream requirements. Several CL methods +based on pre-trained models have been explored, either utilizing pre-extracted +features directly (which makes bridging distribution gaps challenging) or +incorporating adaptors (which may be subject to forgetting). In this paper, we +propose a concise and effective approach for CL with pre-trained models. Given +that forgetting occurs during parameter updating, we contemplate an alternative +approach that exploits training-free random projectors and class-prototype +accumulation, which thus bypasses the issue. Specifically, we inject a frozen +Random Projection layer with nonlinear activation between the pre-trained +model's feature representations and output head, which captures interactions +between features with expanded dimensionality, providing enhanced linear +separability for class-prototype-based CL. We also demonstrate the importance +of decorrelating the class-prototypes to reduce the distribution disparity when +using pre-trained representations. These techniques prove to be effective and +circumvent the problem of forgetting for both class- and domain-incremental +continual learning. Compared to previous methods applied to pre-trained +ViT-B/16 models, we reduce final error rates by between 20% and 62% on seven +class-incremental benchmarks, despite not using any rehearsal memory. We +conclude that the full potential of pre-trained models for simple, effective, +and fast CL has not hitherto been fully tapped. Code is at +github.com/RanPAC/RanPAC. + +
+
+ comment: 32 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Exploring Phonetic Context-Aware Lip-Sync For Talking Face Generation AAAI 2024 + + +
+ Talking face generation is the challenging task of synthesizing a natural and +realistic face that requires accurate synchronization with a given audio. Due +to co-articulation, where an isolated phone is influenced by the preceding or +following phones, the articulation of a phone varies upon the phonetic context. +Therefore, modeling lip motion with the phonetic context can generate more +spatio-temporally aligned lip movement. In this respect, we investigate the +phonetic context in generating lip motion for talking face generation. We +propose Context-Aware Lip-Sync framework (CALS), which explicitly leverages +phonetic context to generate lip movement of the target face. CALS is comprised +of an Audio-to-Lip module and a Lip-to-Face module. The former is pretrained +based on masked learning to map each phone to a contextualized lip motion unit. +The contextualized lip motion unit then guides the latter in synthesizing a +target identity with context-aware lip motion. From extensive experiments, we +verify that simply exploiting the phonetic context in the proposed CALS +framework effectively enhances spatio-temporal alignment. We also demonstrate +the extent to which the phonetic context assists in lip synchronization and +find the effective window size for lip generation to be approximately 1.2 +seconds. + +
+
+ comment: Accepted at AAAI 2024 +
+
+
+
+
+ + ♻ ☆ Understanding CNNs from excitations + + +
+ Saliency maps have proven to be a highly efficacious approach for explicating +the decisions of Convolutional Neural Networks. However, extant methodologies +predominantly rely on gradients, which constrain their ability to explicate +complex models. Furthermore, such approaches are not fully adept at leveraging +negative gradient information to improve interpretive veracity. In this study, +we present a novel concept, termed positive and negative excitation, which +enables the direct extraction of positive and negative excitation for each +layer, thus enabling complete layer-by-layer information utilization sans +gradients. To organize these excitations into final saliency maps, we introduce +a double-chain backpropagation procedure. A comprehensive experimental +evaluation, encompassing both binary classification and multi-classification +tasks, was conducted to gauge the effectiveness of our proposed method. +Encouragingly, the results evince that our approach offers a significant +improvement over the state-of-the-art methods in terms of salient pixel +removal, minor pixel removal, and inconspicuous adversarial perturbation +generation guidance. Additionally, we verify the correlation between positive +and negative excitations. + +
+
+
+
+
+ + ♻ ☆ A Revisit of the Normalized Eight-Point Algorithm and A Self-Supervised + Deep Solution + + +
+ The normalized eight-point algorithm has been widely viewed as the +cornerstone in two-view geometry computation, where the seminal Hartley's +normalization has greatly improved the performance of the direct linear +transformation algorithm. A natural question is, whether there exists and how +to find other normalization methods that may further improve the performance as +per each input sample. In this paper, we provide a novel perspective and +propose two contributions to this fundamental problem: 1) we revisit the +normalized eight-point algorithm and make a theoretical contribution by +presenting the existence of different and better normalization algorithms; 2) +we introduce a deep convolutional neural network with a self-supervised +learning strategy for normalization. Given eight pairs of correspondences, our +network directly predicts the normalization matrices, thus learning to +normalize each input sample. Our learning-based normalization module can be +integrated with both traditional (e.g., RANSAC) and deep learning frameworks +(affording good interpretability) with minimal effort. Extensive experiments on +both synthetic and real images demonstrate the effectiveness of our proposed +approach. + +
+
+ comment: Accepted by Visual Intelligence +
+
+
+
+
+ + ♻ ☆ Image Super-resolution Reconstruction Network based on Enhanced Swin + Transformer via Alternating Aggregation of Local-Global Features + + +
+ The Swin Transformer image super-resolution reconstruction network only +relies on the long-range relationship of window attention and shifted window +attention to explore features. This mechanism has two limitations. On the one +hand, it only focuses on global features while ignoring local features. On the +other hand, it is only concerned with spatial feature interactions while +ignoring channel features and channel interactions, thus limiting its +non-linear mapping ability. To address the above limitations, this paper +proposes enhanced Swin Transformer modules via alternating aggregation of +local-global features. In the local feature aggregation stage, we introduce a +shift convolution to realize the interaction between local spatial information +and channel information. Then, a block sparse global perception module is +introduced in the global feature aggregation stage. In this module, we +reorganize the spatial information first, then send the recombination +information into a multi-layer perceptron unit to implement the global +perception. After that, a multi-scale self-attention module and a low-parameter +residual channel attention module are introduced to realize information +aggregation at different scales. Finally, the proposed network is validated on +five publicly available datasets. The experimental results show that the +proposed network outperforms the other state-of-the-art super-resolution +networks. + +
+
+
+
+
+ + ♻ ☆ Training Transitive and Commutative Multimodal Transformers with LoReTTa NeurIPS 2023 + + +
+ Training multimodal foundation models is challenging due to the limited +availability of multimodal datasets. While many public datasets pair images +with text, few combine images with audio or text with audio. Even rarer are +datasets that align all three modalities at once. Critical domains such as +healthcare, infrastructure, or transportation are particularly affected by +missing modalities. This makes it difficult to integrate all modalities into a +large pre-trained neural network that can be used out-of-the-box or fine-tuned +for different downstream tasks. We introduce LoReTTa (Linking mOdalities with a +tRansitive and commutativE pre-Training sTrAtegy) to address this understudied +problem. Our self-supervised framework unifies causal modeling and masked +modeling with the rules of commutativity and transitivity. This allows us to +transition within and between modalities. As a result, our pre-trained models +are better at exploring the true underlying joint probability distribution. +Given a dataset containing only the disjoint combinations (A, B) and (B, C), +LoReTTa can model the relation A <-> C with A <-> B <-> C. In particular, we +show that a transformer pre-trained with LoReTTa can handle any mixture of +modalities at inference time, including the never-seen pair (A, C) and the +triplet (A, B, C). We extensively evaluate our approach on a synthetic, +medical, and reinforcement learning dataset. Across different domains, our +universal multimodal transformer consistently outperforms strong baselines such +as GPT, BERT, and CLIP on tasks involving the missing modality tuple. + +
+
+ comment: Accepted at NeurIPS 2023 (poster). Camera-ready version +
+
+
+
+
+ + ♻ ☆ Test-Time Domain Adaptation by Learning Domain-Aware Batch Normalization AAAI2024 + + +
+ Test-time domain adaptation aims to adapt the model trained on source domains +to unseen target domains using a few unlabeled images. Emerging research has +shown that the label and domain information is separately embedded in the +weight matrix and batch normalization (BN) layer. Previous works normally +update the whole network naively without explicitly decoupling the knowledge +between label and domain. As a result, it leads to knowledge interference and +defective distribution adaptation. In this work, we propose to reduce such +learning interference and elevate the domain knowledge learning by only +manipulating the BN layer. However, the normalization step in BN is +intrinsically unstable when the statistics are re-estimated from a few samples. +We find that ambiguities can be greatly reduced when only updating the two +affine parameters in BN while keeping the source domain statistics. To further +enhance the domain knowledge extraction from unlabeled data, we construct an +auxiliary branch with label-independent self-supervised learning (SSL) to +provide supervision. Moreover, we propose a bi-level optimization based on +meta-learning to enforce the alignment of two learning objectives of auxiliary +and main branches. The goal is to use the auxiliary branch to adapt the domain +and benefit main task for subsequent inference. Our method keeps the same +computational cost at inference as the auxiliary branch can be thoroughly +discarded after adaptation. Extensive experiments show that our method +outperforms the prior works on five WILDS real-world domain shift datasets. Our +method can also be integrated with methods with label-dependent optimization to +further push the performance boundary. Our code is available at +https://github.com/ynanwu/MABN. + +
+
+ comment: AAAI2024(Oral), see this https URL: https://github.com/ynanwu/MABN +
+
+
+
+
+ + ♻ ☆ BiomedCLIP: a multimodal biomedical foundation model pretrained from + fifteen million scientific image-text pairs + + +
+ Biomedical data is inherently multimodal, comprising physical measurements +and natural language narratives. A generalist biomedical AI model needs to +simultaneously process different modalities of data, including text and images. +Therefore, training an effective generalist biomedical model requires +high-quality multimodal data, such as parallel image-text pairs. Here, we +present PMC-15M, a novel dataset that is two orders of magnitude larger than +existing biomedical multimodal datasets such as MIMIC-CXR, and spans a diverse +range of biomedical image types. PMC-15M contains 15 million biomedical +image-text pairs collected from 4.4 million scientific articles. Based on +PMC-15M, we have pretrained BiomedCLIP, a multimodal foundation model, with +domain-specific adaptations tailored to biomedical vision-language processing. +We conducted extensive experiments and ablation studies on standard biomedical +imaging tasks from retrieval to classification to visual question-answering +(VQA). BiomedCLIP achieved new state-of-the-art results in a wide range of +standard datasets, substantially outperforming prior approaches. Intriguingly, +by large-scale pretraining on diverse biomedical image types, BiomedCLIP even +outperforms state-of-the-art radiology-specific models such as BioViL in +radiology-specific tasks such as RSNA pneumonia detection. In summary, +BiomedCLIP is a fully open-access foundation model that achieves +state-of-the-art performance on various biomedical tasks, paving the way for +transformative multimodal biomedical discovery and applications. We release our +models at https://aka.ms/biomedclip to facilitate future research in multimodal +biomedical AI. + +
+
+ comment: The models are released at https://aka.ms/biomedclip +
+
+
+
+
+ + ♻ ☆ Stable Diffusion for Data Augmentation in COCO and Weed Datasets + + +
+ Generative models have increasingly impacted relative tasks, from computer +vision to interior design and other fields. Stable diffusion is an outstanding +diffusion model that paves the way for producing high-resolution images with +thorough details from text prompts or reference images. It will be an +interesting topic about gaining improvements for small datasets with +image-sparse categories. This study utilized seven common categories and three +widespread weed species to evaluate the efficiency of a stable diffusion model. +In detail, Stable diffusion was used to generate synthetic images belonging to +these classes; three techniques (i.e., Image-to-image translation, Dreambooth, +and ControlNet) based on stable diffusion were leveraged for image generation +with different focuses. Then, classification and detection tasks were conducted +based on these synthetic images, whose performance was compared to the models +trained on original images. Promising results have been achieved in some +classes. This seminal study may expedite the adaption of stable diffusion +models to different fields. + +
+
+
+
+
+ + ♻ ☆ Continual learning under domain transfer with sparse synaptic bursting + + +
+ Existing machines are functionally specific tools that were made for easy +prediction and control. Tomorrow's machines may be closer to biological systems +in their mutability, resilience, and autonomy. But first they must be capable +of learning and retaining new information without being exposed to it +arbitrarily often. Past efforts to engineer such systems have sought to build +or regulate artificial neural networks using disjoint sets of weights that are +uniquely sensitive to specific tasks or inputs. This has not yet enabled +continual learning over long sequences of previously unseen data without +corrupting existing knowledge: a problem known as catastrophic forgetting. In +this paper, we introduce a system that can learn sequentially over previously +unseen datasets (ImageNet, CIFAR-100) with little forgetting over time. This is +done by controlling the activity of weights in a convolutional neural network +on the basis of inputs using top-down regulation generated by a second +feed-forward neural network. We find that our method learns continually under +domain transfer with sparse bursts of activity in weights that are recycled +across tasks, rather than by maintaining task-specific modules. Sparse synaptic +bursting is found to balance activity and suppression such that new functions +can be learned without corrupting extant knowledge, thus mirroring the balance +of order and disorder in systems at the edge of chaos. This behavior emerges +during a prior pre-training (or 'meta-learning') phase in which regulated +synapses are selectively disinhibited, or grown, from an initial state of +uniform suppression through prediction error minimization. + +
+
+
+
+
+ + ♻ ☆ TerrainMesh: Metric-Semantic Terrain Reconstruction from Aerial Images + Using Joint 2D-3D Learning + + +
+ This paper considers outdoor terrain mapping using RGB images obtained from +an aerial vehicle. While feature-based localization and mapping techniques +deliver real-time vehicle odometry and sparse keypoint depth reconstruction, a +dense model of the environment geometry and semantics (vegetation, buildings, +etc.) is usually recovered offline with significant computation and storage. +This paper develops a joint 2D-3D learning approach to reconstruct a local +metric-semantic mesh at each camera keyframe maintained by a visual odometry +algorithm. Given the estimated camera trajectory, the local meshes can be +assembled into a global environment model to capture the terrain topology and +semantics during online operation. A local mesh is reconstructed using an +initialization and refinement stage. In the initialization stage, we estimate +the mesh vertex elevation by solving a least squares problem relating the +vertex barycentric coordinates to the sparse keypoint depth measurements. In +the refinement stage, we associate 2D image and semantic features with the 3D +mesh vertices using camera projection and apply graph convolution to refine the +mesh vertex spatial coordinates and semantic features based on joint 2D and 3D +supervision. Quantitative and qualitative evaluation using real aerial images +show the potential of our method to support environmental monitoring and +surveillance applications. + +
+
+ comment: 19 pages, 17 figures. arXiv admin note: text overlap with + arXiv:2101.01844 +
+
+
+
+
+ + ♻ ☆ CHAMMI: A benchmark for channel-adaptive models in microscopy imaging NeurIPS + + +
+ Most neural networks assume that input images have a fixed number of channels +(three for RGB images). However, there are many settings where the number of +channels may vary, such as microscopy images where the number of channels +changes depending on instruments and experimental goals. Yet, there has not +been a systemic attempt to create and evaluate neural networks that are +invariant to the number and type of channels. As a result, trained models +remain specific to individual studies and are hardly reusable for other +microscopy settings. In this paper, we present a benchmark for investigating +channel-adaptive models in microscopy imaging, which consists of 1) a dataset +of varied-channel single-cell images, and 2) a biologically relevant evaluation +framework. In addition, we adapted several existing techniques to create +channel-adaptive models and compared their performance on this benchmark to +fixed-channel, baseline models. We find that channel-adaptive models can +generalize better to out-of-domain tasks and can be computationally efficient. +We contribute a curated dataset (https://doi.org/10.5281/zenodo.7988357) and an +evaluation API (https://github.com/broadinstitute/MorphEm.git) to facilitate +objective comparisons in future research and applications. + +
+
+ comment: Accepted at NeurIPS Track on Datasets and Benchmarks, 2023 +
+
+
+
+
+ + ♻ ☆ Charting New Territories: Exploring the Geographic and Geospatial + Capabilities of Multimodal LLMs + + +
+ Multimodal large language models (MLLMs) have shown remarkable capabilities +across a broad range of tasks but their knowledge and abilities in the +geographic and geospatial domains are yet to be explored, despite potential +wide-ranging benefits to navigation, environmental research, urban development, +and disaster response. We conduct a series of experiments exploring various +vision capabilities of MLLMs within these domains, particularly focusing on the +frontier model GPT-4V, and benchmark its performance against open-source +counterparts. Our methodology involves challenging these models with a +small-scale geographic benchmark consisting of a suite of visual tasks, testing +their abilities across a spectrum of complexity. The analysis uncovers not only +where such models excel, including instances where they outperform humans, but +also where they falter, providing a balanced view of their capabilities in the +geographic domain. To enable the comparison and evaluation of future models, +our benchmark will be publicly released. + +
+
+ comment: V3: Fixed typo in Fig.1; V2: Minor formatting changes and added + missing subfigure captions +
+
+
+
+
+ + ♻ ☆ Learning Unsupervised World Models for Autonomous Driving via Discrete + Diffusion ICLR 2024 + + +
+ Learning world models can teach an agent how the world works in an +unsupervised manner. Even though it can be viewed as a special case of sequence +modeling, progress for scaling world models on robotic applications such as +autonomous driving has been somewhat less rapid than scaling language models +with Generative Pre-trained Transformers (GPT). We identify two reasons as +major bottlenecks: dealing with complex and unstructured observation space, and +having a scalable generative model. Consequently, we propose a novel world +modeling approach that first tokenizes sensor observations with VQVAE, then +predicts the future via discrete diffusion. To efficiently decode and denoise +tokens in parallel, we recast Masked Generative Image Transformer into the +discrete diffusion framework with a few simple changes, resulting in notable +improvement. When applied to learning world models on point cloud observations, +our model reduces prior SOTA Chamfer distance by more than 65% for 1s +prediction, and more than 50% for 3s prediction, across NuScenes, KITTI +Odometry, and Argoverse2 datasets. Our results demonstrate that discrete +diffusion on tokenized agent experience can unlock the power of GPT-like +unsupervised learning for robotic agents. + +
+
+ comment: Published as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ DeepFDR: A Deep Learning-based False Discovery Rate Control Method for + Neuroimaging Data + + +
+ Voxel-based multiple testing is widely used in neuroimaging data analysis. +Traditional false discovery rate (FDR) control methods often ignore the spatial +dependence among the voxel-based tests and thus suffer from substantial loss of +testing power. While recent spatial FDR control methods have emerged, their +validity and optimality remain questionable when handling the complex spatial +dependencies of the brain. Concurrently, deep learning methods have +revolutionized image segmentation, a task closely related to voxel-based +multiple testing. In this paper, we propose DeepFDR, a novel spatial FDR +control method that leverages unsupervised deep learning-based image +segmentation to address the voxel-based multiple testing problem. Numerical +studies, including comprehensive simulations and Alzheimer's disease FDG-PET +image analysis, demonstrate DeepFDR's superiority over existing methods. +DeepFDR not only excels in FDR control and effectively diminishes the false +nondiscovery rate, but also boasts exceptional computational efficiency highly +suited for tackling large-scale neuroimaging data. + +
+
+
+
+
+ + ♻ ☆ 3DMIT: 3D Multi-modal Instruction Tuning for Scene Understanding + + +
+ The remarkable potential of multi-modal large language models (MLLMs) in +comprehending both vision and language information has been widely +acknowledged. However, the scarcity of 3D scenes-language pairs in comparison +to their 2D counterparts, coupled with the inadequacy of existing approaches in +understanding of 3D scenes by LLMs, poses a significant challenge. In response, +we collect and construct an extensive dataset comprising 75K +instruction-response pairs tailored for 3D scenes. This dataset addresses tasks +related to 3D VQA, 3D grounding, and 3D conversation. To further enhance the +integration of 3D spatial information into LLMs, we introduce a novel and +efficient prompt tuning paradigm, 3DMIT. This paradigm eliminates the alignment +stage between 3D scenes and language and extends the instruction prompt with +the 3D modality information including the entire scene and segmented objects. +We evaluate the effectiveness of our method across diverse tasks in the 3D +scene domain and find that our approach serves as a strategic means to enrich +LLMs' comprehension of the 3D world. Our code is available at +https://github.com/staymylove/3DMIT. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Enhancing High-Resolution 3D Generation through Pixel-wise Gradient + Clipping + + +
+ High-resolution 3D object generation remains a challenging task primarily due +to the limited availability of comprehensive annotated training data. Recent +advancements have aimed to overcome this constraint by harnessing image +generative models, pretrained on extensive curated web datasets, using +knowledge transfer techniques like Score Distillation Sampling (SDS). +Efficiently addressing the requirements of high-resolution rendering often +necessitates the adoption of latent representation-based models, such as the +Latent Diffusion Model (LDM). In this framework, a significant challenge +arises: To compute gradients for individual image pixels, it is necessary to +backpropagate gradients from the designated latent space through the frozen +components of the image model, such as the VAE encoder used within LDM. +However, this gradient propagation pathway has never been optimized, remaining +uncontrolled during training. We find that the unregulated gradients adversely +affect the 3D model's capacity in acquiring texture-related information from +the image generative model, leading to poor quality appearance synthesis. To +address this overarching challenge, we propose an innovative operation termed +Pixel-wise Gradient Clipping (PGC) designed for seamless integration into +existing 3D generative models, thereby enhancing their synthesis quality. +Specifically, we control the magnitude of stochastic gradients by clipping the +pixel-wise gradients efficiently, while preserving crucial texture-related +gradient directions. Despite this simplicity and minimal extra cost, extensive +experiments demonstrate the efficacy of our PGC in enhancing the performance of +existing 3D generative models for high-resolution object rendering. + +
+
+ comment: Technical report. Project page https://fudan-zvg.github.io/PGC-3D +
+
+
+
+
+ + ♻ ☆ WoodScape Motion Segmentation for Autonomous Driving -- CVPR 2023 OmniCV + Workshop Challenge CVPR 2023 + + +
+ Motion segmentation is a complex yet indispensable task in autonomous +driving. The challenges introduced by the ego-motion of the cameras, radial +distortion in fisheye lenses, and the need for temporal consistency make the +task more complicated, rendering traditional and standard Convolutional Neural +Network (CNN) approaches less effective. The consequent laborious data +labeling, representation of diverse and uncommon scenarios, and extensive data +capture requirements underscore the imperative of synthetic data for improving +machine learning model performance. To this end, we employ the PD-WoodScape +synthetic dataset developed by Parallel Domain, alongside the WoodScape fisheye +dataset. Thus, we present the WoodScape fisheye motion segmentation challenge +for autonomous driving, held as part of the CVPR 2023 Workshop on +Omnidirectional Computer Vision (OmniCV). As one of the first competitions +focused on fisheye motion segmentation, we aim to explore and evaluate the +potential and impact of utilizing synthetic data in this domain. In this paper, +we provide a detailed analysis on the competition which attracted the +participation of 112 global teams and a total of 234 submissions. This study +delineates the complexities inherent in the task of motion segmentation, +emphasizes the significance of fisheye datasets, articulate the necessity for +synthetic datasets and the resultant domain gap they engender, outlining the +foundational blueprint for devising successful solutions. Subsequently, we +delve into the details of the baseline experiments and winning methods +evaluating their qualitative and quantitative results, providing with useful +insights. + +
+
+ comment: CVPR 2023 OmniCV Workshop Challenge +
+
+
+
+
+
+
+
+ + Information Retrieval 19 + +
+
+
+ + ☆ Siamese Content-based Search Engine for a More Transparent Skin and + Breast Cancer Diagnosis through Histological Imaging + + +
+ Computer Aid Diagnosis (CAD) has developed digital pathology with Deep +Learning (DL)-based tools to assist pathologists in decision-making. +Content-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek +highly correlated patches in terms of similarity in histopathological features. +In this work, we proposed two CBHIR approaches on breast (Breast-twins) and +skin cancer (Skin-twins) data sets for robust and accurate patch-level +retrieval, integrating a custom-built Siamese network as a feature extractor. +The proposed Siamese network is able to generalize for unseen images by +focusing on the similar histopathological features of the input pairs. The +proposed CBHIR approaches are evaluated on the Breast (public) and Skin +(private) data sets with top K accuracy. Finding the optimum amount of K is +challenging, but also, as much as K increases, the dissimilarity between the +query and the returned images increases which might mislead the pathologists. +To the best of the author's belief, this paper is tackling this issue for the +first time on histopathological images by evaluating the top first retrieved +images. The Breast-twins model achieves 70% of the F1score at the top first, +which exceeds the other state-of-the-art methods at a higher amount of K such +as 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto +Encoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model +tackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential +(STUMP) to assist pathologists with retrieving top K images and their +corresponding labels. So, this approach can offer a more explainable CAD tool +to pathologists in terms of transparency, trustworthiness, or reliability among +other characteristics. + +
+
+
+
+
+ + ☆ Ranking Heterogeneous Search Result Pages using the Interactive + Probability Ranking Principle ECIR 2024 + + +
+ The Probability Ranking Principle (PRP) ranks search results based on their +expected utility derived solely from document contents, often overlooking the +nuances of presentation and user interaction. However, with the evolution of +Search Engine Result Pages (SERPs), now comprising a variety of result cards, +the manner in which these results are presented is pivotal in influencing user +engagement and satisfaction. This shift prompts the question: How does the PRP +and its user-centric counterpart, the Interactive Probability Ranking Principle +(iPRP), compare in the context of these heterogeneous SERPs? Our study draws a +comparison between the PRP and the iPRP, revealing significant differences in +their output. The iPRP, accounting for item-specific costs and interaction +probabilities to determine the ``Expected Perceived Utility" (EPU), yields +different result orderings compared to the PRP. We evaluate the effect of the +EPU on the ordering of results by observing changes in the ranking within a +heterogeneous SERP compared to the traditional ``ten blue links''. We find that +changing the presentation affects the ranking of items according to the (iPRP) +by up to 48\% (with respect to DCG, TBG and RBO) in ad-hoc search tasks on the +TREC WaPo Collection. This work suggests that the iPRP should be employed when +ranking heterogeneous SERPs to provide a user-centric ranking that adapts the +ordering based on the presentation and user engagement. + +
+
+ comment: To be presented as a full paper at ECIR 2024 in Glasgow, UK +
+
+
+
+
+ + ☆ MCRPL: A Pretrain, Prompt & Fine-tune Paradigm for Non-overlapping + Many-to-one Cross-domain Recommendation + + +
+ Cross-domain Recommendation (CR) is the task that tends to improve the +recommendations in the sparse target domain by leveraging the information from +other rich domains. Existing methods of cross-domain recommendation mainly +focus on overlapping scenarios by assuming users are totally or partially +overlapped, which are taken as bridges to connect different domains. However, +this assumption does not always hold since it is illegal to leak users' +identity information to other domains. Conducting Non-overlapping MCR (NMCR) is +challenging since 1) The absence of overlapping information prevents us from +directly aligning different domains, and this situation may get worse in the +MCR scenario. 2) The distribution between source and target domains makes it +difficult for us to learn common information across domains. To overcome the +above challenges, we focus on NMCR, and devise MCRPL as our solution. To +address Challenge 1, we first learn shared domain-agnostic and domain-dependent +prompts, and pre-train them in the pre-training stage. To address Challenge 2, +we further update the domain-dependent prompts with other parameters kept fixed +to transfer the domain knowledge to the target domain. We conduct experiments +on five real-world domains, and the results show the advance of our MCRPL +method compared with several recent SOTA baselines. + +
+
+
+
+
+ + ☆ LLM-Guided Multi-View Hypergraph Learning for Human-Centric Explainable + Recommendation + + +
+ As personalized recommendation systems become vital in the age of information +overload, traditional methods relying solely on historical user interactions +often fail to fully capture the multifaceted nature of human interests. To +enable more human-centric modeling of user preferences, this work proposes a +novel explainable recommendation framework, i.e., LLMHG, synergizing the +reasoning capabilities of large language models (LLMs) and the structural +advantages of hypergraph neural networks. By effectively profiling and +interpreting the nuances of individual user interests, our framework pioneers +enhancements to recommendation systems with increased explainability. We +validate that explicitly accounting for the intricacies of human preferences +allows our human-centric and explainable LLMHG approach to consistently +outperform conventional models across diverse real-world datasets. The proposed +plug-and-play enhancement framework delivers immediate gains in recommendation +performance while offering a pathway to apply advanced LLMs for better +capturing the complexity of human interests across machine learning +applications. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Generative Multi-Modal Knowledge Retrieval with Large Language Models AAAI 2024 + + +
+ Knowledge retrieval with multi-modal queries plays a crucial role in +supporting knowledge-intensive multi-modal applications. However, existing +methods face challenges in terms of their effectiveness and training +efficiency, especially when it comes to training and integrating multiple +retrievers to handle multi-modal queries. In this paper, we propose an +innovative end-to-end generative framework for multi-modal knowledge retrieval. +Our framework takes advantage of the fact that large language models (LLMs) can +effectively serve as virtual knowledge bases, even when trained with limited +data. We retrieve knowledge via a two-step process: 1) generating knowledge +clues related to the queries, and 2) obtaining the relevant document by +searching databases using the knowledge clue. In particular, we first introduce +an object-aware prefix-tuning technique to guide multi-grained visual learning. +Then, we align multi-grained visual features into the textual feature space of +the LLM, employing the LLM to capture cross-modal interactions. Subsequently, +we construct instruction data with a unified format for model training. +Finally, we propose the knowledge-guided generation strategy to impose prior +constraints in the decoding steps, thereby promoting the generation of +distinctive knowledge clues. Through experiments conducted on three benchmarks, +we demonstrate significant improvements ranging from 3.0% to 14.6% across all +evaluation metrics when compared to strong baselines. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ☆ A Reproducibility Study of Goldilocks: Just-Right Tuning of BERT for TAR ECIR 2024 + + +
+ Screening documents is a tedious and time-consuming aspect of high-recall +retrieval tasks, such as compiling a systematic literature review, where the +goal is to identify all relevant documents for a topic. To help streamline this +process, many Technology-Assisted Review (TAR) methods leverage active learning +techniques to reduce the number of documents requiring review. BERT-based +models have shown high effectiveness in text classification, leading to +interest in their potential use in TAR workflows. In this paper, we investigate +recent work that examined the impact of further pre-training epochs on the +effectiveness and efficiency of a BERT-based active learning pipeline. We first +report that we could replicate the original experiments on two specific TAR +datasets, confirming some of the findings: importantly, that further +pre-training is critical to high effectiveness, but requires attention in terms +of selecting the correct training epoch. We then investigate the +generalisability of the pipeline on a different TAR task, that of medical +systematic reviews. In this context, we show that there is no need for further +pre-training if a domain-specific BERT backbone is used within the active +learning pipeline. This finding provides practical implications for using the +studied active learning pipeline within domain-specific TAR tasks. + +
+
+ comment: Accepted at ECIR 2024 (reproducibility) +
+
+
+
+
+ + ☆ Exploring Content-Based and Meta-Data Analysis for Detecting Fake News + Infodemic: A case study on COVID-19 ICPR + + +
+ The coronavirus pandemic (COVID-19) is probably the most disruptive global +health disaster in recent history. It negatively impacted the whole world and +virtually brought the global economy to a standstill. However, as the virus was +spreading, infecting people and claiming thousands of lives so was the spread +and propagation of fake news, misinformation and disinformation about the +event. These included the spread of unconfirmed health advice and remedies on +social media. In this paper, false information about the pandemic is identified +using a content-based approach and metadata curated from messages posted to +online social networks. A content-based approach combined with metadata as well +as an initial feature analysis is used and then several supervised learning +models are tested for identifying and predicting misleading posts. Our approach +shows up to 93% accuracy in the detection of fake news related posts about the +COVID-19 pandemic + +
+
+ comment: 8 pages, 5 figures, 3 tables, International Conference for Pattern + Recognition Systems (ICPRS 2022) +
+
+
+
+
+ + ☆ Link Me Baby One More Time: Social Music Discovery on Spotify + + +
+ We explore the social and contextual factors that influence the outcome of +person-to-person music recommendations and discovery. Specifically, we use data +from Spotify to investigate how a link sent from one user to another results in +the receiver engaging with the music of the shared artist. We consider several +factors that may influence this process, such as the strength of the +sender-receiver relationship, the user's role in the Spotify social network, +their music social cohesion, and how similar the new artist is to the +receiver's taste. We find that the receiver of a link is more likely to engage +with a new artist when (1) they have similar music taste to the sender and the +shared track is a good fit for their taste, (2) they have a stronger and more +intimate tie with the sender, and (3) the shared artist is popular with the +receiver's connections. Finally, we use these findings to build a Random Forest +classifier to predict whether a shared music track will result in the +receiver's engagement with the shared artist. This model elucidates which type +of social and contextual features are most predictive, although peak +performance is achieved when a diverse set of features are included. These +findings provide new insights into the multifaceted mechanisms underpinning the +interplay between music discovery and social processes. + +
+
+
+
+
+ + ☆ From Graphs to Hypergraphs: Hypergraph Projection and its Remediation ICLR 2024 + + +
+ We study the implications of the modeling choice to use a graph, instead of a +hypergraph, to represent real-world interconnected systems whose constituent +relationships are of higher order by nature. Such a modeling choice typically +involves an underlying projection process that maps the original hypergraph +onto a graph, and is common in graph-based analysis. While hypergraph +projection can potentially lead to loss of higher-order relations, there exists +very limited studies on the consequences of doing so, as well as its +remediation. This work fills this gap by doing two things: (1) we develop +analysis based on graph and set theory, showing two ubiquitous patterns of +hyperedges that are root to structural information loss in all hypergraph +projections; we also quantify the combinatorial impossibility of recovering the +lost higher-order structures if no extra help is provided; (2) we still seek to +recover the lost higher-order structures in hypergraph projection, and in light +of (1)'s findings we propose to relax the problem into a learning-based +setting. Under this setting, we develop a learning-based hypergraph +reconstruction method based on an important statistic of hyperedge +distributions that we find. Our reconstruction method is evaluated on 8 +real-world datasets under different settings, and exhibits consistently good +performance. We also demonstrate benefits of the reconstructed hypergraphs via +use cases of protein rankings and link predictions. + +
+
+ comment: Accepted at ICLR 2024 +
+
+
+
+
+ + ☆ Content-Aware Tweet Location Inference using Quadtree Spatial + Partitioning and Jaccard-Cosine Word Embedding + + +
+ Inferring locations from user texts on social media platforms is a +non-trivial and challenging problem relating to public safety. We propose a +novel non-uniform grid-based approach for location inference from Twitter +messages using Quadtree spatial partitions. The proposed algorithm uses natural +language processing (NLP) for semantic understanding and incorporates Cosine +similarity and Jaccard similarity measures for feature vector extraction and +dimensionality reduction. We chose Twitter as our experimental social media +platform due to its popularity and effectiveness for the dissemination of news +and stories about recent events happening around the world. Our approach is the +first of its kind to make location inference from tweets using Quadtree spatial +partitions and NLP, in hybrid word-vector representations. The proposed +algorithm achieved significant classification accuracy and outperformed +state-of-the-art grid-based content-only location inference methods by up to +24% in correctly predicting tweet locations within a 161km radius and by 300km +in median error distance on benchmark datasets. + +
+
+ comment: 8 pages, 7 figures, 5 tables, International Conference on Advances in + Social Networks Analysis and Mining (ASONAM 2018) +
+
+
+
+
+ + ☆ Revealing the Hidden Impact of Top-N Metrics on Optimization in + Recommender Systems ECIR 2024 + + +
+ The hyperparameters of recommender systems for top-n predictions are +typically optimized to enhance the predictive performance of algorithms. +Thereby, the optimization algorithm, e.g., grid search or random search, +searches for the best hyperparameter configuration according to an +optimization-target metric, like nDCG or Precision. In contrast, the optimized +algorithm, internally optimizes a different loss function during training, like +squared error or cross-entropy. To tackle this discrepancy, recent work focused +on generating loss functions better suited for recommender systems. Yet, when +evaluating an algorithm using a top-n metric during optimization, another +discrepancy between the optimization-target metric and the training loss has so +far been ignored. During optimization, the top-n items are selected for +computing a top-n metric; ignoring that the top-n items are selected from the +recommendations of a model trained with an entirely different loss function. +Item recommendations suitable for optimization-target metrics could be outside +the top-n recommended items; hiddenly impacting the optimization performance. +Therefore, we were motivated to analyze whether the top-n items are optimal for +optimization-target top-n metrics. In pursuit of an answer, we exhaustively +evaluate the predictive performance of 250 selection strategies besides +selecting the top-n. We extensively evaluate each selection strategy over +twelve implicit feedback and eight explicit feedback data sets with eleven +recommender systems algorithms. Our results show that there exist selection +strategies other than top-n that increase predictive performance for various +algorithms and recommendation domains. However, the performance of the top ~43% +of selection strategies is not significantly different. We discuss the impact +of our findings on optimization and re-ranking in recommender systems and +feasible solutions. + +
+
+ comment: Accepted in the Full Paper Track for ECIR 2024 +
+
+
+
+
+ + ☆ Ranking Heterogeneous Search Result Pages using the Interactive + Probability Ranking Principle ECIR 2024 + + +
+ The Probability Ranking Principle (PRP) ranks search results based on their +expected utility derived solely from document contents, often overlooking the +nuances of presentation and user interaction. However, with the evolution of +Search Engine Result Pages (SERPs), now comprising a variety of result cards, +the manner in which these results are presented is pivotal in influencing user +engagement and satisfaction. This shift prompts the question: How does the PRP +and its user-centric counterpart, the Interactive Probability Ranking Principle +(iPRP), compare in the context of these heterogeneous SERPs? Our study draws a +comparison between the PRP and the iPRP, revealing significant differences in +their output. The iPRP, accounting for item-specific costs and interaction +probabilities to determine the ``Expected Perceived Utility" (EPU), yields +different result orderings compared to the PRP. We evaluate the effect of the +EPU on the ordering of results by observing changes in the ranking within a +heterogeneous SERP compared to the traditional ``ten blue links''. We find that +changing the presentation affects the ranking of items according to the (iPRP) +by up to 48\% (with respect to DCG, TBG and RBO) in ad-hoc search tasks on the +TREC WaPo Collection. This work suggests that the iPRP should be employed when +ranking heterogeneous SERPs to provide a user-centric ranking that adapts the +ordering based on the presentation and user engagement. + +
+
+ comment: To be presented as a full paper at ECIR 2024 in Glasgow, UK +
+
+
+
+
+ + ☆ Gene-associated Disease Discovery Powered by Large Language Models AAAI 2024 + + +
+ The intricate relationship between genetic variation and human diseases has +been a focal point of medical research, evidenced by the identification of risk +genes regarding specific diseases. The advent of advanced genome sequencing +techniques has significantly improved the efficiency and cost-effectiveness of +detecting these genetic markers, playing a crucial role in disease diagnosis +and forming the basis for clinical decision-making and early risk assessment. +To overcome the limitations of existing databases that record disease-gene +associations from existing literature, which often lack real-time updates, we +propose a novel framework employing Large Language Models (LLMs) for the +discovery of diseases associated with specific genes. This framework aims to +automate the labor-intensive process of sifting through medical literature for +evidence linking genetic variations to diseases, thereby enhancing the +efficiency of disease identification. Our approach involves using LLMs to +conduct literature searches, summarize relevant findings, and pinpoint diseases +related to specific genes. This paper details the development and application +of our LLM-powered framework, demonstrating its potential in streamlining the +complex process of literature retrieval and summarization to identify diseases +associated with specific genetic variations. + +
+
+ comment: This is the official paper accepted by AAAI 2024 Workshop on Large + Language Models for Biological Discoveries +
+
+
+
+
+ + ♻ ☆ Linguistic and Structural Basis of Engineering Design Knowledge + + +
+ Artefact descriptions are the primary carriers of engineering design +knowledge that is both an outcome and a driver of the design process. While an +artefact could be described in different connotations, the design process +requires a description to embody engineering design knowledge, which is +expressed in the text through intricate placement of entities and +relationships. As large-language models learn from all kinds of text merely as +a sequence of characters/tokens, these are yet to generate text that embodies +explicit engineering design facts. Existing ontological design theories are +less likely to guide the large-language models whose applications are currently +limited to ideation and learning purposes. In this article, we explicate +engineering design knowledge as knowledge graphs from a large sample of 33,881 +patent documents. We examine the constituents of these knowledge graphs to +understand the linguistic and structural basis of engineering design knowledge. +In terms of linguistic basis, we observe that entities and relationships could +be generalised to 64 and 24 linguistic syntaxes. While relationships mainly +capture attributes ('of'), structure ('in', 'with'), purpose ('to', 'for'), +hierarchy ('include'), exemplification ('such as'), and behaviour ('to', +'from'), the hierarchical relationships could specifically be identified using +75 unique syntaxes. To understand the structural basis, we draw inspiration +from various studies on biological/ecological networks and discover motifs from +patent knowledge graphs. We identify four 3-node and four 4-node patterns that +could further be converged and simplified into sequence [->...->], aggregation +[->...<-], and hierarchy [<-...->]. Expected to guide large-language model +based design tools, we propose few regulatory precepts for concretising +abstract entities and relationships within subgraphs, while explicating +hierarchical structures. + +
+
+
+
+
+ + ♻ ☆ Starling: An I/O-Efficient Disk-Resident Graph Index Framework for + High-Dimensional Vector Similarity Search on Data Segment SIGMOD 2024 + + +
+ High-dimensional vector similarity search (HVSS) is gaining prominence as a +powerful tool for various data science and AI applications. As vector data +scales up, in-memory indexes pose a significant challenge due to the +substantial increase in main memory requirements. A potential solution involves +leveraging disk-based implementation, which stores and searches vector data on +high-performance devices like NVMe SSDs. However, implementing HVSS for data +segments proves to be intricate in vector databases where a single machine +comprises multiple segments for system scalability. In this context, each +segment operates with limited memory and disk space, necessitating a delicate +balance between accuracy, efficiency, and space cost. Existing disk-based +methods fall short as they do not holistically address all these requirements +simultaneously. In this paper, we present Starling, an I/O-efficient +disk-resident graph index framework that optimizes data layout and search +strategy within the segment. It has two primary components: (1) a data layout +incorporating an in-memory navigation graph and a reordered disk-based graph +with enhanced locality, reducing the search path length and minimizing disk +bandwidth wastage; and (2) a block search strategy designed to minimize costly +disk I/O operations during vector query execution. Through extensive +experiments, we validate the effectiveness, efficiency, and scalability of +Starling. On a data segment with 2GB memory and 10GB disk capacity, Starling +can accommodate up to 33 million vectors in 128 dimensions, offering HVSS with +over 0.9 average precision and top-10 recall rate, and latency under 1 +millisecond. The results showcase Starling's superior performance, exhibiting +43.9$\times$ higher throughput with 98% lower query latency compared to +state-of-the-art methods while maintaining the same level of accuracy. + +
+
+ comment: This paper has been accepted by SIGMOD 2024 +
+
+
+
+
+ + ♻ ☆ Reproducibility Analysis and Enhancements for Multi-Aspect Dense + Retriever with Aspect Learning + + +
+ Multi-aspect dense retrieval aims to incorporate aspect information (e.g., +brand and category) into dual encoders to facilitate relevance matching. As an +early and representative multi-aspect dense retriever, MADRAL learns several +extra aspect embeddings and fuses the explicit aspects with an implicit aspect +"OTHER" for final representation. MADRAL was evaluated on proprietary data and +its code was not released, making it challenging to validate its effectiveness +on other datasets. We failed to reproduce its effectiveness on the public +MA-Amazon data, motivating us to probe the reasons and re-examine its +components. We propose several component alternatives for comparisons, +including replacing "OTHER" with "CLS" and representing aspects with the first +several content tokens. Through extensive experiments, we confirm that learning +"OTHER" from scratch in aspect fusion is harmful. In contrast, our proposed +variants can greatly enhance the retrieval performance. Our research not only +sheds light on the limitations of MADRAL but also provides valuable insights +for future studies on more powerful multi-aspect dense retrieval models. Code +will be released at: +https://github.com/sunxiaojie99/Reproducibility-for-MADRAL. + +
+
+ comment: accepted by ecir2024 as a reproducibility paper +
+
+
+
+
+ + ♻ ☆ A Multi-Granularity-Aware Aspect Learning Model for Multi-Aspect Dense + Retrieval WSDM2024 + + +
+ Dense retrieval methods have been mostly focused on unstructured text and +less attention has been drawn to structured data with various aspects, e.g., +products with aspects such as category and brand. Recent work has proposed two +approaches to incorporate the aspect information into item representations for +effective retrieval by predicting the values associated with the item aspects. +Despite their efficacy, they treat the values as isolated classes (e.g., "Smart +Homes", "Home, Garden & Tools", and "Beauty & Health") and ignore their +fine-grained semantic relation. Furthermore, they either enforce the learning +of aspects into the CLS token, which could confuse it from its designated use +for representing the entire content semantics, or learn extra aspect embeddings +only with the value prediction objective, which could be insufficient +especially when there are no annotated values for an item aspect. Aware of +these limitations, we propose a MUlti-granulaRity-aware Aspect Learning model +(MURAL) for multi-aspect dense retrieval. It leverages aspect information +across various granularities to capture both coarse and fine-grained semantic +relations between values. Moreover, MURAL incorporates separate aspect +embeddings as input to transformer encoders so that the masked language model +objective can assist implicit aspect learning even without aspect-value +annotations. Extensive experiments on two real-world datasets of products and +mini-programs show that MURAL outperforms state-of-the-art baselines +significantly. + +
+
+ comment: Accepted by WSDM2024, update +
+
+
+
+
+ + ♻ ☆ Towards More Robust and Accurate Sequential Recommendation with + Cascade-guided Adversarial Training SDM24 + + +
+ Sequential recommendation models, models that learn from chronological +user-item interactions, outperform traditional recommendation models in many +settings. Despite the success of sequential recommendation models, their +robustness has recently come into question. Two properties unique to the nature +of sequential recommendation models may impair their robustness - the cascade +effects induced during training and the model's tendency to rely too heavily on +temporal information. To address these vulnerabilities, we propose +Cascade-guided Adversarial training, a new adversarial training procedure that +is specifically designed for sequential recommendation models. Our approach +harnesses the intrinsic cascade effects present in sequential modeling to +produce strategic adversarial perturbations to item embeddings during training. +Experiments on training state-of-the-art sequential models on four public +datasets from different domains show that our training approach produces +superior model ranking accuracy and superior model robustness to real item +replacement perturbations when compared to both standard model training and +generic adversarial training. + +
+
+ comment: Accepted to present at SIAM International Conference on Data Mining + (SDM24) +
+
+
+
+
+ + ♻ ☆ FedDCSR: Federated Cross-domain Sequential Recommendation via + Disentangled Representation Learning + + +
+ Cross-domain Sequential Recommendation (CSR) which leverages user sequence +data from multiple domains has received extensive attention in recent years. +However, the existing CSR methods require sharing origin user data across +domains, which violates the General Data Protection Regulation (GDPR). Thus, it +is necessary to combine federated learning (FL) and CSR to fully utilize +knowledge from different domains while preserving data privacy. Nonetheless, +the sequence feature heterogeneity across different domains significantly +impacts the overall performance of FL. In this paper, we propose FedDCSR, a +novel federated cross-domain sequential recommendation framework via +disentangled representation learning. Specifically, to address the sequence +feature heterogeneity across domains, we introduce an approach called +inter-intra domain sequence representation disentanglement (SRD) to disentangle +the user sequence features into domain-shared and domain-exclusive features. In +addition, we design an intra domain contrastive infomax (CIM) strategy to learn +richer domain-exclusive features of users by performing data augmentation on +user sequences. Extensive experiments on three real-world scenarios demonstrate +that FedDCSR achieves significant improvements over existing baselines. + +
+
+
+
+
+
+
+
+ + Machine Learning 89 + +
+
+
+ + ☆ Robotic Imitation of Human Actions + + +
+ Imitation can allow us to quickly gain an understanding of a new task. +Through a demonstration, we can gain direct knowledge about which actions need +to be performed and which goals they have. In this paper, we introduce a new +approach to imitation learning that tackles the challenges of a robot imitating +a human, such as the change in perspective and body schema. Our approach can +use a single human demonstration to abstract information about the demonstrated +task, and use that information to generalise and replicate it. We facilitate +this ability by a new integration of two state-of-the-art methods: a diffusion +action segmentation model to abstract temporal information from the +demonstration and an open vocabulary object detector for spatial information. +Furthermore, we refine the abstracted information and use symbolic reasoning to +create an action plan utilising inverse kinematics, to allow the robot to +imitate the demonstrated action. + +
+
+
+
+
+ + ☆ Sparse PCA with False Discovery Rate Controlled Variable Selection ICASSP 2024 + + +
+ Sparse principal component analysis (PCA) aims at mapping large dimensional +data to a linear subspace of lower dimension. By imposing loading vectors to be +sparse, it performs the double duty of dimension reduction and variable +selection. Sparse PCA algorithms are usually expressed as a trade-off between +explained variance and sparsity of the loading vectors (i.e., number of +selected variables). As a high explained variance is not necessarily synonymous +with relevant information, these methods are prone to select irrelevant +variables. To overcome this issue, we propose an alternative formulation of +sparse PCA driven by the false discovery rate (FDR). We then leverage the +Terminating-Random Experiments (T-Rex) selector to automatically determine an +FDR-controlled support of the loading vectors. A major advantage of the +resulting T-Rex PCA is that no sparsity parameter tuning is required. Numerical +experiments and a stock market data example demonstrate a significant +performance improvement. + +
+
+ comment: Published in ICASSP 2024 - 2024 IEEE International Conference on + Acoustics, Speech and Signal Processing (ICASSP), scheduled for 14-19 April + 2024 in Seoul, Korea +
+
+
+
+
+ + ☆ Weighted Spectral Filters for Kernel Interpolation on Spheres: Estimates + of Prediction Accuracy for Noisy Data + + +
+ Spherical radial-basis-based kernel interpolation abounds in image sciences +including geophysical image reconstruction, climate trends description and +image rendering due to its excellent spatial localization property and perfect +approximation performance. However, in dealing with noisy data, kernel +interpolation frequently behaves not so well due to the large condition number +of the kernel matrix and instability of the interpolation process. In this +paper, we introduce a weighted spectral filter approach to reduce the condition +number of the kernel matrix and then stabilize kernel interpolation. The main +building blocks of the proposed method are the well developed spherical +positive quadrature rules and high-pass spectral filters. Using a recently +developed integral operator approach for spherical data analysis, we +theoretically demonstrate that the proposed weighted spectral filter approach +succeeds in breaking through the bottleneck of kernel interpolation, especially +in fitting noisy data. We provide optimal approximation rates of the new method +to show that our approach does not compromise the predicting accuracy. +Furthermore, we conduct both toy simulations and two real-world data +experiments with synthetically added noise in geophysical image reconstruction +and climate image processing to verify our theoretical assertions and show the +feasibility of the weighted spectral filter approach. + +
+
+
+
+
+ + ☆ Personalized Federated Learning of Probabilistic Models: A PAC-Bayesian + Approach + + +
+ Federated learning aims to infer a shared model from private and +decentralized data stored locally by multiple clients. Personalized federated +learning (PFL) goes one step further by adapting the global model to each +client, enhancing the model's fit for different clients. A significant level of +personalization is required for highly heterogeneous clients, but can be +challenging to achieve especially when they have small datasets. To address +this problem, we propose a PFL algorithm named PAC-PFL for learning +probabilistic models within a PAC-Bayesian framework that utilizes differential +privacy to handle data-dependent priors. Our algorithm collaboratively learns a +shared hyper-posterior and regards each client's posterior inference as the +personalization step. By establishing and minimizing a generalization bound on +the average true risk of clients, PAC-PFL effectively combats over-fitting. +PACPFL achieves accurate and well-calibrated predictions, supported by +experiments on a dataset of photovoltaic panel power generation, FEMNIST +dataset (Caldas et al., 2019), and Dirichlet-partitioned EMNIST dataset (Cohen +et al., 2017). + +
+
+
+
+
+ + ☆ We don't need no labels: Estimating post-deployment model performance + under covariate shift without ground truth + + +
+ The performance of machine learning models often degrades after deployment +due to data distribution shifts. In many use cases, it is impossible to +calculate the post-deployment performance because labels are unavailable or +significantly delayed. Proxy methods for evaluating model performance +stability, like drift detection techniques, do not properly quantify data +distribution shift impact. As a solution, we propose a robust and accurate +performance estimation method for evaluating ML classification models on +unlabeled data that accurately quantifies the impact of covariate shift on +model performance. We call it multi-calibrated confidence-based performance +estimation (M-CBPE). It is model and data-type agnostic and works for any +performance metric. It does not require access to the monitored model - it uses +the model predictions and probability estimates. M-CBPE does not need user +input on the nature of the covariate shift as it fully learns from the data. We +evaluate it with over 600 dataset-model pairs from US census data and compare +it with multiple benchmarks using several evaluation metrics. Results show that +M-CBPE is the best method to estimate the performance of classification models +in any evaluation context. + +
+
+
+
+
+ + ☆ Boosting Gradient Ascent for Continuous DR-submodular Maximization ICML + 2022 + + +
+ Projected Gradient Ascent (PGA) is the most commonly used optimization scheme +in machine learning and operations research areas. Nevertheless, numerous +studies and examples have shown that the PGA methods may fail to achieve the +tight approximation ratio for continuous DR-submodular maximization problems. +To address this challenge, we present a boosting technique in this paper, which +can efficiently improve the approximation guarantee of the standard PGA to +\emph{optimal} with only small modifications on the objective function. The +fundamental idea of our boosting technique is to exploit non-oblivious search +to derive a novel auxiliary function $F$, whose stationary points are excellent +approximations to the global maximum of the original DR-submodular objective +$f$. Specifically, when $f$ is monotone and $\gamma$-weakly DR-submodular, we +propose an auxiliary function $F$ whose stationary points can provide a better +$(1-e^{-\gamma})$-approximation than the +$(\gamma^2/(1+\gamma^2))$-approximation guaranteed by the stationary points of +$f$ itself. Similarly, for the non-monotone case, we devise another auxiliary +function $F$ whose stationary points can achieve an optimal +$\frac{1-\min_{\boldsymbol{x}\in\mathcal{C}}\|\boldsymbol{x}\|_{\infty}}{4}$-approximation +guarantee where $\mathcal{C}$ is a convex constraint set. In contrast, the +stationary points of the original non-monotone DR-submodular function can be +arbitrarily bad~\citep{chen2023continuous}. Furthermore, we demonstrate the +scalability of our boosting technique on four problems. In all of these four +problems, our resulting variants of boosting PGA algorithm beat the previous +standard PGA in several aspects such as approximation ratio and efficiency. +Finally, we corroborate our theoretical findings with numerical experiments, +which demonstrate the effectiveness of our boosting PGA methods. + +
+
+ comment: 74 pages, 6 figures and 9 tables. An extended version of Stochastic + Continuous Submodular Maximization: Boosting via Non-oblivious Function (ICML + 2022) +
+
+
+
+
+ + ☆ Learn What You Need in Personalized Federated Learning + + +
+ Personalized federated learning aims to address data heterogeneity across +local clients in federated learning. However, current methods blindly +incorporate either full model parameters or predefined partial parameters in +personalized federated learning. They fail to customize the collaboration +manner according to each local client's data characteristics, causing +unpleasant aggregation results. To address this essential issue, we propose +$\textit{Learn2pFed}$, a novel algorithm-unrolling-based personalized federated +learning framework, enabling each client to adaptively select which part of its +local model parameters should participate in collaborative training. The key +novelty of the proposed $\textit{Learn2pFed}$ is to optimize each local model +parameter's degree of participant in collaboration as learnable parameters via +algorithm unrolling methods. This approach brings two benefits: 1) +mathmatically determining the participation degree of local model parameters in +the federated collaboration, and 2) obtaining more stable and improved +solutions. Extensive experiments on various tasks, including regression, +forecasting, and image classification, demonstrate that $\textit{Learn2pFed}$ +significantly outperforms previous personalized federated learning methods. + +
+
+
+
+
+ + ☆ OpenDPD: An Open-Source End-to-End Learning & Benchmarking Framework for + Wideband Power Amplifier Modeling and Digital Pre-Distortion ISCA + + +
+ With the rise in communication capacity, deep neural networks (DNN) for +digital pre-distortion (DPD) to correct non-linearity in wideband power +amplifiers (PAs) have become prominent. Yet, there is a void in open-source and +measurement-setup-independent platforms for fast DPD exploration and objective +DPD model comparison. This paper presents an open-source framework, OpenDPD, +crafted in PyTorch, with an associated dataset for PA modeling and DPD +learning. We introduce a Dense Gated Recurrent Unit (DGRU)-DPD, trained via a +novel end-to-end learning architecture, outperforming previous DPD models on a +digital PA DPA in the new digital transmitter (DTX) architecture with +unconventional transfer characteristics compared to analog PAs. Measurements +show our DGRU-DPD achieves an ACPR of -44.69/-44.47 dBc and an EVM of -35.22 dB +for 200 MHz OFDM signals. OpenDPD code, datasets, and documentation are +publicly available at https://github.com/lab-emi/OpenDPD. + +
+
+ comment: To be published at the 2024 IEEE International Symposium on Circuits + and Systems (ISCAS), Singapore +
+
+
+
+
+ + ☆ Anchor function: a type of benchmark functions for studying language + models + + +
+ Understanding transformer-based language models is becoming increasingly +crucial, particularly as they play pivotal roles in advancing towards +artificial general intelligence. However, language model research faces +significant challenges, especially for academic research groups with +constrained resources. These challenges include complex data structures, +unknown target functions, high computational costs and memory requirements, and +a lack of interpretability in the inference process, etc. Drawing a parallel to +the use of simple models in scientific research, we propose the concept of an +anchor function. This is a type of benchmark function designed for studying +language models in learning tasks that follow an "anchor-key" pattern. By +utilizing the concept of an anchor function, we can construct a series of +functions to simulate various language tasks. The anchor function plays a role +analogous to that of mice in diabetes research, particularly suitable for +academic research. We demonstrate the utility of the anchor function with an +example, revealing two basic operations by attention structures in language +models: shifting tokens and broadcasting one token from one position to many +positions. These operations are also commonly observed in large language +models. The anchor function framework, therefore, opens up a series of valuable +and accessible research questions for further exploration, especially for +theoretical study. + +
+
+
+
+
+ + ☆ On Quantum Natural Policy Gradients + + +
+ This research delves into the role of the quantum Fisher Information Matrix +(FIM) in enhancing the performance of Parameterized Quantum Circuit (PQC)-based +reinforcement learning agents. While previous studies have highlighted the +effectiveness of PQC-based policies preconditioned with the quantum FIM in +contextual bandits, its impact in broader reinforcement learning contexts, such +as Markov Decision Processes, is less clear. Through a detailed analysis of +L\"owner inequalities between quantum and classical FIMs, this study uncovers +the nuanced distinctions and implications of using each type of FIM. Our +results indicate that a PQC-based agent using the quantum FIM without +additional insights typically incurs a larger approximation error and does not +guarantee improved performance compared to the classical FIM. Empirical +evaluations in classic control benchmarks suggest even though quantum FIM +preconditioning outperforms standard gradient ascent, in general it is not +superior to classical FIM preconditioning. + +
+
+
+
+
+ + ☆ Sum Throughput Maximization in Multi-BD Symbiotic Radio NOMA Network + Assisted by Active-STAR-RIS + + +
+ In this paper, we employ active simultaneously transmitting and reflecting +reconfigurable intelligent surface (ASRIS) to aid in establishing and enhancing +communication within a commensal symbiotic radio (CSR) network. Unlike +traditional RIS, ASRIS not only ensures coverage in an omni directional manner +but also amplifies received signals, consequently elevating overall network +performance. in the first phase, base station (BS) with active massive MIMO +antennas, send ambient signal to SBDs. In the first phase, the BS transmits +ambient signals to the symbiotic backscatter devices (SBDs), and after +harvesting the energy and modulating their information onto the signal carrier, +the SBDs send Backscatter signals back to the BS. In this scheme, we employ the +Backscatter Relay system to facilitate the transmission of information from the +SBDs to the symbiotic User Equipments (SUEs) with the assistance of the BS. In +the second phase, the BS transmits information signals to the SUEs after +eliminating interference using the Successive Interference Cancellation (SIC) +method. ASRIS is employed to establish communication among SUEs lacking a line +of sight (LoS) and to amplify power signals for SUEs with a LoS connection to +the BS. It is worth noting that we use NOMA for multiple access in all network. + The main goal of this paper is to maximize the sum throughput between all +users. To achieve this, we formulate an optimization problem with variables +including active beamforming coefficients at the BS and ASRIS, as well as the +phase adjustments of ASRIS and scheduling parameters between the first and +second phases. To model this optimization problem, we employ three deep +reinforcement learning (DRL) methods, namely PPO, TD3, and A3C. Finally, the +mentioned methods are simulated and compared with each other. + +
+
+ comment: This article will be submitted to the Transactions journal +
+
+
+
+
+ + ☆ The Faiss library + + +
+ Vector databases manage large collections of embedding vectors. As AI +applications are growing rapidly, so are the number of embeddings that need to +be stored and indexed. The Faiss library is dedicated to vector similarity +search, a core functionality of vector databases. Faiss is a toolkit of +indexing methods and related primitives used to search, cluster, compress and +transform vectors. This paper first describes the tradeoff space of vector +search, then the design principles of Faiss in terms of structure, approach to +optimization and interfacing. We benchmark key features of the library and +discuss a few selected applications to highlight its broad applicability. + +
+
+
+
+
+ + ☆ Large Language Models are Null-Shot Learners + + +
+ This paper presents null-shot prompting. Null-shot prompting exploits +hallucination in large language models (LLMs) by instructing LLMs to utilize +information from the "Examples" section that never exists within the provided +context to perform a task. While reducing hallucination is crucial and +non-negligible for daily and critical uses of LLMs, we propose that in the +current landscape in which these LLMs still hallucinate, it is possible, in +fact, to exploit hallucination to increase performance in performing tasks +compared to standard zero-shot prompting. Experiments with six LLMs show +improvements in performance across the majority of eight datasets, including +reading comprehension, arithmetic reasoning, and closed-book question +answering. The observed inconsistency in increased relative performance across +LLMs also potentially indicates a different degree of inherent hallucination in +each model. These differences show that it is possible to utilize null-shot +prompting as a way to detect degrees of hallucination in LLMs using existing +benchmarking datasets. We also perform ablation studies, including +experimenting with a modified version of null-shot prompting that incorporates +ideas from zero-shot chain-of-thought prompting, which shows different trends +of results. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Siamese Content-based Search Engine for a More Transparent Skin and + Breast Cancer Diagnosis through Histological Imaging + + +
+ Computer Aid Diagnosis (CAD) has developed digital pathology with Deep +Learning (DL)-based tools to assist pathologists in decision-making. +Content-Based Histopathological Image Retrieval (CBHIR) is a novel tool to seek +highly correlated patches in terms of similarity in histopathological features. +In this work, we proposed two CBHIR approaches on breast (Breast-twins) and +skin cancer (Skin-twins) data sets for robust and accurate patch-level +retrieval, integrating a custom-built Siamese network as a feature extractor. +The proposed Siamese network is able to generalize for unseen images by +focusing on the similar histopathological features of the input pairs. The +proposed CBHIR approaches are evaluated on the Breast (public) and Skin +(private) data sets with top K accuracy. Finding the optimum amount of K is +challenging, but also, as much as K increases, the dissimilarity between the +query and the returned images increases which might mislead the pathologists. +To the best of the author's belief, this paper is tackling this issue for the +first time on histopathological images by evaluating the top first retrieved +images. The Breast-twins model achieves 70% of the F1score at the top first, +which exceeds the other state-of-the-art methods at a higher amount of K such +as 5 and 400. Skin-twins overpasses the recently proposed Convolutional Auto +Encoder (CAE) by 67%, increasing the precision. Besides, the Skin-twins model +tackles the challenges of Spitzoid Tumors of Uncertain Malignant Potential +(STUMP) to assist pathologists with retrieving top K images and their +corresponding labels. So, this approach can offer a more explainable CAD tool +to pathologists in terms of transparency, trustworthiness, or reliability among +other characteristics. + +
+
+
+
+
+ + ☆ An Explainable Proxy Model for Multiabel Audio Segmentation ICASSP 2024 + + +
+ Audio signal segmentation is a key task for automatic audio indexing. It +consists of detecting the boundaries of class-homogeneous segments in the +signal. In many applications, explainable AI is a vital process for +transparency of decision-making with machine learning. In this paper, we +propose an explainable multilabel segmentation model that solves speech +activity (SAD), music (MD), noise (ND), and overlapped speech detection (OSD) +simultaneously. This proxy uses the non-negative matrix factorization (NMF) to +map the embedding used for the segmentation to the frequency domain. +Experiments conducted on two datasets show similar performances as the +pre-trained black box model while showing strong explainability features. +Specifically, the frequency bins used for the decision can be easily identified +at both the segment level (local explanations) and global level (class +prototypes). + +
+
+ comment: Accepted at ICASSP 2024 +
+
+
+
+
+ + ☆ Fast Kernel Summation in High Dimensions via Slicing and Fourier + Transforms + + +
+ Kernel-based methods are heavily used in machine learning. However, they +suffer from $O(N^2)$ complexity in the number $N$ of considered data points. In +this paper, we propose an approximation procedure, which reduces this +complexity to $O(N)$. Our approach is based on two ideas. First, we prove that +any radial kernel with analytic basis function can be represented as sliced +version of some one-dimensional kernel and derive an analytic formula for the +one-dimensional counterpart. It turns out that the relation between one- and +$d$-dimensional kernels is given by a generalized Riemann-Liouville fractional +integral. Hence, we can reduce the $d$-dimensional kernel summation to a +one-dimensional setting. Second, for solving these one-dimensional problems +efficiently, we apply fast Fourier summations on non-equispaced data, a sorting +algorithm or a combination of both. Due to its practical importance we pay +special attention to the Gaussian kernel, where we show a dimension-independent +error bound and represent its one-dimensional counterpart via a closed-form +Fourier transform. We provide a run time comparison and error estimate of our +fast kernel summations. + +
+
+
+
+
+ + ☆ Optimizing $k$ in $k$NN Graphs with Graph Learning Perspective + + +
+ In this paper, we propose a method, based on graph signal processing, to +optimize the choice of $k$ in $k$-nearest neighbor graphs ($k$NNGs). $k$NN is +one of the most popular approaches and is widely used in machine learning and +signal processing. The parameter $k$ represents the number of neighbors that +are connected to the target node; however, its appropriate selection is still a +challenging problem. Therefore, most $k$NNGs use ad hoc selection methods for +$k$. In the proposed method, we assume that a different $k$ can be chosen for +each node. We formulate a discrete optimization problem to seek the best $k$ +with a constraint on the sum of distances of the connected nodes. The optimal +$k$ values are efficiently obtained without solving a complex optimization. +Furthermore, we reveal that the proposed method is closely related to existing +graph learning methods. In experiments on real datasets, we demonstrate that +the $k$NNGs obtained with our method are sparse and can determine an +appropriate variable number of edges per node. We validate the effectiveness of +the proposed method for point cloud denoising, comparing our denoising +performance with achievable graph construction methods that can be scaled to +typical point cloud sizes (e.g., thousands of nodes). + +
+
+
+
+
+ + ☆ Enhancing Wind Speed and Wind Power Forecasting Using Shape-Wise Feature + Engineering: A Novel Approach for Improved Accuracy and Robustness + + +
+ Accurate prediction of wind speed and power is vital for enhancing the +efficiency of wind energy systems. Numerous solutions have been implemented to +date, demonstrating their potential to improve forecasting. Among these, deep +learning is perceived as a revolutionary approach in the field. However, +despite their effectiveness, the noise present in the collected data remains a +significant challenge. This noise has the potential to diminish the performance +of these algorithms, leading to inaccurate predictions. In response to this, +this study explores a novel feature engineering approach. This approach +involves altering the data input shape in both Convolutional Neural +Network-Long Short-Term Memory (CNN-LSTM) and Autoregressive models for various +forecasting horizons. The results reveal substantial enhancements in model +resilience against noise resulting from step increases in data. The approach +could achieve an impressive 83% accuracy in predicting unseen data up to the +24th steps. Furthermore, this method consistently provides high accuracy for +short, mid, and long-term forecasts, outperforming the performance of +individual models. These findings pave the way for further research on noise +reduction strategies at different forecasting horizons through shape-wise +feature engineering. + +
+
+
+
+
+ + ☆ Efficient and Mathematically Robust Operations for Certified Neural + Networks Inference + + +
+ In recent years, machine learning (ML) and neural networks (NNs) have gained +widespread use and attention across various domains, particularly in +transportation for achieving autonomy, including the emergence of flying taxis +for urban air mobility (UAM). However, concerns about certification have come +up, compelling the development of standardized processes encompassing the +entire ML and NN pipeline. This paper delves into the inference stage and the +requisite hardware, highlighting the challenges associated with IEEE 754 +floating-point arithmetic and proposing alternative number representations. By +evaluating diverse summation and dot product algorithms, we aim to mitigate +issues related to non-associativity. Additionally, our exploration of +fixed-point arithmetic reveals its advantages over floating-point methods, +demonstrating significant hardware efficiencies. Employing an empirical +approach, we ascertain the optimal bit-width necessary to attain an acceptable +level of accuracy, considering the inherent complexity of bit-width +optimization. + +
+
+
+
+
+ + ☆ Differentially Private Estimation of CATE in Adaptive Experiment + + +
+ Adaptive experiment is widely adopted to estimate conditional average +treatment effect (CATE) in clinical trials and many other scenarios. While the +primary goal in experiment is to maximize estimation accuracy, due to the +imperative of social welfare, it's also crucial to provide treatment with +superior outcomes to patients, which is measured by regret in contextual bandit +framework. These two objectives often lead to contrast optimal allocation +mechanism. Furthermore, privacy concerns arise in clinical scenarios containing +sensitive data like patients health records. Therefore, it's essential for the +treatment allocation mechanism to incorporate robust privacy protection +measures. In this paper, we investigate the tradeoff between loss of social +welfare and statistical power in contextual bandit experiment. We propose a +matched upper and lower bound for the multi-objective optimization problem, and +then adopt the concept of Pareto optimality to mathematically characterize the +optimality condition. Furthermore, we propose differentially private algorithms +which still matches the lower bound, showing that privacy is "almost free". +Additionally, we derive the asymptotic normality of the estimator, which is +essential in statistical inference and hypothesis testing. + +
+
+
+
+
+ + ☆ Towards Causal Relationship in Indefinite Data: Baseline Model and New + Datasets + + +
+ Integrating deep learning and causal discovery has encouraged us to spot that +learning causal structures and representations in dialogue and video is full of +challenges. We defined These data forms as "Indefinite Data", characterized by +multi-structure data and multi-value representations. Unlike existing adaptable +data forms, Indefinite Data still faces gaps in datasets and methods. To +address the dataset gap, we release two high-quality datasets - Causalogue and +Causaction, containing text dialogue samples and video action samples with +causal annotations respectively. Moreover, the method gap arises from the +coexistence of multi-structure data and multi-value representations, breaking +the assumptions of all current methods and rendering them infeasible on +Indefinite Data. To this end, we propose a probabilistic framework as a +baseline, incorporating three designed highlights for this gap: 1) establishing +Causation Condition of representations using the independence of noise terms +under non-fixed causal structures, 2) treating causal strength as a latent +variable and measuring the reconstruction loss in the correlation space, and 3) +estimating the effects of latent confounders. These highpoints make the +probabilistic model capable of overcoming challenges brought by the coexistence +of multi-structure data and multi-value representations and pave the way for +the extension of latent confounders. Comprehensive experiments have evaluated +baseline results of causal structures, causal representations, and confounding +disentanglement. + +
+
+ comment: If you are interested in the two new datasets, pls contact us by + email +
+
+
+
+
+ + ☆ Towards Efficient and Certified Recovery from Poisoning Attacks in + Federated Learning + + +
+ Federated learning (FL) is vulnerable to poisoning attacks, where malicious +clients manipulate their updates to affect the global model. Although various +methods exist for detecting those clients in FL, identifying malicious clients +requires sufficient model updates, and hence by the time malicious clients are +detected, FL models have been already poisoned. Thus, a method is needed to +recover an accurate global model after malicious clients are identified. +Current recovery methods rely on (i) all historical information from +participating FL clients and (ii) the initial model unaffected by the malicious +clients, leading to a high demand for storage and computational resources. In +this paper, we show that highly effective recovery can still be achieved based +on (i) selective historical information rather than all historical information +and (ii) a historical model that has not been significantly affected by +malicious clients rather than the initial model. In this scenario, while +maintaining comparable recovery performance, we can accelerate the recovery +speed and decrease memory consumption. Following this concept, we introduce +Crab, an efficient and certified recovery method, which relies on selective +information storage and adaptive model rollback. Theoretically, we demonstrate +that the difference between the global model recovered by Crab and the one +recovered by train-from-scratch can be bounded under certain assumptions. Our +empirical evaluation, conducted across three datasets over multiple machine +learning models, and a variety of untargeted and targeted poisoning attacks +reveals that Crab is both accurate and efficient, and consistently outperforms +previous approaches in terms of both recovery speed and memory consumption. + +
+
+
+
+
+ + ☆ Matrix Completion with Hypergraphs:Sharp Thresholds and Efficient + Algorithms + + +
+ This paper considers the problem of completing a rating matrix based on +sub-sampled matrix entries as well as observed social graphs and hypergraphs. +We show that there exists a \emph{sharp threshold} on the sample probability +for the task of exactly completing the rating matrix -- the task is achievable +when the sample probability is above the threshold, and is impossible otherwise +-- demonstrating a phase transition phenomenon. The threshold can be expressed +as a function of the ``quality'' of hypergraphs, enabling us to \emph{quantify} +the amount of reduction in sample probability due to the exploitation of +hypergraphs. This also highlights the usefulness of hypergraphs in the matrix +completion problem. En route to discovering the sharp threshold, we develop a +computationally efficient matrix completion algorithm that effectively exploits +the observed graphs and hypergraphs. Theoretical analyses show that our +algorithm succeeds with high probability as long as the sample probability +exceeds the aforementioned threshold, and this theoretical result is further +validated by synthetic experiments. Moreover, our experiments on a real social +network dataset (with both graphs and hypergraphs) show that our algorithm +outperforms other state-of-the-art matrix completion algorithms. + +
+
+
+
+
+ + ☆ PRewrite: Prompt Rewriting with Reinforcement Learning + + +
+ Prompt engineering is critical for the development of LLM-based applications. +However, it is usually done manually in a "trial and error" fashion. This +manual procedure can be time consuming, ineffective, and the generated prompts +are, in a lot of cases, sub-optimal. Even for the prompts which seemingly work +well, there is always a lingering question: can the prompts be made better with +further modifications? + To address these questions, in this paper, we investigate prompt engineering +automation. We consider a specific use case scenario in which developers/users +have drafted initial prompts, but lack the time/expertise to optimize them. We +propose PRewrite, an automated tool to rewrite these drafts and to generate +highly effective new prompts. PRewrite is based on the Reinforcement Learning +(RL) framework which allows for end-to-end optimization and our design allows +the RL search to happen in a large action space. The automated tool leverages +manually crafted prompts as starting points which makes the rewriting procedure +more guided and efficient. The generated prompts are human readable, and +self-explanatory, unlike some of those in previous works. We conducted +extensive experiments on diverse datasets and found that the prompts generated +with this new method not only outperform professionally crafted prompts, but +also prompts generated with other previously proposed methods. + +
+
+
+
+
+ + ☆ Statistical Test for Attention Map in Vision Transformer + + +
+ The Vision Transformer (ViT) demonstrates exceptional performance in various +computer vision tasks. Attention is crucial for ViT to capture complex +wide-ranging relationships among image patches, allowing the model to weigh the +importance of image patches and aiding our understanding of the decision-making +process. However, when utilizing the attention of ViT as evidence in +high-stakes decision-making tasks such as medical diagnostics, a challenge +arises due to the potential of attention mechanisms erroneously focusing on +irrelevant regions. In this study, we propose a statistical test for ViT's +attentions, enabling us to use the attentions as reliable quantitative evidence +indicators for ViT's decision-making with a rigorously controlled error rate. +Using the framework called selective inference, we quantify the statistical +significance of attentions in the form of p-values, which enables the +theoretically grounded quantification of the false positive detection +probability of attentions. We demonstrate the validity and the effectiveness of +the proposed method through numerical experiments and applications to brain +image diagnoses. + +
+
+ comment: 42pages, 17figures +
+
+
+
+
+ + ☆ Differentially Private Sliced Inverse Regression: Minimax Optimality and + Algorithm + + +
+ Privacy preservation has become a critical concern in high-dimensional data +analysis due to the growing prevalence of data-driven applications. Proposed by +Li (1991), sliced inverse regression has emerged as a widely utilized +statistical technique for reducing covariate dimensionality while maintaining +sufficient statistical information. In this paper, we propose optimally +differentially private algorithms specifically designed to address privacy +concerns in the context of sufficient dimension reduction. We proceed to +establish lower bounds for differentially private sliced inverse regression in +both the low and high-dimensional settings. Moreover, we develop differentially +private algorithms that achieve the minimax lower bounds up to logarithmic +factors. Through a combination of simulations and real data analysis, we +illustrate the efficacy of these differentially private algorithms in +safeguarding privacy while preserving vital information within the reduced +dimension space. As a natural extension, we can readily offer analogous lower +and upper bounds for differentially private sparse principal component +analysis, a topic that may also be of potential interest to the statistical and +machine learning community. + +
+
+
+
+
+ + ☆ Machine Learning on Dynamic Graphs: A Survey on Applications + + +
+ Dynamic graph learning has gained significant attention as it offers a +powerful means to model intricate interactions among entities across various +real-world and scientific domains. Notably, graphs serve as effective +representations for diverse networks such as transportation, brain, social, and +internet networks. Furthermore, the rapid advancements in machine learning have +expanded the scope of dynamic graph applications beyond the aforementioned +domains. In this paper, we present a review of lesser-explored applications of +dynamic graph learning. This study revealed the potential of machine learning +on dynamic graphs in addressing challenges across diverse domains, including +those with limited levels of association with the field. + +
+
+
+
+
+ + ☆ Transferring Core Knowledge via Learngenes + + +
+ The pre-training paradigm fine-tunes the models trained on large-scale +datasets to downstream tasks with enhanced performance. It transfers all +knowledge to downstream tasks without discriminating which part is necessary or +unnecessary, which may lead to negative transfer. In comparison, knowledge +transfer in nature is much more efficient. When passing genetic information to +descendants, ancestors encode only the essential knowledge into genes, which +act as the medium. Inspired by that, we adopt a recent concept called +``learngene'' and refine its structures by mimicking the structures of natural +genes. We propose the Genetic Transfer Learning (GTL) -- a framework to copy +the evolutionary process of organisms into neural networks. GTL trains a +population of networks, selects superior learngenes by tournaments, performs +learngene mutations, and passes the learngenes to next generations. Finally, we +successfully extract the learngenes of VGG11 and ResNet12. We show that the +learngenes bring the descendant networks instincts and strong learning ability: +with 20% parameters, the learngenes bring 12% and 16% improvements of accuracy +on CIFAR-FS and miniImageNet. Besides, the learngenes have the scalability and +adaptability on the downstream structure of networks and datasets. Overall, we +offer a novel insight that transferring core knowledge via learngenes may be +sufficient and efficient for neural networks. + +
+
+
+
+
+ + ☆ Machine Learning-Based Malicious Vehicle Detection for Security Threats + and Attacks in Vehicle Ad-hoc Network (VANET) Communications + + +
+ With the rapid growth of Vehicle Ad-hoc Network (VANET) as a promising +technology for efficient and reliable communication among vehicles and +infrastructure, the security and integrity of VANET communications has become a +critical concern. One of the significant threats to VANET is the presence of +blackhole attacks, where malicious nodes disrupt the network's functionality +and compromise data confidentiality, integrity, and availability. In this +paper, we propose a machine learning-based approach for blackhole detection in +VANET. To achieve this task, we first create a comprehensive dataset comprising +normal and malicious traffic flows. Afterward, we study and define a promising +set of features to discriminate the blackhole attacks. Finally, we evaluate +various machine learning algorithms, including Gradient Boosting, Random +Forest, Support Vector Machines, k-Nearest Neighbors, Gaussian Naive Bayes, and +Logistic Regression. Experimental results demonstrate the effectiveness of +these algorithms in distinguishing between normal and malicious nodes. Our +findings also highlight the potential of machine learning based approach in +enhancing the security of VANET by detecting and mitigating blackhole attacks. + +
+
+ comment: In the 2023 RIVF International Conference on Computing and + Communication Technologies, Hanoi, Vietnam +
+
+
+
+
+ + ☆ CycLight: learning traffic signal cooperation with a cycle-level + strategy + + +
+ This study introduces CycLight, a novel cycle-level deep reinforcement +learning (RL) approach for network-level adaptive traffic signal control +(NATSC) systems. Unlike most traditional RL-based traffic controllers that +focus on step-by-step decision making, CycLight adopts a cycle-level strategy, +optimizing cycle length and splits simultaneously using Parameterized Deep +Q-Networks (PDQN) algorithm. This cycle-level approach effectively reduces the +computational burden associated with frequent data communication, meanwhile +enhancing the practicality and safety of real-world applications. A +decentralized framework is formulated for multi-agent cooperation, while +attention mechanism is integrated to accurately assess the impact of the +surroundings on the current intersection. CycLight is tested in a large +synthetic traffic grid using the microscopic traffic simulation tool, SUMO. +Experimental results not only demonstrate the superiority of CycLight over +other state-of-the-art approaches but also showcase its robustness against +information transmission delays. + +
+
+
+
+
+ + ☆ SpecSTG: A Fast Spectral Diffusion Framework for Probabilistic + Spatio-Temporal Traffic Forecasting + + +
+ Traffic forecasting, a crucial application of spatio-temporal graph (STG) +learning, has traditionally relied on deterministic models for accurate point +estimations. Yet, these models fall short of identifying latent risks of +unexpected volatility in future observations. To address this gap, +probabilistic methods, especially variants of diffusion models, have emerged as +uncertainty-aware solutions. However, existing diffusion methods typically +focus on generating separate future time series for individual sensors in the +traffic network, resulting in insufficient involvement of spatial network +characteristics in the probabilistic learning process. To better leverage +spatial dependencies and systematic patterns inherent in traffic data, we +propose SpecSTG, a novel spectral diffusion framework. Our method generates the +Fourier representation of future time series, transforming the learning process +into the spectral domain enriched with spatial information. Additionally, our +approach incorporates a fast spectral graph convolution designed for Fourier +input, alleviating the computational burden associated with existing models. +Numerical experiments show that SpecSTG achieves outstanding performance with +traffic flow and traffic speed datasets compared to state-of-the-art baselines. +The source code for SpecSTG is available at +https://anonymous.4open.science/r/SpecSTG. + +
+
+
+
+
+ + ☆ A Survey of Resource-efficient LLM and Multimodal Foundation Models + + +
+ Large foundation models, including large language models (LLMs), vision +transformers (ViTs), diffusion, and LLM-based multimodal models, are +revolutionizing the entire machine learning lifecycle, from training to +deployment. However, the substantial advancements in versatility and +performance these models offer come at a significant cost in terms of hardware +resources. To support the growth of these large models in a scalable and +environmentally sustainable way, there has been a considerable focus on +developing resource-efficient strategies. This survey delves into the critical +importance of such research, examining both algorithmic and systemic aspects. +It offers a comprehensive analysis and valuable insights gleaned from existing +literature, encompassing a broad array of topics from cutting-edge model +architectures and training/serving algorithms to practical system designs and +implementations. The goal of this survey is to provide an overarching +understanding of how current approaches are tackling the resource challenges +posed by large foundation models and to potentially inspire future +breakthroughs in this field. + +
+
+
+
+
+ + ☆ Predicting Next Useful Location With Context-Awareness: The + State-Of-The-Art + + +
+ Predicting the future location of mobile objects reinforces location-aware +services with proactive intelligence and helps businesses and decision-makers +with better planning and near real-time scheduling in different applications +such as traffic congestion control, location-aware advertisements, and +monitoring public health and well-being. The recent developments in the +smartphone and location sensors technology and the prevalence of using +location-based social networks alongside the improvements in artificial +intelligence and machine learning techniques provide an excellent opportunity +to exploit massive amounts of historical and real-time contextual information +to recognise mobility patterns and achieve more accurate and intelligent +predictions. This survey provides a comprehensive overview of the next useful +location prediction problem with context-awareness. First, we explain the +concepts of context and context-awareness and define the next location +prediction problem. Then we analyse nearly thirty studies in this field +concerning the prediction method, the challenges addressed, the datasets and +metrics used for training and evaluating the model, and the types of context +incorporated. Finally, we discuss the advantages and disadvantages of different +approaches, focusing on the usefulness of the predicted location and +identifying the open challenges and future work on this subject by introducing +two potential use cases of next location prediction in the automotive industry. + +
+
+
+
+
+ + ☆ Transformer-based approach for Ethereum Price Prediction Using + Crosscurrency correlation and Sentiment Analysis + + +
+ The research delves into the capabilities of a transformer-based neural +network for Ethereum cryptocurrency price forecasting. The experiment runs +around the hypothesis that cryptocurrency prices are strongly correlated with +other cryptocurrencies and the sentiments around the cryptocurrency. The model +employs a transformer architecture for several setups from single-feature +scenarios to complex configurations incorporating volume, sentiment, and +correlated cryptocurrency prices. Despite a smaller dataset and less complex +architecture, the transformer model surpasses ANN and MLP counterparts on some +parameters. The conclusion presents a hypothesis on the illusion of causality +in cryptocurrency price movements driven by sentiments. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Augmenting Ground-Level PM2.5 Prediction via Kriging-Based Pseudo-Label + Generation NeurIPS 2023 + + +
+ Fusing abundant satellite data with sparse ground measurements constitutes a +major challenge in climate modeling. To address this, we propose a strategy to +augment the training dataset by introducing unlabeled satellite images paired +with pseudo-labels generated through a spatial interpolation technique known as +ordinary kriging, thereby making full use of the available satellite data +resources. We show that the proposed data augmentation strategy helps enhance +the performance of the state-of-the-art convolutional neural network-random +forest (CNN-RF) model by a reasonable amount, resulting in a noteworthy +improvement in spatial correlation and a reduction in prediction error. + +
+
+ comment: 8 pages, 4 figures, NeurIPS 2023 Workshop: Tackling Climate Change + with Machine Learning +
+
+
+
+
+ + ☆ Incremental Extractive Opinion Summarization Using Cover Trees + + +
+ Extractive opinion summarization involves automatically producing a summary +of text about an entity (e.g., a product's reviews) by extracting +representative sentences that capture prevalent opinions in the review set. +Typically, in online marketplaces user reviews accrue over time, and opinion +summaries need to be updated periodically to provide customers with up-to-date +information. In this work, we study the task of extractive opinion +summarization in an incremental setting, where the underlying review set +evolves over time. Many of the state-of-the-art extractive opinion +summarization approaches are centrality-based, such as CentroidRank. +CentroidRank performs extractive summarization by selecting a subset of review +sentences closest to the centroid in the representation space as the summary. +However, these methods are not capable of operating efficiently in an +incremental setting, where reviews arrive one at a time. In this paper, we +present an efficient algorithm for accurately computing the CentroidRank +summaries in an incremental setting. Our approach, CoverSumm, relies on +indexing review representations in a cover tree and maintaining a reservoir of +candidate summary review sentences. CoverSumm's efficacy is supported by a +theoretical and empirical analysis of running time. Empirically, on a diverse +collection of data (both real and synthetically created to illustrate scaling +considerations), we demonstrate that CoverSumm is up to 25x faster than +baseline methods, and capable of adapting to nuanced changes in data +distribution. We also conduct human evaluations of the generated summaries and +find that CoverSumm is capable of producing informative summaries consistent +with the underlying review set. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Calpric: Inclusive and Fine-grain Labeling of Privacy Policies with + Crowdsourcing and Active Learning USENIX Security 2023 + + +
+ A significant challenge to training accurate deep learning models on privacy +policies is the cost and difficulty of obtaining a large and comprehensive set +of training data. To address these challenges, we present Calpric , which +combines automatic text selection and segmentation, active learning and the use +of crowdsourced annotators to generate a large, balanced training set for +privacy policies at low cost. Automated text selection and segmentation +simplifies the labeling task, enabling untrained annotators from crowdsourcing +platforms, like Amazon's Mechanical Turk, to be competitive with trained +annotators, such as law students, and also reduces inter-annotator agreement, +which decreases labeling cost. Having reliable labels for training enables the +use of active learning, which uses fewer training samples to efficiently cover +the input space, further reducing cost and improving class and data category +balance in the data set. The combination of these techniques allows Calpric to +produce models that are accurate over a wider range of data categories, and +provide more detailed, fine-grain labels than previous work. Our crowdsourcing +process enables Calpric to attain reliable labeled data at a cost of roughly +$0.92-$1.71 per labeled text segment. Calpric 's training process also +generates a labeled data set of 16K privacy policy text segments across 9 Data +categories with balanced positive and negative samples. + +
+
+ comment: published at USENIX Security 2023; associated website: + https://www.usenix.org/conference/usenixsecurity23/presentation/qiu +
+
+
+
+
+ + ☆ BanglaNet: Bangla Handwritten Character Recognition using Ensembling of + Convolutional Neural Network + + +
+ Handwritten character recognition is a crucial task because of its abundant +applications. The recognition task of Bangla handwritten characters is +especially challenging because of the cursive nature of Bangla characters and +the presence of compound characters with more than one way of writing. In this +paper, a classification model based on the ensembling of several Convolutional +Neural Networks (CNN), namely, BanglaNet is proposed to classify Bangla basic +characters, compound characters, numerals, and modifiers. Three different +models based on the idea of state-of-the-art CNN models like Inception, ResNet, +and DenseNet have been trained with both augmented and non-augmented inputs. +Finally, all these models are averaged or ensembled to get the finishing model. +Rigorous experimentation on three benchmark Bangla handwritten characters +datasets, namely, CMATERdb, BanglaLekha-Isolated, and Ekush has exhibited +significant recognition accuracies compared to some recent CNN-based research. +The top-1 recognition accuracies obtained are 98.40%, 97.65%, and 97.32%, and +the top-3 accuracies are 99.79%, 99.74%, and 99.56% for CMATERdb, +BanglaLekha-Isolated, and Ekush datasets respectively. + +
+
+
+
+
+ + ☆ Structure-based out-of-distribution (OOD) materials property prediction: + a benchmark study + + +
+ In real-world material research, machine learning (ML) models are usually +expected to predict and discover novel exceptional materials that deviate from +the known materials. It is thus a pressing question to provide an objective +evaluation of ML model performances in property prediction of +out-of-distribution (OOD) materials that are different from the training set +distribution. Traditional performance evaluation of materials property +prediction models through random splitting of the dataset frequently results in +artificially high performance assessments due to the inherent redundancy of +typical material datasets. Here we present a comprehensive benchmark study of +structure-based graph neural networks (GNNs) for extrapolative OOD materials +property prediction. We formulate five different categories of OOD ML problems +for three benchmark datasets from the MatBench study. Our extensive experiments +show that current state-of-the-art GNN algorithms significantly underperform +for the OOD property prediction tasks on average compared to their baselines in +the MatBench study, demonstrating a crucial generalization gap in realistic +material prediction tasks. We further examine the latent physical spaces of +these GNN models and identify the sources of CGCNN, ALIGNN, and DeeperGATGNN's +significantly more robust OOD performance than those of the current best models +in the MatBench study (coGN and coNGN), and provide insights to improve their +performance. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Self-Imagine: Effective Unimodal Reasoning with Multimodal Models using + Self-Imagination + + +
+ The potential of Vision-Language Models (\textsc{vlm}s) often remains +underutilized in handling complex text-based problems, particularly when these +problems could benefit from visual representation. Resonating with humans' +ability to solve complex text-based problems by (1) creating a visual diagram +from the problem and (2) deducing what steps they need to take to solve it, we +propose \textsc{Self-Imagine}. We leverage a single Vision-Language Model +(\textsc{vlm}) to generate a structured representation of the question using +HTML, then render the HTML as an image, and finally use the same \vlm to answer +the question using both the question and the image. Our approach does not +require any additional training data or training. We evaluate our approach in +three mathematics tasks and nine general-purpose reasoning tasks using +state-of-the-art \textsc{vlm}. Our approach boosts the performance of +\textsc{vlm} on all math tasks (\gsm: +4.62\%; \asdiv: +4.49\%; \svamp: ++9.30\%) and the majority of the general-purpose reasoning tasks by 0.4\% to +13.20\% while achieving comparable performance in other tasks. + Code and data at https://github.com/snat1505027/self-imagine . + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ RiemannONets: Interpretable Neural Operators for Riemann Problems + + +
+ Developing the proper representations for simulating high-speed flows with +strong shock waves, rarefactions, and contact discontinuities has been a +long-standing question in numerical analysis. Herein, we employ neural +operators to solve Riemann problems encountered in compressible flows for +extreme pressure jumps (up to $10^{10}$ pressure ratio). In particular, we +first consider the DeepONet that we train in a two-stage process, following the +recent work of Lee and Shin, wherein the first stage, a basis is extracted from +the trunk net, which is orthonormalized and subsequently is used in the second +stage in training the branch net. This simple modification of DeepONet has a +profound effect on its accuracy, efficiency, and robustness and leads to very +accurate solutions to Riemann problems compared to the vanilla version. It also +enables us to interpret the results physically as the hierarchical data-driven +produced basis reflects all the flow features that would otherwise be +introduced using ad hoc feature expansion layers. We also compare the results +with another neural operator based on the U-Net for low, intermediate, and very +high-pressure ratios that are very accurate for Riemann problems, especially +for large pressure ratios, due to their multiscale nature but computationally +more expensive. Overall, our study demonstrates that simple neural network +architectures, if properly pre-trained, can achieve very accurate solutions of +Riemann problems for real-time forecasting. + +
+
+
+
+
+ + ☆ Evaluating the Utility of Conformal Prediction Sets for AI-Advised Image + Labeling + + +
+ As deep neural networks are more commonly deployed in high-stakes domains, +their lack of interpretability makes uncertainty quantification challenging. We +investigate the effects of presenting conformal prediction +sets$\unicode{x2013}$a method for generating valid confidence sets in +distribution-free uncertainty quantification$\unicode{x2013}$to express +uncertainty in AI-advised decision-making. Through a large pre-registered +experiment, we compare the utility of conformal prediction sets to displays of +Top-1 and Top-k predictions for AI-advised image labeling. We find that the +utility of prediction sets for accuracy varies with the difficulty of the task: +while they result in accuracy on par with or less than Top-1 and Top-k displays +for easy images, prediction sets excel at assisting humans in labeling +out-of-distribution (OOD) images especially when the set size is small. Our +results empirically pinpoint the practical challenges of conformal prediction +sets and provide implications on how to incorporate them for real-world +decision-making. + +
+
+ comment: 28 pages, 11 figures, 8 tables +
+
+
+
+
+ + ☆ DCRMTA: Unbiased Causal Representation for Multi-touch Attribution + + +
+ Multi-touch attribution (MTA) currently plays a pivotal role in achieving a +fair estimation of the contributions of each advertising touchpoint to-wards +conversion behavior, deeply influencing budget allocation and advertising +recommenda-tion. Traditional multi-touch attribution methods initially build a +conversion prediction model, an-ticipating learning the inherent relationship +be-tween touchpoint sequences and user purchasing behavior through historical +data. Based on this, counterfactual touchpoint sequences are con-structed from +the original sequence subset, and conversions are estimated using the +prediction model, thus calculating advertising contributions. A covert +assumption of these methods is the un-biased nature of conversion prediction +models. However, due to confounding variables factors arising from user +preferences and internet recom-mendation mechanisms such as homogenization of +ad recommendations resulting from past shop-ping records, bias can easily occur +in conversion prediction models trained on observational data. This paper +redefines the causal effect of user fea-tures on conversions and proposes a +novel end-to-end approach, Deep Causal Representation for MTA (DCRMTA). Our +model while eliminating confounding variables, extracts features with causal +relations to conversions from users. Fur-thermore, Extensive experiments on +both synthet-ic and real-world Criteo data demonstrate DCRMTA's superior +performance in converting prediction across varying data distributions, while +also effectively attributing value across dif-ferent advertising channels + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ MambaTab: A Simple Yet Effective Approach for Handling Tabular Data + + +
+ Tabular data remains ubiquitous across domains despite growing use of images +and texts for machine learning. While deep learning models like convolutional +neural networks and transformers achieve strong performance on tabular data, +they require extensive data preprocessing, tuning, and resources, limiting +accessibility and scalability. This work develops an innovative approach based +on a structured state-space model (SSM), MambaTab, for tabular data. SSMs have +strong capabilities for efficiently extracting effective representations from +data with long-range dependencies. MambaTab leverages Mamba, an emerging SSM +variant, for end-to-end supervised learning on tables. Compared to +state-of-the-art baselines, MambaTab delivers superior performance while +requiring significantly fewer parameters and minimal preprocessing, as +empirically validated on diverse benchmark datasets. MambaTab's efficiency, +scalability, generalizability, and predictive gains signify it as a +lightweight, "out-of-the-box" solution for diverse tabular data with promise +for enabling wider practical applications. + +
+
+
+
+
+ + ☆ The Effect of Intrinsic Dataset Properties on Generalization: Unraveling + Learning Differences Between Natural and Medical Images ICLR 2024 + + +
+ This paper investigates discrepancies in how neural networks learn from +different imaging domains, which are commonly overlooked when adopting computer +vision techniques from the domain of natural images to other specialized +domains such as medical images. Recent works have found that the generalization +error of a trained network typically increases with the intrinsic dimension +($d_{data}$) of its training set. Yet, the steepness of this relationship +varies significantly between medical (radiological) and natural imaging +domains, with no existing theoretical explanation. We address this gap in +knowledge by establishing and empirically validating a generalization scaling +law with respect to $d_{data}$, and propose that the substantial scaling +discrepancy between the two considered domains may be at least partially +attributed to the higher intrinsic "label sharpness" ($K_F$) of medical imaging +datasets, a metric which we propose. Next, we demonstrate an additional benefit +of measuring the label sharpness of a training set: it is negatively correlated +with the trained model's adversarial robustness, which notably leads to models +for medical images having a substantially higher vulnerability to adversarial +attack. Finally, we extend our $d_{data}$ formalism to the related metric of +learned representation intrinsic dimension ($d_{repr}$), derive a +generalization scaling law with respect to $d_{repr}$, and show that $d_{data}$ +serves as an upper bound for $d_{repr}$. Our theoretical results are supported +by thorough experiments with six models and eleven natural and medical imaging +datasets over a range of training set sizes. Our findings offer insights into +the influence of intrinsic dataset properties on generalization, representation +learning, and robustness in deep neural networks. + +
+
+ comment: ICLR 2024. Code: + https://github.com/mazurowski-lab/intrinsic-properties +
+
+
+
+
+ + ☆ Binaural Angular Separation Network ICASSP 2024 + + +
+ We propose a neural network model that can separate target speech sources +from interfering sources at different angular regions using two microphones. +The model is trained with simulated room impulse responses (RIRs) using +omni-directional microphones without needing to collect real RIRs. By relying +on specific angular regions and multiple room simulations, the model utilizes +consistent time difference of arrival (TDOA) cues, or what we call delay +contrast, to separate target and interference sources while remaining robust in +various reverberation environments. We demonstrate the model is not only +generalizable to a commercially available device with a slightly different +microphone geometry, but also outperforms our previous work which uses one +additional microphone on the same device. The model runs in real-time on-device +and is suitable for low-latency streaming applications such as telephony and +video conferencing. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ☆ Robust Localization of Key Fob Using Channel Impulse Response of Ultra + Wide Band Sensors for Keyless Entry Systems + + +
+ Using neural networks for localization of key fob within and surrounding a +car as a security feature for keyless entry is fast emerging. In this paper we +study: 1) the performance of pre-computed features of neural networks based UWB +(ultra wide band) localization classification forming the baseline of our +experiments. 2) Investigate the inherent robustness of various neural networks; +therefore, we include the study of robustness of the adversarial examples +without any adversarial training in this work. 3) Propose a multi-head +self-supervised neural network architecture which outperforms the baseline +neural networks without any adversarial training. The model's performance +improved by 67% at certain ranges of adversarial magnitude for fast gradient +sign method and 37% each for basic iterative method and projected gradient +descent method. + +
+
+
+
+
+ + ☆ Semi-Supervised Learning Approach for Efficient Resource Allocation with + Network Slicing in O-RAN + + +
+ The Open Radio Access Network (O-RAN) technology has emerged as a promising +solution for network operators, providing them with an open and favorable +environment. Ensuring effective coordination of x-applications (xAPPs) is +crucial to enhance flexibility and optimize network performance within the +O-RAN. In this paper, we introduce an innovative approach to the resource +allocation problem, aiming to coordinate multiple independent xAPPs for network +slicing and resource allocation in O-RAN. Our proposed method focuses on +maximizing the weighted throughput among user equipments (UE), as well as +allocating physical resource blocks (PRBs). We prioritize two service types, +namely enhanced Mobile Broadband and Ultra Reliable Low Latency Communication. +To achieve this, we have designed two xAPPs: a power control xAPP for each UE +and a PRB allocation xAPP. The proposed method consists of a two-part training +phase, where the first part uses supervised learning with a Variational +Autoencoder trained to regress the power transmission as well as the user +association and PRB allocation decisions, and the second part uses unsupervised +learning with a contrastive loss approach to improve the generalization and +robustness of the model. We evaluate the performance of our proposed method by +comparing its results to those obtained from an exhaustive search algorithm, +deep Q-network algorithm, and by reporting performance metrics for the +regression task. We also evaluate the proposed model's performance in different +scenarios among the service types. The results show that the proposed method is +a more efficient and effective solution for network slicing problems compared +to state-of-the-art methods. + +
+
+ comment: Submitted to IEEE Transactions on Network and Service Management +
+
+
+
+
+ + ☆ Shabari: Delayed Decision-Making for Faster and Efficient Serverless + Function + + +
+ Serverless computing relieves developers from the burden of resource +management, thus providing ease-of-use to the users and the opportunity to +optimize resource utilization for the providers. However, today's serverless +systems lack performance guarantees for function invocations, thus limiting +support for performance-critical applications: we observed severe performance +variability (up to 6x). Providers lack visibility into user functions and hence +find it challenging to right-size them: we observed heavy resource +underutilization (up to 80%). To understand the causes behind the performance +variability and underutilization, we conducted a measurement study of commonly +deployed serverless functions and learned that the function performance and +resource utilization depend crucially on function semantics and inputs. Our key +insight is to delay making resource allocation decisions until after the +function inputs are available. We introduce Shabari, a resource management +framework for serverless systems that makes decisions as late as possible to +right-size each invocation to meet functions' performance objectives (SLOs) and +improve resource utilization. Shabari uses an online learning agent to +right-size each function invocation based on the features of the function input +and makes cold-start-aware scheduling decisions. For a range of serverless +functions and inputs, Shabari reduces SLO violations by 11-73% while not +wasting any vCPUs and reducing wasted memory by 64-94% in the median case, +compared to state-of-the-art systems, including Aquatope, Parrotfish, and +Cypress. + +
+
+ comment: 17 pages, 14 figures +
+
+
+
+
+ + ☆ Using i-vectors for subject-independent cross-session EEG transfer + learning + + +
+ Cognitive load classification is the task of automatically determining an +individual's utilization of working memory resources during performance of a +task based on physiologic measures such as electroencephalography (EEG). In +this paper, we follow a cross-disciplinary approach, where tools and +methodologies from speech processing are used to tackle this problem. The +corpus we use was released publicly in 2021 as part of the first passive +brain-computer interface competition on cross-session workload estimation. We +present our approach which used i-vector-based neural network classifiers to +accomplish inter-subject cross-session EEG transfer learning, achieving 18% +relative improvement over equivalent subject-dependent models. We also report +experiments showing how our subject-independent models perform competitively on +held-out subjects and improve with additional subject data, suggesting that +subject-dependent training is not required for effective cognitive load +determination. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ REValueD: Regularised Ensemble Value-Decomposition for Factorisable + Markov Decision Processes ICLR 2024 + + +
+ Discrete-action reinforcement learning algorithms often falter in tasks with +high-dimensional discrete action spaces due to the vast number of possible +actions. A recent advancement leverages value-decomposition, a concept from +multi-agent reinforcement learning, to tackle this challenge. This study delves +deep into the effects of this value-decomposition, revealing that whilst it +curtails the over-estimation bias inherent to Q-learning algorithms, it +amplifies target variance. To counteract this, we present an ensemble of +critics to mitigate target variance. Moreover, we introduce a regularisation +loss that helps to mitigate the effects that exploratory actions in one +dimension can have on the value of optimal actions in other dimensions. Our +novel algorithm, REValueD, tested on discretised versions of the DeepMind +Control Suite tasks, showcases superior performance, especially in the +challenging humanoid and dog tasks. We further dissect the factors influencing +REValueD's performance, evaluating the significance of the regularisation loss +and the scalability of REValueD with increasing sub-actions per dimension. + +
+
+ comment: To appear in ICLR 2024 +
+
+
+
+
+ + ☆ RIDGE: Reproducibility, Integrity, Dependability, Generalizability, and + Efficiency Assessment of Medical Image Segmentation Models + + +
+ Deep learning techniques, despite their potential, often suffer from a lack +of reproducibility and generalizability, impeding their clinical adoption. +Image segmentation is one of the critical tasks in medical image analysis, in +which one or several regions/volumes of interest should be annotated. This +paper introduces the RIDGE checklist, a framework for assessing the +Reproducibility, Integrity, Dependability, Generalizability, and Efficiency of +deep learning-based medical image segmentation models. The checklist serves as +a guide for researchers to enhance the quality and transparency of their work, +ensuring that segmentation models are not only scientifically sound but also +clinically relevant. + +
+
+ comment: 20 pages, 1 Figure, 1 Table +
+
+
+
+
+ + ☆ Stochastic Subnetwork Annealing: A Regularization Technique for Fine + Tuning Pruned Subnetworks ICLR-2024 + + +
+ Pruning methods have recently grown in popularity as an effective way to +reduce the size and computational complexity of deep neural networks. Large +numbers of parameters can be removed from trained models with little +discernible loss in accuracy after a small number of continued training epochs. +However, pruning too many parameters at once often causes an initial steep drop +in accuracy which can undermine convergence quality. Iterative pruning +approaches mitigate this by gradually removing a small number of parameters +over multiple epochs. However, this can still lead to subnetworks that overfit +local regions of the loss landscape. We introduce a novel and effective +approach to tuning subnetworks through a regularization technique we call +Stochastic Subnetwork Annealing. Instead of removing parameters in a discrete +manner, we instead represent subnetworks with stochastic masks where each +parameter has a probabilistic chance of being included or excluded on any given +forward pass. We anneal these probabilities over time such that subnetwork +structure slowly evolves as mask values become more deterministic, allowing for +a smoother and more robust optimization of subnetworks at high levels of +sparsity. + +
+
+ comment: 9 pages, 2 figures; Rejected at ICLR-2024; Revised and updated with + new experiments; Submitted to WCCI-2024 +
+
+
+
+
+ + ♻ ☆ The Memory Perturbation Equation: Understanding Model's Sensitivity to + Data NeurIPS + 2023 + + +
+ Understanding model's sensitivity to its training data is crucial but can +also be challenging and costly, especially during training. To simplify such +issues, we present the Memory-Perturbation Equation (MPE) which relates model's +sensitivity to perturbation in its training data. Derived using Bayesian +principles, the MPE unifies existing sensitivity measures, generalizes them to +a wide-variety of models and algorithms, and unravels useful properties +regarding sensitivities. Our empirical results show that sensitivity estimates +obtained during training can be used to faithfully predict generalization on +unseen test data. The proposed equation is expected to be useful for future +research on robust and adaptive learning. + +
+
+ comment: 37th Conference on Neural Information Processing Systems (NeurIPS + 2023) +
+
+
+
+
+ + ♻ ☆ GAIA: Delving into Gradient-based Attribution Abnormality for + Out-of-distribution Detection NeurIPS2023 + + +
+ Detecting out-of-distribution (OOD) examples is crucial to guarantee the +reliability and safety of deep neural networks in real-world settings. In this +paper, we offer an innovative perspective on quantifying the disparities +between in-distribution (ID) and OOD data -- analyzing the uncertainty that +arises when models attempt to explain their predictive decisions. This +perspective is motivated by our observation that gradient-based attribution +methods encounter challenges in assigning feature importance to OOD data, +thereby yielding divergent explanation patterns. Consequently, we investigate +how attribution gradients lead to uncertain explanation outcomes and introduce +two forms of abnormalities for OOD detection: the zero-deflation abnormality +and the channel-wise average abnormality. We then propose GAIA, a simple and +effective approach that incorporates Gradient Abnormality Inspection and +Aggregation. The effectiveness of GAIA is validated on both commonly utilized +(CIFAR) and large-scale (ImageNet-1k) benchmarks. Specifically, GAIA reduces +the average FPR95 by 23.10% on CIFAR10 and by 45.41% on CIFAR100 compared to +advanced post-hoc methods. + +
+
+ comment: Accepted by NeurIPS2023 +
+
+
+
+
+ + ♻ ☆ CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language + Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insights into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023; updated with CLadder dataset v1.5 +
+
+
+
+
+ + ♻ ☆ RLPlanner: Reinforcement Learning based Floorplanning for Chiplets with + Fast Thermal Analysis + + +
+ Chiplet-based systems have gained significant attention in recent years due +to their low cost and competitive performance. As the complexity and +compactness of a chiplet-based system increase, careful consideration must be +given to microbump assignments, interconnect delays, and thermal limitations +during the floorplanning stage. This paper introduces RLPlanner, an efficient +early-stage floorplanning tool for chiplet-based systems with a novel fast +thermal evaluation method. RLPlanner employs advanced reinforcement learning to +jointly minimize total wirelength and temperature. To alleviate the +time-consuming thermal calculations, RLPlanner incorporates the developed fast +thermal evaluation method to expedite the iterations and optimizations. +Comprehensive experiments demonstrate that our proposed fast thermal evaluation +method achieves a mean absolute error (MAE) of 0.25 K and delivers over 120x +speed-up compared to the open-source thermal solver HotSpot. When integrated +with our fast thermal evaluation method, RLPlanner achieves an average +improvement of 20.28\% in minimizing the target objective (a combination of +wirelength and temperature), within a similar running time, compared to the +classic simulated annealing method with HotSpot. + +
+
+
+
+
+ + ♻ ☆ Deep learning based Image Compression for Microscopy Images: An + Empirical Study + + +
+ With the fast development of modern microscopes and bioimaging techniques, an +unprecedentedly large amount of imaging data are being generated, stored, +analyzed, and even shared through networks. The size of the data poses great +challenges for current data infrastructure. One common way to reduce the data +size is by image compression. This present study analyzes classic and deep +learning based image compression methods, and their impact on deep learning +based image processing models. Deep learning based label-free prediction models +(i.e., predicting fluorescent images from bright field images) are used as an +example application for comparison and analysis. Effective image compression +methods could help reduce the data size significantly without losing necessary +information, and therefore reduce the burden on data management infrastructure +and permit fast transmission through the network for data sharing or cloud +computing. To compress images in such a wanted way, multiple classical lossy +image compression techniques are compared to several AI-based compression +models provided by and trained with the CompressAI toolbox using python. These +different compression techniques are compared in compression ratio, multiple +image similarity measures and, most importantly, the prediction accuracy from +label-free models on compressed images. We found that AI-based compression +techniques largely outperform the classic ones and will minimally affect the +downstream label-free task in 2D cases. In the end, we hope the present study +could shed light on the potential of deep learning based image compression and +the impact of image compression on downstream deep learning based image +analysis models. + +
+
+ comment: - Update github link; - correct the author name; - update the table + (correct some errors during calculation); - update the implementation detail + section and the discussion section +
+
+
+
+
+ + ♻ ☆ ENN: A Neural Network with DCT Adaptive Activation Functions SP + + +
+ The expressiveness of neural networks highly depends on the nature of the +activation function, although these are usually assumed predefined and fixed +during the training stage. Under a signal processing perspective, in this paper +we present Expressive Neural Network (ENN), a novel model in which the +non-linear activation functions are modeled using the Discrete Cosine Transform +(DCT) and adapted using backpropagation during training. This parametrization +keeps the number of trainable parameters low, is appropriate for gradient-based +schemes, and adapts to different learning tasks. This is the first non-linear +model for activation functions that relies on a signal processing perspective, +providing high flexibility and expressiveness to the network. We contribute +with insights in the explainability of the network at convergence by recovering +the concept of bump, this is, the response of each activation function in the +output space. Finally, through exhaustive experiments we show that the model +can adapt to classification and regression tasks. The performance of ENN +outperforms state of the art benchmarks, providing above a 40% gap in accuracy +in some scenarios. + +
+
+ comment: Paper accepted in IEEE Journal of Selected Topics in Signal + Processing (JSTSP) Special Series on AI in Signal & Data Science - Toward + Explainable, Reliable, and Sustainable Machine Learning +
+
+
+
+
+ + ♻ ☆ AQuA: A Benchmarking Tool for Label Quality Assessment NeurIPS 2023 + + +
+ Machine learning (ML) models are only as good as the data they are trained +on. But recent studies have found datasets widely used to train and evaluate ML +models, e.g. ImageNet, to have pervasive labeling errors. Erroneous labels on +the train set hurt ML models' ability to generalize, and they impact evaluation +and model selection using the test set. Consequently, learning in the presence +of labeling errors is an active area of research, yet this field lacks a +comprehensive benchmark to evaluate these methods. Most of these methods are +evaluated on a few computer vision datasets with significant variance in the +experimental protocols. With such a large pool of methods and inconsistent +evaluation, it is also unclear how ML practitioners can choose the right models +to assess label quality in their data. To this end, we propose a benchmarking +environment AQuA to rigorously evaluate methods that enable machine learning in +the presence of label noise. We also introduce a design space to delineate +concrete design choices of label error detection models. We hope that our +proposed design space and benchmark enable practitioners to choose the right +tools to improve their label quality and that our benchmark enables objective +and rigorous evaluation of machine learning tools facing mislabeled data. + +
+
+ comment: Accepted at the 37th Conference on Neural Information Processing + Systems (NeurIPS 2023) Track on Datasets and Benchmarks. Source code can be + found at www.github.com/autonlab/aqua/ +
+
+
+
+
+ + ♻ ☆ How to Turn Your Knowledge Graph Embeddings into Generative Models + + +
+ Some of the most successful knowledge graph embedding (KGE) models for link +prediction -- CP, RESCAL, TuckER, ComplEx -- can be interpreted as energy-based +models. Under this perspective they are not amenable for exact +maximum-likelihood estimation (MLE), sampling and struggle to integrate logical +constraints. This work re-interprets the score functions of these KGEs as +circuits -- constrained computational graphs allowing efficient +marginalisation. Then, we design two recipes to obtain efficient generative +circuit models by either restricting their activations to be non-negative or +squaring their outputs. Our interpretation comes with little or no loss of +performance for link prediction, while the circuits framework unlocks exact +learning by MLE, efficient sampling of new triples, and guarantee that logical +constraints are satisfied by design. Furthermore, our models scale more +gracefully than the original KGEs on graphs with millions of entities. + +
+
+
+
+
+ + ♻ ☆ Diffusion Language Models Generation Can Be Halted Early + + +
+ Diffusion Language models (DLMs) are a promising avenue for text generation +due to their practical properties on tractable controllable generation. They +also have the advantage of not having to predict text autoregressively. +However, despite these notable features, DLMs have not yet reached the +performance levels of their Autoregressive counterparts. One of the ways to +reduce the performance gap between these two types of language models is to +speed up the generation of DLMs. Therefore, we propose a pioneering methodology +to address this issue in this work. It enables the execution of more generation +steps within a given time frame, potentially leading to higher-quality outputs. +Specifically, our methods estimate DLMs completeness of text generation and +allow adaptive halting of the generation process. We test and refine our +methods on Plaid, SSD, and CDCD DLMs and create a cohesive perspective on their +generation workflows. Finally, we confirm that our methods allow halting Plaid, +SSD, and CDCD models and decrease the generation time by $10$-$40$% without a +drop in the quality of model samples. + +
+
+
+
+
+ + ♻ ☆ Iterative Regularization with k-support Norm: An Important Complement to + Sparse Recovery AAAI 2024 + + +
+ Sparse recovery is ubiquitous in machine learning and signal processing. Due +to the NP-hard nature of sparse recovery, existing methods are known to suffer +either from restrictive (or even unknown) applicability conditions, or high +computational cost. Recently, iterative regularization methods have emerged as +a promising fast approach because they can achieve sparse recovery in one pass +through early stopping, rather than the tedious grid-search used in the +traditional methods. However, most of those iterative methods are based on the +$\ell_1$ norm which requires restrictive applicability conditions and could +fail in many cases. Therefore, achieving sparse recovery with iterative +regularization methods under a wider range of conditions has yet to be further +explored. To address this issue, we propose a novel iterative regularization +algorithm, IRKSN, based on the $k$-support norm regularizer rather than the +$\ell_1$ norm. We provide conditions for sparse recovery with IRKSN, and +compare them with traditional conditions for recovery with $\ell_1$ norm +regularizers. Additionally, we give an early stopping bound on the model error +of IRKSN with explicit constants, achieving the standard linear rate for sparse +recovery. Finally, we illustrate the applicability of our algorithm on several +experiments, including a support recovery experiment with a correlated design +matrix. + +
+
+ comment: Accepted at AAAI 2024. Code at + https://github.com/wdevazelhes/IRKSN_AAAI2024 +
+
+
+
+
+ + ♻ ☆ Interpolation of mountain weather forecasts by machine learning + + +
+ Recent advances in numerical simulation methods based on physical models and +their combination with machine learning have improved the accuracy of weather +forecasts. However, the accuracy decreases in complex terrains such as +mountainous regions because these methods usually use grids of several +kilometers square and simple machine learning models. While deep learning has +also made significant progress in recent years, its direct application is +difficult to utilize the physical knowledge used in the simulation. This paper +proposes a method that uses machine learning to interpolate future weather in +mountainous regions using forecast data from surrounding plains and past +observed data to improve weather forecasts in mountainous regions. We focus on +mountainous regions in Japan and predict temperature and precipitation mainly +using LightGBM as a machine learning model. Despite the use of a small dataset, +through feature engineering and model tuning, our method partially achieves +improvements in the RMSE with significantly less training time. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Attention, Distillation, and Tabularization: Towards Practical Neural + Network-Based Prefetching + + +
+ Attention-based Neural Networks (NN) have demonstrated their effectiveness in +accurate memory access prediction, an essential step in data prefetching. +However, the substantial computational overheads associated with these models +result in high inference latency, limiting their feasibility as practical +prefetchers. To close the gap, we propose a new approach based on +tabularization that significantly reduces model complexity and inference +latency without sacrificing prediction accuracy. Our novel tabularization +methodology takes as input a distilled, yet highly accurate attention-based +model for memory access prediction and efficiently converts its expensive +matrix multiplications into a hierarchy of fast table lookups. As an exemplar +of the above approach, we develop DART, a prefetcher comprised of a simple +hierarchy of tables. With a modest 0.09 drop in F1-score, DART reduces 99.99% +of arithmetic operations from the large attention-based model and 91.83% from +the distilled model. DART accelerates the large model inference by 170x and the +distilled model by 9.4x. DART has comparable latency and storage costs as +state-of-the-art rule-based prefetcher BO but surpasses it by 6.1% in IPC +improvement. DART outperforms state-of-the-art NN-based prefetchers TransFetch +by 33.1% and Voyager by 37.2% in terms of IPC improvement, primarily due to its +low prefetching latency. + +
+
+
+
+
+ + ♻ ☆ StemGen: A music generation model that listens ICASSP 2024 + + +
+ End-to-end generation of musical audio using deep learning techniques has +seen an explosion of activity recently. However, most models concentrate on +generating fully mixed music in response to abstract conditioning information. +In this work, we present an alternative paradigm for producing music generation +models that can listen and respond to musical context. We describe how such a +model can be constructed using a non-autoregressive, transformer-based model +architecture and present a number of novel architectural and sampling +improvements. We train the described architecture on both an open-source and a +proprietary dataset. We evaluate the produced models using standard quality +metrics and a new approach based on music information retrieval descriptors. +The resulting model reaches the audio quality of state-of-the-art +text-conditioned models, as well as exhibiting strong musical coherence with +its context. + +
+
+ comment: Accepted for publication at ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ FreqFed: A Frequency Analysis-Based Approach for Mitigating Poisoning + Attacks in Federated Learning NDSS + + +
+ Federated learning (FL) is a collaborative learning paradigm allowing +multiple clients to jointly train a model without sharing their training data. +However, FL is susceptible to poisoning attacks, in which the adversary injects +manipulated model updates into the federated model aggregation process to +corrupt or destroy predictions (untargeted poisoning) or implant hidden +functionalities (targeted poisoning or backdoors). Existing defenses against +poisoning attacks in FL have several limitations, such as relying on specific +assumptions about attack types and strategies or data distributions or not +sufficiently robust against advanced injection techniques and strategies and +simultaneously maintaining the utility of the aggregated model. To address the +deficiencies of existing defenses, we take a generic and completely different +approach to detect poisoning (targeted and untargeted) attacks. We present +FreqFed, a novel aggregation mechanism that transforms the model updates (i.e., +weights) into the frequency domain, where we can identify the core frequency +components that inherit sufficient information about weights. This allows us to +effectively filter out malicious updates during local training on the clients, +regardless of attack types, strategies, and clients' data distributions. We +extensively evaluate the efficiency and effectiveness of FreqFed in different +application domains, including image classification, word prediction, IoT +intrusion detection, and speech recognition. We demonstrate that FreqFed can +mitigate poisoning attacks effectively with a negligible impact on the utility +of the aggregated model. + +
+
+ comment: To appear in the Network and Distributed System Security (NDSS) + Symposium 2024. 16 pages, 8 figures, 12 tables, 1 algorithm, 3 equations +
+
+
+
+
+ + ♻ ☆ How do Minimum-Norm Shallow Denoisers Look in Function Space? + + +
+ Neural network (NN) denoisers are an essential building block in many common +tasks, ranging from image reconstruction to image generation. However, the +success of these models is not well understood from a theoretical perspective. +In this paper, we aim to characterize the functions realized by shallow ReLU NN +denoisers -- in the common theoretical setting of interpolation (i.e., zero +training loss) with a minimal representation cost (i.e., minimal $\ell^2$ norm +weights). First, for univariate data, we derive a closed form for the NN +denoiser function, find it is contractive toward the clean data points, and +prove it generalizes better than the empirical MMSE estimator at a low noise +level. Next, for multivariate data, we find the NN denoiser functions in a +closed form under various geometric assumptions on the training data: data +contained in a low-dimensional subspace, data contained in a union of one-sided +rays, or several types of simplexes. These functions decompose into a sum of +simple rank-one piecewise linear interpolations aligned with edges and/or faces +connecting training samples. We empirically verify this alignment phenomenon on +synthetic data and real images. + +
+
+ comment: Thirty-seventh Conference on Neural Information Processing Systems +
+
+
+
+
+ + ♻ ☆ Tiny-VBF: Resource-Efficient Vision Transformer based Lightweight + Beamformer for Ultrasound Single-Angle Plane Wave Imaging DATE 2024 + + +
+ Accelerating compute intensive non-real-time beam-forming algorithms in +ultrasound imaging using deep learning architectures has been gaining momentum +in the recent past. Nonetheless, the complexity of the state-of-the-art deep +learning techniques poses challenges for deployment on resource-constrained +edge devices. In this work, we propose a novel vision transformer based tiny +beamformer (Tiny-VBF), which works on the raw radio-frequency channel data +acquired through single-angle plane wave insonification. The output of our +Tiny-VBF provides fast envelope detection requiring very low frame rate, i.e. +0.34 GOPs/Frame for a frame size of 368 x 128 in comparison to the +state-of-the-art deep learning models. It also exhibited an 8% increase in +contrast and gains of 5% and 33% in axial and lateral resolution respectively +when compared to Tiny-CNN on in-vitro dataset. Additionally, our model showed a +4.2% increase in contrast and gains of 4% and 20% in axial and lateral +resolution respectively when compared against conventional Delay-and-Sum (DAS) +beamformer. We further propose an accelerator architecture and implement our +Tiny-VBF model on a Zynq UltraScale+ MPSoC ZCU104 FPGA using a hybrid +quantization scheme with 50% less resource consumption compared to the +floating-point implementation, while preserving the image quality. + +
+
+ comment: 6 pages, DATE 2024 +
+
+
+
+
+ + ♻ ☆ Translatotron 3: Speech to Speech Translation with Monolingual Data ICASSP 2024 + + +
+ This paper presents Translatotron 3, a novel approach to unsupervised direct +speech-to-speech translation from monolingual speech-text datasets by combining +masked autoencoder, unsupervised embedding mapping, and back-translation. +Experimental results in speech-to-speech translation tasks between Spanish and +English show that Translatotron 3 outperforms a baseline cascade system, +reporting $18.14$ BLEU points improvement on the synthesized +Unpaired-Conversational dataset. In contrast to supervised approaches that +necessitate real paired data, or specialized modeling to replicate +para-/non-linguistic information such as pauses, speaking rates, and speaker +identity, Translatotron 3 showcases its capability to retain it. Audio samples +can be found at http://google-research.github.io/lingvo-lab/translatotron3 + +
+
+ comment: To appear in ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Design of Two-Level Incentive Mechanisms for Hierarchical Federated + Learning + + +
+ Hierarchical Federated Learning (HFL) is a distributed machine learning +paradigm tailored for multi-tiered computation architectures, which supports +massive access of devices' models simultaneously. To enable efficient HFL, it +is crucial to design suitable incentive mechanisms to ensure that devices +actively participate in local training. However, there are few studies on +incentive mechanism design for HFL. In this paper, we design two-level +incentive mechanisms for the HFL with a two-tiered computing structure to +encourage the participation of entities in each tier in the HFL training. In +the lower-level game, we propose a coalition formation game to joint optimize +the edge association and bandwidth allocation problem, and obtain efficient +coalition partitions by the proposed preference rule, which can be proven to be +stable by exact potential game. In the upper-level game, we design the +Stackelberg game algorithm, which not only determines the optimal number of +edge aggregations for edge servers to maximize their utility, but also optimize +the unit reward provided for the edge aggregation performance to ensure the +interests of cloud servers. Furthermore, numerical results indicate that the +proposed algorithms can achieve better performance than the benchmark schemes. + +
+
+
+
+
+ + ♻ ☆ Managing Temporal Resolution in Continuous Value Estimation: A + Fundamental Trade-off NeurIPS 2023 + + +
+ A default assumption in reinforcement learning (RL) and optimal control is +that observations arrive at discrete time points on a fixed clock cycle. Yet, +many applications involve continuous-time systems where the time +discretization, in principle, can be managed. The impact of time discretization +on RL methods has not been fully characterized in existing theory, but a more +detailed analysis of its effect could reveal opportunities for improving +data-efficiency. We address this gap by analyzing Monte-Carlo policy evaluation +for LQR systems and uncover a fundamental trade-off between approximation and +statistical error in value estimation. Importantly, these two errors behave +differently to time discretization, leading to an optimal choice of temporal +resolution for a given data budget. These findings show that managing the +temporal resolution can provably improve policy evaluation efficiency in LQR +systems with finite data. Empirically, we demonstrate the trade-off in +numerical simulations of LQR instances and standard RL benchmarks for +non-linear continuous control. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Constrained Reweighting of Distributions: an Optimal Transport Approach + + +
+ We commonly encounter the problem of identifying an optimally weight adjusted +version of the empirical distribution of observed data, adhering to predefined +constraints on the weights. Such constraints often manifest as restrictions on +the moments, tail behaviour, shapes, number of modes, etc., of the resulting +weight adjusted empirical distribution. In this article, we substantially +enhance the flexibility of such methodology by introducing a nonparametrically +imbued distributional constraints on the weights, and developing a general +framework leveraging the maximum entropy principle and tools from optimal +transport. The key idea is to ensure that the maximum entropy weight adjusted +empirical distribution of the observed data is close to a pre-specified +probability distribution in terms of the optimal transport metric while +allowing for subtle departures. The versatility of the framework is +demonstrated in the context of three disparate applications where data +re-weighting is warranted to satisfy side constraints on the optimization +problem at the heart of the statistical task: namely, portfolio allocation, +semi-parametric inference for complex surveys, and ensuring algorithmic +fairness in machine learning algorithms. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.10085 +
+
+
+
+
+ + ♻ ☆ Restless Bandits with Average Reward: Breaking the Uniform Global + Attractor Assumption NeurIPS 2023 + + +
+ We study the infinite-horizon restless bandit problem with the average reward +criterion, in both discrete-time and continuous-time settings. A fundamental +goal is to efficiently compute policies that achieve a diminishing optimality +gap as the number of arms, $N$, grows large. Existing results on asymptotic +optimality all rely on the uniform global attractor property (UGAP), a complex +and challenging-to-verify assumption. In this paper, we propose a general, +simulation-based framework, Follow-the-Virtual-Advice, that converts any +single-armed policy into a policy for the original $N$-armed problem. This is +done by simulating the single-armed policy on each arm and carefully steering +the real state towards the simulated state. Our framework can be instantiated +to produce a policy with an $O(1/\sqrt{N})$ optimality gap. In the +discrete-time setting, our result holds under a simpler synchronization +assumption, which covers some problem instances that violate UGAP. More +notably, in the continuous-time setting, we do not require \emph{any} +additional assumptions beyond the standard unichain condition. In both +settings, our work is the first asymptotic optimality result that does not +require UGAP. + +
+
+ comment: NeurIPS 2023. 35 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Characteristic Guidance: Non-linear Correction for Diffusion Model at + Large Guidance Scale + + +
+ Popular guidance for denoising diffusion probabilistic model (DDPM) linearly +combines distinct conditional models together to provide enhanced control over +samples. However, this approach overlooks nonlinear effects that become +significant when guidance scale is large. To address this issue, we propose +characteristic guidance, a sampling method that provides first-principle +non-linear correction for classifier-free guided DDPMs. Such correction forces +the guided DDPMs to respect the Fokker-Planck equation of their underlying +diffusion process, in a way that is training-free, derivative-free, and +compatible with existing sampling methods. Experiments show that characteristic +guidance enhances control and reduces color and exposure issues in image +generation, proving effective in diverse applications ranging from latent space +sampling to solving physics problems like magnet phase transitions. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Learning to solve Bayesian inverse problems: An amortized variational + inference approach using Gaussian and Flow guides + + +
+ Inverse problems, i.e., estimating parameters of physical models from +experimental data, are ubiquitous in science and engineering. The Bayesian +formulation is the gold standard because it alleviates ill-posedness issues and +quantifies epistemic uncertainty. Since analytical posteriors are not typically +available, one resorts to Markov chain Monte Carlo sampling or approximate +variational inference. However, inference needs to be rerun from scratch for +each new set of data. This drawback limits the applicability of the Bayesian +formulation to real-time settings, e.g., health monitoring of engineered +systems, and medical diagnosis. The objective of this paper is to develop a +methodology that enables real-time inference by learning the Bayesian inverse +map, i.e., the map from data to posteriors. Our approach is as follows. We +parameterize the posterior distribution as a function of data. This work +outlines two distinct approaches to do this. The first method involves +parameterizing the posterior using an amortized full-rank Gaussian guide, +implemented through neural networks. The second method utilizes a Conditional +Normalizing Flow guide, employing conditional invertible neural networks for +cases where the target posterior is arbitrarily complex. In both approaches, we +learn the network parameters by amortized variational inference which involves +maximizing the expectation of evidence lower bound over all possible datasets +compatible with the model. We demonstrate our approach by solving a set of +benchmark problems from science and engineering. Our results show that the +posterior estimates of our approach are in agreement with the corresponding +ground truth obtained by Markov chain Monte Carlo. Once trained, our approach +provides the posterior distribution for a given observation just at the cost of +a forward pass of the neural network. + +
+
+
+
+
+ + ♻ ☆ Sampling from Gaussian Process Posteriors using Stochastic Gradient + Descent + + +
+ Gaussian processes are a powerful framework for quantifying uncertainty and +for sequential decision-making but are limited by the requirement of solving +linear systems. In general, this has a cubic cost in dataset size and is +sensitive to conditioning. We explore stochastic gradient algorithms as a +computationally efficient method of approximately solving these linear systems: +we develop low-variance optimization objectives for sampling from the posterior +and extend these to inducing points. Counterintuitively, stochastic gradient +descent often produces accurate predictions, even in cases where it does not +converge quickly to the optimum. We explain this through a spectral +characterization of the implicit bias from non-convergence. We show that +stochastic gradient descent produces predictive distributions close to the true +posterior both in regions with sufficient data coverage, and in regions +sufficiently far away from the data. Experimentally, stochastic gradient +descent achieves state-of-the-art performance on sufficiently large-scale or +ill-conditioned regression tasks. Its uncertainty estimates match the +performance of significantly more expensive baselines on a large-scale Bayesian +optimization task. + +
+
+
+
+
+ + ♻ ☆ RanPAC: Random Projections and Pre-trained Models for Continual Learning + + +
+ Continual learning (CL) aims to incrementally learn different tasks (such as +classification) in a non-stationary data stream without forgetting old ones. +Most CL works focus on tackling catastrophic forgetting under a +learning-from-scratch paradigm. However, with the increasing prominence of +foundation models, pre-trained models equipped with informative representations +have become available for various downstream requirements. Several CL methods +based on pre-trained models have been explored, either utilizing pre-extracted +features directly (which makes bridging distribution gaps challenging) or +incorporating adaptors (which may be subject to forgetting). In this paper, we +propose a concise and effective approach for CL with pre-trained models. Given +that forgetting occurs during parameter updating, we contemplate an alternative +approach that exploits training-free random projectors and class-prototype +accumulation, which thus bypasses the issue. Specifically, we inject a frozen +Random Projection layer with nonlinear activation between the pre-trained +model's feature representations and output head, which captures interactions +between features with expanded dimensionality, providing enhanced linear +separability for class-prototype-based CL. We also demonstrate the importance +of decorrelating the class-prototypes to reduce the distribution disparity when +using pre-trained representations. These techniques prove to be effective and +circumvent the problem of forgetting for both class- and domain-incremental +continual learning. Compared to previous methods applied to pre-trained +ViT-B/16 models, we reduce final error rates by between 20% and 62% on seven +class-incremental benchmarks, despite not using any rehearsal memory. We +conclude that the full potential of pre-trained models for simple, effective, +and fast CL has not hitherto been fully tapped. Code is at +github.com/RanPAC/RanPAC. + +
+
+ comment: 32 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Accelerated Optimization Landscape of Linear-Quadratic Regulator + + +
+ Linear-quadratic regulator (LQR) is a landmark problem in the field of +optimal control, which is the concern of this paper. Generally, LQR is +classified into state-feedback LQR (SLQR) and output-feedback LQR (OLQR) based +on whether the full state is obtained. It has been suggested in existing +literature that both SLQR and OLQR could be viewed as \textit{constrained +nonconvex matrix optimization} problems in which the only variable to be +optimized is the feedback gain matrix. In this paper, we introduce a +first-order accelerated optimization framework of handling the LQR problem, and +give its convergence analysis for the cases of SLQR and OLQR, respectively. + Specifically, a Lipschiz Hessian property of LQR performance criterion is +presented, which turns out to be a crucial property for the application of +modern optimization techniques. For the SLQR problem, a continuous-time hybrid +dynamic system is introduced, whose solution trajectory is shown to converge +exponentially to the optimal feedback gain with Nesterov-optimal order +$1-\frac{1}{\sqrt{\kappa}}$ ($\kappa$ the condition number). Then, the +symplectic Euler scheme is utilized to discretize the hybrid dynamic system, +and a Nesterov-type method with a restarting rule is proposed that preserves +the continuous-time convergence rate, i.e., the discretized algorithm admits +the Nesterov-optimal convergence order. For the OLQR problem, a Hessian-free +accelerated framework is proposed, which is a two-procedure method consisting +of semiconvex function optimization and negative curvature exploitation. In a +time $\mathcal{O}(\epsilon^{-7/4}\log(1/\epsilon))$, the method can find an +$\epsilon$-stationary point of the performance criterion; this entails that the +method improves upon the $\mathcal{O}(\epsilon^{-2})$ complexity of vanilla +gradient descent. Moreover, our method provides the second-order guarantee of +stationary point. + +
+
+
+
+
+ + ♻ ☆ Dialogue for Prompting: a Policy-Gradient-Based Discrete Prompt + Generation for Few-shot Learning AAAI 2024 + + +
+ Prompt-based pre-trained language models (PLMs) paradigm have succeeded +substantially in few-shot natural language processing (NLP) tasks. However, +prior discrete prompt optimization methods require expert knowledge to design +the base prompt set and identify high-quality prompts, which is costly, +inefficient, and subjective. Meanwhile, existing continuous prompt optimization +methods improve the performance by learning the ideal prompts through the +gradient information of PLMs, whose high computational cost, and low +readability and generalizability are often concerning. To address the research +gap, we propose a Dialogue-comprised Policy-gradient-based Discrete Prompt +Optimization ($DP_2O$) method. We first design a multi-round dialogue alignment +strategy for readability prompt set generation based on GPT-4. Furthermore, we +propose an efficient prompt screening metric to identify high-quality prompts +with linear complexity. Finally, we construct a reinforcement learning (RL) +framework based on policy gradients to match the prompts to inputs optimally. +By training a policy network with only 0.67% of the PLM parameter size on the +tasks in the few-shot setting, $DP_2O$ outperforms the state-of-the-art (SOTA) +method by 1.52% in accuracy on average on four open-source datasets. Moreover, +subsequent experiments also demonstrate that $DP_2O$ has good universality, +robustness, and generalization ability. + +
+
+ comment: AAAI 2024 Main Track +
+
+
+
+
+ + ♻ ☆ MGTBench: Benchmarking Machine-Generated Text Detection + + +
+ Nowadays, powerful large language models (LLMs) such as ChatGPT have +demonstrated revolutionary power in a variety of tasks. Consequently, the +detection of machine-generated texts (MGTs) is becoming increasingly crucial as +LLMs become more advanced and prevalent. These models have the ability to +generate human-like language, making it challenging to discern whether a text +is authored by a human or a machine. This raises concerns regarding +authenticity, accountability, and potential bias. However, existing methods for +detecting MGTs are evaluated using different model architectures, datasets, and +experimental settings, resulting in a lack of a comprehensive evaluation +framework that encompasses various methodologies. Furthermore, it remains +unclear how existing detection methods would perform against powerful LLMs. In +this paper, we fill this gap by proposing the first benchmark framework for MGT +detection against powerful LLMs, named MGTBench. Extensive evaluations on +public datasets with curated texts generated by various powerful LLMs such as +ChatGPT-turbo and Claude demonstrate the effectiveness of different detection +methods. Our ablation study shows that a larger number of words in general +leads to better performance and most detection methods can achieve similar +performance with much fewer training samples. Moreover, we delve into a more +challenging task: text attribution. Our findings indicate that the model-based +detection methods still perform well in the text attribution task. To +investigate the robustness of different detection methods, we consider three +adversarial attacks, namely paraphrasing, random spacing, and adversarial +perturbations. We discover that these attacks can significantly diminish +detection effectiveness, underscoring the critical need for the development of +more robust detection methods. + +
+
+
+
+
+ + ♻ ☆ Understanding CNNs from excitations + + +
+ Saliency maps have proven to be a highly efficacious approach for explicating +the decisions of Convolutional Neural Networks. However, extant methodologies +predominantly rely on gradients, which constrain their ability to explicate +complex models. Furthermore, such approaches are not fully adept at leveraging +negative gradient information to improve interpretive veracity. In this study, +we present a novel concept, termed positive and negative excitation, which +enables the direct extraction of positive and negative excitation for each +layer, thus enabling complete layer-by-layer information utilization sans +gradients. To organize these excitations into final saliency maps, we introduce +a double-chain backpropagation procedure. A comprehensive experimental +evaluation, encompassing both binary classification and multi-classification +tasks, was conducted to gauge the effectiveness of our proposed method. +Encouragingly, the results evince that our approach offers a significant +improvement over the state-of-the-art methods in terms of salient pixel +removal, minor pixel removal, and inconspicuous adversarial perturbation +generation guidance. Additionally, we verify the correlation between positive +and negative excitations. + +
+
+
+
+
+ + ♻ ☆ Contextual Pandora's Box + + +
+ Pandora's Box is a fundamental stochastic optimization problem, where the +decision-maker must find a good alternative while minimizing the search cost of +exploring the value of each alternative. In the original formulation, it is +assumed that accurate distributions are given for the values of all the +alternatives, while recent work studies the online variant of Pandora's Box +where the distributions are originally unknown. In this work, we study +Pandora's Box in the online setting, while incorporating context. At every +round, we are presented with a number of alternatives each having a context, an +exploration cost and an unknown value drawn from an unknown distribution that +may change at every round. Our main result is a no-regret algorithm that +performs comparably well to the optimal algorithm which knows all prior +distributions exactly. Our algorithm works even in the bandit setting where the +algorithm never learns the values of the alternatives that were not explored. +The key technique that enables our result is a novel modification of the +realizability condition in contextual bandits that connects a context to a +sufficient statistic of each alternative's distribution (its "reservation +value") rather than its mean. + +
+
+
+
+
+ + ♻ ☆ Modelling Cellular Perturbations with the Sparse Additive Mechanism + Shift Variational Autoencoder NeurIPS 2023 + + +
+ Generative models of observations under interventions have been a vibrant +topic of interest across machine learning and the sciences in recent years. For +example, in drug discovery, there is a need to model the effects of diverse +interventions on cells in order to characterize unknown biological mechanisms +of action. We propose the Sparse Additive Mechanism Shift Variational +Autoencoder, SAMS-VAE, to combine compositionality, disentanglement, and +interpretability for perturbation models. SAMS-VAE models the latent state of a +perturbed sample as the sum of a local latent variable capturing +sample-specific variation and sparse global variables of latent intervention +effects. Crucially, SAMS-VAE sparsifies these global latent variables for +individual perturbations to identify disentangled, perturbation-specific latent +subspaces that are flexibly composable. We evaluate SAMS-VAE both +quantitatively and qualitatively on a range of tasks using two popular single +cell sequencing datasets. In order to measure perturbation-specific +model-properties, we also introduce a framework for evaluation of perturbation +models based on average treatment effects with links to posterior predictive +checks. SAMS-VAE outperforms comparable models in terms of generalization +across in-distribution and out-of-distribution tasks, including a combinatorial +reasoning task under resource paucity, and yields interpretable latent +structures which correlate strongly to known biological mechanisms. Our results +suggest SAMS-VAE is an interesting addition to the modeling toolkit for machine +learning-driven scientific discovery. + +
+
+ comment: Presented at the 37th Conference on Neural Information Processing + Systems (NeurIPS 2023) (Post-NeurIPS fixes: cosmetic fixes, updated + references, added simulation to appendix) +
+
+
+
+
+ + ♻ ☆ Should Under-parameterized Student Networks Copy or Average Teacher + Weights? NeurIPS 2023 + + +
+ Any continuous function $f^*$ can be approximated arbitrarily well by a +neural network with sufficiently many neurons $k$. We consider the case when +$f^*$ itself is a neural network with one hidden layer and $k$ neurons. +Approximating $f^*$ with a neural network with $n< k$ neurons can thus be seen +as fitting an under-parameterized "student" network with $n$ neurons to a +"teacher" network with $k$ neurons. As the student has fewer neurons than the +teacher, it is unclear, whether each of the $n$ student neurons should copy one +of the teacher neurons or rather average a group of teacher neurons. For +shallow neural networks with erf activation function and for the standard +Gaussian input distribution, we prove that "copy-average" configurations are +critical points if the teacher's incoming vectors are orthonormal and its +outgoing weights are unitary. Moreover, the optimum among such configurations +is reached when $n-1$ student neurons each copy one teacher neuron and the +$n$-th student neuron averages the remaining $k-n+1$ teacher neurons. For the +student network with $n=1$ neuron, we provide additionally a closed-form +solution of the non-trivial critical point(s) for commonly used activation +functions through solving an equivalent constrained optimization problem. +Empirically, we find for the erf activation function that gradient flow +converges either to the optimal copy-average critical point or to another point +where each student neuron approximately copies a different teacher neuron. +Finally, we find similar results for the ReLU activation function, suggesting +that the optimal solution of underparameterized networks has a universal +structure. + +
+
+ comment: 41 pages, presented at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ On sparse regression, Lp-regularization, and automated model discovery + + +
+ Sparse regression and feature extraction are the cornerstones of knowledge +discovery from massive data. Their goal is to discover interpretable and +predictive models that provide simple relationships among scientific variables. +While the statistical tools for model discovery are well established in the +context of linear regression, their generalization to nonlinear regression in +material modeling is highly problem-specific and insufficiently understood. +Here we explore the potential of neural networks for automatic model discovery +and induce sparsity by a hybrid approach that combines two strategies: +regularization and physical constraints. We integrate the concept of Lp +regularization for subset selection with constitutive neural networks that +leverage our domain knowledge in kinematics and thermodynamics. We train our +networks with both, synthetic and real data, and perform several thousand +discovery runs to infer common guidelines and trends: L2 regularization or +ridge regression is unsuitable for model discovery; L1 regularization or lasso +promotes sparsity, but induces strong bias; only L0 regularization allows us to +transparently fine-tune the trade-off between interpretability and +predictability, simplicity and accuracy, and bias and variance. With these +insights, we demonstrate that Lp regularized constitutive neural networks can +simultaneously discover both, interpretable models and physically meaningful +parameters. We anticipate that our findings will generalize to alternative +discovery techniques such as sparse and symbolic regression, and to other +domains such as biology, chemistry, or medicine. Our ability to automatically +discover material models from data could have tremendous applications in +generative material design and open new opportunities to manipulate matter, +alter properties of existing materials, and discover new materials with +user-defined properties. + +
+
+ comment: 35 pages, 15 figures, 2 tables, 62 references +
+
+
+
+
+ + ♻ ☆ Exploring Contextual Representation and Multi-Modality for End-to-End + Autonomous Driving + + +
+ Learning contextual and spatial environmental representations enhances +autonomous vehicle's hazard anticipation and decision-making in complex +scenarios. Recent perception systems enhance spatial understanding with sensor +fusion but often lack full environmental context. Humans, when driving, +naturally employ neural maps that integrate various factors such as historical +data, situational subtleties, and behavioral predictions of other road users to +form a rich contextual understanding of their surroundings. This neural +map-based comprehension is integral to making informed decisions on the road. +In contrast, even with their significant advancements, autonomous systems have +yet to fully harness this depth of human-like contextual understanding. +Motivated by this, our work draws inspiration from human driving patterns and +seeks to formalize the sensor fusion approach within an end-to-end autonomous +driving framework. We introduce a framework that integrates three cameras +(left, right, and center) to emulate the human field of view, coupled with +top-down bird-eye-view semantic data to enhance contextual representation. The +sensor data is fused and encoded using a self-attention mechanism, leading to +an auto-regressive waypoint prediction module. We treat feature representation +as a sequential problem, employing a vision transformer to distill the +contextual interplay between sensor modalities. The efficacy of the proposed +method is experimentally evaluated in both open and closed-loop settings. Our +method achieves displacement error by 0.67m in open-loop settings, surpassing +current methods by 6.9% on the nuScenes dataset. In closed-loop evaluations on +CARLA's Town05 Long and Longest6 benchmarks, the proposed method enhances +driving performance, route completion, and reduces infractions. + +
+
+
+
+
+ + ♻ ☆ DiarizationLM: Speaker Diarization Post-Processing with Large Language + Models + + +
+ In this paper, we introduce DiarizationLM, a framework to leverage large +language models (LLM) to post-process the outputs from a speaker diarization +system. Various goals can be achieved with the proposed framework, such as +improving the readability of the diarized transcript, or reducing the word +diarization error rate (WDER). In this framework, the outputs of the automatic +speech recognition (ASR) and speaker diarization systems are represented as a +compact textual format, which is included in the prompt to an optionally +finetuned LLM. The outputs of the LLM can be used as the refined diarization +results with the desired enhancement. As a post-processing step, this framework +can be easily applied to any off-the-shelf ASR and speaker diarization systems +without retraining existing components. Our experiments show that a finetuned +PaLM 2-S model can reduce the WDER by rel. 55.5% on the Fisher telephone +conversation dataset, and rel. 44.9% on the Callhome English dataset. + +
+
+
+
+
+ + ♻ ☆ How Safe Am I Given What I See? Calibrated Prediction of Safety Chances + for Image-Controlled Autonomy + + +
+ End-to-end learning has emerged as a major paradigm for developing autonomous +systems. Unfortunately, with its performance and convenience comes an even +greater challenge of safety assurance. A key factor of this challenge is the +absence of the notion of a low-dimensional and interpretable dynamical state, +around which traditional assurance methods revolve. Focusing on the online +safety prediction problem, this paper proposes a configurable family of +learning pipelines based on generative world models, which do not require +low-dimensional states. To implement these pipelines, we overcome the +challenges of learning safety-informed latent representations and missing +safety labels under prediction-induced distribution shift. These pipelines come +with statistical calibration guarantees on their safety chance predictions +based on conformal prediction. We perform an extensive evaluation of the +proposed learning pipelines on two case studies of image-controlled systems: a +racing car and a cartpole. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ End-to-End Optimized Image Compression with the Frequency-Oriented + Transform + + +
+ Image compression constitutes a significant challenge amidst the era of +information explosion. Recent studies employing deep learning methods have +demonstrated the superior performance of learning-based image compression +methods over traditional codecs. However, an inherent challenge associated with +these methods lies in their lack of interpretability. Following an analysis of +the varying degrees of compression degradation across different frequency +bands, we propose the end-to-end optimized image compression model facilitated +by the frequency-oriented transform. The proposed end-to-end image compression +model consists of four components: spatial sampling, frequency-oriented +transform, entropy estimation, and frequency-aware fusion. The +frequency-oriented transform separates the original image signal into distinct +frequency bands, aligning with the human-interpretable concept. Leveraging the +non-overlapping hypothesis, the model enables scalable coding through the +selective transmission of arbitrary frequency components. Extensive experiments +are conducted to demonstrate that our model outperforms all traditional codecs +including next-generation standard H.266/VVC on MS-SSIM metric. Moreover, +visual analysis tasks (i.e., object detection and semantic segmentation) are +conducted to verify the proposed compression method could preserve semantic +fidelity besides signal-level precision. + +
+
+ comment: 25 pages, accepted by MVAP +
+
+
+
+
+ + ☆ E2HQV: High-Quality Video Generation from Event Camera via + Theory-Inspired Model-Aided Deep Learning AAAI2024 + + +
+ The bio-inspired event cameras or dynamic vision sensors are capable of +asynchronously capturing per-pixel brightness changes (called event-streams) in +high temporal resolution and high dynamic range. However, the non-structural +spatial-temporal event-streams make it challenging for providing intuitive +visualization with rich semantic information for human vision. It calls for +events-to-video (E2V) solutions which take event-streams as input and generate +high quality video frames for intuitive visualization. However, current +solutions are predominantly data-driven without considering the prior knowledge +of the underlying statistics relating event-streams and video frames. It highly +relies on the non-linearity and generalization capability of the deep neural +networks, thus, is struggling on reconstructing detailed textures when the +scenes are complex. In this work, we propose \textbf{E2HQV}, a novel E2V +paradigm designed to produce high-quality video frames from events. This +approach leverages a model-aided deep learning framework, underpinned by a +theory-inspired E2V model, which is meticulously derived from the fundamental +imaging principles of event cameras. To deal with the issue of state-reset in +the recurrent components of E2HQV, we also design a temporal shift embedding +module to further improve the quality of the video frames. Comprehensive +evaluations on the real world event camera datasets validate our approach, with +E2HQV, notably outperforming state-of-the-art approaches, e.g., surpassing the +second best by over 40\% for some evaluation metrics. + +
+
+ comment: Accepted in AAAI2024 +
+
+
+
+
+ + ☆ Deep Shape-Texture Statistics for Completely Blind Image Quality + Evaluation + + +
+ Opinion-Unaware Blind Image Quality Assessment (OU-BIQA) models aim to +predict image quality without training on reference images and subjective +quality scores. Thereinto, image statistical comparison is a classic paradigm, +while the performance is limited by the representation ability of visual +descriptors. Deep features as visual descriptors have advanced IQA in recent +research, but they are discovered to be highly texture-biased and lack of +shape-bias. On this basis, we find out that image shape and texture cues +respond differently towards distortions, and the absence of either one results +in an incomplete image representation. Therefore, to formulate a well-round +statistical description for images, we utilize the shapebiased and +texture-biased deep features produced by Deep Neural Networks (DNNs) +simultaneously. More specifically, we design a Shape-Texture Adaptive Fusion +(STAF) module to merge shape and texture information, based on which we +formulate qualityrelevant image statistics. The perceptual quality is +quantified by the variant Mahalanobis Distance between the inner and outer +Shape-Texture Statistics (DSTS), wherein the inner and outer statistics +respectively describe the quality fingerprints of the distorted image and +natural images. The proposed DSTS delicately utilizes shape-texture statistical +relations between different data scales in the deep domain, and achieves +state-of-the-art (SOTA) quality prediction performance on images with +artificial and authentic distortions. + +
+
+
+
+
+ + ☆ CLIPRerank: An Extremely Simple Method for Improving Ad-hoc Video Search ICASSP 2024 + + +
+ Ad-hoc Video Search (AVS) enables users to search for unlabeled video content +using on-the-fly textual queries. Current deep learning-based models for AVS +are trained to optimize holistic similarity between short videos and their +associated descriptions. However, due to the diversity of ad-hoc queries, even +for a short video, its truly relevant part w.r.t. a given query can be of +shorter duration. In such a scenario, the holistic similarity becomes +suboptimal. To remedy the issue, we propose in this paper CLIPRerank, a +fine-grained re-scoring method. We compute cross-modal similarities between +query and video frames using a pre-trained CLIP model, with multi-frame scores +aggregated by max pooling. The fine-grained score is weightedly added to the +initial score for search result reranking. As such, CLIPRerank is agnostic to +the underlying video retrieval models and extremely simple, making it a handy +plug-in for boosting AVS. Experiments on the challenging TRECVID AVS benchmarks +(from 2016 to 2021) justify the effectiveness of the proposed strategy. +CLIPRerank consistently improves the TRECVID top performers and multiple +existing models including SEA, W2VV++, Dual Encoding, Dual Task, LAFF, +CLIP2Video, TS2-Net and X-CLIP. Our method also works when substituting BLIP-2 +for CLIP. + +
+
+ comment: Accepted by ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Understanding How People with Binge Eating Disorder and Bulimia Interact + with Digital Food Content + + +
+ A large body of research has focused on understanding how online content and +disordered eating behaviors are associated. However, there is a lack of +comprehensive studies investigating digital food content's influence on +individuals with eating disorders. We conducted two rounds of studies (N=23 and +22, respectively) with individuals with binge eating disorder (BED) or bulimia +nervosa (BN) to understand their motivations and practices of consuming digital +food content. Our study reveals that individuals with BED and BN anticipate +positive effects from food media to overcome their condition, but in practice, +it often exacerbates their disorder. We also discovered that many individuals +have experienced a cycle of quitting and returning to digital food content +consumption. Based on these findings, we articulate design implications for +digital food content and multimedia platforms to support vulnerable individuals +in everyday online platform interactions. + +
+
+ comment: 28 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ 3DMIT: 3D Multi-modal Instruction Tuning for Scene Understanding + + +
+ The remarkable potential of multi-modal large language models (MLLMs) in +comprehending both vision and language information has been widely +acknowledged. However, the scarcity of 3D scenes-language pairs in comparison +to their 2D counterparts, coupled with the inadequacy of existing approaches in +understanding of 3D scenes by LLMs, poses a significant challenge. In response, +we collect and construct an extensive dataset comprising 75K +instruction-response pairs tailored for 3D scenes. This dataset addresses tasks +related to 3D VQA, 3D grounding, and 3D conversation. To further enhance the +integration of 3D spatial information into LLMs, we introduce a novel and +efficient prompt tuning paradigm, 3DMIT. This paradigm eliminates the alignment +stage between 3D scenes and language and extends the instruction prompt with +the 3D modality information including the entire scene and segmented objects. +We evaluate the effectiveness of our method across diverse tasks in the 3D +scene domain and find that our approach serves as a strategic means to enrich +LLMs' comprehension of the 3D world. Our code is available at +https://github.com/staymylove/3DMIT. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 62 + +
+
+
+ + ☆ A Novel Approach for Automatic Program Repair using Round-Trip + Translation with Large Language Models + + +
+ Research shows that grammatical mistakes in a sentence can be corrected by +translating it to another language and back using neural machine translation +with language models. We investigate whether this correction capability of +Large Language Models (LLMs) extends to Automatic Program Repair (APR). Current +generative models for APR are pre-trained on source code and fine-tuned for +repair. This paper proposes bypassing the fine-tuning step and using Round-Trip +Translation (RTT): translation of code from one programming language to another +programming or natural language, and back. We hypothesize that RTT with LLMs +restores the most commonly seen patterns in code during pre-training, i.e., +performs a regression toward the mean, which removes bugs as they are a form of +noise w.r.t. the more frequent, natural, bug-free code in the training data. To +test this hypothesis, we employ eight recent LLMs pre-trained on code, +including the latest GPT versions, and four common program repair benchmarks in +Java. We find that RTT with English as an intermediate language repaired 101 of +164 bugs with GPT-4 on the HumanEval-Java dataset. Moreover, 46 of these are +unique bugs that are not repaired by other LLMs fine-tuned for APR. Our +findings highlight the viability of round-trip translation with LLMs as a +technique for automated program repair and its potential for research in +software engineering. + Keywords: automated program repair, large language model, machine translation + +
+
+
+
+
+ + ☆ Leveraging External Knowledge Resources to Enable Domain-Specific + Comprehension + + +
+ Machine Reading Comprehension (MRC) has been a long-standing problem in NLP +and, with the recent introduction of the BERT family of transformer based +language models, it has come a long way to getting solved. Unfortunately, +however, when BERT variants trained on general text corpora are applied to +domain-specific text, their performance inevitably degrades on account of the +domain shift i.e. genre/subject matter discrepancy between the training and +downstream application data. Knowledge graphs act as reservoirs for either open +or closed domain information and prior studies have shown that they can be used +to improve the performance of general-purpose transformers in domain-specific +applications. Building on existing work, we introduce a method using +Multi-Layer Perceptrons (MLPs) for aligning and integrating embeddings +extracted from knowledge graphs with the embeddings spaces of pre-trained +language models (LMs). We fuse the aligned embeddings with open-domain LMs BERT +and RoBERTa, and fine-tune them for two MRC tasks namely span detection +(COVID-QA) and multiple-choice questions (PubMedQA). On the COVID-QA dataset, +we see that our approach allows these models to perform similar to their +domain-specific counterparts, Bio/Sci-BERT, as evidenced by the Exact Match +(EM) metric. With regards to PubMedQA, we observe an overall improvement in +accuracy while the F1 stays relatively the same over the domain-specific +models. + +
+
+
+
+
+ + ☆ MCMChaos: Improvising Rap Music with MCMC Methods and Chaos Theory + + +
+ A novel freestyle rap software, MCMChaos 0.0.1, based on rap music +transcriptions created in previous research is presented. The software has +three different versions, each making use of different mathematical simulation +methods: collapsed gibbs sampler and lorenz attractor simulation. As far as we +know, these simulation methods have never been used in rap music generation +before. The software implements Python Text-to-Speech processing (pyttxs) to +convert text wrangled from the MCFlow corpus into English speech. In each +version, values simulated from each respective mathematical model alter the +rate of speech, volume, and (in the multiple voice case) the voice of the +text-to-speech engine on a line-by-line basis. The user of the software is +presented with a real-time graphical user interface (GUI) which instantaneously +changes the initial values read into the mathematical simulation methods. +Future research might attempt to allow for more user control and autonomy. + +
+
+
+
+
+ + ☆ AI-as-exploration: Navigating intelligence space + + +
+ Artificial Intelligence is a field that lives many lives, and the term has +come to encompass a motley collection of scientific and commercial endeavours. +In this paper, I articulate the contours of a rather neglected but central +scientific role that AI has to play, which I dub `AI-as-exploration'.The basic +thrust of AI-as-exploration is that of creating and studying systems that can +reveal candidate building blocks of intelligence that may differ from the forms +of human and animal intelligence we are familiar with. In other words, I +suggest that AI is one of the best tools we have for exploring intelligence +space, namely the space of possible intelligent systems. I illustrate the value +of AI-as-exploration by focusing on a specific case study, i.e., recent work on +the capacity to combine novel and invented concepts in humans and Large +Language Models. I show that the latter, despite showing human-level accuracy +in such a task, most probably solve it in ways radically different, but no less +relevant to intelligence research, to those hypothesised for humans. + +
+
+
+
+
+ + ☆ A Study on Large Language Models' Limitations in Multiple-Choice + Question Answering + + +
+ The widespread adoption of Large Language Models (LLMs) has become +commonplace, particularly with the emergence of open-source models. More +importantly, smaller models are well-suited for integration into consumer +devices and are frequently employed either as standalone solutions or as +subroutines in various AI tasks. Despite their ubiquitous use, there is no +systematic analysis of their specific capabilities and limitations. In this +study, we tackle one of the most widely used tasks - answering Multiple Choice +Question (MCQ). We analyze 26 small open-source models and find that 65% of the +models do not understand the task, only 4 models properly select an answer from +the given choices, and only 5 of these models are choice order independent. +These results are rather alarming given the extensive use of MCQ tests with +these models. We recommend exercising caution and testing task understanding +before using MCQ to evaluate LLMs in any field whatsoever. + +
+
+
+
+
+ + ☆ SciGLM: Training Scientific Language Models with Self-Reflective + Instruction Annotation and Tuning + + +
+ \label{sec:abstract} Large Language Models (LLMs) have shown promise in +assisting scientific discovery. However, such applications are currently +limited by LLMs' deficiencies in understanding intricate scientific concepts, +deriving symbolic equations, and solving advanced numerical calculations. To +bridge these gaps, we introduce SciGLM, a suite of scientific language models +able to conduct college-level scientific reasoning. Central to our approach is +a novel self-reflective instruction annotation framework to address the data +scarcity challenge in the science domain. This framework leverages existing +LLMs to generate step-by-step reasoning for unlabelled scientific questions, +followed by a process of self-reflective critic-and-revise. Applying this +framework, we curated SciInstruct, a diverse and high-quality dataset +encompassing mathematics, physics, chemistry, and formal proofs. We fine-tuned +the ChatGLM family of language models with SciInstruct, enhancing their +capabilities in scientific and mathematical reasoning. Remarkably, SciGLM +consistently improves both the base model (ChatGLM3-6B-Base) and larger-scale +models (12B and 32B), without sacrificing the language understanding +capabilities of the base model. This makes SciGLM a suitable foundational model +to facilitate diverse scientific discovery tasks. For the benefit of the wider +research community, we release SciInstruct, SciGLM, alongside a self-reflective +framework and fine-tuning code at \url{https://github.com/THUDM/SciGLM}. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ SemEval-2017 Task 4: Sentiment Analysis in Twitter using BERT + + +
+ This paper uses the BERT model, which is a transformer-based architecture, to +solve task 4A, English Language, Sentiment Analysis in Twitter of SemEval2017. +BERT is a very powerful large language model for classification tasks when the +amount of training data is small. For this experiment, we have used the +BERT{\textsubscript{\tiny BASE}} model, which has 12 hidden layers. This model +provides better accuracy, precision, recall, and f1 score than the Naive Bayes +baseline model. It performs better in binary classification subtasks than the +multi-class classification subtasks. We also considered all kinds of ethical +issues during this experiment, as Twitter data contains personal and sensible +information. The dataset and code used in our experiment can be found in this +GitHub repository. + +
+
+
+
+
+ + ☆ A Lexicon for Studying Radicalization in Incel Communities + + +
+ Incels are an extremist online community of men who believe in an ideology +rooted in misogyny, racism, the glorification of violence, and dehumanization. +In their online forums, they use an extensive, evolving cryptolect - a set of +ingroup terms that have meaning within the group, reflect the ideology, +demonstrate membership in the community, and are difficult for outsiders to +understand. This paper presents a lexicon with terms and definitions for common +incel root words, prefixes, and affixes. The lexicon is text-based for use in +automated analysis and is derived via a Qualitative Content Analysis of the +most frequent incel words, their structure, and their meaning on five of the +most active incel communities from 2016 to 2023. This lexicon will support +future work examining radicalization and deradicalization/disengagement within +the community. + +
+
+ comment: 6 pages, 1 figure +
+
+
+
+
+ + ☆ Can Large Language Models Explain Themselves? + + +
+ Instruction-tuned large language models (LLMs) excel at many tasks, and will +even provide explanations for their behavior. Since these models are directly +accessible to the public, there is a risk that convincing and wrong +explanations can lead to unsupported confidence in LLMs. Therefore, +interpretability-faithfulness of self-explanations is an important +consideration for AI Safety. Assessing the interpretability-faithfulness of +these explanations, termed self-explanations, is challenging as the models are +too complex for humans to annotate what is a correct explanation. To address +this, we propose employing self-consistency checks as a measure of +faithfulness. For example, if an LLM says a set of words is important for +making a prediction, then it should not be able to make the same prediction +without these words. While self-consistency checks are a common approach to +faithfulness, they have not previously been applied to LLM's self-explanations. +We apply self-consistency checks to three types of self-explanations: +counterfactuals, importance measures, and redactions. Our work demonstrate that +faithfulness is both task and model dependent, e.g., for sentiment +classification, counterfactual explanations are more faithful for Llama2, +importance measures for Mistral, and redaction for Falcon 40B. Finally, our +findings are robust to prompt-variations. + +
+
+
+
+
+ + ☆ Word Boundary Information Isn't Useful for Encoder Language Models + + +
+ All existing transformer-based approaches to NLP using subword tokenisation +algorithms encode whitespace (word boundary information) through the use of +special space symbols (such as \#\# or \_) forming part of tokens. These +symbols have been shown to a) lead to reduced morphological validity of +tokenisations, and b) give substantial vocabulary redundancy. As such, removing +these symbols has been shown to have a beneficial effect on the processing of +morphologically complex words for transformer encoders in the pretrain-finetune +paradigm. In this work, we explore whether word boundary information is at all +useful to such models. In particular, we train transformer encoders across four +different training scales, and investigate several alternative approaches to +including word boundary information, evaluating on a range of tasks across +different domains and problem set-ups: GLUE (for sentence-level +classification), NER (for token-level classification), and two classification +datasets involving complex words (Superbizarre and FLOTA). Overall, through an +extensive experimental setup that includes the pre-training of 29 models, we +find no substantial improvements from our alternative approaches, suggesting +that modifying tokenisers to remove word boundary information isn't leading to +a loss of useful information. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ The Pitfalls of Defining Hallucination + + +
+ Despite impressive advances in Natural Language Generation (NLG) and Large +Language Models (LLMs), researchers are still unclear about important aspects +of NLG evaluation. To substantiate this claim, I examine current +classifications of hallucination and omission in Data-text NLG, and I propose a +logic-based synthesis of these classfications. I conclude by highlighting some +remaining limitations of all current thinking about hallucination and by +discussing implications for LLMs. + +
+
+ comment: Accepted for publication in Computational Linguistics on 30 Dec. + 2023. (9 Pages.) +
+
+
+
+
+ + ☆ Learned Best-Effort LLM Serving + + +
+ Many applications must provide low-latency LLM service to users or risk +unacceptable user experience. However, over-provisioning resources to serve +fluctuating request patterns is often prohibitively expensive. In this work, we +present a best-effort serving system that employs deep reinforcement learning +to adjust service quality based on the task distribution and system load. Our +best-effort system can maintain availability with over 10x higher client +request rates, serves above 96% of peak performance 4.1x more often, and serves +above 98% of peak performance 2.3x more often than static serving on +unpredictable workloads. Our learned router is robust to shifts in both the +arrival and task distribution. Compared to static serving, learned best-effort +serving allows for cost-efficient serving through increased hardware utility. +Additionally, we argue that learned best-effort LLM serving is applicable in +wide variety of settings and provides application developers great flexibility +to meet their specific needs. + +
+
+
+
+
+ + ☆ The Chronicles of RAG: The Retriever, the Chunk and the Generator + + +
+ Retrieval Augmented Generation (RAG) has become one of the most popular +paradigms for enabling LLMs to access external data, and also as a mechanism +for grounding to mitigate against hallucinations. When implementing RAG you can +face several challenges like effective integration of retrieval models, +efficient representation learning, data diversity, computational efficiency +optimization, evaluation, and quality of text generation. Given all these +challenges, every day a new technique to improve RAG appears, making it +unfeasible to experiment with all combinations for your problem. In this +context, this paper presents good practices to implement, optimize, and +evaluate RAG for the Brazilian Portuguese language, focusing on the +establishment of a simple pipeline for inference and experiments. We explored a +diverse set of methods to answer questions about the first Harry Potter book. +To generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview, +gpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the +retriever, our approach achieved an improvement of MRR@10 by 35.4% compared to +the baseline. When optimizing the input size in the application, we observed +that it is possible to further enhance it by 2.4%. Finally, we present the +complete architecture of the RAG with our recommendations. As result, we moved +from a baseline of 57.88% to a maximum relative score of 98.61%. + +
+
+ comment: 16 pages, 15 figures, 9 tables +
+
+
+
+
+ + ☆ EMBRE: Entity-aware Masking for Biomedical Relation Extraction + + +
+ Information extraction techniques, including named entity recognition (NER) +and relation extraction (RE), are crucial in many domains to support making +sense of vast amounts of unstructured text data by identifying and connecting +relevant information. Such techniques can assist researchers in extracting +valuable insights. In this paper, we introduce the Entity-aware Masking for +Biomedical Relation Extraction (EMBRE) method for biomedical relation +extraction, as applied in the context of the BioRED challenge Task 1, in which +human-annotated entities are provided as input. Specifically, we integrate +entity knowledge into a deep neural network by pretraining the backbone model +with an entity masking objective. We randomly mask named entities for each +instance and let the model identify the masked entity along with its type. In +this way, the model is capable of learning more specific knowledge and more +robust representations. Then, we utilize the pre-trained model as our backbone +to encode language representations and feed these representations into two +multilayer perceptron (MLPs) to predict the logits for relation and novelty, +respectively. The experimental results demonstrate that our proposed method can +improve the performances of entity pair, relation and novelty extraction over +our baseline. + +
+
+ comment: 5 pages, 1 figure +
+
+
+
+
+ + ☆ The What, Why, and How of Context Length Extension Techniques in Large + Language Models -- A Detailed Survey + + +
+ The advent of Large Language Models (LLMs) represents a notable breakthrough +in Natural Language Processing (NLP), contributing to substantial progress in +both text comprehension and generation. However, amidst these advancements, it +is noteworthy that LLMs often face a limitation in terms of context length +extrapolation. Understanding and extending the context length for LLMs is +crucial in enhancing their performance across various NLP applications. In this +survey paper, we delve into the multifaceted aspects of exploring why it is +essential, and the potential transformations that superior techniques could +bring to NLP applications. We study the inherent challenges associated with +extending context length and present an organized overview of the existing +strategies employed by researchers. Additionally, we discuss the intricacies of +evaluating context extension techniques and highlight the open challenges that +researchers face in this domain. Furthermore, we explore whether there is a +consensus within the research community regarding evaluation standards and +identify areas where further agreement is needed. This comprehensive survey +aims to serve as a valuable resource for researchers, guiding them through the +nuances of context length extension techniques and fostering discussions on +future advancements in this evolving field. + +
+
+
+
+
+ + ☆ JumpCoder: Go Beyond Autoregressive Coder via Online Modification + + +
+ While existing code large language models (code LLMs) exhibit impressive +capabilities in code generation, their autoregressive sequential generation +inherently lacks reversibility. This limitation hinders them from timely +correcting previous missing statements during coding as humans do, often +leading to error propagation and suboptimal performance. We introduce +JumpCoder, a novel modelagnostic framework that enables online modification and +non-sequential generation to augment the code LLMs. The key idea behind +JumpCoder is to insert new code into the currently generated code when +necessary during generation, which is achieved through an auxiliary infilling +model that works in tandem with the code LLM. Since identifying the best infill +position beforehand is intractable, we adopt an infill-first, judge-later +strategy, which experiments with filling at the $k$ most critical positions +following the generation of each line, and uses an Abstract Syntax Tree (AST) +parser alongside the Generation Model Scoring to effectively judge the validity +of each potential infill. Extensive experiments using six state-of-the-art code +LLMs across multiple benchmarks consistently indicate significant improvements +over all baselines. Notably, JumpCoder assists code LLMs in achieving up to a +3.6% increase in Pass@1 for Python, 6.3% for Java, and 3.7% for C++ in the +multilingual HumanEval benchmarks. Our code is public at +https://github.com/Keytoyze/JumpCoder. + +
+
+
+
+
+ + ☆ Consolidating Trees of Robotic Plans Generated Using Large Language + Models to Improve Reliability + + +
+ The inherent probabilistic nature of Large Language Models (LLMs) introduces +an element of unpredictability, raising concerns about potential discrepancies +in their output. This paper introduces an innovative approach aims to generate +correct and optimal robotic task plans for diverse real-world demands and +scenarios. LLMs have been used to generate task plans, but they are unreliable +and may contain wrong, questionable, or high-cost steps. The proposed approach +uses LLM to generate a number of task plans as trees and amalgamates them into +a graph by removing questionable paths. Then an optimal task tree can be +retrieved to circumvent questionable and high-cost nodes, thereby improving +planning accuracy and execution efficiency. The approach is further improved by +incorporating a large knowledge network. Leveraging GPT-4 further, the +high-level task plan is converted into a low-level Planning Domain Definition +Language (PDDL) plan executable by a robot. Evaluation results highlight the +superior accuracy and efficiency of our approach compared to previous +methodologies in the field of task planning. + +
+
+
+
+
+ + ☆ Authorship Obfuscation in Multilingual Machine-Generated Text Detection + + +
+ High-quality text generation capability of latest Large Language Models +(LLMs) causes concerns about their misuse (e.g., in massive generation/spread +of disinformation). Machine-generated text (MGT) detection is important to cope +with such threats. However, it is susceptible to authorship obfuscation (AO) +methods, such as paraphrasing, which can cause MGTs to evade detection. So far, +this was evaluated only in monolingual settings. Thus, the susceptibility of +recently proposed multilingual detectors is still unknown. We fill this gap by +comprehensively benchmarking the performance of 10 well-known AO methods, +attacking 37 MGT detection methods against MGTs in 11 languages (i.e., 10 +$\times$ 37 $\times$ 11 = 4,070 combinations). We also evaluate the effect of +data augmentation on adversarial robustness using obfuscated texts. The results +indicate that all tested AO methods can cause detection evasion in all tested +languages, where homoglyph attacks are especially successful. + +
+
+
+
+
+ + ☆ Unlocking Efficiency in Large Language Model Inference: A Comprehensive + Survey of Speculative Decoding + + +
+ To mitigate the high inference latency stemming from autoregressive decoding +in Large Language Models (LLMs), Speculative Decoding has emerged as a novel +decoding paradigm for LLM inference. In each decoding step, this method first +efficiently drafts several future tokens and then verifies them in parallel. +Unlike autoregressive decoding, Speculative Decoding facilitates the +simultaneous decoding of multiple tokens per step, thereby accelerating +inference. This paper presents a comprehensive overview and analysis of this +promising decoding paradigm. We begin by providing a formal definition and +formulation of Speculative Decoding. Then, we organize in-depth discussions on +its key facets, including current leading techniques, the challenges faced, and +potential future directions in this field. We aim for this work to serve as a +catalyst for further research on Speculative Decoding, ultimately contributing +to more efficient LLM inference. + +
+
+
+
+
+ + ☆ Milestones in Bengali Sentiment Analysis leveraging Transformer-models: + Fundamentals, Challenges and Future Directions + + +
+ Sentiment Analysis (SA) refers to the task of associating a view polarity +(usually, positive, negative, or neutral; or even fine-grained such as slightly +angry, sad, etc.) to a given text, essentially breaking it down to a supervised +(since we have the view labels apriori) classification task. Although heavily +studied in resource-rich languages such as English thus pushing the SOTA by +leaps and bounds, owing to the arrival of the Transformer architecture, the +same cannot be said for resource-poor languages such as Bengali (BN). For a +language spoken by roughly 300 million people, the technology enabling them to +run trials on their favored tongue is severely lacking. In this paper, we +analyze the SOTA for SA in Bengali, particularly, Transformer-based models. We +discuss available datasets, their drawbacks, the nuances associated with +Bengali i.e. what makes this a challenging language to apply SA on, and finally +provide insights for future direction to mitigate the limitations in the field. + +
+
+
+
+
+ + ☆ Question Translation Training for Better Multilingual Reasoning + + +
+ Large language models show compelling performance on reasoning tasks but they +tend to perform much worse in languages other than English. This is +unsurprising given that their training data largely consists of English text +and instructions. A typical solution is to translate instruction data into all +languages of interest, and then train on the resulting multilingual data, which +is called translate-training. This approach not only incurs high cost, but also +results in poorly translated data due to the non-standard formatting of +chain-of-thought and mathematical reasoning instructions. In this paper, we +explore the benefits of question alignment, where we train the model to +translate reasoning questions into English by finetuning on X-English question +data. In this way we perform targetted, in-domain language alignment which +makes best use of English instruction data to unlock the LLMs' multilingual +reasoning abilities. Experimental results on LLaMA2-13B show that question +alignment leads to consistent improvements over the translate-training +approach: an average improvement of 11.3\% and 16.1\% accuracy across ten +languages on the MGSM and MSVAMP maths reasoning benchmarks (The project will +be available at: https://github.com/NJUNLP/QAlign). + +
+
+
+
+
+ + ☆ Wikidata as a seed for Web Extraction + + +
+ Wikidata has grown to a knowledge graph with an impressive size. To date, it +contains more than 17 billion triples collecting information about people, +places, films, stars, publications, proteins, and many more. On the other side, +most of the information on the Web is not published in highly structured data +repositories like Wikidata, but rather as unstructured and semi-structured +content, more concretely in HTML pages containing text and tables. Finding, +monitoring, and organizing this data in a knowledge graph is requiring +considerable work from human editors. The volume and complexity of the data +make this task difficult and time-consuming. In this work, we present a +framework that is able to identify and extract new facts that are published +under multiple Web domains so that they can be proposed for validation by +Wikidata editors. The framework is relying on question-answering technologies. +We take inspiration from ideas that are used to extract facts from textual +collections and adapt them to extract facts from Web pages. For achieving this, +we demonstrate that language models can be adapted to extract facts not only +from textual collections but also from Web pages. By exploiting the information +already contained in Wikidata the proposed framework can be trained without the +need for any additional learning signals and can extract new facts for a wide +range of properties and domains. Following this path, Wikidata can be used as a +seed to extract facts on the Web. Our experiments show that we can achieve a +mean performance of 84.07 at F1-score. Moreover, our estimations show that we +can potentially extract millions of facts that can be proposed for human +validation. The goal is to help editors in their daily tasks and contribute to +the completion of the Wikidata knowledge graph. + +
+
+
+
+
+ + ☆ Consolidating Strategies for Countering Hate Speech Using Persuasive + Dialogues + + +
+ Hateful comments are prevalent on social media platforms. Although tools for +automatically detecting, flagging, and blocking such false, offensive, and +harmful content online have lately matured, such reactive and brute force +methods alone provide short-term and superficial remedies while the +perpetrators persist. With the public availability of large language models +which can generate articulate synthetic and engaging content at scale, there +are concerns about the rapid growth of dissemination of such malicious content +on the web. There is now a need to focus on deeper, long-term solutions that +involve engaging with the human perpetrator behind the source of the content to +change their viewpoint or at least bring down the rhetoric using persuasive +means. To do that, we propose defining and experimenting with controllable +strategies for generating counter-arguments to hateful comments in online +conversations. We experiment with controlling response generation using +features based on (i) argument structure and reasoning-based Walton argument +schemes, (ii) counter-argument speech acts, and (iii) human +characteristics-based qualities such as Big-5 personality traits and human +values. Using automatic and human evaluations, we determine the best +combination of features that generate fluent, argumentative, and logically +sound arguments for countering hate. We further share the developed +computational models for automatically annotating text with such features, and +a silver-standard annotated version of an existing hate speech dialog corpora. + +
+
+
+
+
+ + ☆ Flexibly Scaling Large Language Models Contexts Through Extensible + Tokenization + + +
+ Large language models (LLMs) are in need of sufficient contexts to handle +many critical applications, such as retrieval augmented generation and few-shot +learning. However, due to the constrained window size, the LLMs can only access +to the information within a limited context. Although the size of context +window can be extended by fine-tuning, it will result in a substantial cost in +both training and inference stage. In this paper, we present Extensible +Tokenization as an alternative method which realizes the flexible scaling of +LLMs' context. Extensible Tokenization stands as a midware in between of the +tokenized context and the LLM, which transforms the raw token embeddings into +the extensible embeddings. Such embeddings provide a more compact +representation for the long context, on top of which the LLM is able to +perceive more information with the same context window. Extensible Tokenization +is also featured by its flexibility: the scaling factor can be flexibly +determined within a feasible scope, leading to the extension of an arbitrary +context length at the inference time. Besides, Extensible Tokenization is +introduced as a drop-in component, which can be seamlessly plugged into not +only the LLM itself and but also its fine-tuned derivatives, bringing in the +extended contextual information while fully preserving the LLM's existing +capabilities. We perform comprehensive experiments on long-context language +modeling and understanding tasks, which verify Extensible Tokenization as an +effective, efficient, flexible, and compatible method to extend LLM's context. +Our model and source code will be made publicly available. + +
+
+
+
+
+ + ☆ Quantum Transfer Learning for Acceptability Judgements + + +
+ Hybrid quantum-classical classifiers promise to positively impact critical +aspects of natural language processing tasks, particularly +classification-related ones. Among the possibilities currently investigated, +quantum transfer learning, i.e., using a quantum circuit for fine-tuning +pre-trained classical models for a specific task, is attracting significant +attention as a potential platform for proving quantum advantage. + This work shows potential advantages, both in terms of performance and +expressiveness, of quantum transfer learning algorithms trained on embedding +vectors extracted from a large language model to perform classification on a +classical Linguistics task: acceptability judgments. Acceptability judgment is +the ability to determine whether a sentence is considered natural and +well-formed by a native speaker. The approach has been tested on sentences +extracted from ItaCoLa, a corpus that collects Italian sentences labeled with +their acceptability judgment. The evaluation phase shows results for the +quantum transfer learning pipeline comparable to state-of-the-art classical +transfer learning algorithms, proving current quantum computers' capabilities +to tackle NLP tasks for ready-to-use applications. Furthermore, a qualitative +linguistic analysis, aided by explainable AI methods, reveals the capabilities +of quantum transfer learning algorithms to correctly classify complex and more +structured sentences, compared to their classical counterpart. This finding +sets the ground for a quantifiable quantum advantage in NLP in the near future. + +
+
+
+
+
+ + ☆ On the importance of Data Scale in Pretraining Arabic Language Models + + +
+ Pretraining monolingual language models have been proven to be vital for +performance in Arabic Natural Language Processing (NLP) tasks. In this paper, +we conduct a comprehensive study on the role of data in Arabic Pretrained +Language Models (PLMs). More precisely, we reassess the performance of a suite +of state-of-the-art Arabic PLMs by retraining them on massive-scale, +high-quality Arabic corpora. We have significantly improved the performance of +the leading Arabic encoder-only BERT-base and encoder-decoder T5-base models on +the ALUE and ORCA leaderboards, thereby reporting state-of-the-art results in +their respective model categories. In addition, our analysis strongly suggests +that pretraining data by far is the primary contributor to performance, +surpassing other factors. Our models and source code are publicly available at +https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/JABER-PyTorch. + +
+
+
+
+
+ + ☆ Prompting open-source and commercial language models for grammatical + error correction of English learner text + + +
+ Thanks to recent advances in generative AI, we are able to prompt large +language models (LLMs) to produce texts which are fluent and grammatical. In +addition, it has been shown that we can elicit attempts at grammatical error +correction (GEC) from LLMs when prompted with ungrammatical input sentences. We +evaluate how well LLMs can perform at GEC by measuring their performance on +established benchmark datasets. We go beyond previous studies, which only +examined GPT* models on a selection of English GEC datasets, by evaluating +seven open-source and three commercial LLMs on four established GEC benchmarks. +We investigate model performance and report results against individual error +types. Our results indicate that LLMs do not always outperform supervised +English GEC models except in specific contexts -- namely commercial LLMs on +benchmarks annotated with fluency corrections as opposed to minimal edits. We +find that several open-source models outperform commercial ones on minimal edit +benchmarks, and that in some settings zero-shot prompting is just as +competitive as few-shot prompting. + +
+
+ comment: 8 pages with appendices +
+
+
+
+
+ + ☆ Assisted Knowledge Graph Authoring: Human-Supervised Knowledge Graph + Construction from Natural Language + + +
+ Encyclopedic knowledge graphs, such as Wikidata, host an extensive repository +of millions of knowledge statements. However, domain-specific knowledge from +fields such as history, physics, or medicine is significantly underrepresented +in those graphs. Although few domain-specific knowledge graphs exist (e.g., +Pubmed for medicine), developing specialized retrieval applications for many +domains still requires constructing knowledge graphs from scratch. To +facilitate knowledge graph construction, we introduce WAKA: a Web application +that allows domain experts to create knowledge graphs through the medium with +which they are most familiar: natural language. + +
+
+ comment: accepted at CHIIR 2024 +
+
+
+
+
+ + ☆ MAPLE: Multilingual Evaluation of Parameter Efficient Finetuning of + Large Language Models + + +
+ Parameter efficient finetuning has emerged as a viable solution for improving +the performance of Large Language Models without requiring massive resources +and compute. Prior work on multilingual evaluation has shown that there is a +large gap between the performance of LLMs on English and other languages. +Further, there is also a large gap between the performance of smaller +open-source models and larger LLMs. Finetuning can be an effective way to +bridge this gap and make language models more equitable. In this work, we +finetune the LLaMA-7B and Mistral-7B models on synthetic multilingual +instruction tuning data to determine its effect on model performance on five +downstream tasks covering twenty three languages in all. Additionally, we +experiment with various parameters, such as rank for low-rank adaptation and +values of quantisation to determine their effects on downstream performance and +find that higher rank and higher quantisation values benefit low-resource +languages. We find that parameter efficient finetuning of smaller open source +models sometimes bridges the gap between the performance of these models and +the larger ones, however, English performance can take a hit. We also find that +finetuning sometimes improves performance on low-resource languages, while +degrading performance on high-resource languages. + +
+
+ comment: 23 pages, 23 figures, 14 tables +
+
+
+
+
+ + ☆ Cascaded Cross-Modal Transformer for Audio-Textual Classification + + +
+ Speech classification tasks often require powerful language understanding +models to grasp useful features, which becomes problematic when limited +training data is available. To attain superior classification performance, we +propose to harness the inherent value of multimodal representations by +transcribing speech using automatic speech recognition (ASR) models and +translating the transcripts into different languages via pretrained translation +models. We thus obtain an audio-textual (multimodal) representation for each +data sample. Subsequently, we combine language-specific Bidirectional Encoder +Representations from Transformers (BERT) with Wav2Vec2.0 audio features via a +novel cascaded cross-modal transformer (CCMT). Our model is based on two +cascaded transformer blocks. The first one combines text-specific features from +distinct languages, while the second one combines acoustic features with +multilingual features previously learned by the first transformer block. We +employed our system in the Requests Sub-Challenge of the ACM Multimedia 2023 +Computational Paralinguistics Challenge. CCMT was declared the winning +solution, obtaining an unweighted average recall (UAR) of 65.41% and 85.87% for +complaint and request detection, respectively. Moreover, we applied our +framework on the Speech Commands v2 and HarperValleyBank dialog data sets, +surpassing previous studies reporting results on these benchmarks. Our code is +freely available for download at: https://github.com/ristea/ccmt. + +
+
+
+
+
+ + ☆ Exploiting GPT-4 Vision for Zero-shot Point Cloud Understanding + + +
+ In this study, we tackle the challenge of classifying the object category in +point clouds, which previous works like PointCLIP struggle to address due to +the inherent limitations of the CLIP architecture. Our approach leverages GPT-4 +Vision (GPT-4V) to overcome these challenges by employing its advanced +generative abilities, enabling a more adaptive and robust classification +process. We adapt the application of GPT-4V to process complex 3D data, +enabling it to achieve zero-shot recognition capabilities without altering the +underlying model architecture. Our methodology also includes a systematic +strategy for point cloud image visualization, mitigating domain gap and +enhancing GPT-4V's efficiency. Experimental validation demonstrates our +approach's superiority in diverse scenarios, setting a new benchmark in +zero-shot point cloud classification. + +
+
+
+
+
+ + ☆ Safe Reinforcement Learning with Free-form Natural Language Constraints + and Pre-Trained Language Models + + +
+ Safe reinforcement learning (RL) agents accomplish given tasks while adhering +to specific constraints. Employing constraints expressed via +easily-understandable human language offers considerable potential for +real-world applications due to its accessibility and non-reliance on domain +expertise. Previous safe RL methods with natural language constraints typically +adopt a recurrent neural network, which leads to limited capabilities when +dealing with various forms of human language input. Furthermore, these methods +often require a ground-truth cost function, necessitating domain expertise for +the conversion of language constraints into a well-defined cost function that +determines constraint violation. To address these issues, we proposes to use +pre-trained language models (LM) to facilitate RL agents' comprehension of +natural language constraints and allow them to infer costs for safe policy +learning. Through the use of pre-trained LMs and the elimination of the need +for a ground-truth cost, our method enhances safe policy learning under a +diverse set of human-derived free-form natural language constraints. +Experiments on grid-world navigation and robot control show that the proposed +method can achieve strong performance while adhering to given constraints. The +usage of pre-trained LMs allows our method to comprehend complicated +constraints and learn safe policies without the need for ground-truth cost at +any stage of training or evaluation. Extensive ablation studies are conducted +to demonstrate the efficacy of each part of our method. + +
+
+
+
+
+ + ☆ See the Unseen: Better Context-Consistent Knowledge-Editing by Noises + + +
+ Knowledge-editing updates knowledge of large language models (LLMs) and +contributes to the interpretability and application of LLMs. However, knowledge +applying is context-consistent: LLMs can recall the same knowledge in different +contexts. Existing works ignore this property and the editing lacks +generalization. In this paper, we empirically find that the effects of +different contexts upon LLMs in recalling the same knowledge follow a +Gaussian-like distribution. We then sample Gaussian noises to simulate the +effects of different contexts when updating LLMs. By such, we can make LLMs see +the unseen contexts where the edited knowledge will be applied, therefore +improving the editing generalization. Experimental results on three LLMs +demonstrate the effectiveness of our methods and also distinguish our methods +from the others of fine-tuning LLMs by noises. + +
+
+
+
+
+ + ☆ MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of + Multimodal Large Language Models in Perception + + +
+ Multimodal Large Language Models (MLLMs) have shown their remarkable +abilities in visual perception and understanding recently. However, how to +comprehensively evaluate the capabilities of MLLMs remains a challenge. Most of +the existing benchmarks predominantly focus on assessing perception, cognition, +and reasoning, neglecting the abilities of self-awareness, referring to the +model's recognition of its own capability boundary. In our study, we focus on +self-awareness in image perception and introduce the knowledge quadrant for +MLLMs, which clearly defines the knowns and unknowns in perception. Based on +this, we propose a novel benchmark specifically designed to evaluate the +Self-Aware capabilities in Perception for MLLMs(MM-SAP). MM-SAP encompasses +three distinct sub-datasets, each focusing on different aspects of +self-awareness. We evaluated eight well-known MLLMs using MM-SAP, analyzing +their self-awareness and providing detailed insights. Code and data are +available at https://github.com/YHWmz/MM-SAP + +
+
+
+
+
+ + ☆ Editing Arbitrary Propositions in LLMs without Subject Labels + + +
+ Large Language Model (LLM) editing modifies factual information in LLMs. +Locate-and-Edit (L\&E) methods accomplish this by finding where relevant +information is stored within the neural network, and editing the weights at +that location. The goal of editing is to modify the response of an LLM to a +proposition independently of its phrasing, while not modifying its response to +other related propositions. Existing methods are limited to binary +propositions, which represent straightforward binary relations between a +subject and an object. Furthermore, existing methods rely on semantic subject +labels, which may not be available or even be well-defined in practice. In this +paper, we show that both of these issues can be effectively skirted with a +simple and fast localization method called Gradient Tracing (GT). This +localization method allows editing arbitrary propositions instead of just +binary ones, and does so without the need for subject labels. As propositions +always have a truth value, our experiments prompt an LLM as a boolean +classifier, and edit its T/F response to propositions. Our method applies GT +for location tracing, and then edit the model at that location using a mild +variant of Rank-One Model Editing (ROME). On datasets of binary propositions +derived from the CounterFact dataset, we show that our method -- without access +to subject labels -- performs close to state-of-the-art L\&E methods which has +access subject labels. We then introduce a new dataset, Factual Accuracy +Classification Test (FACT), which includes non-binary propositions and for +which subject labels are not generally applicable, and therefore is beyond the +scope of existing L\&E methods. Nevertheless, we show that with our method +editing is possible on FACT. + +
+
+
+
+
+ + ☆ TAROT: A Hierarchical Framework with Multitask Co-Pretraining on + Semi-Structured Data towards Effective Person-Job Fit ICASSP 2024 + + +
+ Person-job fit is an essential part of online recruitment platforms in +serving various downstream applications like Job Search and Candidate +Recommendation. Recently, pretrained large language models have further +enhanced the effectiveness by leveraging richer textual information in user +profiles and job descriptions apart from user behavior features and job +metadata. However, the general domain-oriented design struggles to capture the +unique structural information within user profiles and job descriptions, +leading to a loss of latent semantic correlations. We propose TAROT, a +hierarchical multitask co-pretraining framework, to better utilize structural +and semantic information for informative text embeddings. TAROT targets +semi-structured text in profiles and jobs, and it is co-pretained with +multi-grained pretraining tasks to constrain the acquired semantic information +at each level. Experiments on a real-world LinkedIn dataset show significant +performance improvements, proving its effectiveness in person-job fit tasks. + +
+
+ comment: ICASSP 2024 camera ready. 5 pages, 1 figure, 3 tables +
+
+
+
+
+ + ☆ Survey of Natural Language Processing for Education: Taxonomy, + Systematic Review, and Future Trends + + +
+ Natural Language Processing (NLP) aims to analyze the text via techniques in +the computer science field. It serves the applications in healthcare, commerce, +and education domains. Particularly, NLP has been applied to the education +domain to help teaching and learning. In this survey, we review recent advances +in NLP with a focus on solving problems related to the education domain. In +detail, we begin with introducing the relevant background. Then, we present the +taxonomy of NLP in the education domain. Next, we illustrate the task +definition, challenges, and corresponding techniques based on the above +taxonomy. After that, we showcase some off-the-shelf demonstrations in this +domain and conclude with future directions. + +
+
+
+
+
+ + ☆ Developing ChatGPT for Biology and Medicine: A Complete Review of + Biomedical Question Answering + + +
+ ChatGPT explores a strategic blueprint of question answering (QA) in +delivering medical diagnosis, treatment recommendations, and other healthcare +support. This is achieved through the increasing incorporation of medical +domain data via natural language processing (NLP) and multimodal paradigms. By +transitioning the distribution of text, images, videos, and other modalities +from the general domain to the medical domain, these techniques have expedited +the progress of medical domain question answering (MDQA). They bridge the gap +between human natural language and sophisticated medical domain knowledge or +expert manual annotations, handling large-scale, diverse, unbalanced, or even +unlabeled data analysis scenarios in medical contexts. Central to our focus is +the utilizing of language models and multimodal paradigms for medical question +answering, aiming to guide the research community in selecting appropriate +mechanisms for their specific medical research requirements. Specialized tasks +such as unimodal-related question answering, reading comprehension, reasoning, +diagnosis, relation extraction, probability modeling, and others, as well as +multimodal-related tasks like vision question answering, image caption, +cross-modal retrieval, report summarization, and generation, are discussed in +detail. Each section delves into the intricate specifics of the respective +method under consideration. This paper highlights the structures and +advancements of medical domain explorations against general domain methods, +emphasizing their applications across different tasks and datasets. It also +outlines current challenges and opportunities for future medical domain +research, paving the way for continued innovation and application in this +rapidly evolving field. + +
+
+ comment: 50 pages, 3 figures, Biophysics Reports +
+
+
+
+
+ + ☆ GWPT: A Green Word-Embedding-based POS Tagger + + +
+ As a fundamental tool for natural language processing (NLP), the +part-of-speech (POS) tagger assigns the POS label to each word in a sentence. A +novel lightweight POS tagger based on word embeddings is proposed and named +GWPT (green word-embedding-based POS tagger) in this work. Following the green +learning (GL) methodology, GWPT contains three modules in cascade: 1) +representation learning, 2) feature learning, and 3) decision learning modules. +The main novelty of GWPT lies in representation learning. It uses +non-contextual or contextual word embeddings, partitions embedding dimension +indices into low-, medium-, and high-frequency sets, and represents them with +different N-grams. It is shown by experimental results that GWPT offers +state-of-the-art accuracies with fewer model parameters and significantly lower +computational complexity in both training and inference as compared with +deep-learning-based methods. + +
+
+
+
+
+ + ☆ Utilizing deep learning models for the identification of enhancers and + super-enhancers based on genomic and epigenomic features + + +
+ This paper provides an extensive examination of a sizable dataset of English +tweets focusing on nine widely recognized cryptocurrencies, specifically +Cardano, Binance, Bitcoin, Dogecoin, Ethereum, Fantom, Matic, Shiba, and +Ripple. Our primary objective was to conduct a psycholinguistic and emotion +analysis of social media content associated with these cryptocurrencies. To +enable investigators to make more informed decisions. The study involved +comparing linguistic characteristics across the diverse digital coins, shedding +light on the distinctive linguistic patterns that emerge within each coin's +community. To achieve this, we utilized advanced text analysis techniques. +Additionally, our work unveiled an intriguing Understanding of the interplay +between these digital assets within the cryptocurrency community. By examining +which coin pairs are mentioned together most frequently in the dataset, we +established correlations between different cryptocurrencies. To ensure the +reliability of our findings, we initially gathered a total of 832,559 tweets +from Twitter. These tweets underwent a rigorous preprocessing stage, resulting +in a refined dataset of 115,899 tweets that were used for our analysis. +Overall, our research offers valuable Perception into the linguistic nuances of +various digital coins' online communities and provides a deeper understanding +of their interactions in the cryptocurrency space. + +
+
+ comment: 13 pages, 7 figures, 6 Tables +
+
+
+
+
+ + ☆ Only Send What You Need: Learning to Communicate Efficiently in + Federated Multilingual Machine Translation + + +
+ Federated learning (FL) is a promising approach for solving multilingual +tasks, potentially enabling clients with their own language-specific data to +collaboratively construct a high-quality neural machine translation (NMT) +model. However, communication constraints in practical network systems present +challenges for exchanging large-scale NMT engines between FL parties. In this +paper, we propose a meta-learning-based adaptive parameter selection +methodology, MetaSend, that improves the communication efficiency of model +transmissions from clients during FL-based multilingual NMT training. Our +approach learns a dynamic threshold for filtering parameters prior to +transmission without compromising the NMT model quality, based on the tensor +deviations of clients between different FL rounds. Through experiments on two +NMT datasets with different language distributions, we demonstrate that +MetaSend obtains substantial improvements over baselines in translation quality +in the presence of a limited communication budget. + +
+
+
+
+
+ + ☆ Model Editing at Scale leads to Gradual and Catastrophic Forgetting + + +
+ Editing knowledge in large language models is an attractive capability to +have which allows us to correct incorrectly learnt facts during pre-training, +as well as update the model with an ever-growing list of new facts. While +existing model editing techniques have shown promise, they are usually +evaluated using metrics for reliability, specificity and generalization over +one or few edits. We argue that for model editing to have practical utility, we +must be able to make multiple edits to the same model. With this in mind, we +evaluate the current model editing methods at scale, focusing on two state of +the art methods: ROME and MEMIT. We find that as the model is edited +sequentially with multiple facts, it continually forgets previously edited +facts and the ability to perform downstream tasks. This forgetting happens in +two phases -- an initial gradual but progressive forgetting phase followed by +abrupt or catastrophic forgetting phase. Both gradual and catastrophic +forgetting limit the usefulness of model editing methods at scale -- the former +making model editing less effective as multiple edits are made to the model +while the latter caps the scalability of such model editing methods. Our +analysis also highlights other key limitations of ROME and MEMIT at scale. With +our work, we push for the development and evaluation of model editing methods +keeping scalability in mind. + +
+
+
+
+
+ + ☆ Taec: a Manually annotated text dataset for trait and phenotype + extraction and entity linking in wheat breeding literature + + +
+ Wheat varieties show a large diversity of traits and phenotypes. Linking them +to genetic variability is essential for shorter and more efficient wheat +breeding programs. Newly desirable wheat variety traits include disease +resistance to reduce pesticide use, adaptation to climate change, resistance to +heat and drought stresses, or low gluten content of grains. Wheat breeding +experiments are documented by a large body of scientific literature and +observational data obtained in-field and under controlled conditions. The +cross-referencing of complementary information from the literature and +observational data is essential to the study of the genotype-phenotype +relationship and to the improvement of wheat selection. The scientific +literature on genetic marker-assisted selection describes much information +about the genotype-phenotype relationship. However, the variety of expressions +used to refer to traits and phenotype values in scientific articles is a hinder +to finding information and cross-referencing it. When trained adequately by +annotated examples, recent text mining methods perform highly in named entity +recognition and linking in the scientific domain. While several corpora contain +annotations of human and animal phenotypes, currently, no corpus is available +for training and evaluating named entity recognition and entity-linking methods +in plant phenotype literature. The Triticum aestivum trait Corpus is a new gold +standard for traits and phenotypes of wheat. It consists of 540 PubMed +references fully annotated for trait, phenotype, and species named entities +using the Wheat Trait and Phenotype Ontology and the species taxonomy of the +National Center for Biotechnology Information. A study of the performance of +tools trained on the Triticum aestivum trait Corpus shows that the corpus is +suitable for the training and evaluation of named entity recognition and +linking. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Stability Analysis of ChatGPT-based Sentiment Analysis in AI Quality + Assurance + + +
+ In the era of large AI models, the complex architecture and vast parameters +present substantial challenges for effective AI quality management (AIQM), e.g. +large language model (LLM). This paper focuses on investigating the quality +assurance of a specific LLM-based AI product--a ChatGPT-based sentiment +analysis system. The study delves into stability issues related to both the +operation and robustness of the expansive AI model on which ChatGPT is based. +Experimental analysis is conducted using benchmark datasets for sentiment +analysis. The results reveal that the constructed ChatGPT-based sentiment +analysis system exhibits uncertainty, which is attributed to various +operational factors. It demonstrated that the system also exhibits stability +issues in handling conventional small text attacks involving robustness. + +
+
+
+
+
+ + ☆ Leveraging the power of transformers for guilt detection in text + + +
+ In recent years, language models and deep learning techniques have +revolutionized natural language processing tasks, including emotion detection. +However, the specific emotion of guilt has received limited attention in this +field. In this research, we explore the applicability of three +transformer-based language models for detecting guilt in text and compare their +performance for general emotion detection and guilt detection. Our proposed +model outformed BERT and RoBERTa models by two and one points respectively. +Additionally, we analyze the challenges in developing accurate guilt-detection +models and evaluate our model's effectiveness in detecting related emotions +like "shame" through qualitative analysis of results. + +
+
+
+
+
+ + ♻ ☆ ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via + Tool Embeddings NeurIPS 2023 + + +
+ Augmenting large language models (LLMs) with external tools has emerged as a +promising approach to solving complex problems. However, traditional methods, +which finetune LLMs with tool demonstration data, can be both costly and +restricted to a predefined set of tools. Recent in-context learning paradigm +alleviates these issues, but the limited context length only allows for a few +shots of demonstrations, leading to suboptimal understandings of the tools. +Moreover, when there are numerous tools to choose from, in-context learning +could completely fail to work. In this paper, we propose an alternative +approach, $\textbf{ToolkenGPT}$, which combines the benefits of both sides. Our +approach represents each $\underline{tool}$ as a to$\underline{ken}$ +($\textit{toolken}$) and learns an embedding for it, enabling tool calls in the +same way as generating a regular word token. Once a toolken is triggered, the +LLM is prompted to complete arguments for the tool to execute. ToolkenGPT +offers the flexibility to plug in an arbitrary number of tools by expanding the +set of toolkens on the fly. In addition, it improves tool use by allowing +extensive demonstration data for learning the toolken embeddings. In diverse +domains, including numerical reasoning, knowledge-based question answering, and +embodied plan generation, our approach effectively augments LLMs with tools and +substantially outperforms various latest baselines. ToolkenGPT demonstrates the +promising ability to use relevant tools from a large tool set in complex +scenarios. + +
+
+ comment: NeurIPS 2023 (oral). Code: https://github.com/Ber666/ToolkenGPT +
+
+
+
+
+ + ♻ ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees + + +
+ This work studies post-training parameter quantization in large language +models (LLMs). We introduce quantization with incoherence processing (QuIP), a +new method based on the insight that quantization benefits from +$\textit{incoherent}$ weight and Hessian matrices, i.e., from the weights being +even in magnitude and the directions in which it is important to round them +accurately being unaligned with the coordinate axes. QuIP consists of two +steps: (1) an adaptive rounding procedure minimizing a quadratic proxy +objective; (2) efficient pre- and post-processing that ensures weight and +Hessian incoherence via multiplication by random orthogonal matrices. We +complement QuIP with the first theoretical analysis for an LLM-scale +quantization algorithm, and show that our theory also applies to an existing +method, OPTQ. Empirically, we find that our incoherence preprocessing improves +several existing quantization algorithms and yields the first LLM quantization +methods that produce viable results using only two bits per weight. Our code +can be found at https://github.com/Cornell-RelaxML/QuIP. + +
+
+
+
+
+ + ♻ ☆ Has Your Pretrained Model Improved? A Multi-head Posterior Based + Approach + + +
+ The emergence of pretrained models has significantly impacted Natural +Language Processing (NLP) and Computer Vision to relational datasets. +Traditionally, these models are assessed through fine-tuned downstream tasks. +However, this raises the question of how to evaluate these models more +efficiently and more effectively. In this study, we explore a novel approach +where we leverage the meta features associated with each entity as a source of +worldly knowledge and employ entity representations from the models. We propose +using the consistency between these representations and the meta features as a +metric for evaluating pretrained models. Our method's effectiveness is +demonstrated across various domains, including models with relational datasets, +large language models and image models. + +
+
+
+
+
+ + ♻ ☆ Do LLMs exhibit human-like response biases? A case study in survey + design + + +
+ As large language models (LLMs) become more capable, there is growing +excitement about the possibility of using LLMs as proxies for humans in +real-world tasks where subjective labels are desired, such as in surveys and +opinion polling. One widely-cited barrier to the adoption of LLMs is their +sensitivity to prompt wording - but interestingly, humans also display +sensitivities to instruction changes in the form of response biases. As such, +we argue that if LLMs are going to be used to approximate human opinions, it is +necessary to investigate the extent to which LLMs also reflect human response +biases, if at all. In this work, we use survey design as a case study, where +human response biases caused by permutations in wordings of "prompts" have been +extensively studied. Drawing from prior work in social psychology, we design a +dataset and propose a framework to evaluate whether LLMs exhibit human-like +response biases in survey questionnaires. Our comprehensive evaluation of nine +models shows that popular open and commercial LLMs generally fail to reflect +human-like behavior. These inconsistencies tend to be more prominent in models +that have been instruction fine-tuned. Furthermore, even if a model shows a +significant change in the same direction as humans, we find that perturbations +that are not meant to elicit significant changes in humans may also result in a +similar change. These results highlight the potential pitfalls of using LLMs to +substitute humans in parts of the annotation pipeline, and further underscore +the importance of finer-grained characterizations of model behavior. Our code, +dataset, and collected samples are available at +https://github.com/lindiatjuatja/BiasMonkey + +
+
+
+
+
+ + ♻ ☆ Lightning Attention-2: A Free Lunch for Handling Unlimited Sequence + Lengths in Large Language Models + + +
+ Linear attention is an efficient attention mechanism that has recently +emerged as a promising alternative to conventional softmax attention. With its +ability to process tokens in linear computational complexities, linear +attention, in theory, can handle sequences of unlimited length without +sacrificing speed, i.e., maintaining a constant training speed for various +sequence lengths with a fixed memory consumption. However, due to the issue +with cumulative summation (cumsum), current linear attention algorithms cannot +demonstrate their theoretical advantage in a causal setting. In this paper, we +present Lightning Attention-2, the first linear attention implementation that +enables linear attention to realize its theoretical computational benefits. To +achieve this, we leverage the thought of tiling, separately handling the +intra-block and inter-block components in linear attention calculation. +Specifically, we utilize the conventional attention computation mechanism for +the intra-blocks and apply linear attention kernel tricks for the inter-blocks. +A tiling technique is adopted through both forward and backward procedures to +take full advantage of the GPU hardware. We implement our algorithm in Triton +to make it IO-aware and hardware-friendly. Various experiments are conducted on +different model sizes and sequence lengths. Lightning Attention-2 retains +consistent training and inference speed regardless of input sequence length and +is significantly faster than other attention mechanisms. The source code is +available at https://github.com/OpenNLPLab/lightning-attention. + +
+
+ comment: Technical Report. Yiran Zhong is the corresponding author. The source + code is available at https://github.com/OpenNLPLab/lightning-attention +
+
+
+
+
+ + ♻ ☆ An Autoregressive Text-to-Graph Framework for Joint Entity and Relation + Extraction AAAI 2024 + + +
+ In this paper, we propose a novel method for joint entity and relation +extraction from unstructured text by framing it as a conditional sequence +generation problem. In contrast to conventional generative information +extraction models that are left-to-right token-level generators, our approach +is \textit{span-based}. It generates a linearized graph where nodes represent +text spans and edges represent relation triplets. Our method employs a +transformer encoder-decoder architecture with pointing mechanism on a dynamic +vocabulary of spans and relation types. Our model can capture the structural +characteristics and boundaries of entities and relations through span +representations while simultaneously grounding the generated output in the +original text thanks to the pointing mechanism. Evaluation on benchmark +datasets validates the effectiveness of our approach, demonstrating competitive +results. Code is available at https://github.com/urchade/ATG. + +
+
+ comment: AAAI 2024 (camera ready version) +
+
+
+
+
+ + ♻ ☆ Advancing Italian Biomedical Information Extraction with + Transformers-based Models: Methodological Insights and Multicenter Practical + Application + + +
+ The introduction of computerized medical records in hospitals has reduced +burdensome activities like manual writing and information fetching. However, +the data contained in medical records are still far underutilized, primarily +because extracting data from unstructured textual medical records takes time +and effort. Information Extraction, a subfield of Natural Language Processing, +can help clinical practitioners overcome this limitation by using automated +text-mining pipelines. In this work, we created the first Italian +neuropsychiatric Named Entity Recognition dataset, PsyNIT, and used it to +develop a Transformers-based model. Moreover, we collected and leveraged three +external independent datasets to implement an effective multicenter model, with +overall F1-score 84.77%, Precision 83.16%, Recall 86.44%. The lessons learned +are: (i) the crucial role of a consistent annotation process and (ii) a +fine-tuning strategy that combines classical methods with a "low-resource" +approach. This allowed us to establish methodological guidelines that pave the +way for Natural Language Processing studies in less-resourced languages. + +
+
+ comment: 2 figures, 6 tables, Supplementary Notes included +
+
+
+
+
+ + ♻ ☆ ChatGPT's One-year Anniversary: Are Open-Source Large Language Models + Catching up? + + +
+ Upon its release in late 2022, ChatGPT has brought a seismic shift in the +entire landscape of AI, both in research and commerce. Through +instruction-tuning a large language model (LLM) with supervised fine-tuning and +reinforcement learning from human feedback, it showed that a model could answer +human questions and follow instructions on a broad panel of tasks. Following +this success, interests in LLMs have intensified, with new LLMs flourishing at +frequent interval across academia and industry, including many start-ups +focused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's +Claude) generally outperform their open-source counterparts, the progress on +the latter has been rapid with claims of achieving parity or even better on +certain tasks. This has crucial implications not only on research but also on +business. In this work, on the first anniversary of ChatGPT, we provide an +exhaustive overview of this success, surveying all tasks where an open-source +LLM has claimed to be on par or better than ChatGPT. + +
+
+ comment: version v4, included latest top-performing open-sourced LLMs +
+
+
+
+
+ + ♻ ☆ AMPLIFY:Attention-based Mixup for Performance Improvement and Label + Smoothing in Transformer + + +
+ Mixup is an effective data augmentation method that generates new augmented +samples by aggregating linear combinations of different original samples. +However, if there are noises or aberrant features in the original samples, +Mixup may propagate them to the augmented samples, leading to over-sensitivity +of the model to these outliers . To solve this problem, this paper proposes a +new Mixup method called AMPLIFY. This method uses the Attention mechanism of +Transformer itself to reduce the influence of noises and aberrant values in the +original samples on the prediction results, without increasing additional +trainable parameters, and the computational cost is very low, thereby avoiding +the problem of high resource consumption in common Mixup methods such as +Sentence Mixup . The experimental results show that, under a smaller +computational resource cost, AMPLIFY outperforms other Mixup methods in text +classification tasks on 7 benchmark datasets, providing new ideas and new ways +to further improve the performance of pre-trained models based on the Attention +mechanism, such as BERT, ALBERT, RoBERTa, and GPT. Our code can be obtained at +https://github.com/kiwi-lilo/AMPLIFY. + +
+
+
+
+
+ + ♻ ☆ Improving Low-resource Prompt-based Relation Representation with + Multi-view Decoupling Learning AAAI 2024 + + +
+ Recently, prompt-tuning with pre-trained language models (PLMs) has +demonstrated the significantly enhancing ability of relation extraction (RE) +tasks. However, in low-resource scenarios, where the available training data is +scarce, previous prompt-based methods may still perform poorly for prompt-based +representation learning due to a superficial understanding of the relation. To +this end, we highlight the importance of learning high-quality relation +representation in low-resource scenarios for RE, and propose a novel +prompt-based relation representation method, named MVRE +(\underline{M}ulti-\underline{V}iew \underline{R}elation +\underline{E}xtraction), to better leverage the capacity of PLMs to improve the +performance of RE within the low-resource prompt-tuning paradigm. Specifically, +MVRE decouples each relation into different perspectives to encompass +multi-view relation representations for maximizing the likelihood during +relation inference. Furthermore, we also design a Global-Local loss and a +Dynamic-Initialization method for better alignment of the multi-view +relation-representing virtual words, containing the semantics of relation +labels during the optimization learning process and initialization. Extensive +experiments on three benchmark datasets show that our method can achieve +state-of-the-art in low-resource settings. + +
+
+ comment: Accepted to AAAI 2024 +
+
+
+
+
+ + ♻ ☆ LEGO:Language Enhanced Multi-modal Grounding Model + + +
+ Multi-modal large language models have demonstrated impressive performance +across various tasks in different modalities. However, existing multi-modal +models primarily emphasize capturing global information within each modality +while neglecting the importance of perceiving local information across +modalities. Consequently, these models lack the ability to effectively +understand the fine-grained details of input data, limiting their performance +in tasks that require a more nuanced understanding. To address this limitation, +there is a compelling need to develop models that enable fine-grained +understanding across multiple modalities, thereby enhancing their applicability +to a wide range of tasks. In this paper, we propose LEGO, a language enhanced +multi-modal grounding model. Beyond capturing global information like other +multi-modal models, our proposed model excels at tasks demanding a detailed +understanding of local information within the input. It demonstrates precise +identification and localization of specific regions in images or moments in +videos. To achieve this objective, we design a diversified dataset construction +pipeline, resulting in a multi-modal, multi-granularity dataset for model +training. The code, dataset, and demo of our model can be found at https: +//github.com/lzw-lzw/LEGO. + +
+
+
+
+
+ + ♻ ☆ LLaVA-Phi: Efficient Multi-Modal Assistant with Small Language Model + + +
+ In this paper, we introduce LLaVA-$\phi$ (LLaVA-Phi), an efficient +multi-modal assistant that harnesses the power of the recently advanced small +language model, Phi-2, to facilitate multi-modal dialogues. LLaVA-Phi marks a +notable advancement in the realm of compact multi-modal models. It demonstrates +that even smaller language models, with as few as 2.7B parameters, can +effectively engage in intricate dialogues that integrate both textual and +visual elements, provided they are trained with high-quality corpora. Our model +delivers commendable performance on publicly available benchmarks that +encompass visual comprehension, reasoning, and knowledge-based perception. +Beyond its remarkable performance in multi-modal dialogue tasks, our model +opens new avenues for applications in time-sensitive environments and systems +that require real-time interaction, such as embodied agents. It highlights the +potential of smaller language models to achieve sophisticated levels of +understanding and interaction, while maintaining greater resource +efficiency.The project is available at {https://github.com/zhuyiche/llava-phi}. + +
+
+ comment: technique report +
+
+
+
+
+ + ♻ ☆ Dynamic Fault Characteristics Evaluation in Power Grid + + +
+ To enhance the intelligence degree in operation and maintenance, a novel +method for fault detection in power grids is proposed. The proposed GNN-based +approach first identifies fault nodes through a specialized feature extraction +method coupled with a knowledge graph. By incorporating temporal data, the +method leverages the status of nodes from preceding and subsequent time periods +to help current fault detection. To validate the effectiveness of the node +features, a correlation analysis of the output features from each node was +conducted. The results from experiments show that this method can accurately +locate fault nodes in simulation scenarios with a remarkable accuracy. +Additionally, the graph neural network based feature modeling allows for a +qualitative examination of how faults spread across nodes, which provides +valuable insights for analyzing fault nodes. + +
+
+
+
+
+ + ♻ ☆ Can Text-based Knowledge Graph Completion Benefit From Zero-Shot Large + Language Models? + + +
+ Text-based knowledge graph completion (KGC) methods, leveraging textual +entity descriptions are at the research forefront. The efficacy of these models +hinges on the quality of the textual data. This study explores whether enriched +or more efficient textual descriptions can amplify model performance. Recently, +Large Language Models (LLMs) have shown remarkable improvements in NLP tasks, +attributed to their sophisticated text generation and conversational +capabilities. LLMs assimilate linguistic patterns and integrate knowledge from +their training data. Compared to traditional databases like Wikipedia, LLMs +provide several advantages, facilitating broader information querying and +content augmentation. We hypothesize that LLMs, without fine-tuning, can refine +entity descriptions, serving as an auxiliary knowledge source. An in-depth +analysis was conducted to verify this hypothesis. We found that (1) without +fine-tuning, LLMs have the capability to further improve the quality of entity +text descriptions. We validated this through experiments on the FB15K-237 and +WN18RR datasets. (2) LLMs exhibit text generation hallucination issues and +selectively output words with multiple meanings. This was mitigated by +contextualizing prompts to constrain LLM outputs. (3) Larger model sizes do not +necessarily guarantee better performance; even the 7B model can achieve +optimized results in this comparative task. These findings underscore the +untapped potential of large models in text-based KGC, which is a promising +direction for further research in KGC. The code and datasets are accessible at +\href{https://github.com/sjlmg/CP-KGC}. + +
+
+ comment: new versionv +
+
+
+
+
+ + ♻ ☆ T-Eval: Evaluating the Tool Utilization Capability of Large Language + Models Step by Step + + +
+ Large language models (LLM) have achieved remarkable performance on various +NLP tasks and are augmented by tools for broader applications. Yet, how to +evaluate and analyze the tool-utilization capability of LLMs is still +under-explored. In contrast to previous works that evaluate models +holistically, we comprehensively decompose the tool utilization into multiple +sub-processes, including instruction following, planning, reasoning, retrieval, +understanding, and review. Based on that, we further introduce T-Eval to +evaluate the tool utilization capability step by step. T-Eval disentangles the +tool utilization evaluation into several sub-domains along model capabilities, +facilitating the inner understanding of both holistic and isolated competency +of LLMs. We conduct extensive experiments on T-Eval and in-depth analysis of +various LLMs. T-Eval not only exhibits consistency with the outcome-oriented +evaluation but also provides a more fine-grained analysis of the capabilities +of LLMs, providing a new perspective in LLM evaluation on tool-utilization +ability. The benchmark will be available at +https://github.com/open-compass/T-Eval. + +
+
+ comment: Project: https://open-compass.github.io/T-Eval +
+
+
+
+
+ + ♻ ☆ MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and + Uni-Modal Adapter EMNLP + + +
+ Language Models (LMs) have demonstrated impressive molecule understanding +ability on various 1D text-related tasks. However, they inherently lack 2D +graph perception - a critical ability of human professionals in comprehending +molecules' topological structures. To bridge this gap, we propose MolCA: +Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal +Adapter. MolCA enables an LM (e.g., Galactica) to understand both text- and +graph-based molecular contents via the cross-modal projector. Specifically, the +cross-modal projector is implemented as a Q-Former to connect a graph encoder's +representation space and an LM's text space. Further, MolCA employs a uni-modal +adapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks. +Unlike previous studies that couple an LM with a graph encoder via cross-modal +contrastive learning, MolCA retains the LM's ability of open-ended text +generation and augments it with 2D graph information. To showcase its +effectiveness, we extensively benchmark MolCA on tasks of molecule captioning, +IUPAC name prediction, and molecule-text retrieval, on which MolCA +significantly outperforms the baselines. Our codes and checkpoints can be found +at https://github.com/acharkq/MolCA. + +
+
+ comment: EMNLP main conference. 9 pages +
+
+
+
+
+ + ♻ ☆ Dynamic Fault Analysis in Substations Based on Knowledge Graphs + + +
+ To address the challenge of identifying hidden danger in substations from +unstructured text, a novel dynamic analysis method is proposed. We first +extract relevant information from the unstructured text, and then leverages a +flexible distributed search engine built on Elastic-Search to handle the data. +Following this, the hidden Markov model is employed to train the data within +the engine. The Viterbi algorithm is integrated to decipher the hidden state +sequences, facilitating the segmentation and labeling of entities related to +hidden dangers. The final step involves using the Neo4j graph database to +dynamically create a knowledge graph that visualizes hidden dangers in the +substation. The effectiveness of the proposed method is demonstrated through a +case analysis from a specific substation with hidden dangers revealed in the +text records. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 88 + +
+
+
+ + ☆ Convolutional Neural Network Compression via Dynamic Parameter Rank + Pruning + + +
+ While Convolutional Neural Networks (CNNs) excel at learning complex +latent-space representations, their over-parameterization can lead to +overfitting and reduced performance, particularly with limited data. This, +alongside their high computational and memory demands, limits the applicability +of CNNs for edge deployment. Low-rank matrix approximation has emerged as a +promising approach to reduce CNN parameters, but its application presents +challenges including rank selection and performance loss. To address these +issues, we propose an efficient training method for CNN compression via dynamic +parameter rank pruning. Our approach integrates efficient matrix factorization +and novel regularization techniques, forming a robust framework for dynamic +rank reduction and model compression. We use Singular Value Decomposition (SVD) +to model low-rank convolutional filters and dense weight matrices and we +achieve model compression by training the SVD factors with back-propagation in +an end-to-end way. We evaluate our method on an array of modern CNNs, including +ResNet-18, ResNet-20, and ResNet-32, and datasets like CIFAR-10, CIFAR-100, and +ImageNet (2012), showcasing its applicability in computer vision. Our +experiments show that the proposed method can yield substantial storage savings +while maintaining or even enhancing classification performance. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Jewelry Recognition via Encoder-Decoder Models + + +
+ Jewelry recognition is a complex task due to the different styles and designs +of accessories. Precise descriptions of the various accessories is something +that today can only be achieved by experts in the field of jewelry. In this +work, we propose an approach for jewelry recognition using computer vision +techniques and image captioning, trying to simulate this expert human behavior +of analyzing accessories. The proposed methodology consist on using different +image captioning models to detect the jewels from an image and generate a +natural language description of the accessory. Then, this description is also +utilized to classify the accessories at different levels of detail. The +generated caption includes details such as the type of jewel, color, material, +and design. To demonstrate the effectiveness of the proposed method in +accurately recognizing different types of jewels, a dataset consisting of +images of accessories belonging to jewelry stores in C\'ordoba (Spain) has been +created. After testing the different image captioning architectures designed, +the final model achieves a captioning accuracy of 95\%. The proposed +methodology has the potential to be used in various applications such as +jewelry e-commerce, inventory management or automatic jewels recognition to +analyze people's tastes and social status. + +
+
+ comment: 6 pages, 5 figures, MetroXRAINE 2023 Conference +
+
+
+
+
+ + ☆ How does self-supervised pretraining improve robustness against noisy + labels across various medical image classification datasets? + + +
+ Noisy labels can significantly impact medical image classification, +particularly in deep learning, by corrupting learned features. Self-supervised +pretraining, which doesn't rely on labeled data, can enhance robustness against +noisy labels. However, this robustness varies based on factors like the number +of classes, dataset complexity, and training size. In medical images, subtle +inter-class differences and modality-specific characteristics add complexity. +Previous research hasn't comprehensively explored the interplay between +self-supervised learning and robustness against noisy labels in medical image +classification, considering all these factors. In this study, we address three +key questions: i) How does label noise impact various medical image +classification datasets? ii) Which types of medical image datasets are more +challenging to learn and more affected by label noise? iii) How do different +self-supervised pretraining methods enhance robustness across various medical +image datasets? Our results show that DermNet, among five datasets (Fetal +plane, DermNet, COVID-DU-Ex, MURA, NCT-CRC-HE-100K), is the most challenging +but exhibits greater robustness against noisy labels. Additionally, contrastive +learning stands out among the eight self-supervised methods as the most +effective approach to enhance robustness against noisy labels. + +
+
+
+
+
+ + ☆ Cesium Tiles for High-realism Simulation and Comparing SLAM Results in + Corresponding Virtual and Real-world Environments + + +
+ This article discusses the use of a simulated environment to predict +algorithm results in the real world. Simulators are crucial in allowing +researchers to test algorithms, sensor integration, and navigation systems +without deploying expensive hardware. This article examines how the AirSim +simulator, Unreal Engine, and Cesium plugin can be used to generate simulated +digital twin models of real-world locations. Several technical challenges in +completing the analysis are discussed and the technical solutions are detailed +in this article. Work investigates how to assess mapping results for a +real-life experiment using Cesium Tiles provided by digital twins of the +experimental location. This is accompanied by a description of a process for +duplicating real-world flights in simulation. The performance of these methods +is evaluated by analyzing real-life and experimental image telemetry with the +Direct Sparse Odometry (DSO) mapping algorithm. Results indicate that Cesium +Tiles environments can provide highly accurate models of ground truth geometry +after careful alignment. Further, results from real-life and simulated +telemetry analysis indicate that the virtual simulation results accurately +predict real-life results. Findings indicate that the algorithm results in real +life and in the simulated duplicate exhibited a high degree of similarity. This +indicates that the use of Cesium Tiles environments as a virtual digital twin +for real-life experiments will provide representative results for such +algorithms. The impact of this can be significant, potentially allowing +expansive virtual testing of robotic systems at specific deployment locations +to develop solutions that are tailored to the environment and potentially +outperforming solutions meant to work in completely generic environments. + +
+
+
+
+
+ + ☆ GD-CAF: Graph Dual-stream Convolutional Attention Fusion for + Precipitation Nowcasting + + +
+ Accurate precipitation nowcasting is essential for various purposes, +including flood prediction, disaster management, optimizing agricultural +activities, managing transportation routes and renewable energy. While several +studies have addressed this challenging task from a sequence-to-sequence +perspective, most of them have focused on a single area without considering the +existing correlation between multiple disjoint regions. In this paper, we +formulate precipitation nowcasting as a spatiotemporal graph sequence +nowcasting problem. In particular, we introduce Graph Dual-stream Convolutional +Attention Fusion (GD-CAF), a novel approach designed to learn from historical +spatiotemporal graph of precipitation maps and nowcast future time step ahead +precipitation at different spatial locations. GD-CAF consists of +spatio-temporal convolutional attention as well as gated fusion modules which +are equipped with depthwise-separable convolutional operations. This +enhancement enables the model to directly process the high-dimensional +spatiotemporal graph of precipitation maps and exploits higher-order +correlations between the data dimensions. We evaluate our model on seven years +of precipitation maps across Europe and its neighboring areas collected from +the ERA5 dataset, provided by Copernicus. The model receives a fully connected +graph in which each node represents historical observations from a specific +region on the map. Consequently, each node contains a 3D tensor with time, +height, and width dimensions. Experimental results demonstrate that the +proposed GD-CAF model outperforms the other examined models. Furthermore, the +averaged seasonal spatial and temporal attention scores over the test set are +visualized to provide additional insights about the strongest connections +between different regions or time steps. These visualizations shed light on the +decision-making process of our model. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ Machine Perceptual Quality: Evaluating the Impact of Severe Lossy + Compression on Audio and Image Models + + +
+ In the field of neural data compression, the prevailing focus has been on +optimizing algorithms for either classical distortion metrics, such as PSNR or +SSIM, or human perceptual quality. With increasing amounts of data consumed by +machines rather than humans, a new paradigm of machine-oriented +compression$\unicode{x2013}$which prioritizes the retention of features salient +for machine perception over traditional human-centric +criteria$\unicode{x2013}$has emerged, creating several new challenges to the +development, evaluation, and deployment of systems utilizing lossy compression. +In particular, it is unclear how different approaches to lossy compression will +affect the performance of downstream machine perception tasks. To address this +under-explored area, we evaluate various perception +models$\unicode{x2013}$including image classification, image segmentation, +speech recognition, and music source separation$\unicode{x2013}$under severe +lossy compression. We utilize several popular codecs spanning conventional, +neural, and generative compression architectures. Our results indicate three +key findings: (1) using generative compression, it is feasible to leverage +highly compressed data while incurring a negligible impact on machine +perceptual quality; (2) machine perceptual quality correlates strongly with +deep similarity metrics, indicating a crucial role of these metrics in the +development of machine-oriented codecs; and (3) using lossy compressed +datasets, (e.g. ImageNet) for pre-training can lead to counter-intuitive +scenarios where lossy compression increases machine perceptual quality rather +than degrading it. To encourage engagement on this growing area of research, +our code and experiments are available at: +https://github.com/danjacobellis/MPQ. + +
+
+ comment: 10 pages; abridged version published in IEEE Data Compression + Conference 2024 +
+
+
+
+
+ + ☆ Image Similarity using An Ensemble of Context-Sensitive Models + + +
+ Image similarity has been extensively studied in computer vision. In recently +years, machine-learned models have shown their ability to encode more semantics +than traditional multivariate metrics. However, in labelling similarity, +assigning a numerical score to a pair of images is less intuitive than +determining if an image A is closer to a reference image R than another image +B. In this work, we present a novel approach for building an image similarity +model based on labelled data in the form of A:R vs B:R. We address the +challenges of sparse sampling in the image space (R, A, B) and biases in the +models trained with context-based data by using an ensemble model. In +particular, we employed two ML techniques to construct such an ensemble model, +namely dimensionality reduction and MLP regressors. Our testing results show +that the ensemble model constructed performs ~5% better than the best +individual context-sensitive models. They also performed better than the model +trained with mixed imagery data as well as existing similarity models, e.g., +CLIP and DINO. This work demonstrate that context-based labelling and model +training can be effective when an appropriate ensemble approach is used to +alleviate the limitation due to sparse sampling. + +
+
+
+
+
+ + ☆ Transformer-based Video Saliency Prediction with High Temporal Dimension + Decoding + + +
+ In recent years, finding an effective and efficient strategy for exploiting +spatial and temporal information has been a hot research topic in video +saliency prediction (VSP). With the emergence of spatio-temporal transformers, +the weakness of the prior strategies, e.g., 3D convolutional networks and +LSTM-based networks, for capturing long-range dependencies has been effectively +compensated. While VSP has drawn benefits from spatio-temporal transformers, +finding the most effective way for aggregating temporal features is still +challenging. To address this concern, we propose a transformer-based video +saliency prediction approach with high temporal dimension decoding network +(THTD-Net). This strategy accounts for the lack of complex hierarchical +interactions between features that are extracted from the transformer-based +spatio-temporal encoder: in particular, it does not require multiple decoders +and aims at gradually reducing temporal features' dimensions in the decoder. +This decoder-based architecture yields comparable performance to multi-branch +and over-complicated models on common benchmarks such as DHF1K, UCF-sports and +Hollywood-2. + +
+
+ comment: 8 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Vertical Federated Image Segmentation + + +
+ With the popularization of AI solutions for image based problems, there has +been a growing concern for both data privacy and acquisition. In a large number +of cases, information is located on separate data silos and it can be difficult +for a developer to consolidate all of it in a fashion that is appropriate for +machine learning model development. Alongside this, a portion of these +localized data regions may not have access to a labelled ground truth. This +indicates that they have the capacity to reach conclusions numerically, but are +not able to assign classifications amid a lack of pertinent information. Such a +determination is often negligible, especially when attempting to develop image +based solutions that often necessitate this capability. With this being the +case, we propose an innovative vertical federated learning (VFL) model +architecture that can operate under this common set of conditions. This is the +first (and currently the only) implementation of a system that can work under +the constraints of a VFL environment and perform image segmentation while +maintaining nominal accuracies. We achieved this by utilizing an FCN that +boasts the ability to operate on federates that lack labelled data and +privately share the respective weights with a central server, that of which +hosts the necessary features for classification. Tests were conducted on the +CamVid dataset in order to determine the impact of heavy feature compression +required for the transfer of information between federates, as well as to reach +nominal conclusions about the overall performance metrics when working under +such constraints. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Machine Learning Based Object Tracking + + +
+ Machine learning based object detection as well as tracking that object have +been performed in this paper. The authors were able to set a range of interest +(ROI) around an object using Open Computer Vision, better known as OpenCV. Next +a tracking algorithm has been used to maintain tracking on an object while +simultaneously operating two servo motors to keep the object centered in the +frame. Detailed procedure and code are included in this paper. + +
+
+
+
+
+ + ☆ Information hiding cameras: optical concealment of object information + into ordinary images + + +
+ Data protection methods like cryptography, despite being effective, +inadvertently signal the presence of secret communication, thereby drawing +undue attention. Here, we introduce an optical information hiding camera +integrated with an electronic decoder, optimized jointly through deep learning. +This information hiding-decoding system employs a diffractive optical processor +as its front-end, which transforms and hides input images in the form of +ordinary-looking patterns that deceive/mislead human observers. This +information hiding transformation is valid for infinitely many combinations of +secret messages, all of which are transformed into ordinary-looking output +patterns, achieved all-optically through passive light-matter interactions +within the optical processor. By processing these ordinary-looking output +images, a jointly-trained electronic decoder neural network accurately +reconstructs the original information hidden within the deceptive output +pattern. We numerically demonstrated our approach by designing an information +hiding diffractive camera along with a jointly-optimized convolutional decoder +neural network. The efficacy of this system was demonstrated under various +lighting conditions and noise levels, showing its robustness. We further +extended this information hiding camera to multi-spectral operation, allowing +the concealment and decoding of multiple images at different wavelengths, all +performed simultaneously in a single feed-forward operation. The feasibility of +our framework was also demonstrated experimentally using THz radiation. This +optical encoder-electronic decoder-based co-design provides a novel information +hiding camera interface that is both high-speed and energy-efficient, offering +an intriguing solution for visual information security. + +
+
+ comment: 26 Pages, 8 Figures +
+
+
+
+
+ + ☆ $M^{2}$Fusion: Bayesian-based Multimodal Multi-level Fusion on + Colorectal Cancer Microsatellite Instability Prediction + + +
+ Colorectal cancer (CRC) micro-satellite instability (MSI) prediction on +histopathology images is a challenging weakly supervised learning task that +involves multi-instance learning on gigapixel images. To date, radiology images +have proven to have CRC MSI information and efficient patient imaging +techniques. Different data modalities integration offers the opportunity to +increase the accuracy and robustness of MSI prediction. Despite the progress in +representation learning from the whole slide images (WSI) and exploring the +potential of making use of radiology data, CRC MSI prediction remains a +challenge to fuse the information from multiple data modalities (e.g., +pathology WSI and radiology CT image). In this paper, we propose $M^{2}$Fusion: +a Bayesian-based multimodal multi-level fusion pipeline for CRC MSI. The +proposed fusion model $M^{2}$Fusion is capable of discovering more novel +patterns within and across modalities that are beneficial for predicting MSI +than using a single modality alone, as well as other fusion methods. The +contribution of the paper is three-fold: (1) $M^{2}$Fusion is the first +pipeline of multi-level fusion on pathology WSI and 3D radiology CT image for +MSI prediction; (2) CT images are the first time integrated into multimodal +fusion for CRC MSI prediction; (3) feature-level fusion strategy is evaluated +on both Transformer-based and CNN-based method. Our approach is validated on +cross-validation of 352 cases and outperforms either feature-level (0.8177 vs. +0.7908) or decision-level fusion strategy (0.8177 vs. 0.7289) on AUC score. + +
+
+
+
+
+ + ☆ VeCAF: VLM-empowered Collaborative Active Finetuning with Training + Objective Awareness + + +
+ Finetuning a pretrained vision model (PVM) is a common technique for learning +downstream vision tasks. The conventional finetuning process with the randomly +sampled data points results in diminished training efficiency. To address this +drawback, we propose a novel approach, VLM-empowered Collaborative Active +Finetuning (VeCAF). VeCAF optimizes a parametric data selection model by +incorporating the training objective of the model being tuned. Effectively, +this guides the PVM towards the performance goal with improved data and +computational efficiency. As vision-language models (VLMs) have achieved +significant advancements by establishing a robust connection between image and +language domains, we exploit the inherent semantic richness of the text +embedding space and utilize text embedding of pretrained VLM models to augment +PVM image features for better data selection and finetuning. Furthermore, the +flexibility of text-domain augmentation gives VeCAF a unique ability to handle +out-of-distribution scenarios without external augmented data. Extensive +experiments show the leading performance and high efficiency of VeCAF that is +superior to baselines in both in-distribution and out-of-distribution image +classification tasks. On ImageNet, VeCAF needs up to 3.3x less training batches +to reach the target performance compared to full finetuning and achieves 2.8% +accuracy improvement over SOTA methods with the same number of batches. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ☆ Phenotyping calcification in vascular tissues using artificial + intelligence + + +
+ Vascular calcification is implicated as an important factor in major adverse +cardiovascular events (MACE), including heart attack and stroke. A controversy +remains over how to integrate the diverse forms of vascular calcification into +clinical risk assessment tools. Even the commonly used calcium score for +coronary arteries, which assumes risk scales positively with total +calcification, has important inconsistencies. Fundamental studies are needed to +determine how risk is influenced by the diverse calcification phenotypes. +However, studies of these kinds are hindered by the lack of high-throughput, +objective, and non-destructive tools for classifying calcification in imaging +data sets. Here, we introduce a new classification system for phenotyping +calcification along with a semi-automated, non-destructive pipeline that can +distinguish these phenotypes in even atherosclerotic tissues. The pipeline +includes a deep-learning-based framework for segmenting lipid pools in noisy +micro-CT images and an unsupervised clustering framework for categorizing +calcification based on size, clustering, and topology. This approach is +illustrated for five vascular specimens, providing phenotyping for thousands of +calcification particles across as many as 3200 images in less than seven hours. +Average Dice Similarity Coefficients of 0.96 and 0.87 could be achieved for +tissue and lipid pool, respectively, with training and validation needed on +only 13 images despite the high heterogeneity in these tissues. By introducing +an efficient and comprehensive approach to phenotyping calcification, this work +enables large-scale studies to identify a more reliable indicator of the risk +of cardiovascular events, a leading cause of global mortality and morbidity. + +
+
+
+
+
+ + ☆ Uncovering the Full Potential of Visual Grounding Methods in VQA + + +
+ Visual Grounding (VG) methods in Visual Question Answering (VQA) attempt to +improve VQA performance by strengthening a model's reliance on +question-relevant visual information. The presence of such relevant information +in the visual input is typically assumed in training and testing. This +assumption, however, is inherently flawed when dealing with imperfect image +representations common in large-scale VQA, where the information carried by +visual features frequently deviates from expected ground-truth contents. As a +result, training and testing of VG-methods is performed with largely inaccurate +data, which obstructs proper assessment of their potential benefits. + In this work, we demonstrate that current evaluation schemes for VG-methods +are problematic due to the flawed assumption of availability of relevant visual +information. Our experiments show that the potential benefits of these methods +are severely underestimated as a result. + +
+
+
+
+
+ + ☆ Pedestrian Detection in Low-Light Conditions: A Comprehensive Survey + + +
+ Pedestrian detection remains a critical problem in various domains, such as +computer vision, surveillance, and autonomous driving. In particular, accurate +and instant detection of pedestrians in low-light conditions and reduced +visibility is of utmost importance for autonomous vehicles to prevent accidents +and save lives. This paper aims to comprehensively survey various pedestrian +detection approaches, baselines, and datasets that specifically target +low-light conditions. The survey discusses the challenges faced in detecting +pedestrians at night and explores state-of-the-art methodologies proposed in +recent years to address this issue. These methodologies encompass a diverse +range, including deep learning-based, feature-based, and hybrid approaches, +which have shown promising results in enhancing pedestrian detection +performance under challenging lighting conditions. Furthermore, the paper +highlights current research directions in the field and identifies potential +solutions that merit further investigation by researchers. By thoroughly +examining pedestrian detection techniques in low-light conditions, this survey +seeks to contribute to the advancement of safer and more reliable autonomous +driving systems and other applications related to pedestrian safety. +Accordingly, most of the current approaches in the field use deep +learning-based image fusion methodologies (i.e., early, halfway, and late +fusion) for accurate and reliable pedestrian detection. Moreover, the majority +of the works in the field (approximately 48%) have been evaluated on the KAIST +dataset, while the real-world video feeds recorded by authors have been used in +less than six percent of the works. + +
+
+ comment: 23 pages, 3 tables, 10 figures +
+
+
+
+
+ + ☆ Fusing Echocardiography Images and Medical Records for Continuous + Patient Stratification + + +
+ Deep learning now enables automatic and robust extraction of cardiac function +descriptors from echocardiographic sequences, such as ejection fraction or +strain. These descriptors provide fine-grained information that physicians +consider, in conjunction with more global variables from the clinical record, +to assess patients' condition. Drawing on novel transformer models applied to +tabular data (e.g., variables from electronic health records), we propose a +method that considers all descriptors extracted from medical records and +echocardiograms to learn the representation of a difficult-to-characterize +cardiovascular pathology, namely hypertension. Our method first projects each +variable into its own representation space using modality-specific approaches. +These standardized representations of multimodal data are then fed to a +transformer encoder, which learns to merge them into a comprehensive +representation of the patient through a pretext task of predicting a clinical +rating. This pretext task is formulated as an ordinal classification to enforce +a pathological continuum in the representation space. We observe the major +trends along this continuum for a cohort of 239 hypertensive patients to +describe, with unprecedented gradation, the effect of hypertension on a number +of cardiac function descriptors. Our analysis shows that i) pretrained weights +from a foundation model allow to reach good performance (83% accuracy) even +with limited data (less than 200 training samples), ii) trends across the +population are reproducible between trainings, and iii) for descriptors whose +interactions with hypertension are well documented, patterns are consistent +with prior physiological knowledge. + +
+
+ comment: 10 pages, submitted to IEEE TMI +
+
+
+
+
+ + ☆ Improving OCR Quality in 19th Century Historical Documents Using a + Combined Machine Learning Based Approach + + +
+ This paper addresses a major challenge to historical research on the 19th +century. Large quantities of sources have become digitally available for the +first time, while extraction techniques are lagging behind. Therefore, we +researched machine learning (ML) models to recognise and extract complex data +structures in a high-value historical primary source, the Schematismus. It +records every single person in the Habsburg civil service above a certain +hierarchical level between 1702 and 1918 and documents the genesis of the +central administration over two centuries. Its complex and intricate structure +as well as its enormous size have so far made any more comprehensive analysis +of the administrative and social structure of the later Habsburg Empire on the +basis of this source impossible. We pursued two central objectives: Primarily, +the improvement of the OCR quality, for which we considered an improved +structure recognition to be essential; in the further course, it turned out +that this also made the extraction of the data structure possible. We chose +Faster R-CNN as base for the ML architecture for structure recognition. In +order to obtain the required amount of training data quickly and economically, +we synthesised Hof- und Staatsschematismus-style data, which we used to train +our model. The model was then fine-tuned with a smaller set of manually +annotated historical source data. We then used Tesseract-OCR, which was further +optimised for the style of our documents, to complete the combined structure +extraction and OCR process. Results show a significant decrease in the two +standard parameters of OCR-performance, WER and CER (where lower values are +better). Combined structure detection and fine-tuned OCR improved CER and WER +values by remarkable 71.98 percent (CER) respectively 52.49 percent (WER). + +
+
+ comment: 29 pages, 23 figures, 7 tables +
+
+
+
+
+ + ☆ Exploring Masked Autoencoders for Sensor-Agnostic Image Retrieval in + Remote Sensing + + +
+ Self-supervised learning through masked autoencoders (MAEs) has recently +attracted great attention for remote sensing (RS) image representation +learning, and thus embodies a significant potential for content-based image +retrieval (CBIR) from ever-growing RS image archives. However, the existing +studies on MAEs in RS assume that the considered RS images are acquired by a +single image sensor, and thus are only suitable for uni-modal CBIR problems. +The effectiveness of MAEs for cross-sensor CBIR, which aims to search +semantically similar images across different image modalities, has not been +explored yet. In this paper, we take the first step to explore the +effectiveness of MAEs for sensor-agnostic CBIR in RS. To this end, we present a +systematic overview on the possible adaptations of the vanilla MAE to exploit +masked image modeling on multi-sensor RS image archives (denoted as +cross-sensor masked autoencoders [CSMAEs]). Based on different adjustments +applied to the vanilla MAE, we introduce different CSMAE models. We also +provide an extensive experimental analysis of these CSMAE models. We finally +derive a guideline to exploit masked image modeling for uni-modal and +cross-modal CBIR problems in RS. The code of this work is publicly available at +https://github.com/jakhac/CSMAE. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Our code is available at https://github.com/jakhac/CSMAE +
+
+
+
+
+ + ☆ Towards A Better Metric for Text-to-Video Generation + + +
+ Generative models have demonstrated remarkable capability in synthesizing +high-quality text, images, and videos. For video generation, contemporary +text-to-video models exhibit impressive capabilities, crafting visually +stunning videos. Nonetheless, evaluating such videos poses significant +challenges. Current research predominantly employs automated metrics such as +FVD, IS, and CLIP Score. However, these metrics provide an incomplete analysis, +particularly in the temporal assessment of video content, thus rendering them +unreliable indicators of true video quality. Furthermore, while user studies +have the potential to reflect human perception accurately, they are hampered by +their time-intensive and laborious nature, with outcomes that are often tainted +by subjective bias. In this paper, we investigate the limitations inherent in +existing metrics and introduce a novel evaluation pipeline, the Text-to-Video +Score (T2VScore). This metric integrates two pivotal criteria: (1) Text-Video +Alignment, which scrutinizes the fidelity of the video in representing the +given text description, and (2) Video Quality, which evaluates the video's +overall production caliber with a mixture of experts. Moreover, to evaluate the +proposed metrics and facilitate future improvements on them, we present the +TVGE dataset, collecting human judgements of 2,543 text-to-video generated +videos on the two criteria. Experiments on the TVGE dataset demonstrate the +superiority of the proposed T2VScore on offering a better metric for +text-to-video generation. + +
+
+ comment: Project page: https://showlab.github.io/T2VScore/ +
+
+
+
+
+ + ☆ Seeing the Unseen: Visual Common Sense for Semantic Placement + + +
+ Computer vision tasks typically involve describing what is present in an +image (e.g. classification, detection, segmentation, and captioning). We study +a visual common sense task that requires understanding what is not present. +Specifically, given an image (e.g. of a living room) and name of an object +("cushion"), a vision system is asked to predict semantically-meaningful +regions (masks or bounding boxes) in the image where that object could be +placed or is likely be placed by humans (e.g. on the sofa). We call this task: +Semantic Placement (SP) and believe that such common-sense visual understanding +is critical for assitive robots (tidying a house), and AR devices +(automatically rendering an object in the user's space). Studying the invisible +is hard. Datasets for image description are typically constructed by curating +relevant images and asking humans to annotate the contents of the image; +neither of those two steps are straightforward for objects not present in the +image. We overcome this challenge by operating in the opposite direction: we +start with an image of an object in context from web, and then remove that +object from the image via inpainting. This automated pipeline converts +unstructured web data into a dataset comprising pairs of images with/without +the object. Using this, we collect a novel dataset, with ${\sim}1.3$M images +across $9$ object categories, and train a SP prediction model called CLIP-UNet. +CLIP-UNet outperforms existing VLMs and baselines that combine semantic priors +with object detectors on real-world and simulated images. In our user studies, +we find that the SP masks predicted by CLIP-UNet are favored $43.7\%$ and +$31.3\%$ times when comparing against the $4$ SP baselines on real and +simulated images. In addition, we demonstrate leveraging SP mask predictions +from CLIP-UNet enables downstream applications like building tidying robots in +indoor environments. + +
+
+
+
+
+ + ☆ Low-light Stereo Image Enhancement and De-noising in the Low-frequency + Information Enhanced Image Space + + +
+ Unlike single image task, stereo image enhancement can use another view +information, and its key stage is how to perform cross-view feature interaction +to extract useful information from another view. However, complex noise in +low-light image and its impact on subsequent feature encoding and interaction +are ignored by the existing methods. In this paper, a method is proposed to +perform enhancement and de-noising simultaneously. First, to reduce unwanted +noise interference, a low-frequency information enhanced module (IEM) is +proposed to suppress noise and produce a new image space. Additionally, a +cross-channel and spatial context information mining module (CSM) is proposed +to encode long-range spatial dependencies and to enhance inter-channel feature +interaction. Relying on CSM, an encoder-decoder structure is constructed, +incorporating cross-view and cross-scale feature interactions to perform +enhancement in the new image space. Finally, the network is trained with the +constraints of both spatial and frequency domain losses. Extensive experiments +on both synthesized and real datasets show that our method obtains better +detail recovery and noise removal compared with state-of-the-art methods. In +addition, a real stereo image enhancement dataset is captured with stereo +camera ZED2. The code and dataset are publicly available at: +https://www.github.com/noportraits/LFENet. + +
+
+
+
+
+ + ☆ DeepThalamus: A novel deep learning method for automatic segmentation of + brain thalamic nuclei from multimodal ultra-high resolution MRI + + +
+ The implication of the thalamus in multiple neurological pathologies makes it +a structure of interest for volumetric analysis. In the present work, we have +designed and implemented a multimodal volumetric deep neural network for the +segmentation of thalamic nuclei at ultra-high resolution (0.125 mm3). Current +tools either operate at standard resolution (1 mm3) or use monomodal data. To +achieve the proposed objective, first, a database of semiautomatically +segmented thalamic nuclei was created using ultra-high resolution T1, T2 and +White Matter nulled (WMn) images. Then, a novel Deep learning based strategy +was designed to obtain the automatic segmentations and trained to improve its +robustness and accuaracy using a semisupervised approach. The proposed method +was compared with a related state-of-the-art method showing competitive results +both in terms of segmentation quality and efficiency. To make the proposed +method fully available to the scientific community, a full pipeline able to +work with monomodal standard resolution T1 images is also proposed. + +
+
+
+
+
+ + ☆ Sparsity-based background removal for STORM super-resolution images + + +
+ Single-molecule localization microscopy techniques, like stochastic optical +reconstruction microscopy (STORM), visualize biological specimens by +stochastically exciting sparse blinking emitters. The raw images suffer from +unwanted background fluorescence, which must be removed to achieve +super-resolution. We introduce a sparsity-based background removal method by +adapting a neural network (SLNet) from a different microscopy domain. The SLNet +computes a low-rank representation of the images, and then, by subtracting it +from the raw images, the sparse component is computed, representing the frames +without the background. We compared our approach with widely used background +removal methods, such as the median background removal or the rolling ball +algorithm, on two commonly used STORM datasets, one glial cell, and one +microtubule dataset. The SLNet delivers STORM frames with less background, +leading to higher emitters' localization precision and higher-resolution +reconstructed images than commonly used methods. Notably, the SLNet is +lightweight and easily trainable (<5 min). Since it is trained in an +unsupervised manner, no prior information is required and can be applied to any +STORM dataset. We uploaded a pre-trained SLNet to the Bioimage model zoo, +easily accessible through ImageJ. Our results show that our sparse +decomposition method could be an essential and efficient STORM pre-processing +tool. + +
+
+
+
+
+ + ☆ MaskClustering: View Consensus based Mask Graph Clustering for + Open-Vocabulary 3D Instance Segmentation + + +
+ Open-vocabulary 3D instance segmentation has emerged as a frontier topic due +to its capability to segment 3D instances beyond a predefined set of +categories. However, compared to significant progress in the 2D domain, methods +for 3D open-vocabulary instance segmentation are hindered by the limited scale +of high-quality annotated 3D data. To harness the capabilities of 2D models, +recent efforts have focused on merging 2D masks based on metrics such as +geometric and semantic similarity to form 3D instances. In contrast to these +local metrics, we propose a novel metric called view consensus to better +exploit multi-view observation. The key insight is that two 2D masks should be +considered as belonging to the same instance if a considerable number of other +2D masks from other views contain both these two masks. Based on this metric, +we build a global mask graph and iteratively cluster masks, prioritizing mask +pairs with solid view consensus. The corresponding 3D points cluster of these +2D mask clusters can be regarded as 3D instances, along with the fused +open-vocabulary features from clustered 2D masks. Through this multi-view +verification and fusion mechanism, our method effectively leverages the prior +instance knowledge from massive 2D masks predicted by visual foundation models, +eliminating the need for training on 3D data. Experiments on publicly available +datasets, including ScanNet200 and MatterPort3D, demonstrate that our method +achieves state-of-the-art performance in both open-vocabulary instance +segmentation and class-agnostic mask generation. Our project page is at +https://pku-epic.github.io/MaskClustering. + +
+
+
+
+
+ + ☆ SSL-Interactions: Pretext Tasks for Interactive Trajectory Prediction + + +
+ This paper addresses motion forecasting in multi-agent environments, pivotal +for ensuring safety of autonomous vehicles. Traditional as well as recent +data-driven marginal trajectory prediction methods struggle to properly learn +non-linear agent-to-agent interactions. We present SSL-Interactions that +proposes pretext tasks to enhance interaction modeling for trajectory +prediction. We introduce four interaction-aware pretext tasks to encapsulate +various aspects of agent interactions: range gap prediction, closest distance +prediction, direction of movement prediction, and type of interaction +prediction. We further propose an approach to curate interaction-heavy +scenarios from datasets. This curated data has two advantages: it provides a +stronger learning signal to the interaction model, and facilitates generation +of pseudo-labels for interaction-centric pretext tasks. We also propose three +new metrics specifically designed to evaluate predictions in interactive +scenes. Our empirical evaluations indicate SSL-Interactions outperforms +state-of-the-art motion forecasting methods quantitatively with up to 8% +improvement, and qualitatively, for interaction-heavy scenarios. + +
+
+ comment: 13 pages, 5 figures, submitted to IV-2024 +
+
+
+
+
+ + ☆ HexaGen3D: StableDiffusion is just one step away from Fast and Diverse + Text-to-3D Generation + + +
+ Despite the latest remarkable advances in generative modeling, efficient +generation of high-quality 3D assets from textual prompts remains a difficult +task. A key challenge lies in data scarcity: the most extensive 3D datasets +encompass merely millions of assets, while their 2D counterparts contain +billions of text-image pairs. To address this, we propose a novel approach +which harnesses the power of large, pretrained 2D diffusion models. More +specifically, our approach, HexaGen3D, fine-tunes a pretrained text-to-image +model to jointly predict 6 orthographic projections and the corresponding +latent triplane. We then decode these latents to generate a textured mesh. +HexaGen3D does not require per-sample optimization, and can infer high-quality +and diverse objects from textual prompts in 7 seconds, offering significantly +better quality-to-latency trade-offs when comparing to existing approaches. +Furthermore, HexaGen3D demonstrates strong generalization to new objects or +compositions. + +
+
+ comment: 9 pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ Graph Transformer GANs with Graph Masked Modeling for Architectural + Layout Generation CVPR2023 + + +
+ We present a novel graph Transformer generative adversarial network (GTGAN) +to learn effective graph node relations in an end-to-end fashion for +challenging graph-constrained architectural layout generation tasks. The +proposed graph-Transformer-based generator includes a novel graph Transformer +encoder that combines graph convolutions and self-attentions in a Transformer +to model both local and global interactions across connected and non-connected +graph nodes. Specifically, the proposed connected node attention (CNA) and +non-connected node attention (NNA) aim to capture the global relations across +connected nodes and non-connected nodes in the input graph, respectively. The +proposed graph modeling block (GMB) aims to exploit local vertex interactions +based on a house layout topology. Moreover, we propose a new node +classification-based discriminator to preserve the high-level semantic and +discriminative node features for different house components. To maintain the +relative spatial relationships between ground truth and predicted graphs, we +also propose a novel graph-based cycle-consistency loss. Finally, we propose a +novel self-guided pre-training method for graph representation learning. This +approach involves simultaneous masking of nodes and edges at an elevated mask +ratio (i.e., 40%) and their subsequent reconstruction using an asymmetric +graph-centric autoencoder architecture. This method markedly improves the +model's learning proficiency and expediency. Experiments on three challenging +graph-constrained architectural layout generation tasks (i.e., house layout +generation, house roof generation, and building layout generation) with three +public datasets demonstrate the effectiveness of the proposed method in terms +of objective quantitative scores and subjective visual realism. New +state-of-the-art results are established by large margins on these three tasks. + +
+
+ comment: Accepted to TPAMI, an extended version of a paper published in + CVPR2023. arXiv admin note: substantial text overlap with arXiv:2303.08225 +
+
+
+
+
+ + ☆ Towards Efficient Diffusion-Based Image Editing with Instant Attention + Masks AAAI2024 + + +
+ Diffusion-based Image Editing (DIE) is an emerging research hot-spot, which +often applies a semantic mask to control the target area for diffusion-based +editing. However, most existing solutions obtain these masks via manual +operations or off-line processing, greatly reducing their efficiency. In this +paper, we propose a novel and efficient image editing method for Text-to-Image +(T2I) diffusion models, termed Instant Diffusion Editing(InstDiffEdit). In +particular, InstDiffEdit aims to employ the cross-modal attention ability of +existing diffusion models to achieve instant mask guidance during the diffusion +steps. To reduce the noise of attention maps and realize the full automatics, +we equip InstDiffEdit with a training-free refinement scheme to adaptively +aggregate the attention distributions for the automatic yet accurate mask +generation. Meanwhile, to supplement the existing evaluations of DIE, we +propose a new benchmark called Editing-Mask to examine the mask accuracy and +local editing ability of existing methods. To validate InstDiffEdit, we also +conduct extensive experiments on ImageNet and Imagen, and compare it with a +bunch of the SOTA methods. The experimental results show that InstDiffEdit not +only outperforms the SOTA methods in both image quality and editing results, +but also has a much faster inference speed, i.e., +5 to +6 times. Our code +available at https://anonymous.4open.science/r/InstDiffEdit-C306/ + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ FiGCLIP: Fine-Grained CLIP Adaptation via Densely Annotated Videos + + +
+ While contrastive language image pretraining (CLIP) have exhibited impressive +performance by learning highly semantic and generalized representations, recent +works have exposed a fundamental drawback in its syntactic properties, that +includes interpreting fine-grained attributes, actions, spatial relations, +states, and details that require compositional reasoning. One reason for this +is that natural captions often do not capture all the visual details of a +scene. This leads to unaddressed visual concepts being misattributed to the +wrong words. And the pooled image and text features, ends up acting as a bag of +words, hence losing the syntactic information. In this work, we ask: Is it +possible to enhance CLIP's fine-grained and syntactic abilities without +compromising its semantic properties? We show that this is possible by adapting +CLIP efficiently on a high-quality, comprehensive, and relatively small +dataset. We demonstrate our adaptation strategy on VidSitu, a video situation +recognition dataset annotated with verbs and rich semantic role labels (SRL). +We use the SRL and verb information to create rule-based detailed captions, +making sure they capture most of the visual concepts. Combined with hard +negatives and hierarchical losses, these annotations allow us to learn a +powerful visual representation, dubbed Fine-Grained CLIP (FiGCLIP), that +preserves semantic understanding while being detail-oriented. We evaluate on +five diverse vision-language tasks in both fine-tuning and zero-shot settings, +achieving consistent improvements over the base CLIP model. + +
+
+
+
+
+ + ☆ Foundation Models for Biomedical Image Segmentation: A Survey + + +
+ Recent advancements in biomedical image analysis have been significantly +driven by the Segment Anything Model (SAM). This transformative technology, +originally developed for general-purpose computer vision, has found rapid +application in medical image processing. Within the last year, marked by over +100 publications, SAM has demonstrated its prowess in zero-shot learning +adaptations for medical imaging. The fundamental premise of SAM lies in its +capability to segment or identify objects in images without prior knowledge of +the object type or imaging modality. This approach aligns well with tasks +achievable by the human visual system, though its application in non-biological +vision contexts remains more theoretically challenging. A notable feature of +SAM is its ability to adjust segmentation according to a specified resolution +scale or area of interest, akin to semantic priming. This adaptability has +spurred a wave of creativity and innovation in applying SAM to medical imaging. +Our review focuses on the period from April 1, 2023, to September 30, 2023, a +critical first six months post-initial publication. We examine the adaptations +and integrations of SAM necessary to address longstanding clinical challenges, +particularly in the context of 33 open datasets covered in our analysis. While +SAM approaches or achieves state-of-the-art performance in numerous +applications, it falls short in certain areas, such as segmentation of the +carotid artery, adrenal glands, optic nerve, and mandible bone. Our survey +delves into the innovative techniques where SAM's foundational approach excels +and explores the core concepts in translating and applying these models +effectively in diverse medical imaging scenarios. + +
+
+ comment: 22 pages, 4 figures, 7 tables +
+
+
+
+
+ + ☆ SwinTextSpotter v2: Towards Better Synergy for Scene Text Spotting + + +
+ End-to-end scene text spotting, which aims to read the text in natural +images, has garnered significant attention in recent years. However, recent +state-of-the-art methods usually incorporate detection and recognition simply +by sharing the backbone, which does not directly take advantage of the feature +interaction between the two tasks. In this paper, we propose a new end-to-end +scene text spotting framework termed SwinTextSpotter v2, which seeks to find a +better synergy between text detection and recognition. Specifically, we enhance +the relationship between two tasks using novel Recognition Conversion and +Recognition Alignment modules. Recognition Conversion explicitly guides text +localization through recognition loss, while Recognition Alignment dynamically +extracts text features for recognition through the detection predictions. This +simple yet effective design results in a concise framework that requires +neither an additional rectification module nor character-level annotations for +the arbitrarily-shaped text. Furthermore, the parameters of the detector are +greatly reduced without performance degradation by introducing a Box Selection +Schedule. Qualitative and quantitative experiments demonstrate that +SwinTextSpotter v2 achieved state-of-the-art performance on various +multilingual (English, Chinese, and Vietnamese) benchmarks. The code will be +available at +\href{https://github.com/mxin262/SwinTextSpotterv2}{SwinTextSpotter v2}. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2203.10209 +
+
+
+
+
+ + ☆ Fine-Grained Prototypes Distillation for Few-Shot Object Detection AAAI2024 + + +
+ Few-shot object detection (FSOD) aims at extending a generic detector for +novel object detection with only a few training examples. It attracts great +concerns recently due to the practical meanings. Meta-learning has been +demonstrated to be an effective paradigm for this task. In general, methods +based on meta-learning employ an additional support branch to encode novel +examples (a.k.a. support images) into class prototypes, which are then fused +with query branch to facilitate the model prediction. However, the class-level +prototypes are difficult to precisely generate, and they also lack detailed +information, leading to instability in performance.New methods are required to +capture the distinctive local context for more robust novel object detection. +To this end, we propose to distill the most representative support features +into fine-grained prototypes. These prototypes are then assigned into query +feature maps based on the matching results, modeling the detailed feature +relations between two branches. This process is realized by our Fine-Grained +Feature Aggregation (FFA) module. Moreover, in terms of high-level feature +fusion, we propose Balanced Class-Agnostic Sampling (B-CAS) strategy and +Non-Linear Fusion (NLF) module from differenct perspectives. They are +complementary to each other and depict the high-level feature relations more +effectively. Extensive experiments on PASCAL VOC and MS COCO benchmarks show +that our method sets a new state-of-the-art performance in most settings. Our +code is available at https://github.com/wangchen1801/FPD. + +
+
+ comment: Accepted by AAAI2024 +
+
+
+
+
+ + ☆ Multimodal Crowd Counting with Pix2Pix GANs + + +
+ Most state-of-the-art crowd counting methods use color (RGB) images to learn +the density map of the crowd. However, these methods often struggle to achieve +higher accuracy in densely crowded scenes with poor illumination. Recently, +some studies have reported improvement in the accuracy of crowd counting models +using a combination of RGB and thermal images. Although multimodal data can +lead to better predictions, multimodal data might not be always available +beforehand. In this paper, we propose the use of generative adversarial +networks (GANs) to automatically generate thermal infrared (TIR) images from +color (RGB) images and use both to train crowd counting models to achieve +higher accuracy. We use a Pix2Pix GAN network first to translate RGB images to +TIR images. Our experiments on several state-of-the-art crowd counting models +and benchmark crowd datasets report significant improvement in accuracy. + +
+
+ comment: Accepted version of the paper in 19th International Conference on + Computer Vision Theory and Applications (VISAPP), Rome, Italy, 27-29 Feb, + 2024, +
+
+
+
+
+ + ☆ Curriculum for Crowd Counting -- Is it Worthy? + + +
+ Recent advances in deep learning techniques have achieved remarkable +performance in several computer vision problems. A notably intuitive technique +called Curriculum Learning (CL) has been introduced recently for training deep +learning models. Surprisingly, curriculum learning achieves significantly +improved results in some tasks but marginal or no improvement in others. Hence, +there is still a debate about its adoption as a standard method to train +supervised learning models. In this work, we investigate the impact of +curriculum learning in crowd counting using the density estimation method. We +performed detailed investigations by conducting 112 experiments using six +different CL settings using eight different crowd models. Our experiments show +that curriculum learning improves the model learning performance and shortens +the convergence time. + +
+
+ comment: Accepted version of the paper in 19th International Conference on + Computer Vision Theory and Applications (VISAPP), Rome, Italy, 27-19 February + 2024 +
+
+
+
+
+ + ☆ Collaboratively Self-supervised Video Representation Learning for Action + Recognition + + +
+ Considering the close connection between action recognition and human pose +estimation, we design a Collaboratively Self-supervised Video Representation +(CSVR) learning framework specific to action recognition by jointly considering +generative pose prediction and discriminative context matching as pretext +tasks. Specifically, our CSVR consists of three branches: a generative pose +prediction branch, a discriminative context matching branch, and a video +generating branch. Among them, the first one encodes dynamic motion feature by +utilizing Conditional-GAN to predict the human poses of future frames, and the +second branch extracts static context features by pulling the representations +of clips and compressed key frames from the same video together while pushing +apart the pairs from different videos. The third branch is designed to recover +the current video frames and predict the future ones, for the purpose of +collaboratively improving dynamic motion features and static context features. +Extensive experiments demonstrate that our method achieves state-of-the-art +performance on the UCF101 and HMDB51 datasets. + +
+
+
+
+
+ + ☆ Geo-locating Road Objects using Inverse Haversine Formula with NVIDIA + Driveworks + + +
+ Geolocation is integral to the seamless functioning of autonomous vehicles +and advanced traffic monitoring infrastructures. This paper introduces a +methodology to geolocate road objects using a monocular camera, leveraging the +NVIDIA DriveWorks platform. We use the Centimeter Positioning Service (CPOS) +and the inverse Haversine formula to geo-locate road objects accurately. The +real-time algorithm processing capability of the NVIDIA DriveWorks platform +enables instantaneous object recognition and spatial localization for Advanced +Driver Assistance Systems (ADAS) and autonomous driving platforms. We present a +measurement pipeline suitable for autonomous driving (AD) platforms and provide +detailed guidelines for calibrating cameras using NVIDIA DriveWorks. +Experiments were carried out to validate the accuracy of the proposed method +for geolocating targets in both controlled and dynamic settings. We show that +our approach can locate targets with less than 1m error when the AD platform is +stationary and less than 4m error at higher speeds (i.e. up to 60km/h) within a +15m radius. + +
+
+
+
+
+ + ☆ PMFSNet: Polarized Multi-scale Feature Self-attention Network For + Lightweight Medical Image Segmentation + + +
+ Current state-of-the-art medical image segmentation methods prioritize +accuracy but often at the expense of increased computational demands and larger +model sizes. Applying these large-scale models to the relatively limited scale +of medical image datasets tends to induce redundant computation, complicating +the process without the necessary benefits. This approach not only adds +complexity but also presents challenges for the integration and deployment of +lightweight models on edge devices. For instance, recent transformer-based +models have excelled in 2D and 3D medical image segmentation due to their +extensive receptive fields and high parameter count. However, their +effectiveness comes with a risk of overfitting when applied to small datasets +and often neglects the vital inductive biases of Convolutional Neural Networks +(CNNs), essential for local feature representation. In this work, we propose +PMFSNet, a novel medical imaging segmentation model that effectively balances +global and local feature processing while avoiding the computational redundancy +typical in larger models. PMFSNet streamlines the UNet-based hierarchical +structure and simplifies the self-attention mechanism's computational +complexity, making it suitable for lightweight applications. It incorporates a +plug-and-play PMFS block, a multi-scale feature enhancement module based on +attention mechanisms, to capture long-term dependencies. Extensive +comprehensive results demonstrate that even with a model (less than 1 million +parameters), our method achieves superior performance in various segmentation +tasks across different data scales. It achieves (IoU) metrics of 84.68%, +82.02%, and 78.82% on public datasets of teeth CT (CBCT), ovarian tumors +ultrasound(MMOTU), and skin lesions dermoscopy images (ISIC 2018), +respectively. The source code is available at +https://github.com/yykzjh/PMFSNet. + +
+
+
+
+
+ + ☆ Exploiting GPT-4 Vision for Zero-shot Point Cloud Understanding + + +
+ In this study, we tackle the challenge of classifying the object category in +point clouds, which previous works like PointCLIP struggle to address due to +the inherent limitations of the CLIP architecture. Our approach leverages GPT-4 +Vision (GPT-4V) to overcome these challenges by employing its advanced +generative abilities, enabling a more adaptive and robust classification +process. We adapt the application of GPT-4V to process complex 3D data, +enabling it to achieve zero-shot recognition capabilities without altering the +underlying model architecture. Our methodology also includes a systematic +strategy for point cloud image visualization, mitigating domain gap and +enhancing GPT-4V's efficiency. Experimental validation demonstrates our +approach's superiority in diverse scenarios, setting a new benchmark in +zero-shot point cloud classification. + +
+
+
+
+
+ + ☆ A Bi-Pyramid Multimodal Fusion Method for the Diagnosis of Bipolar + Disorders ICASSP 2024 + + +
+ Previous research on the diagnosis of Bipolar disorder has mainly focused on +resting-state functional magnetic resonance imaging. However, their accuracy +can not meet the requirements of clinical diagnosis. Efficient multimodal +fusion strategies have great potential for applications in multimodal data and +can further improve the performance of medical diagnosis models. In this work, +we utilize both sMRI and fMRI data and propose a novel multimodal diagnosis +model for bipolar disorder. The proposed Patch Pyramid Feature Extraction +Module extracts sMRI features, and the spatio-temporal pyramid structure +extracts the fMRI features. Finally, they are fused by a fusion module to +output diagnosis results with a classifier. Extensive experiments show that our +proposed method outperforms others in balanced accuracy from 0.657 to 0.732 on +the OpenfMRI dataset, and achieves the state of the art. + +
+
+ comment: Accepted by IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Bias-Conflict Sample Synthesis and Adversarial Removal Debias Strategy + for Temporal Sentence Grounding in Video AAAI 2024 + + +
+ Temporal Sentence Grounding in Video (TSGV) is troubled by dataset bias +issue, which is caused by the uneven temporal distribution of the target +moments for samples with similar semantic components in input videos or query +texts. Existing methods resort to utilizing prior knowledge about bias to +artificially break this uneven distribution, which only removes a limited +amount of significant language biases. In this work, we propose the +bias-conflict sample synthesis and adversarial removal debias strategy +(BSSARD), which dynamically generates bias-conflict samples by explicitly +leveraging potentially spurious correlations between single-modality features +and the temporal position of the target moments. Through adversarial training, +its bias generators continuously introduce biases and generate bias-conflict +samples to deceive its grounding model. Meanwhile, the grounding model +continuously eliminates the introduced biases, which requires it to model +multi-modality alignment information. BSSARD will cover most kinds of coupling +relationships and disrupt language and visual biases simultaneously. Extensive +experiments on Charades-CD and ActivityNet-CD demonstrate the promising +debiasing capability of BSSARD. Source codes are available at +https://github.com/qzhb/BSSARD. + +
+
+ comment: accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Combining Image- and Geometric-based Deep Learning for Shape Regression: + A Comparison to Pixel-level Methods for Segmentation in Chest X-Ray + + +
+ When solving a segmentation task, shaped-base methods can be beneficial +compared to pixelwise classification due to geometric understanding of the +target object as shape, preventing the generation of anatomical implausible +predictions in particular for corrupted data. In this work, we propose a novel +hybrid method that combines a lightweight CNN backbone with a geometric neural +network (Point Transformer) for shape regression. Using the same CNN encoder, +the Point Transformer reaches segmentation quality on per with current +state-of-the-art convolutional decoders ($4\pm1.9$ vs $3.9\pm2.9$ error in mm +and $85\pm13$ vs $88\pm10$ Dice), but crucially, is more stable w.r.t image +distortion, starting to outperform them at a corruption level of 30%. +Furthermore, we include the nnU-Net as an upper baseline, which has $3.7\times$ +more trainable parameters than our proposed method. + +
+
+ comment: Submitted to German Conference on Medical Image Computing 2024 +
+
+
+
+
+ + ☆ MM-SAP: A Comprehensive Benchmark for Assessing Self-Awareness of + Multimodal Large Language Models in Perception + + +
+ Multimodal Large Language Models (MLLMs) have shown their remarkable +abilities in visual perception and understanding recently. However, how to +comprehensively evaluate the capabilities of MLLMs remains a challenge. Most of +the existing benchmarks predominantly focus on assessing perception, cognition, +and reasoning, neglecting the abilities of self-awareness, referring to the +model's recognition of its own capability boundary. In our study, we focus on +self-awareness in image perception and introduce the knowledge quadrant for +MLLMs, which clearly defines the knowns and unknowns in perception. Based on +this, we propose a novel benchmark specifically designed to evaluate the +Self-Aware capabilities in Perception for MLLMs(MM-SAP). MM-SAP encompasses +three distinct sub-datasets, each focusing on different aspects of +self-awareness. We evaluated eight well-known MLLMs using MM-SAP, analyzing +their self-awareness and providing detailed insights. Code and data are +available at https://github.com/YHWmz/MM-SAP + +
+
+
+
+
+ + ☆ One for All: Toward Unified Foundation Models for Earth Vision + + +
+ Foundation models characterized by extensive parameters and trained on +large-scale datasets have demonstrated remarkable efficacy across various +downstream tasks for remote sensing data. Current remote sensing foundation +models typically specialize in a single modality or a specific spatial +resolution range, limiting their versatility for downstream datasets. While +there have been attempts to develop multi-modal remote sensing foundation +models, they typically employ separate vision encoders for each modality or +spatial resolution, necessitating a switch in backbones contingent upon the +input data. To address this issue, we introduce a simple yet effective method, +termed OFA-Net (One-For-All Network): employing a single, shared Transformer +backbone for multiple data modalities with different spatial resolutions. Using +the masked image modeling mechanism, we pre-train a single Transformer backbone +on a curated multi-modal dataset with this simple design. Then the backbone +model can be used in different downstream tasks, thus forging a path towards a +unified foundation backbone model in Earth vision. The proposed method is +evaluated on 12 distinct downstream tasks and demonstrates promising +performance. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ InstantID: Zero-shot Identity-Preserving Generation in Seconds + + +
+ There has been significant progress in personalized image synthesis with +methods such as Textual Inversion, DreamBooth, and LoRA. Yet, their real-world +applicability is hindered by high storage demands, lengthy fine-tuning +processes, and the need for multiple reference images. Conversely, existing ID +embedding-based methods, while requiring only a single forward inference, face +challenges: they either necessitate extensive fine-tuning across numerous model +parameters, lack compatibility with community pre-trained models, or fail to +maintain high face fidelity. Addressing these limitations, we introduce +InstantID, a powerful diffusion model-based solution. Our plug-and-play module +adeptly handles image personalization in various styles using just a single +facial image, while ensuring high fidelity. To achieve this, we design a novel +IdentityNet by imposing strong semantic and weak spatial conditions, +integrating facial and landmark images with textual prompts to steer the image +generation. InstantID demonstrates exceptional performance and efficiency, +proving highly beneficial in real-world applications where identity +preservation is paramount. Moreover, our work seamlessly integrates with +popular pre-trained text-to-image diffusion models like SD1.5 and SDXL, serving +as an adaptable plugin. Our codes and pre-trained checkpoints will be available +at https://github.com/InstantID/InstantID. + +
+
+ comment: Technical Report, project page available at + https://instantid.github.io/ +
+
+
+
+
+ + ☆ PolMERLIN: Self-Supervised Polarimetric Complex SAR Image Despeckling + with Masked Networks + + +
+ Despeckling is a crucial noise reduction task in improving the quality of +synthetic aperture radar (SAR) images. Directly obtaining noise-free SAR images +is a challenging task that has hindered the development of accurate despeckling +algorithms. The advent of deep learning has facilitated the study of denoising +models that learn from only noisy SAR images. However, existing methods deal +solely with single-polarization images and cannot handle the multi-polarization +images captured by modern satellites. In this work, we present an extension of +the existing model for generating single-polarization SAR images to handle +multi-polarization SAR images. Specifically, we propose a novel self-supervised +despeckling approach called channel masking, which exploits the relationship +between polarizations. Additionally, we utilize a spatial masking method that +addresses pixel-to-pixel correlations to further enhance the performance of our +approach. By effectively incorporating multiple polarization information, our +method surpasses current state-of-the-art methods in quantitative evaluation in +both synthetic and real-world scenarios. + +
+
+ comment: To appear on IEEE Geoscience and Remote Sensing Letters +
+
+
+
+
+ + ☆ Compositional Oil Spill Detection Based on Object Detector and Adapted + Segment Anything Model from SAR Images + + +
+ Semantic segmentation-based methods have attracted extensive attention in oil +spill detection from SAR images. However, the existing approaches require a +large number of finely annotated segmentation samples in the training stage. To +alleviate this issue, we propose a composite oil spill detection framework, +SAM-OIL, comprising an object detector (e.g., YOLOv8), an adapted Segment +Anything Model (SAM), and an Ordered Mask Fusion (OMF) module. SAM-OIL is the +first application of the powerful SAM in oil spill detection. Specifically, the +SAM-OIL strategy uses YOLOv8 to obtain the categories and bounding boxes of oil +spill-related objects, then inputs bounding boxes into the adapted SAM to +retrieve category-agnostic masks, and finally adopts the Ordered Mask Fusion +(OMF) module to fuse the masks and categories. The adapted SAM, combining a +frozen SAM with a learnable Adapter module, can enhance SAM's ability to +segment ambiguous objects. The OMF module, a parameter-free method, can +effectively resolve pixel category conflicts within SAM. Experimental results +demonstrate that SAM-OIL surpasses existing semantic segmentation-based oil +spill detection methods, achieving mIoU of 69.52%. The results also indicated +that both OMF and Adapter modules can effectively improve the accuracy in +SAM-OIL. + +
+
+ comment: 5 pages, 4 figures +
+
+
+
+
+ + ☆ Harnessing Deep Learning and Satellite Imagery for Post-Buyout Land + Cover Mapping + + +
+ Environmental disasters such as floods, hurricanes, and wildfires have +increasingly threatened communities worldwide, prompting various mitigation +strategies. Among these, property buyouts have emerged as a prominent approach +to reducing vulnerability to future disasters. This strategy involves +governments purchasing at-risk properties from willing sellers and converting +the land into open space, ostensibly reducing future disaster risk and impact. +However, the aftermath of these buyouts, particularly concerning land-use +patterns and community impacts, remains under-explored. This research aims to +fill this gap by employing innovative techniques like satellite imagery +analysis and deep learning to study these patterns. To achieve this goal, we +employed FEMA's Hazard Mitigation Grant Program (HMGP) buyout dataset, +encompassing over 41,004 addresses of these buyout properties from 1989 to +2017. Leveraging Google's Maps Static API, we gathered 40,053 satellite images +corresponding to these buyout lands. Subsequently, we implemented five +cutting-edge machine learning models to evaluate their performance in +classifying land cover types. Notably, this task involved multi-class +classification, and our model achieved an outstanding ROC-AUC score of 98.86% + +
+
+
+
+
+ + ☆ Robo-ABC: Affordance Generalization Beyond Categories via Semantic + Correspondence for Robot Manipulation + + +
+ Enabling robotic manipulation that generalizes to out-of-distribution scenes +is a crucial step toward open-world embodied intelligence. For human beings, +this ability is rooted in the understanding of semantic correspondence among +objects, which naturally transfers the interaction experience of familiar +objects to novel ones. Although robots lack such a reservoir of interaction +experience, the vast availability of human videos on the Internet may serve as +a valuable resource, from which we extract an affordance memory including the +contact points. Inspired by the natural way humans think, we propose Robo-ABC: +when confronted with unfamiliar objects that require generalization, the robot +can acquire affordance by retrieving objects that share visual or semantic +similarities from the affordance memory. The next step is to map the contact +points of the retrieved objects to the new object. While establishing this +correspondence may present formidable challenges at first glance, recent +research finds it naturally arises from pre-trained diffusion models, enabling +affordance mapping even across disparate object categories. Through the +Robo-ABC framework, robots may generalize to manipulate out-of-category objects +in a zero-shot manner without any manual annotation, additional training, part +segmentation, pre-coded knowledge, or viewpoint restrictions. Quantitatively, +Robo-ABC significantly enhances the accuracy of visual affordance retrieval by +a large margin of 31.6% compared to state-of-the-art (SOTA) end-to-end +affordance models. We also conduct real-world experiments of cross-category +object-grasping tasks. Robo-ABC achieved a success rate of 85.7%, proving its +capacity for real-world tasks. + +
+
+
+
+
+ + ☆ CascadeV-Det: Cascade Point Voting for 3D Object Detection + + +
+ Anchor-free object detectors are highly efficient in performing point-based +prediction without the need for extra post-processing of anchors. However, +different from the 2D grids, the 3D points used in these detectors are often +far from the ground truth center, making it challenging to accurately regress +the bounding boxes. To address this issue, we propose a Cascade Voting +(CascadeV) strategy that provides high-quality 3D object detection with +point-based prediction. Specifically, CascadeV performs cascade detection using +a novel Cascade Voting decoder that combines two new components: Instance Aware +Voting (IA-Voting) and a Cascade Point Assignment (CPA) module. The IA-Voting +module updates the object features of updated proposal points within the +bounding box using conditional inverse distance weighting. This approach +prevents features from being aggregated outside the instance and helps improve +the accuracy of object detection. Additionally, since model training can suffer +from a lack of proposal points with high centerness, we have developed the CPA +module to narrow down the positive assignment threshold with cascade stages. +This approach relaxes the dependence on proposal centerness in the early stages +while ensuring an ample quantity of positives with high centerness in the later +stages. Experiments show that FCAF3D with our CascadeV achieves +state-of-the-art 3D object detection results with 70.4\% mAP@0.25 and 51.6\% +mAP@0.5 on SUN RGB-D and competitive results on ScanNet. Code will be released +at https://github.com/Sharpiless/CascadeV-Det + +
+
+
+
+
+ + ☆ A Deep Hierarchical Feature Sparse Framework for Occluded Person + Re-Identification + + +
+ Most existing methods tackle the problem of occluded person re-identification +(ReID) by utilizing auxiliary models, resulting in a complicated and +inefficient ReID framework that is unacceptable for real-time applications. In +this work, a speed-up person ReID framework named SUReID is proposed to +mitigate occlusion interference while speeding up inference. The SUReID +consists of three key components: hierarchical token sparsification (HTS) +strategy, non-parametric feature alignment knowledge distillation (NPKD), and +noise occlusion data augmentation (NODA). The HTS strategy works by pruning the +redundant tokens in the vision transformer to achieve highly effective +self-attention computation and eliminate interference from occlusions or +background noise. However, the pruned tokens may contain human part features +that contaminate the feature representation and degrade the performance. To +solve this problem, the NPKD is employed to supervise the HTS strategy, +retaining more discriminative tokens and discarding meaningless ones. +Furthermore, the NODA is designed to introduce more noisy samples, which +further trains the ability of the HTS to disentangle different tokens. +Experimental results show that the SUReID achieves superior performance with +surprisingly fast inference. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Semantic Segmentation in Multiple Adverse Weather Conditions with Domain + Knowledge Retention + + +
+ Semantic segmentation's performance is often compromised when applied to +unlabeled adverse weather conditions. Unsupervised domain adaptation is a +potential approach to enhancing the model's adaptability and robustness to +adverse weather. However, existing methods encounter difficulties when +sequentially adapting the model to multiple unlabeled adverse weather +conditions. They struggle to acquire new knowledge while also retaining +previously learned knowledge.To address these problems, we propose a semantic +segmentation method for multiple adverse weather conditions that incorporates +adaptive knowledge acquisition, pseudolabel blending, and weather composition +replay. Our adaptive knowledge acquisition enables the model to avoid learning +from extreme images that could potentially cause the model to forget. In our +approach of blending pseudo-labels, we not only utilize the current model but +also integrate the previously learned model into the ongoing learning process. +This collaboration between the current teacher and the previous model enhances +the robustness of the pseudo-labels for the current target. Our weather +composition replay mechanism allows the model to continuously refine its +previously learned weather information while simultaneously learning from the +new target domain. Our method consistently outperforms the stateof-the-art +methods, and obtains the best performance with averaged mIoU (%) of 65.7 and +the lowest forgetting (%) of 3.6 against 60.1 and 11.3, on the ACDC datasets +for a four-target continual multi-target domain adaptation. + +
+
+
+
+
+ + ☆ Concept-Guided Prompt Learning for Generalization in Vision-Language + Models AAAI 2024 + + +
+ Contrastive Language-Image Pretraining (CLIP) model has exhibited remarkable +efficacy in establishing cross-modal connections between texts and images, +yielding impressive performance across a broad spectrum of downstream +applications through fine-tuning. However, for generalization tasks, the +current fine-tuning methods for CLIP, such as CoOp and CoCoOp, demonstrate +relatively low performance on some fine-grained datasets. We recognize the +underlying reason is that these previous methods only projected global features +into the prompt, neglecting the various visual concepts, such as colors, +shapes, and sizes, which are naturally transferable across domains and play a +crucial role in generalization tasks. To address this issue, in this work, we +propose Concept-Guided Prompt Learning (CPL) for vision-language models. +Specifically, we leverage the well-learned knowledge of CLIP to create a visual +concept cache to enable concept-guided prompting. In order to refine the text +features, we further develop a projector that transforms multi-level visual +features into text features. We observe that this concept-guided prompt +learning approach is able to achieve enhanced consistency between visual and +linguistic modalities. Extensive experimental results demonstrate that our CPL +method significantly improves generalization capabilities compared to the +current state-of-the-art methods. + +
+
+ comment: Accepted by AAAI 2024 +
+
+
+
+
+ + ☆ Hierarchical Fashion Design with Multi-stage Diffusion Models + + +
+ Cross-modal fashion synthesis and editing offer intelligent support to +fashion designers by enabling the automatic generation and local modification +of design drafts.While current diffusion models demonstrate commendable +stability and controllability in image synthesis,they still face significant +challenges in generating fashion design from abstract design elements and +fine-grained editing.Abstract sensory expressions, \eg office, business, and +party, form the high-level design concepts, while measurable aspects like +sleeve length, collar type, and pant length are considered the low-level +attributes of clothing.Controlling and editing fashion images using lengthy +text descriptions poses a difficulty.In this paper, we propose HieraFashDiff,a +novel fashion design method using the shared multi-stage diffusion model +encompassing high-level design concepts and low-level clothing attributes in a +hierarchical structure.Specifically, we categorized the input text into +different levels and fed them in different time step to the diffusion model +according to the criteria of professional clothing designers.HieraFashDiff +allows designers to add low-level attributes after high-level prompts for +interactive editing incrementally.In addition, we design a differentiable loss +function in the sampling process with a mask to keep non-edit +areas.Comprehensive experiments performed on our newly conducted Hierarchical +fashion dataset,demonstrate that our proposed method outperforms other +state-of-the-art competitors. + +
+
+
+
+
+ + ♻ ☆ Online Class-Incremental Learning For Real-World Food Image + Classification WACV 2024 + + +
+ Food image classification is essential for monitoring health and tracking +dietary in image-based dietary assessment methods. However, conventional +systems often rely on static datasets with fixed classes and uniform +distribution. In contrast, real-world food consumption patterns, shaped by +cultural, economic, and personal influences, involve dynamic and evolving data. +Thus, require the classification system to cope with continuously evolving +data. Online Class Incremental Learning (OCIL) addresses the challenge of +learning continuously from a single-pass data stream while adapting to the new +knowledge and reducing catastrophic forgetting. Experience Replay (ER) based +OCIL methods store a small portion of previous data and have shown encouraging +performance. However, most existing OCIL works assume that the distribution of +encountered data is perfectly balanced, which rarely happens in real-world +scenarios. In this work, we explore OCIL for real-world food image +classification by first introducing a probabilistic framework to simulate +realistic food consumption scenarios. Subsequently, we present an attachable +Dynamic Model Update (DMU) module designed for existing ER methods, which +enables the selection of relevant images for model training, addressing +challenges arising from data repetition and imbalanced sample occurrences +inherent in realistic food consumption patterns within the OCIL framework. Our +performance evaluation demonstrates significant enhancements compared to +established ER methods, showing great potential for lifelong learning in +real-world food image classification scenarios. The code of our method is +publicly accessible at +https://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification + +
+
+ comment: Accepted at IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV 2024) +
+
+
+
+
+ + ♻ ☆ Morphological Profiling for Drug Discovery in the Era of Deep Learning + + +
+ Morphological profiling is a valuable tool in phenotypic drug discovery. The +advent of high-throughput automated imaging has enabled the capturing of a wide +range of morphological features of cells or organisms in response to +perturbations at the single-cell resolution. Concurrently, significant advances +in machine learning and deep learning, especially in computer vision, have led +to substantial improvements in analyzing large-scale high-content images at +high-throughput. These efforts have facilitated understanding of compound +mechanism-of-action (MOA), drug repurposing, characterization of cell +morphodynamics under perturbation, and ultimately contributing to the +development of novel therapeutics. In this review, we provide a comprehensive +overview of the recent advances in the field of morphological profiling. We +summarize the image profiling analysis workflow, survey a broad spectrum of +analysis strategies encompassing feature engineering- and deep learning-based +approaches, and introduce publicly available benchmark datasets. We place a +particular emphasis on the application of deep learning in this pipeline, +covering cell segmentation, image representation learning, and multimodal +learning. Additionally, we illuminate the application of morphological +profiling in phenotypic drug discovery and highlight potential challenges and +opportunities in this field. + +
+
+ comment: 44 pages, 5 figure, 5 tables +
+
+
+
+
+ + ♻ ☆ Learning to Transform for Generalizable Instance-wise Invariance ICCV 2023 + + +
+ Computer vision research has long aimed to build systems that are robust to +spatial transformations found in natural data. Traditionally, this is done +using data augmentation or hard-coding invariances into the architecture. +However, too much or too little invariance can hurt, and the correct amount is +unknown a priori and dependent on the instance. Ideally, the appropriate +invariance would be learned from data and inferred at test-time. + We treat invariance as a prediction problem. Given any image, we use a +normalizing flow to predict a distribution over transformations and average the +predictions over them. Since this distribution only depends on the instance, we +can align instances before classifying them and generalize invariance across +classes. The same distribution can also be used to adapt to out-of-distribution +poses. This normalizing flow is trained end-to-end and can learn a much larger +range of transformations than Augerino and InstaAug. When used as data +augmentation, our method shows accuracy and robustness gains on CIFAR 10, +CIFAR10-LT, and TinyImageNet. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ SCTNet: Single-Branch CNN with Transformer Semantic Information for + Real-Time Segmentation AAAI 2024 + + +
+ Recent real-time semantic segmentation methods usually adopt an additional +semantic branch to pursue rich long-range context. However, the additional +branch incurs undesirable computational overhead and slows inference speed. To +eliminate this dilemma, we propose SCTNet, a single branch CNN with transformer +semantic information for real-time segmentation. SCTNet enjoys the rich +semantic representations of an inference-free semantic branch while retaining +the high efficiency of lightweight single branch CNN. SCTNet utilizes a +transformer as the training-only semantic branch considering its superb ability +to extract long-range context. With the help of the proposed transformer-like +CNN block CFBlock and the semantic information alignment module, SCTNet could +capture the rich semantic information from the transformer branch in training. +During the inference, only the single branch CNN needs to be deployed. We +conduct extensive experiments on Cityscapes, ADE20K, and COCO-Stuff-10K, and +the results show that our method achieves the new state-of-the-art performance. +The code and model is available at https://github.com/xzz777/SCTNet + +
+
+ comment: Accepted by AAAI 2024; typos corrected; code and models have been + released at https://github.com/xzz777/SCTNet +
+
+
+
+
+ + ♻ ☆ US \& MRI Image Fusion Based on Markerless Skin Registration + + +
+ This paper presents an innovative automatic fusion imaging system that +combines 3D CT/MR images with real-time ultrasound (US) acquisition. The system +eliminates the need for external physical markers and complex training, making +image fusion feasible for physicians with different experience levels. The +integrated system involves a portable 3D camera for patient-specific surface +acquisition, an electromagnetic tracking system, and US components. The fusion +algorithm comprises two main parts: skin segmentation and rigid +co-registration, both integrated into the US machine. The co-registration +software aligns the surface extracted from CT/MR images with patient-specific +coordinates, facilitating rapid and effective fusion. Experimental testing in +different settings, including the clinical environment, validates the system's +accuracy, computational efficiency, noise robustness, and operator +independence. The co-registration error remains under the acceptable range +of~$1$ cm. + +
+
+
+
+
+ + ♻ ☆ Client-Level Differential Privacy via Adaptive Intermediary in Federated + Medical Imaging MICCAI'23 + + +
+ Despite recent progress in enhancing the privacy of federated learning (FL) +via differential privacy (DP), the trade-off of DP between privacy protection +and performance is still underexplored for real-world medical scenario. In this +paper, we propose to optimize the trade-off under the context of client-level +DP, which focuses on privacy during communications. However, FL for medical +imaging involves typically much fewer participants (hospitals) than other +domains (e.g., mobile devices), thus ensuring clients be differentially private +is much more challenging. To tackle this problem, we propose an adaptive +intermediary strategy to improve performance without harming privacy. +Specifically, we theoretically find splitting clients into sub-clients, which +serve as intermediaries between hospitals and the server, can mitigate the +noises introduced by DP without harming privacy. Our proposed approach is +empirically evaluated on both classification and segmentation tasks using two +public datasets, and its effectiveness is demonstrated with significant +performance improvements and comprehensive analytical studies. Code is +available at: https://github.com/med-air/Client-DP-FL. + +
+
+ comment: Accepted by 26th International Conference on Medical Image Computing + and Computer Assisted Intervention (MICCAI'23) +
+
+
+
+
+ + ♻ ☆ PhilEO Bench: Evaluating Geo-Spatial Foundation Models + + +
+ Massive amounts of unlabelled data are captured by Earth Observation (EO) +satellites, with the Sentinel-2 constellation generating 1.6 TB of data daily. +This makes Remote Sensing a data-rich domain well suited to Machine Learning +(ML) solutions. However, a bottleneck in applying ML models to EO is the lack +of annotated data as annotation is a labour-intensive and costly process. As a +result, research in this domain has focused on Self-Supervised Learning and +Foundation Model approaches. This paper addresses the need to evaluate +different Foundation Models on a fair and uniform benchmark by introducing the +PhilEO Bench, a novel evaluation framework for EO Foundation Models. The +framework comprises of a testbed and a novel 400 GB Sentinel-2 dataset +containing labels for three downstream tasks, building density estimation, road +segmentation, and land cover classification. We present experiments using our +framework evaluating different Foundation Models, including Prithvi and SatMAE, +at multiple n-shots and convergence rates. + +
+
+ comment: 6 pages, 5 figures, Submitted to IGARSS 2024 +
+
+
+
+
+ + ♻ ☆ WidthFormer: Toward Efficient Transformer-based BEV View Transformation + + +
+ In this work, we present WidthFormer, a novel transformer-based +Bird's-Eye-View (BEV) 3D detection method tailored for real-time +autonomous-driving applications. WidthFormer is computationally efficient, +robust and does not require any special engineering effort to deploy. In this +work, we propose a novel 3D positional encoding mechanism capable of accurately +encapsulating 3D geometric information, which enables our model to generate +high-quality BEV representations with only a single transformer decoder layer. +This mechanism is also beneficial for existing sparse 3D object detectors. +Inspired by the recently-proposed works, we further improve our model's +efficiency by vertically compressing the image features when serving as +attention keys and values. We also introduce two modules to compensate for +potential information loss due to feature compression. Experimental evaluation +on the widely-used nuScenes 3D object detection benchmark demonstrates that our +method outperforms previous approaches across different 3D detection +architectures. More importantly, our model is highly efficient. For example, +when using $256\times 704$ input images, it achieves 1.5 ms and 2.8 ms latency +on NVIDIA 3090 GPU and Horizon Journey-5 computation solutions, respectively. +Furthermore, WidthFormer also exhibits strong robustness to different degrees +of camera perturbations. Our study offers valuable insights into the deployment +of BEV transformation methods in real-world, complex road environments. Code is +available at https://github.com/ChenhongyiYang/WidthFormer . + +
+
+
+
+
+ + ♻ ☆ DiffSketcher: Text Guided Vector Sketch Synthesis through Latent + Diffusion Models NIPS 2023 + + +
+ Even though trained mainly on images, we discover that pretrained diffusion +models show impressive power in guiding sketch synthesis. In this paper, we +present DiffSketcher, an innovative algorithm that creates \textit{vectorized} +free-hand sketches using natural language input. DiffSketcher is developed +based on a pre-trained text-to-image diffusion model. It performs the task by +directly optimizing a set of B\'ezier curves with an extended version of the +score distillation sampling (SDS) loss, which allows us to use a raster-level +diffusion model as a prior for optimizing a parametric vectorized sketch +generator. Furthermore, we explore attention maps embedded in the diffusion +model for effective stroke initialization to speed up the generation process. +The generated sketches demonstrate multiple levels of abstraction while +maintaining recognizability, underlying structure, and essential visual details +of the subject drawn. Our experiments show that DiffSketcher achieves greater +quality than prior work. The code and demo of DiffSketcher can be found at +https://ximinng.github.io/DiffSketcher-project/. + +
+
+ comment: Accepted by NIPS 2023. Project page: + https://ximinng.github.io/DiffSketcher-project/ +
+
+
+
+
+ + ♻ ☆ Multi-task convolutional neural network for image aesthetic assessment + + +
+ As people's aesthetic preferences for images are far from understood, image +aesthetic assessment is a challenging artificial intelligence task. The range +of factors underlying this task is almost unlimited, but we know that some +aesthetic attributes affect those preferences. In this study, we present a +multi-task convolutional neural network that takes into account these +attributes. The proposed neural network jointly learns the attributes along +with the overall aesthetic scores of images. This multi-task learning framework +allows for effective generalization through the utilization of shared +representations. Our experiments demonstrate that the proposed method +outperforms the state-of-the-art approaches in predicting overall aesthetic +scores for images in one benchmark of image aesthetics. We achieve near-human +performance in terms of overall aesthetic scores when considering the +Spearman's rank correlations. Moreover, our model pioneers the application of +multi-tasking in another benchmark, serving as a new baseline for future +research. Notably, our approach achieves this performance while using fewer +parameters compared to existing multi-task neural networks in the literature, +and consequently makes our method more efficient in terms of computational +complexity. + +
+
+
+
+
+ + ♻ ☆ Diffusion models meet image counter-forensics + + +
+ From its acquisition in the camera sensors to its storage, different +operations are performed to generate the final image. This pipeline imprints +specific traces into the image to form a natural watermark. Tampering with an +image disturbs these traces; these disruptions are clues that are used by most +methods to detect and locate forgeries. In this article, we assess the +capabilities of diffusion models to erase the traces left by forgers and, +therefore, deceive forensics methods. Such an approach has been recently +introduced for adversarial purification, achieving significant performance. We +show that diffusion purification methods are well suited for counter-forensics +tasks. Such approaches outperform already existing counter-forensics techniques +both in deceiving forensics methods and in preserving the natural look of the +purified images. The source code is publicly available at +https://github.com/mtailanian/diff-cf. + +
+
+
+
+
+ + ♻ ☆ Decomposition, Compression, and Synthesis (DCS)-based Video Coding: A + Neural Exploration via Resolution-Adaptive Learning + + +
+ Inspired by the facts that retinal cells actually segregate the visual scene +into different attributes (e.g., spatial details, temporal motion) for +respective neuronal processing, we propose to first decompose the input video +into respective spatial texture frames (STF) at its native spatial resolution +that preserve the rich spatial details, and the other temporal motion frames +(TMF) at a lower spatial resolution that retain the motion smoothness; then +compress them together using any popular video coder; and finally synthesize +decoded STFs and TMFs for high-fidelity video reconstruction at the same +resolution as its native input. This work simply applies the bicubic resampling +in decomposition and HEVC compliant codec in compression, and puts the focus on +the synthesis part. For resolution-adaptive synthesis, a motion compensation +network (MCN) is devised on TMFs to efficiently align and aggregate temporal +motion features that will be jointly processed with corresponding STFs using a +non-local texture transfer network (NL-TTN) to better augment spatial details, +by which the compression and resolution resampling noises can be effectively +alleviated with better rate-distortion efficiency. Such "Decomposition, +Compression, Synthesis (DCS)" based scheme is codec agnostic, currently +exemplifying averaged $\approx$1 dB PSNR gain or $\approx$25% BD-rate saving, +against the HEVC anchor using reference software. In addition, experimental +comparisons to the state-of-the-art methods and ablation studies are conducted +to further report the efficiency and generalization of DCS algorithm, promising +an encouraging direction for future video coding. + +
+
+
+
+
+ + ♻ ☆ StageInteractor: Query-based Object Detector with Cross-stage + Interaction + + +
+ Previous object detectors make predictions based on dense grid points or +numerous preset anchors. Most of these detectors are trained with one-to-many +label assignment strategies. On the contrary, recent query-based object +detectors depend on a sparse set of learnable queries and a series of decoder +layers. The one-to-one label assignment is independently applied on each layer +for the deep supervision during training. Despite the great success of +query-based object detection, however, this one-to-one label assignment +strategy demands the detectors to have strong fine-grained discrimination and +modeling capacity. To solve the above problems, in this paper, we propose a new +query-based object detector with cross-stage interaction, coined as +StageInteractor. During the forward propagation, we come up with an efficient +way to improve this modeling ability by reusing dynamic operators with +lightweight adapters. As for the label assignment, a cross-stage label assigner +is applied subsequent to the one-to-one label assignment. With this assigner, +the training target class labels are gathered across stages and then +reallocated to proper predictions at each decoder layer. On MS COCO benchmark, +our model improves the baseline by 2.2 AP, and achieves 44.8 AP with ResNet-50 +as backbone, 100 queries and 12 training epochs. With longer training time and +300 queries, StageInteractor achieves 51.1 AP and 52.2 AP with ResNeXt-101-DCN +and Swin-S, respectively. + +
+
+
+
+
+ + ♻ ☆ Uncertainty-based Detection of Adversarial Attacks in Semantic + Segmentation + + +
+ State-of-the-art deep neural networks have proven to be highly powerful in a +broad range of tasks, including semantic image segmentation. However, these +networks are vulnerable against adversarial attacks, i.e., non-perceptible +perturbations added to the input image causing incorrect predictions, which is +hazardous in safety-critical applications like automated driving. Adversarial +examples and defense strategies are well studied for the image classification +task, while there has been limited research in the context of semantic +segmentation. First works however show that the segmentation outcome can be +severely distorted by adversarial attacks. In this work, we introduce an +uncertainty-based approach for the detection of adversarial attacks in semantic +segmentation. We observe that uncertainty as for example captured by the +entropy of the output distribution behaves differently on clean and perturbed +images and leverage this property to distinguish between the two cases. Our +method works in a light-weight and post-processing manner, i.e., we do not +modify the model or need knowledge of the process used for generating +adversarial examples. In a thorough empirical analysis, we demonstrate the +ability of our approach to detect perturbed images across multiple types of +adversarial attacks. + +
+
+
+
+
+ + ♻ ☆ CAMP-Net: Consistency-Aware Multi-Prior Network for Accelerated MRI + Reconstruction + + +
+ Undersampling k-space data in MRI reduces scan time but pose challenges in +image reconstruction. Considerable progress has been made in reconstructing +accelerated MRI. However, restoration of high-frequency image details in highly +undersampled data remains challenging. To address this issue, we propose +CAMP-Net, an unrolling-based Consistency-Aware Multi-Prior Network for +accelerated MRI reconstruction. CAMP-Net leverages complementary multi-prior +knowledge and multi-slice information from various domains to enhance +reconstruction quality. Specifically, CAMP-Net comprises three interleaved +modules for image enhancement, k-space restoration, and calibration +consistency, respectively. These modules jointly learn priors from data in +image domain, k-domain, and calibration region, respectively, in data-driven +manner during each unrolled iteration. Notably, the encoded calibration prior +knowledge extracted from auto-calibrating signals implicitly guides the +learning of consistency-aware k-space correlation for reliable interpolation of +missing k-space data. To maximize the benefits of image domain and k-domain +prior knowledge, the reconstructions are aggregated in a frequency fusion +module, exploiting their complementary properties to optimize the trade-off +between artifact removal and fine detail preservation. Additionally, we +incorporate a surface data fidelity layer during the learning of k-domain and +calibration domain priors to prevent degradation of the reconstruction caused +by padding-induced data imperfections. We evaluate the generalizability and +robustness of our method on three large public datasets with varying +acceleration factors and sampling patterns. The experimental results +demonstrate that our method outperforms state-of-the-art approaches in terms of +both reconstruction quality and $T_2$ mapping estimation, particularly in +scenarios with high acceleration factors. + +
+
+
+
+
+ + ♻ ☆ A Generative Multi-Resolution Pyramid and Normal-Conditioning 3D Cloth + Draping WACV24 + + +
+ RGB cloth generation has been deeply studied in the related literature, +however, 3D garment generation remains an open problem. In this paper, we build +a conditional variational autoencoder for 3D garment generation and draping. We +propose a pyramid network to add garment details progressively in a canonical +space, i.e. unposing and unshaping the garments w.r.t. the body. We study +conditioning the network on surface normal UV maps, as an intermediate +representation, which is an easier problem to optimize than 3D coordinates. Our +results on two public datasets, CLOTH3D and CAPE, show that our model is +robust, controllable in terms of detail generation by the use of +multi-resolution pyramids, and achieves state-of-the-art results that can +highly generalize to unseen garments, poses, and shapes even when training with +small amounts of data. + +
+
+ comment: WACV24, IEEE copyright +
+
+
+
+
+ + ♻ ☆ Lagrangian Motion Magnification with Double Sparse Optical Flow + Decomposition + + +
+ Microexpressions are fast and spatially small facial expressions that are +difficult to detect. Therefore motion magnification techniques, which aim at +amplifying and hence revealing subtle motion in videos, appear useful for +handling such expressions. There are basically two main approaches, namely via +Eulerian or Lagrangian techniques. While the first one magnifies motion +implicitly by operating directly on image pixels, the Lagrangian approach uses +optical flow (OF) techniques to extract and magnify pixel trajectories. In this +paper, we propose a novel approach for local Lagrangian motion magnification of +facial micro-motions. Our contribution is three-fold: first, we fine tune the +recurrent all-pairs field transforms (RAFT) for OFs deep learning approach for +faces by adding ground truth obtained from the variational dense inverse search +(DIS) for OF algorithm applied to the CASME II video set of facial micro +expressions. This enables us to produce OFs of facial videos in an efficient +and sufficiently accurate way. Second, since facial micro-motions are both +local in space and time, we propose to approximate the OF field by sparse +components both in space and time leading to a double sparse decomposition. +Third, we use this decomposition to magnify micro-motions in specific areas of +the face, where we introduce a new forward warping strategy using a triangular +splitting of the image grid and barycentric interpolation of the RGB vectors at +the corners of the transformed triangles. We demonstrate the feasibility of our +approach by various examples. + +
+
+
+
+
+ + ♻ ☆ Graph Attention Transformer Network for Multi-Label Image Classification + + +
+ Multi-label classification aims to recognize multiple objects or attributes +from images. However, it is challenging to learn from proper label graphs to +effectively characterize such inter-label correlations or dependencies. Current +methods often use the co-occurrence probability of labels based on the training +set as the adjacency matrix to model this correlation, which is greatly limited +by the dataset and affects the model's generalization ability. In this paper, +we propose a Graph Attention Transformer Network (GATN), a general framework +for multi-label image classification that can effectively mine complex +inter-label relationships. First, we use the cosine similarity based on the +label word embedding as the initial correlation matrix, which can represent +rich semantic information. Subsequently, we design the graph attention +transformer layer to transfer this adjacency matrix to adapt to the current +domain. Our extensive experiments have demonstrated that our proposed methods +can achieve state-of-the-art performance on three datasets. + +
+
+
+
+
+ + ♻ ☆ Hierarchical Randomized Smoothing + + +
+ Real-world data is complex and often consists of objects that can be +decomposed into multiple entities (e.g. images into pixels, graphs into +interconnected nodes). Randomized smoothing is a powerful framework for making +models provably robust against small changes to their inputs - by guaranteeing +robustness of the majority vote when randomly adding noise before +classification. Yet, certifying robustness on such complex data via randomized +smoothing is challenging when adversaries do not arbitrarily perturb entire +objects (e.g. images) but only a subset of their entities (e.g. pixels). As a +solution, we introduce hierarchical randomized smoothing: We partially smooth +objects by adding random noise only on a randomly selected subset of their +entities. By adding noise in a more targeted manner than existing methods we +obtain stronger robustness guarantees while maintaining high accuracy. We +initialize hierarchical smoothing using different noising distributions, +yielding novel robustness certificates for discrete and continuous domains. We +experimentally demonstrate the importance of hierarchical smoothing in image +and node classification, where it yields superior robustness-accuracy +trade-offs. Overall, hierarchical smoothing is an important contribution +towards models that are both - certifiably robust to perturbations and +accurate. + +
+
+
+
+
+ + ♻ ☆ B-cos Alignment for Inherently Interpretable CNNs and Vision + Transformers CVPR 2022 + + +
+ We present a new direction for increasing the interpretability of deep neural +networks (DNNs) by promoting weight-input alignment during training. For this, +we propose to replace the linear transformations in DNNs by our novel B-cos +transformation. As we show, a sequence (network) of such transformations +induces a single linear transformation that faithfully summarises the full +model computations. Moreover, the B-cos transformation is designed such that +the weights align with relevant signals during optimisation. As a result, those +induced linear transformations become highly interpretable and highlight +task-relevant features. Importantly, the B-cos transformation is designed to be +compatible with existing architectures and we show that it can easily be +integrated into virtually all of the latest state of the art models for +computer vision - e.g. ResNets, DenseNets, ConvNext models, as well as Vision +Transformers - by combining the B-cos-based explanations with normalisation and +attention layers, all whilst maintaining similar accuracy on ImageNet. Finally, +we show that the resulting explanations are of high visual quality and perform +well under quantitative interpretability metrics. + +
+
+ comment: Extension of B-cos Networks: Alignment is All We Need for + Interpretability (B\"ohle et al., CVPR 2022). Accepted for publication in + IEEE Transactions on Pattern Analysis and Machine Intelligence. arXiv admin + note: substantial text overlap with arXiv:2205.10268 +
+
+
+
+
+ + ♻ ☆ Multi-Depth Branch Network for Efficient Image Super-Resolution + + +
+ A longstanding challenge in Super-Resolution (SR) is how to efficiently +enhance high-frequency details in Low-Resolution (LR) images while maintaining +semantic coherence. This is particularly crucial in practical applications +where SR models are often deployed on low-power devices. To address this issue, +we propose an innovative asymmetric SR architecture featuring Multi-Depth +Branch Module (MDBM). These MDBMs contain branches of different depths, +designed to capture high- and low-frequency information simultaneously and +efficiently. The hierarchical structure of MDBM allows the deeper branch to +gradually accumulate fine-grained local details under the contextual guidance +of the shallower branch. We visualize this process using feature maps, and +further demonstrate the rationality and effectiveness of this design using +proposed novel Fourier spectral analysis methods. Moreover, our model exhibits +more significant spectral differentiation between branches than existing branch +networks. This suggests that MDBM reduces feature redundancy and offers a more +effective method for integrating high- and low-frequency information. Extensive +qualitative and quantitative evaluations on various datasets show that our +model can generate structurally consistent and visually realistic HR images. It +achieves state-of-the-art (SOTA) results at a very fast inference speed. Our +code is available at https://github.com/thy960112/MDBN. + +
+
+
+
+
+ + ♻ ☆ Optimising for Interpretability: Convolutional Dynamic Alignment + Networks CVPR 2021 + + +
+ We introduce a new family of neural network models called Convolutional +Dynamic Alignment Networks (CoDA Nets), which are performant classifiers with a +high degree of inherent interpretability. Their core building blocks are +Dynamic Alignment Units (DAUs), which are optimised to transform their inputs +with dynamically computed weight vectors that align with task-relevant +patterns. As a result, CoDA Nets model the classification prediction through a +series of input-dependent linear transformations, allowing for linear +decomposition of the output into individual input contributions. Given the +alignment of the DAUs, the resulting contribution maps align with +discriminative input patterns. These model-inherent decompositions are of high +visual quality and outperform existing attribution methods under quantitative +metrics. Further, CoDA Nets constitute performant classifiers, achieving on par +results to ResNet and VGG models on e.g. CIFAR-10 and TinyImagenet. Lastly, +CoDA Nets can be combined with conventional neural network models to yield +powerful classifiers that more easily scale to complex datasets such as +Imagenet whilst exhibiting an increased interpretable depth, i.e., the output +can be explained well in terms of contributions from intermediate layers within +the network. + +
+
+ comment: Extension of "Convolutional Dynamic Alignment Networks for + Interpretable Classifications" (B\"ohle et al., CVPR 2021). arXiv admin note: + substantial text overlap with arXiv:2104.00032 +
+
+
+
+
+ + ♻ ☆ Adversarial Examples are Misaligned in Diffusion Model Manifolds + + +
+ In recent years, diffusion models (DMs) have drawn significant attention for +their success in approximating data distributions, yielding state-of-the-art +generative results. Nevertheless, the versatility of these models extends +beyond their generative capabilities to encompass various vision applications, +such as image inpainting, segmentation, adversarial robustness, among others. +This study is dedicated to the investigation of adversarial attacks through the +lens of diffusion models. However, our objective does not involve enhancing the +adversarial robustness of image classifiers. Instead, our focus lies in +utilizing the diffusion model to detect and analyze the anomalies introduced by +these attacks on images. To that end, we systematically examine the alignment +of the distributions of adversarial examples when subjected to the process of +transformation using diffusion models. The efficacy of this approach is +assessed across CIFAR-10 and ImageNet datasets, including varying image sizes +in the latter. The results demonstrate a notable capacity to discriminate +effectively between benign and attacked images, providing compelling evidence +that adversarial instances do not align with the learned manifold of the DMs. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ SoundCam: A Dataset for Finding Humans Using Room Acoustics NeurIPS 2023 + + +
+ A room's acoustic properties are a product of the room's geometry, the +objects within the room, and their specific positions. A room's acoustic +properties can be characterized by its impulse response (RIR) between a source +and listener location, or roughly inferred from recordings of natural signals +present in the room. Variations in the positions of objects in a room can +effect measurable changes in the room's acoustic properties, as characterized +by the RIR. Existing datasets of RIRs either do not systematically vary +positions of objects in an environment, or they consist of only simulated RIRs. +We present SoundCam, the largest dataset of unique RIRs from in-the-wild rooms +publicly released to date. It includes 5,000 10-channel real-world measurements +of room impulse responses and 2,000 10-channel recordings of music in three +different rooms, including a controlled acoustic lab, an in-the-wild living +room, and a conference room, with different humans in positions throughout each +room. We show that these measurements can be used for interesting tasks, such +as detecting and identifying humans, and tracking their positions. + +
+
+ comment: In NeurIPS 2023 Datasets and Benchmarks Track. Project page: + https://masonlwang.com/soundcam/. Wang and Clarke contributed equally to this + work +
+
+
+
+
+ + ♻ ☆ Diff-Instruct: A Universal Approach for Transferring Knowledge From + Pre-trained Diffusion Models + + +
+ Due to the ease of training, ability to scale, and high sample quality, +diffusion models (DMs) have become the preferred option for generative +modeling, with numerous pre-trained models available for a wide variety of +datasets. Containing intricate information about data distributions, +pre-trained DMs are valuable assets for downstream applications. In this work, +we consider learning from pre-trained DMs and transferring their knowledge to +other generative models in a data-free fashion. Specifically, we propose a +general framework called Diff-Instruct to instruct the training of arbitrary +generative models as long as the generated samples are differentiable with +respect to the model parameters. Our proposed Diff-Instruct is built on a +rigorous mathematical foundation where the instruction process directly +corresponds to minimizing a novel divergence we call Integral Kullback-Leibler +(IKL) divergence. IKL is tailored for DMs by calculating the integral of the KL +divergence along a diffusion process, which we show to be more robust in +comparing distributions with misaligned supports. We also reveal non-trivial +connections of our method to existing works such as DreamFusion, and generative +adversarial training. To demonstrate the effectiveness and universality of +Diff-Instruct, we consider two scenarios: distilling pre-trained diffusion +models and refining existing GAN models. The experiments on distilling +pre-trained diffusion models show that Diff-Instruct results in +state-of-the-art single-step diffusion-based models. The experiments on +refining GAN models show that the Diff-Instruct can consistently improve the +pre-trained generators of GAN models across various settings. + +
+
+
+
+
+ + ♻ ☆ Progressive Energy-Based Cooperative Learning for Multi-Domain + Image-to-Image Translation + + +
+ This paper studies a novel energy-based cooperative learning framework for +multi-domain image-to-image translation. The framework consists of four +components: descriptor, translator, style encoder, and style generator. The +descriptor is a multi-head energy-based model that represents a multi-domain +image distribution. The components of translator, style encoder, and style +generator constitute a diversified image generator. Specifically, given an +input image from a source domain, the translator turns it into a stylised +output image of the target domain according to a style code, which can be +inferred by the style encoder from a reference image or produced by the style +generator from a random noise. Since the style generator is represented as an +domain-specific distribution of style codes, the translator can provide a +one-to-many transformation (i.e., diversified generation) between source domain +and target domain. To train our framework, we propose a likelihood-based +multi-domain cooperative learning algorithm to jointly train the multi-domain +descriptor and the diversified image generator (including translator, style +encoder, and style generator modules) via multi-domain MCMC teaching, in which +the descriptor guides the diversified image generator to shift its probability +density toward the data distribution, while the diversified image generator +uses its randomly translated images to initialize the descriptor's Langevin +dynamics process for efficient sampling. + +
+
+
+
+
+ + ♻ ☆ CASR: Refining Action Segmentation via Marginalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose Causal Abstraction Segmentation Refiner (CASR), which can +refine TAS results from various models by enhancing video causality in +marginalizing frame-level casual relationships. Specifically, we define the +equivalent frame-level casual model and segment-level causal model, so that the +causal adjacency matrix constructed from marginalized frame-level causal +relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. + +
+
+
+
+
+ + ♻ ☆ Towards Real-World Aerial Vision Guidance with Categorical 6D Pose + Tracker + + +
+ Tracking the object 6-DoF pose is crucial for various downstream robot tasks +and real-world applications. In this paper, we investigate the real-world robot +task of aerial vision guidance for aerial robotics manipulation, utilizing +category-level 6-DoF pose tracking. Aerial conditions inevitably introduce +special challenges, such as rapid viewpoint changes in pitch and roll and +inter-frame differences. To support these challenges in task, we firstly +introduce a robust category-level 6-DoF pose tracker (Robust6DoF). This tracker +leverages shape and temporal prior knowledge to explore optimal inter-frame +keypoint pairs, generated under a priori structural adaptive supervision in a +coarse-to-fine manner. Notably, our Robust6DoF employs a Spatial-Temporal +Augmentation module to deal with the problems of the inter-frame differences +and intra-class shape variations through both temporal dynamic filtering and +shape-similarity filtering. We further present a Pose-Aware Discrete Servo +strategy (PAD-Servo), serving as a decoupling approach to implement the final +aerial vision guidance task. It contains two servo action policies to better +accommodate the structural properties of aerial robotics manipulation. +Exhaustive experiments on four well-known public benchmarks demonstrate the +superiority of our Robust6DoF. Real-world tests directly verify that our +Robust6DoF along with PAD-Servo can be readily used in real-world aerial +robotic applications. + +
+
+
+
+
+ + ♻ ☆ LEGO:Language Enhanced Multi-modal Grounding Model + + +
+ Multi-modal large language models have demonstrated impressive performance +across various tasks in different modalities. However, existing multi-modal +models primarily emphasize capturing global information within each modality +while neglecting the importance of perceiving local information across +modalities. Consequently, these models lack the ability to effectively +understand the fine-grained details of input data, limiting their performance +in tasks that require a more nuanced understanding. To address this limitation, +there is a compelling need to develop models that enable fine-grained +understanding across multiple modalities, thereby enhancing their applicability +to a wide range of tasks. In this paper, we propose LEGO, a language enhanced +multi-modal grounding model. Beyond capturing global information like other +multi-modal models, our proposed model excels at tasks demanding a detailed +understanding of local information within the input. It demonstrates precise +identification and localization of specific regions in images or moments in +videos. To achieve this objective, we design a diversified dataset construction +pipeline, resulting in a multi-modal, multi-granularity dataset for model +training. The code, dataset, and demo of our model can be found at https: +//github.com/lzw-lzw/LEGO. + +
+
+
+
+
+ + ♻ ☆ ReSimAD: Zero-Shot 3D Domain Transfer for Autonomous Driving with Source + Reconstruction and Target Simulation + + +
+ Domain shifts such as sensor type changes and geographical situation +variations are prevalent in Autonomous Driving (AD), which poses a challenge +since AD model relying on the previous domain knowledge can be hardly directly +deployed to a new domain without additional costs. In this paper, we provide a +new perspective and approach of alleviating the domain shifts, by proposing a +Reconstruction-Simulation-Perception (ReSimAD) scheme. Specifically, the +implicit reconstruction process is based on the knowledge from the previous old +domain, aiming to convert the domain-related knowledge into domain-invariant +representations, e.g., 3D scene-level meshes. Besides, the point clouds +simulation process of multiple new domains is conditioned on the above +reconstructed 3D meshes, where the target-domain-like simulation samples can be +obtained, thus reducing the cost of collecting and annotating new-domain data +for the subsequent perception process. For experiments, we consider different +cross-domain situations such as Waymo-to-KITTI, Waymo-to-nuScenes, +Waymo-to-ONCE, etc, to verify the zero-shot target-domain perception using +ReSimAD. Results demonstrate that our method is beneficial to boost the domain +generalization ability, even promising for 3D pre-training. + +
+
+ comment: Code and simulated points are available at + https://github.com/PJLab-ADG/3DTrans#resimad +
+
+
+
+
+ + ♻ ☆ LLaVA-Phi: Efficient Multi-Modal Assistant with Small Language Model + + +
+ In this paper, we introduce LLaVA-$\phi$ (LLaVA-Phi), an efficient +multi-modal assistant that harnesses the power of the recently advanced small +language model, Phi-2, to facilitate multi-modal dialogues. LLaVA-Phi marks a +notable advancement in the realm of compact multi-modal models. It demonstrates +that even smaller language models, with as few as 2.7B parameters, can +effectively engage in intricate dialogues that integrate both textual and +visual elements, provided they are trained with high-quality corpora. Our model +delivers commendable performance on publicly available benchmarks that +encompass visual comprehension, reasoning, and knowledge-based perception. +Beyond its remarkable performance in multi-modal dialogue tasks, our model +opens new avenues for applications in time-sensitive environments and systems +that require real-time interaction, such as embodied agents. It highlights the +potential of smaller language models to achieve sophisticated levels of +understanding and interaction, while maintaining greater resource +efficiency.The project is available at {https://github.com/zhuyiche/llava-phi}. + +
+
+ comment: technique report +
+
+
+
+
+ + ♻ ☆ CLAPP: Contrastive Language-Audio Pre-training in Passive Underwater + Vessel Classification + + +
+ Existing research on audio classification faces challenges in recognizing +attributes of passive underwater vessel scenarios and lacks well-annotated +datasets due to data privacy concerns. In this study, we introduce CLAPP +(Contrastive Language-Audio Pre-training in Passive Underwater Vessel +Classification), a novel model. Our aim is to train a neural network using a +wide range of vessel audio and vessel state text pairs obtained from an +oceanship dataset. CLAPP is capable of directly learning from raw vessel audio +data and, when available, from carefully curated labels, enabling improved +recognition of vessel attributes in passive underwater vessel scenarios. +Model's zero-shot capability allows predicting the most relevant vessel state +description for a given vessel audio, without directly optimizing for the task. +Our approach aims to solve 2 challenges: vessel audio-text classification and +passive underwater vessel audio attribute recognition. The proposed method +achieves new state-of-the-art results on both Deepship and Shipsear public +datasets, with a notable margin of about 7%-13% for accuracy compared to prior +methods on zero-shot task. + +
+
+
+
+
+ + ♻ ☆ Improved Dense Nested Attention Network Based on Transformer for + Infrared Small Target Detection + + +
+ Infrared small target detection based on deep learning offers unique +advantages in separating small targets from complex and dynamic backgrounds. +However, the features of infrared small targets gradually weaken as the depth +of convolutional neural network (CNN) increases. To address this issue, we +propose a novel method for detecting infrared small targets called improved +dense nested attention network (IDNANet), which is based on the transformer +architecture. We preserve the dense nested structure of dense nested attention +network (DNANet) and introduce the Swin-transformer during feature extraction +stage to enhance the continuity of features. Furthermore, we integrate the +ACmix attention structure into the dense nested structure to enhance the +features of intermediate layers. Additionally, we design a weighted dice binary +cross-entropy (WD-BCE) loss function to mitigate the negative impact of +foreground-background imbalance in the samples. Moreover, we develop a dataset +specifically for infrared small targets, called BIT-SIRST. The dataset +comprises a significant amount of real-world targets and manually annotated +labels, as well as synthetic data and corresponding labels. We have evaluated +the effectiveness of our method through experiments conducted on public +datasets. In comparison to other state-of-the-art methods, our approach +outperforms in terms of probability of detection ($P_d$), false-alarm rate +($F_a$), and mean intersection of union ($mIoU$). The $mIoU$ reaches 90.89\% on +the NUDT-SIRST dataset and 79.72\% on the SIRST dataset. + +
+
+
+
+
+ + ♻ ☆ Scalable Geometric Fracture Assembly via Co-creation Space among + Assemblers AAAI2024 + + +
+ Geometric fracture assembly presents a challenging practical task in +archaeology and 3D computer vision. Previous methods have focused solely on +assembling fragments based on semantic information, which has limited the +quantity of objects that can be effectively assembled. Therefore, there is a +need to develop a scalable framework for geometric fracture assembly without +relying on semantic information. To improve the effectiveness of assembling +geometric fractures without semantic information, we propose a co-creation +space comprising several assemblers capable of gradually and unambiguously +assembling fractures. Additionally, we introduce a novel loss function, i.e., +the geometric-based collision loss, to address collision issues during the +fracture assembly process and enhance the results. Our framework exhibits +better performance on both PartNet and Breaking Bad datasets compared to +existing state-of-the-art frameworks. Extensive experiments and quantitative +comparisons demonstrate the effectiveness of our proposed framework, which +features linear computational complexity, enhanced abstraction, and improved +generalization. Our code is publicly available at +https://github.com/Ruiyuan-Zhang/CCS. + +
+
+ comment: AAAI2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ The Chronicles of RAG: The Retriever, the Chunk and the Generator + + +
+ Retrieval Augmented Generation (RAG) has become one of the most popular +paradigms for enabling LLMs to access external data, and also as a mechanism +for grounding to mitigate against hallucinations. When implementing RAG you can +face several challenges like effective integration of retrieval models, +efficient representation learning, data diversity, computational efficiency +optimization, evaluation, and quality of text generation. Given all these +challenges, every day a new technique to improve RAG appears, making it +unfeasible to experiment with all combinations for your problem. In this +context, this paper presents good practices to implement, optimize, and +evaluate RAG for the Brazilian Portuguese language, focusing on the +establishment of a simple pipeline for inference and experiments. We explored a +diverse set of methods to answer questions about the first Harry Potter book. +To generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview, +gpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the +retriever, our approach achieved an improvement of MRR@10 by 35.4% compared to +the baseline. When optimizing the input size in the application, we observed +that it is possible to further enhance it by 2.4%. Finally, we present the +complete architecture of the RAG with our recommendations. As result, we moved +from a baseline of 57.88% to a maximum relative score of 98.61%. + +
+
+ comment: 16 pages, 15 figures, 9 tables +
+
+
+
+
+ + ☆ Deep Evolutional Instant Interest Network for CTR Prediction in + Trigger-Induced Recommendation + + +
+ The recommendation has been playing a key role in many industries, e.g., +e-commerce, streaming media, social media, etc. Recently, a new recommendation +scenario, called Trigger-Induced Recommendation (TIR), where users are able to +explicitly express their instant interests via trigger items, is emerging as an +essential role in many e-commerce platforms, e.g., Alibaba.com and Amazon. +Without explicitly modeling the user's instant interest, traditional +recommendation methods usually obtain sub-optimal results in TIR. Even though +there are a few methods considering the trigger and target items simultaneously +to solve this problem, they still haven't taken into account temporal +information of user behaviors, the dynamic change of user instant interest when +the user scrolls down and the interactions between the trigger and target +items. To tackle these problems, we propose a novel method -- Deep Evolutional +Instant Interest Network (DEI2N), for click-through rate prediction in TIR +scenarios. Specifically, we design a User Instant Interest Modeling Layer to +predict the dynamic change of the intensity of instant interest when the user +scrolls down. Temporal information is utilized in user behavior modeling. +Moreover, an Interaction Layer is introduced to learn better interactions +between the trigger and target items. We evaluate our method on several offline +and real-world industrial datasets. Experimental results show that our proposed +DEI2N outperforms state-of-the-art baselines. In addition, online A/B testing +demonstrates the superiority over the existing baseline in real-world +production environments. + +
+
+ comment: 7 pages, 3 figures, reviewing of the 17th ACM International + Conference on Web Search and Data Mining +
+
+
+
+
+ + ☆ CREAD: A Classification-Restoration Framework with Error Adaptive + Discretization for Watch Time Prediction in Video Recommender Systems + + +
+ The watch time is a significant indicator of user satisfaction in video +recommender systems. However, the prediction of watch time as a target variable +is often hindered by its highly imbalanced distribution with a scarcity of +observations for larger target values and over-populated samples for small +values. State-of-the-art watch time prediction models discretize the continuous +watch time into a set of buckets in order to consider the distribution of watch +time. However, it is highly uninvestigated how these discrete buckets should be +created from the continuous watch time distribution, and existing +discretization approaches suffer from either a large learning error or a large +restoration error. To address this challenge, we propose a +Classification-Restoration framework with Error-Adaptive-Discretization (CREAD) +to accurately predict the watch time. The proposed framework contains a +discretization module, a classification module, and a restoration module. It +predicts the watch time through multiple classification problems. The +discretization process is a key contribution of the CREAD framework. We +theoretically analyze the impacts of the discretization on the learning error +and the restoration error, and then propose the error-adaptive discretization +(EAD) technique to better balance the two errors, which achieves better +performance over traditional discretization approaches. We conduct detailed +offline evaluations on a public dataset and an industrial dataset, both showing +performance gains through the proposed approach. Moreover, We have fully +launched our framework to Kwai App, an online video platform, which resulted in +a significant increase in users' video watch time by 0.29% through A/B testing. +These results highlight the effectiveness of the CREAD framework in watch time +prediction in video recommender systems. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Model Editing at Scale leads to Gradual and Catastrophic Forgetting + + +
+ Editing knowledge in large language models is an attractive capability to +have which allows us to correct incorrectly learnt facts during pre-training, +as well as update the model with an ever-growing list of new facts. While +existing model editing techniques have shown promise, they are usually +evaluated using metrics for reliability, specificity and generalization over +one or few edits. We argue that for model editing to have practical utility, we +must be able to make multiple edits to the same model. With this in mind, we +evaluate the current model editing methods at scale, focusing on two state of +the art methods: ROME and MEMIT. We find that as the model is edited +sequentially with multiple facts, it continually forgets previously edited +facts and the ability to perform downstream tasks. This forgetting happens in +two phases -- an initial gradual but progressive forgetting phase followed by +abrupt or catastrophic forgetting phase. Both gradual and catastrophic +forgetting limit the usefulness of model editing methods at scale -- the former +making model editing less effective as multiple edits are made to the model +while the latter caps the scalability of such model editing methods. Our +analysis also highlights other key limitations of ROME and MEMIT at scale. With +our work, we push for the development and evaluation of model editing methods +keeping scalability in mind. + +
+
+
+
+
+ + ☆ GACE: Learning Graph-Based Cross-Page Ads Embedding For Click-Through + Rate Prediction + + +
+ Predicting click-through rate (CTR) is the core task of many ads online +recommendation systems, which helps improve user experience and increase +platform revenue. In this type of recommendation system, we often encounter two +main problems: the joint usage of multi-page historical advertising data and +the cold start of new ads. In this paper, we proposed GACE, a graph-based +cross-page ads embedding generation method. It can warm up and generate the +representation embedding of cold-start and existing ads across various pages. +Specifically, we carefully build linkages and a weighted undirected graph model +considering semantic and page-type attributes to guide the direction of feature +fusion and generation. We designed a variational auto-encoding task as +pre-training module and generated embedding representations for new and old ads +based on this task. The results evaluated in the public dataset AliEC from +RecBole and the real-world industry dataset from Alipay show that our GACE +method is significantly superior to the SOTA method. In the online A/B test, +the click-through rate on three real-world pages from Alipay has increased by +3.6%, 2.13%, and 3.02%, respectively. Especially in the cold-start task, the +CTR increased by 9.96%, 7.51%, and 8.97%, respectively. + +
+
+ comment: 15 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ The Impact of Differential Privacy on Recommendation Accuracy and + Popularity Bias ECIR'24 + + +
+ Collaborative filtering-based recommender systems leverage vast amounts of +behavioral user data, which poses severe privacy risks. Thus, often, random +noise is added to the data to ensure Differential Privacy (DP). However, to +date, it is not well understood, in which ways this impacts personalized +recommendations. In this work, we study how DP impacts recommendation accuracy +and popularity bias, when applied to the training data of state-of-the-art +recommendation models. Our findings are three-fold: First, we find that nearly +all users' recommendations change when DP is applied. Second, recommendation +accuracy drops substantially while recommended item popularity experiences a +sharp increase, suggesting that popularity bias worsens. Third, we find that DP +exacerbates popularity bias more severely for users who prefer unpopular items +than for users that prefer popular items. + +
+
+ comment: Accepted at the IR4Good track at ECIR'24, 17 pages +
+
+
+
+
+ + ♻ ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon causes source bias in text retrieval for web search. +Specifically, neural retrieval models tend to rank generated texts higher than +human-written texts. In this paper, we extend the study of this bias to +cross-modal retrieval. Firstly, we successfully construct a suitable benchmark +to explore the existence of the bias. Subsequent extensive experiments on this +benchmark reveal that AI-generated images introduce an invisible relevance bias +to text-image retrieval models. Specifically, our experiments show that +text-image retrieval models tend to rank the AI-generated images higher than +the real images, even though the AI-generated images do not exhibit more +visually relevant features to the query than real images. This invisible +relevance bias is prevalent across retrieval models with varying training data +and architectures. Furthermore, our subsequent exploration reveals that the +inclusion of AI-generated images in the training data of the retrieval models +exacerbates the invisible relevance bias. The above phenomenon triggers a +vicious cycle, which makes the invisible relevance bias become more and more +serious. To elucidate the potential causes of invisible relevance and address +the aforementioned issues, we introduce an effective training method aimed at +alleviating the invisible relevance bias. Subsequently, we apply our proposed +debiasing method to retroactively identify the causes of invisible relevance, +revealing that the AI-generated images induce the image encoder to embed +additional information into their representation. This information exhibits a +certain consistency across generated images with different semantics and can +make the retriever estimate a higher relevance score. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ RecRanker: Instruction Tuning Large Language Model as Ranker for Top-k + Recommendation + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities and +have been extensively deployed across various domains, including recommender +systems. Numerous studies have employed specialized \textit{prompts} to harness +the in-context learning capabilities intrinsic to LLMs. For example, LLMs are +prompted to act as zero-shot rankers for listwise ranking, evaluating candidate +items generated by a retrieval model for recommendation. Recent research +further uses instruction tuning techniques to align LLM with human preference +for more promising recommendations. Despite its potential, current research +overlooks the integration of multiple ranking tasks to enhance model +performance. Moreover, the signal from the conventional recommendation model is +not integrated into the LLM, limiting the current system performance. + In this paper, we introduce RecRanker, tailored for instruction tuning LLM to +serve as the \textbf{Ranker} for top-\textit{k} \textbf{Rec}ommendations. +Specifically, we introduce importance-aware sampling, clustering-based +sampling, and penalty for repetitive sampling for sampling high-quality, +representative, and diverse training data. To enhance the prompt, we introduce +position shifting strategy to mitigate position bias and augment the prompt +with auxiliary information from conventional recommendation models, thereby +enriching the contextual understanding of the LLM. Subsequently, we utilize the +sampled data to assemble an instruction-tuning dataset with the augmented +prompt comprising three distinct ranking tasks: pointwise, pairwise, and +listwise rankings. We further propose a hybrid ranking method to enhance the +model performance by ensembling these ranking tasks. Our empirical evaluations +demonstrate the effectiveness of our proposed RecRanker in both direct and +sequential recommendation scenarios. + +
+
+
+
+
+
+
+
+ + Machine Learning 77 + +
+
+
+ + ☆ Contextual Bandits with Stage-wise Constraints + + +
+ We study contextual bandits in the presence of a stage-wise constraint (a +constraint at each round), when the constraint must be satisfied both with high +probability and in expectation. Obviously the setting where the constraint is +in expectation is a relaxation of the one with high probability. We start with +the linear case where both the contextual bandit problem (reward function) and +the stage-wise constraint (cost function) are linear. In each of the high +probability and in expectation settings, we propose an upper-confidence bound +algorithm for the problem and prove a $T$-round regret bound for it. Our +algorithms balance exploration and constraint satisfaction using a novel idea +that scales the radii of the reward and cost confidence sets with different +scaling factors. We also prove a lower-bound for this constrained problem, show +how our algorithms and analyses can be extended to multiple constraints, and +provide simulations to validate our theoretical results. In the high +probability setting, we describe the minimum requirements for the action set in +order for our algorithm to be tractable. In the setting that the constraint is +in expectation, we further specialize our results to multi-armed bandits and +propose a computationally efficient algorithm for this setting with regret +analysis. Finally, we extend our results to the case where the reward and cost +functions are both non-linear. We propose an algorithm for this case and prove +a regret bound for it that characterize the function class complexity by the +eluder dimension. + +
+
+ comment: 53 pages. arXiv admin note: text overlap with arXiv:2006.10185 +
+
+
+
+
+ + ☆ Discovery of Generalizable TBI Phenotypes Using Multivariate Time-Series + Clustering + + +
+ Traumatic Brain Injury (TBI) presents a broad spectrum of clinical +presentations and outcomes due to its inherent heterogeneity, leading to +diverse recovery trajectories and varied therapeutic responses. While many +studies have delved into TBI phenotyping for distinct patient populations, +identifying TBI phenotypes that consistently generalize across various settings +and populations remains a critical research gap. Our research addresses this by +employing multivariate time-series clustering to unveil TBI's dynamic +intricates. Utilizing a self-supervised learning-based approach to clustering +multivariate time-Series data with missing values (SLAC-Time), we analyzed both +the research-centric TRACK-TBI and the real-world MIMIC-IV datasets. +Remarkably, the optimal hyperparameters of SLAC-Time and the ideal number of +clusters remained consistent across these datasets, underscoring SLAC-Time's +stability across heterogeneous datasets. Our analysis revealed three +generalizable TBI phenotypes ({\alpha}, \b{eta}, and {\gamma}), each exhibiting +distinct non-temporal features during emergency department visits, and temporal +feature profiles throughout ICU stays. Specifically, phenotype {\alpha} +represents mild TBI with a remarkably consistent clinical presentation. In +contrast, phenotype \b{eta} signifies severe TBI with diverse clinical +manifestations, and phenotype {\gamma} represents a moderate TBI profile in +terms of severity and clinical diversity. Age is a significant determinant of +TBI outcomes, with older cohorts recording higher mortality rates. Importantly, +while certain features varied by age, the core characteristics of TBI +manifestations tied to each phenotype remain consistent across diverse +populations. + +
+
+ comment: 25 pages, 10 figures, 4 tables, submitted to Computers in Biology and + Medicine +
+
+
+
+
+ + ☆ A Novel Approach for Automatic Program Repair using Round-Trip + Translation with Large Language Models + + +
+ Research shows that grammatical mistakes in a sentence can be corrected by +translating it to another language and back using neural machine translation +with language models. We investigate whether this correction capability of +Large Language Models (LLMs) extends to Automatic Program Repair (APR). Current +generative models for APR are pre-trained on source code and fine-tuned for +repair. This paper proposes bypassing the fine-tuning step and using Round-Trip +Translation (RTT): translation of code from one programming language to another +programming or natural language, and back. We hypothesize that RTT with LLMs +restores the most commonly seen patterns in code during pre-training, i.e., +performs a regression toward the mean, which removes bugs as they are a form of +noise w.r.t. the more frequent, natural, bug-free code in the training data. To +test this hypothesis, we employ eight recent LLMs pre-trained on code, +including the latest GPT versions, and four common program repair benchmarks in +Java. We find that RTT with English as an intermediate language repaired 101 of +164 bugs with GPT-4 on the HumanEval-Java dataset. Moreover, 46 of these are +unique bugs that are not repaired by other LLMs fine-tuned for APR. Our +findings highlight the viability of round-trip translation with LLMs as a +technique for automated program repair and its potential for research in +software engineering. + Keywords: automated program repair, large language model, machine translation + +
+
+
+
+
+ + ☆ Carrying over algorithm in transformers + + +
+ Addition is perhaps one of the simplest arithmetic tasks one can think of and +is usually performed using the carrying over algorithm. This algorithm consists +of two tasks: adding digits in the same position and carrying over a one +whenever necessary. We study how transformer models implement this algorithm +and how the two aforementioned tasks are allocated to different parts of the +network. We first focus on two-layer encoder-only models and show that the +carrying over algorithm is implemented in a modular fashion. The first layer is +mostly responsible for adding digits in the same position. The second layer +first decides, in the attention, which positions need a carried one or not, and +then performs the carrying of the one in the final MLP. We provide a simple way +of precisely identifying which neurons are responsible for that task. This +implementation of the carrying over algorithm occurs across a range of +hyperparameters for two as well as three-layer models. For small decoder-only +models, we observe the same implementation and provide suggestive evidence for +its existence in three 7B large language models. + +
+
+ comment: Comments welcome! +
+
+
+
+
+ + ☆ Robustness Against Adversarial Attacks via Learning Confined Adversarial + Polytopes ICASSP 2024 + + +
+ Deep neural networks (DNNs) could be deceived by generating +human-imperceptible perturbations of clean samples. Therefore, enhancing the +robustness of DNNs against adversarial attacks is a crucial task. In this +paper, we aim to train robust DNNs by limiting the set of outputs reachable via +a norm-bounded perturbation added to a clean sample. We refer to this set as +adversarial polytope, and each clean sample has a respective adversarial +polytope. Indeed, if the respective polytopes for all the samples are compact +such that they do not intersect the decision boundaries of the DNN, then the +DNN is robust against adversarial samples. Hence, the inner-working of our +algorithm is based on learning \textbf{c}onfined \textbf{a}dversarial +\textbf{p}olytopes (CAP). By conducting a thorough set of experiments, we +demonstrate the effectiveness of CAP over existing adversarial robustness +methods in improving the robustness of models against state-of-the-art attacks +including AutoAttack. + +
+
+ comment: The paper has been accepted in ICASSP 2024 +
+
+
+
+
+ + ☆ How does self-supervised pretraining improve robustness against noisy + labels across various medical image classification datasets? + + +
+ Noisy labels can significantly impact medical image classification, +particularly in deep learning, by corrupting learned features. Self-supervised +pretraining, which doesn't rely on labeled data, can enhance robustness against +noisy labels. However, this robustness varies based on factors like the number +of classes, dataset complexity, and training size. In medical images, subtle +inter-class differences and modality-specific characteristics add complexity. +Previous research hasn't comprehensively explored the interplay between +self-supervised learning and robustness against noisy labels in medical image +classification, considering all these factors. In this study, we address three +key questions: i) How does label noise impact various medical image +classification datasets? ii) Which types of medical image datasets are more +challenging to learn and more affected by label noise? iii) How do different +self-supervised pretraining methods enhance robustness across various medical +image datasets? Our results show that DermNet, among five datasets (Fetal +plane, DermNet, COVID-DU-Ex, MURA, NCT-CRC-HE-100K), is the most challenging +but exhibits greater robustness against noisy labels. Additionally, contrastive +learning stands out among the eight self-supervised methods as the most +effective approach to enhance robustness against noisy labels. + +
+
+
+
+
+ + ☆ Solution of the Probabilistic Lambert Problem: Connections with Optimal + Mass Transport, Schrödinger Bridge and Reaction-Diffusion PDEs + + +
+ Lambert's problem concerns with transferring a spacecraft from a given +initial to a given terminal position within prescribed flight time via velocity +control subject to a gravitational force field. We consider a probabilistic +variant of the Lambert problem where the knowledge of the endpoint constraints +in position vectors are replaced by the knowledge of their respective joint +probability density functions. We show that the Lambert problem with endpoint +joint probability density constraints is a generalized optimal mass transport +(OMT) problem, thereby connecting this classical astrodynamics problem with a +burgeoning area of research in modern stochastic control and stochastic machine +learning. This newfound connection allows us to rigorously establish the +existence and uniqueness of solution for the probabilistic Lambert problem. The +same connection also helps to numerically solve the probabilistic Lambert +problem via diffusion regularization, i.e., by leveraging further connection of +the OMT with the Schr\"odinger bridge problem (SBP). This also shows that the +probabilistic Lambert problem with additive dynamic process noise is in fact a +generalized SBP, and can be solved numerically using the so-called +Schr\"odinger factors, as we do in this work. We explain how the resulting +analysis leads to solving a boundary-coupled system of reaction-diffusion PDEs +where the nonlinear gravitational potential appears as the reaction rate. We +propose novel algorithms for the same, and present illustrative numerical +results. Our analysis and the algorithmic framework are nonparametric, i.e., we +make neither statistical (e.g., Gaussian, first few moments, mixture or +exponential family, finite dimensionality of the sufficient statistic) nor +dynamical (e.g., Taylor series) approximations. + +
+
+
+
+
+ + ☆ GD-CAF: Graph Dual-stream Convolutional Attention Fusion for + Precipitation Nowcasting + + +
+ Accurate precipitation nowcasting is essential for various purposes, +including flood prediction, disaster management, optimizing agricultural +activities, managing transportation routes and renewable energy. While several +studies have addressed this challenging task from a sequence-to-sequence +perspective, most of them have focused on a single area without considering the +existing correlation between multiple disjoint regions. In this paper, we +formulate precipitation nowcasting as a spatiotemporal graph sequence +nowcasting problem. In particular, we introduce Graph Dual-stream Convolutional +Attention Fusion (GD-CAF), a novel approach designed to learn from historical +spatiotemporal graph of precipitation maps and nowcast future time step ahead +precipitation at different spatial locations. GD-CAF consists of +spatio-temporal convolutional attention as well as gated fusion modules which +are equipped with depthwise-separable convolutional operations. This +enhancement enables the model to directly process the high-dimensional +spatiotemporal graph of precipitation maps and exploits higher-order +correlations between the data dimensions. We evaluate our model on seven years +of precipitation maps across Europe and its neighboring areas collected from +the ERA5 dataset, provided by Copernicus. The model receives a fully connected +graph in which each node represents historical observations from a specific +region on the map. Consequently, each node contains a 3D tensor with time, +height, and width dimensions. Experimental results demonstrate that the +proposed GD-CAF model outperforms the other examined models. Furthermore, the +averaged seasonal spatial and temporal attention scores over the test set are +visualized to provide additional insights about the strongest connections +between different regions or time steps. These visualizations shed light on the +decision-making process of our model. + +
+
+ comment: 13 pages, 13 figures +
+
+
+
+
+ + ☆ Machine Perceptual Quality: Evaluating the Impact of Severe Lossy + Compression on Audio and Image Models + + +
+ In the field of neural data compression, the prevailing focus has been on +optimizing algorithms for either classical distortion metrics, such as PSNR or +SSIM, or human perceptual quality. With increasing amounts of data consumed by +machines rather than humans, a new paradigm of machine-oriented +compression$\unicode{x2013}$which prioritizes the retention of features salient +for machine perception over traditional human-centric +criteria$\unicode{x2013}$has emerged, creating several new challenges to the +development, evaluation, and deployment of systems utilizing lossy compression. +In particular, it is unclear how different approaches to lossy compression will +affect the performance of downstream machine perception tasks. To address this +under-explored area, we evaluate various perception +models$\unicode{x2013}$including image classification, image segmentation, +speech recognition, and music source separation$\unicode{x2013}$under severe +lossy compression. We utilize several popular codecs spanning conventional, +neural, and generative compression architectures. Our results indicate three +key findings: (1) using generative compression, it is feasible to leverage +highly compressed data while incurring a negligible impact on machine +perceptual quality; (2) machine perceptual quality correlates strongly with +deep similarity metrics, indicating a crucial role of these metrics in the +development of machine-oriented codecs; and (3) using lossy compressed +datasets, (e.g. ImageNet) for pre-training can lead to counter-intuitive +scenarios where lossy compression increases machine perceptual quality rather +than degrading it. To encourage engagement on this growing area of research, +our code and experiments are available at: +https://github.com/danjacobellis/MPQ. + +
+
+ comment: 10 pages; abridged version published in IEEE Data Compression + Conference 2024 +
+
+
+
+
+ + ☆ A Study on Large Language Models' Limitations in Multiple-Choice + Question Answering + + +
+ The widespread adoption of Large Language Models (LLMs) has become +commonplace, particularly with the emergence of open-source models. More +importantly, smaller models are well-suited for integration into consumer +devices and are frequently employed either as standalone solutions or as +subroutines in various AI tasks. Despite their ubiquitous use, there is no +systematic analysis of their specific capabilities and limitations. In this +study, we tackle one of the most widely used tasks - answering Multiple Choice +Question (MCQ). We analyze 26 small open-source models and find that 65% of the +models do not understand the task, only 4 models properly select an answer from +the given choices, and only 5 of these models are choice order independent. +These results are rather alarming given the extensive use of MCQ tests with +these models. We recommend exercising caution and testing task understanding +before using MCQ to evaluate LLMs in any field whatsoever. + +
+
+
+
+
+ + ☆ Integrate Any Omics: Towards genome-wide data integration for patient + stratification + + +
+ High-throughput omics profiling advancements have greatly enhanced cancer +patient stratification. However, incomplete data in multi-omics integration +presents a significant challenge, as traditional methods like sample exclusion +or imputation often compromise biological diversity and dependencies. +Furthermore, the critical task of accurately classifying new patients with +partial omics data into existing subtypes is commonly overlooked. To address +these issues, we introduce IntegrAO (Integrate Any Omics), an unsupervised +framework for integrating incomplete multi-omics data and classifying new +samples. IntegrAO first combines partially overlapping patient graphs from +diverse omics sources and utilizes graph neural networks to produce unified +patient embeddings. Our systematic evaluation across five cancer cohorts +involving six omics modalities demonstrates IntegrAO's robustness to missing +data and its accuracy in classifying new samples with partial profiles. An +acute myeloid leukemia case study further validates its capability to uncover +biological and clinical heterogeneity in incomplete datasets. IntegrAO's +ability to handle heterogeneous and incomplete data makes it an essential tool +for precision oncology, offering a holistic approach to patient +characterization. + +
+
+
+
+
+ + ☆ A Globally Convergent Algorithm for Neural Network Parameter + Optimization Based on Difference-of-Convex Functions + + +
+ We propose an algorithm for optimizing the parameters of single hidden layer +neural networks. Specifically, we derive a blockwise difference-of-convex (DC) +functions representation of the objective function. Based on the latter, we +propose a block coordinate descent (BCD) approach that we combine with a +tailored difference-of-convex functions algorithm (DCA). We prove global +convergence of the proposed algorithm. Furthermore, we mathematically analyze +the convergence rate of parameters and the convergence rate in value (i.e., the +training loss). We give conditions under which our algorithm converges linearly +or even faster depending on the local shape of the loss function. We confirm +our theoretical derivations numerically and compare our algorithm against +state-of-the-art gradient-based solvers in terms of both training loss and test +loss. + +
+
+ comment: accepted by TMLR +
+
+
+
+
+ + ☆ Vertical Federated Image Segmentation + + +
+ With the popularization of AI solutions for image based problems, there has +been a growing concern for both data privacy and acquisition. In a large number +of cases, information is located on separate data silos and it can be difficult +for a developer to consolidate all of it in a fashion that is appropriate for +machine learning model development. Alongside this, a portion of these +localized data regions may not have access to a labelled ground truth. This +indicates that they have the capacity to reach conclusions numerically, but are +not able to assign classifications amid a lack of pertinent information. Such a +determination is often negligible, especially when attempting to develop image +based solutions that often necessitate this capability. With this being the +case, we propose an innovative vertical federated learning (VFL) model +architecture that can operate under this common set of conditions. This is the +first (and currently the only) implementation of a system that can work under +the constraints of a VFL environment and perform image segmentation while +maintaining nominal accuracies. We achieved this by utilizing an FCN that +boasts the ability to operate on federates that lack labelled data and +privately share the respective weights with a central server, that of which +hosts the necessary features for classification. Tests were conducted on the +CamVid dataset in order to determine the impact of heavy feature compression +required for the transfer of information between federates, as well as to reach +nominal conclusions about the overall performance metrics when working under +such constraints. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ Can Large Language Models Explain Themselves? + + +
+ Instruction-tuned large language models (LLMs) excel at many tasks, and will +even provide explanations for their behavior. Since these models are directly +accessible to the public, there is a risk that convincing and wrong +explanations can lead to unsupported confidence in LLMs. Therefore, +interpretability-faithfulness of self-explanations is an important +consideration for AI Safety. Assessing the interpretability-faithfulness of +these explanations, termed self-explanations, is challenging as the models are +too complex for humans to annotate what is a correct explanation. To address +this, we propose employing self-consistency checks as a measure of +faithfulness. For example, if an LLM says a set of words is important for +making a prediction, then it should not be able to make the same prediction +without these words. While self-consistency checks are a common approach to +faithfulness, they have not previously been applied to LLM's self-explanations. +We apply self-consistency checks to three types of self-explanations: +counterfactuals, importance measures, and redactions. Our work demonstrate that +faithfulness is both task and model dependent, e.g., for sentiment +classification, counterfactual explanations are more faithful for Llama2, +importance measures for Mistral, and redaction for Falcon 40B. Finally, our +findings are robust to prompt-variations. + +
+
+
+
+
+ + ☆ Machine Learning Techniques to Identify Hand Gestures amidst Forearm + Muscle Signals + + +
+ This study investigated the use of forearm EMG data for distinguishing eight +hand gestures, employing the Neural Network and Random Forest algorithms on +data from ten participants. The Neural Network achieved 97 percent accuracy +with 1000-millisecond windows, while the Random Forest achieved 85 percent +accuracy with 200-millisecond windows. Larger window sizes improved gesture +classification due to increased temporal resolution. The Random Forest +exhibited faster processing at 92 milliseconds, compared to the Neural +Network's 124 milliseconds. In conclusion, the study identified a Neural +Network with a 1000-millisecond stream as the most accurate (97 percent), and a +Random Forest with a 200-millisecond stream as the most efficient (85 percent). +Future research should focus on increasing sample size, incorporating more hand +gestures, and exploring different feature extraction methods and modeling +algorithms to enhance system accuracy and efficiency. + +
+
+ comment: 21 pages, 7 figures +
+
+
+
+
+ + ☆ Multifidelity domain decomposition-based physics-informed neural + networks for time-dependent problems + + +
+ Multiscale problems are challenging for neural network-based discretizations +of differential equations, such as physics-informed neural networks (PINNs). +This can be (partly) attributed to the so-called spectral bias of neural +networks. To improve the performance of PINNs for time-dependent problems, a +combination of multifidelity stacking PINNs and domain decomposition-based +finite basis PINNs are employed. In particular, to learn the high-fidelity part +of the multifidelity model, a domain decomposition in time is employed. The +performance is investigated for a pendulum and a two-frequency problem as well +as the Allen-Cahn equation. It can be observed that the domain decomposition +approach clearly improves the PINN and stacking PINN approaches. + +
+
+
+
+
+ + ☆ Learned Best-Effort LLM Serving + + +
+ Many applications must provide low-latency LLM service to users or risk +unacceptable user experience. However, over-provisioning resources to serve +fluctuating request patterns is often prohibitively expensive. In this work, we +present a best-effort serving system that employs deep reinforcement learning +to adjust service quality based on the task distribution and system load. Our +best-effort system can maintain availability with over 10x higher client +request rates, serves above 96% of peak performance 4.1x more often, and serves +above 98% of peak performance 2.3x more often than static serving on +unpredictable workloads. Our learned router is robust to shifts in both the +arrival and task distribution. Compared to static serving, learned best-effort +serving allows for cost-efficient serving through increased hardware utility. +Additionally, we argue that learned best-effort LLM serving is applicable in +wide variety of settings and provides application developers great flexibility +to meet their specific needs. + +
+
+
+
+
+ + ☆ The Chronicles of RAG: The Retriever, the Chunk and the Generator + + +
+ Retrieval Augmented Generation (RAG) has become one of the most popular +paradigms for enabling LLMs to access external data, and also as a mechanism +for grounding to mitigate against hallucinations. When implementing RAG you can +face several challenges like effective integration of retrieval models, +efficient representation learning, data diversity, computational efficiency +optimization, evaluation, and quality of text generation. Given all these +challenges, every day a new technique to improve RAG appears, making it +unfeasible to experiment with all combinations for your problem. In this +context, this paper presents good practices to implement, optimize, and +evaluate RAG for the Brazilian Portuguese language, focusing on the +establishment of a simple pipeline for inference and experiments. We explored a +diverse set of methods to answer questions about the first Harry Potter book. +To generate the answers we used the OpenAI's gpt-4, gpt-4-1106-preview, +gpt-3.5-turbo-1106, and Google's Gemini Pro. Focusing on the quality of the +retriever, our approach achieved an improvement of MRR@10 by 35.4% compared to +the baseline. When optimizing the input size in the application, we observed +that it is possible to further enhance it by 2.4%. Finally, we present the +complete architecture of the RAG with our recommendations. As result, we moved +from a baseline of 57.88% to a maximum relative score of 98.61%. + +
+
+ comment: 16 pages, 15 figures, 9 tables +
+
+
+
+
+ + ☆ Do stable neural networks exist for classification problems? -- A new + view on stability in AI + + +
+ In deep learning (DL) the instability phenomenon is widespread and well +documented, most commonly using the classical measure of stability, the +Lipschitz constant. While a small Lipchitz constant is traditionally viewed as +guarantying stability, it does not capture the instability phenomenon in DL for +classification well. The reason is that a classification function -- which is +the target function to be approximated -- is necessarily discontinuous, thus +having an 'infinite' Lipchitz constant. As a result, the classical approach +will deem every classification function unstable, yet basic classification +functions a la 'is there a cat in the image?' will typically be locally very +'flat' -- and thus locally stable -- except at the decision boundary. The lack +of an appropriate measure of stability hinders a rigorous theory for stability +in DL, and consequently, there are no proper approximation theoretic results +that can guarantee the existence of stable networks for classification +functions. In this paper we introduce a novel stability measure +$\mathscr{S}(f)$, for any classification function $f$, appropriate to study the +stability of discontinuous functions and their approximations. We further prove +two approximation theorems: First, for any $\epsilon > 0$ and any +classification function $f$ on a \emph{compact set}, there is a neural network +(NN) $\psi$, such that $\psi - f \neq 0$ only on a set of measure $< \epsilon$, +moreover, $\mathscr{S}(\psi) \geq \mathscr{S}(f) - \epsilon$ (as accurate and +stable as $f$ up to $\epsilon$). Second, for any classification function $f$ +and $\epsilon > 0$, there exists a NN $\psi$ such that $\psi = f$ on the set of +points that are at least $\epsilon$ away from the decision boundary. + +
+
+
+
+
+ + ☆ Explainable Predictive Maintenance: A Survey of Current Methods, + Challenges and Opportunities + + +
+ Predictive maintenance is a well studied collection of techniques that aims +to prolong the life of a mechanical system by using artificial intelligence and +machine learning to predict the optimal time to perform maintenance. The +methods allow maintainers of systems and hardware to reduce financial and time +costs of upkeep. As these methods are adopted for more serious and potentially +life-threatening applications, the human operators need trust the predictive +system. This attracts the field of Explainable AI (XAI) to introduce +explainability and interpretability into the predictive system. XAI brings +methods to the field of predictive maintenance that can amplify trust in the +users while maintaining well-performing systems. This survey on explainable +predictive maintenance (XPM) discusses and presents the current methods of XAI +as applied to predictive maintenance while following the Preferred Reporting +Items for Systematic Reviews and Meta-Analyses (PRISMA) 2020 guidelines. We +categorize the different XPM methods into groups that follow the XAI +literature. Additionally, we include current challenges and a discussion on +future research directions in XPM. + +
+
+
+
+
+ + ☆ Adaptive Neural-Operator Backstepping Control of a Benchmark Hyperbolic + PDE + + +
+ To stabilize PDEs, feedback controllers require gain kernel functions, which +are themselves governed by PDEs. Furthermore, these gain-kernel PDEs depend on +the PDE plants' functional coefficients. The functional coefficients in PDE +plants are often unknown. This requires an adaptive approach to PDE control, +i.e., an estimation of the plant coefficients conducted concurrently with +control, where a separate PDE for the gain kernel must be solved at each +timestep upon the update in the plant coefficient function estimate. Solving a +PDE at each timestep is computationally expensive and a barrier to the +implementation of real-time adaptive control of PDEs. Recently, results in +neural operator (NO) approximations of functional mappings have been introduced +into PDE control, for replacing the computation of the gain kernel with a +neural network that is trained, once offline, and reused in real-time for rapid +solution of the PDEs. In this paper, we present the first result on applying +NOs in adaptive PDE control, presented for a benchmark 1-D hyperbolic PDE with +recirculation. We establish global stabilization via Lyapunov analysis, in the +plant and parameter error states, and also present an alternative approach, via +passive identifiers, which avoids the strong assumptions on kernel +differentiability. We then present numerical simulations demonstrating +stability and observe speedups up to three orders of magnitude, highlighting +the real-time efficacy of neural operators in adaptive control. Our code +(Github) is made publicly available for future researchers. + +
+
+ comment: 16.5 pages, 3 figures +
+
+
+
+
+ + ☆ The ODE Method for Stochastic Approximation and Reinforcement Learning + with Markovian Noise + + +
+ Stochastic approximation is a class of algorithms that update a vector +iteratively, incrementally, and stochastically, including, e.g., stochastic +gradient descent and temporal difference learning. One fundamental challenge in +analyzing a stochastic approximation algorithm is to establish its stability, +i.e., to show that the stochastic vector iterates are bounded almost surely. In +this paper, we extend the celebrated Borkar-Meyn theorem for stability from the +Martingale difference noise setting to the Markovian noise setting, which +greatly improves its applicability in reinforcement learning, especially in +those off-policy reinforcement learning algorithms with linear function +approximation and eligibility traces. Central to our analysis is the +diminishing asymptotic rate of change of a few functions, which is implied by +both a form of strong law of large numbers and a commonly used V4 Lyapunov +drift condition and trivially holds if the Markov chain is finite and +irreducible. + +
+
+
+
+
+ + ☆ Two Types of AI Existential Risk: Decisive and Accumulative + + +
+ The conventional discourse on existential risks (x-risks) from AI typically +focuses on abrupt, dire events caused by advanced AI systems, particularly +those that might achieve or surpass human-level intelligence. These events have +severe consequences that either lead to human extinction or irreversibly +cripple human civilization to a point beyond recovery. This discourse, however, +often neglects the serious possibility of AI x-risks manifesting incrementally +through a series of smaller yet interconnected disruptions, gradually crossing +critical thresholds over time. This paper contrasts the conventional "decisive +AI x-risk hypothesis" with an "accumulative AI x-risk hypothesis." While the +former envisions an overt AI takeover pathway, characterized by scenarios like +uncontrollable superintelligence, the latter suggests a different causal +pathway to existential catastrophes. This involves a gradual accumulation of +critical AI-induced threats such as severe vulnerabilities and systemic erosion +of econopolitical structures. The accumulative hypothesis suggests a boiling +frog scenario where incremental AI risks slowly converge, undermining +resilience until a triggering event results in irreversible collapse. Through +systems analysis, this paper examines the distinct assumptions differentiating +these two hypotheses. It is then argued that the accumulative view reconciles +seemingly incompatible perspectives on AI risks. The implications of +differentiating between these causal pathways -- the decisive and the +accumulative -- for the governance of AI risks as well as long-term AI safety +are discussed. + +
+
+
+
+
+ + ☆ Optimal Data Splitting in Distributed Optimization for Machine Learning + + +
+ The distributed optimization problem has become increasingly relevant +recently. It has a lot of advantages such as processing a large amount of data +in less time compared to non-distributed methods. However, most distributed +approaches suffer from a significant bottleneck - the cost of communications. +Therefore, a large amount of research has recently been directed at solving +this problem. One such approach uses local data similarity. In particular, +there exists an algorithm provably optimally exploiting the similarity +property. But this result, as well as results from other works solve the +communication bottleneck by focusing only on the fact that communication is +significantly more expensive than local computing and does not take into +account the various capacities of network devices and the different +relationship between communication time and local computing expenses. We +consider this setup and the objective of this study is to achieve an optimal +ratio of distributed data between the server and local machines for any costs +of communications and local computations. The running times of the network are +compared between uniform and optimal distributions. The superior theoretical +performance of our solutions is experimentally validated. + +
+
+ comment: 17 pages, 2 figures, Doklady Rossijskoj akademii nauk: + https://journals.rcsi.science/2686-9543/article/view/247131 +
+
+
+
+
+ + ☆ Fusing Echocardiography Images and Medical Records for Continuous + Patient Stratification + + +
+ Deep learning now enables automatic and robust extraction of cardiac function +descriptors from echocardiographic sequences, such as ejection fraction or +strain. These descriptors provide fine-grained information that physicians +consider, in conjunction with more global variables from the clinical record, +to assess patients' condition. Drawing on novel transformer models applied to +tabular data (e.g., variables from electronic health records), we propose a +method that considers all descriptors extracted from medical records and +echocardiograms to learn the representation of a difficult-to-characterize +cardiovascular pathology, namely hypertension. Our method first projects each +variable into its own representation space using modality-specific approaches. +These standardized representations of multimodal data are then fed to a +transformer encoder, which learns to merge them into a comprehensive +representation of the patient through a pretext task of predicting a clinical +rating. This pretext task is formulated as an ordinal classification to enforce +a pathological continuum in the representation space. We observe the major +trends along this continuum for a cohort of 239 hypertensive patients to +describe, with unprecedented gradation, the effect of hypertension on a number +of cardiac function descriptors. Our analysis shows that i) pretrained weights +from a foundation model allow to reach good performance (83% accuracy) even +with limited data (less than 200 training samples), ii) trends across the +population are reproducible between trainings, and iii) for descriptors whose +interactions with hypertension are well documented, patterns are consistent +with prior physiological knowledge. + +
+
+ comment: 10 pages, submitted to IEEE TMI +
+
+
+
+
+ + ☆ Activations and Gradients Compression for Model-Parallel Training + + +
+ Large neural networks require enormous computational clusters of machines. +Model-parallel training, when the model architecture is partitioned +sequentially between workers, is a popular approach for training modern models. +Information compression can be applied to decrease workers communication time, +as it is often a bottleneck in such systems. This work explores how +simultaneous compression of activations and gradients in model-parallel +distributed training setup affects convergence. We analyze compression methods +such as quantization and TopK compression, and also experiment with error +compensation techniques. Moreover, we employ TopK with AQ-SGD per-batch error +feedback approach. We conduct experiments on image classification and language +model fine-tuning tasks. Our findings demonstrate that gradients require milder +compression rates than activations. We observe that $K=10\%$ is the lowest TopK +compression level, which does not harm model convergence severely. Experiments +also show that models trained with TopK perform well only when compression is +also applied during inference. We find that error feedback techniques do not +improve model-parallel training compared to plain compression, but allow model +inference without compression with almost no quality drop. Finally, when +applied with the AQ-SGD approach, TopK stronger than with $ K=30\%$ worsens +model performance significantly. + +
+
+ comment: 17 pages, 6 figures, 5 tables, Doklady Rossijskoj akademii nauk: + https://journals.rcsi.science/2686-9543/article/view/247111 +
+
+
+
+
+ + ☆ Improving OCR Quality in 19th Century Historical Documents Using a + Combined Machine Learning Based Approach + + +
+ This paper addresses a major challenge to historical research on the 19th +century. Large quantities of sources have become digitally available for the +first time, while extraction techniques are lagging behind. Therefore, we +researched machine learning (ML) models to recognise and extract complex data +structures in a high-value historical primary source, the Schematismus. It +records every single person in the Habsburg civil service above a certain +hierarchical level between 1702 and 1918 and documents the genesis of the +central administration over two centuries. Its complex and intricate structure +as well as its enormous size have so far made any more comprehensive analysis +of the administrative and social structure of the later Habsburg Empire on the +basis of this source impossible. We pursued two central objectives: Primarily, +the improvement of the OCR quality, for which we considered an improved +structure recognition to be essential; in the further course, it turned out +that this also made the extraction of the data structure possible. We chose +Faster R-CNN as base for the ML architecture for structure recognition. In +order to obtain the required amount of training data quickly and economically, +we synthesised Hof- und Staatsschematismus-style data, which we used to train +our model. The model was then fine-tuned with a smaller set of manually +annotated historical source data. We then used Tesseract-OCR, which was further +optimised for the style of our documents, to complete the combined structure +extraction and OCR process. Results show a significant decrease in the two +standard parameters of OCR-performance, WER and CER (where lower values are +better). Combined structure detection and fine-tuned OCR improved CER and WER +values by remarkable 71.98 percent (CER) respectively 52.49 percent (WER). + +
+
+ comment: 29 pages, 23 figures, 7 tables +
+
+
+
+
+ + ☆ Deep Evolutional Instant Interest Network for CTR Prediction in + Trigger-Induced Recommendation + + +
+ The recommendation has been playing a key role in many industries, e.g., +e-commerce, streaming media, social media, etc. Recently, a new recommendation +scenario, called Trigger-Induced Recommendation (TIR), where users are able to +explicitly express their instant interests via trigger items, is emerging as an +essential role in many e-commerce platforms, e.g., Alibaba.com and Amazon. +Without explicitly modeling the user's instant interest, traditional +recommendation methods usually obtain sub-optimal results in TIR. Even though +there are a few methods considering the trigger and target items simultaneously +to solve this problem, they still haven't taken into account temporal +information of user behaviors, the dynamic change of user instant interest when +the user scrolls down and the interactions between the trigger and target +items. To tackle these problems, we propose a novel method -- Deep Evolutional +Instant Interest Network (DEI2N), for click-through rate prediction in TIR +scenarios. Specifically, we design a User Instant Interest Modeling Layer to +predict the dynamic change of the intensity of instant interest when the user +scrolls down. Temporal information is utilized in user behavior modeling. +Moreover, an Interaction Layer is introduced to learn better interactions +between the trigger and target items. We evaluate our method on several offline +and real-world industrial datasets. Experimental results show that our proposed +DEI2N outperforms state-of-the-art baselines. In addition, online A/B testing +demonstrates the superiority over the existing baseline in real-world +production environments. + +
+
+ comment: 7 pages, 3 figures, reviewing of the 17th ACM International + Conference on Web Search and Data Mining +
+
+
+
+
+ + ☆ Joint Probability Selection and Power Allocation for Federated Learning + + +
+ In this paper, we study the performance of federated learning over wireless +networks, where devices with a limited energy budget train a machine learning +model. The federated learning performance depends on the selection of the +clients participating in the learning at each round. Most existing studies +suggest deterministic approaches for the client selection, resulting in +challenging optimization problems that are usually solved using heuristics, and +therefore without guarantees on the quality of the final solution. We formulate +a new probabilistic approach to jointly select clients and allocate power +optimally so that the expected number of participating clients is maximized. To +solve the problem, a new alternating algorithm is proposed, where at each step, +the closed-form solutions for user selection probabilities and power +allocations are obtained. Our numerical results show that the proposed approach +achieves a significant performance in terms of energy consumption, completion +time and accuracy as compared to the studied benchmarks. + +
+
+
+
+
+ + ☆ Combining Machine Learning and Ontology: A Systematic Literature Review + + +
+ Motivated by the desire to explore the process of combining inductive and +deductive reasoning, we conducted a systematic literature review of articles +that investigate the integration of machine learning and ontologies. The +objective was to identify diverse techniques that incorporate both inductive +reasoning (performed by machine learning) and deductive reasoning (performed by +ontologies) into artificial intelligence systems. Our review, which included +the analysis of 128 studies, allowed us to identify three main categories of +hybridization between machine learning and ontologies: learning-enhanced +ontologies, semantic data mining, and learning and reasoning systems. We +provide a comprehensive examination of all these categories, emphasizing the +various machine learning algorithms utilized in the studies. Furthermore, we +compared our classification with similar recent work in the field of hybrid AI +and neuro-symbolic approaches. + +
+
+
+
+
+ + ☆ Conformal Approach To Gaussian Process Surrogate Evaluation With + Coverage Guarantees + + +
+ Gaussian processes (GPs) are a Bayesian machine learning approach widely used +to construct surrogate models for the uncertainty quantification of computer +simulation codes in industrial applications. It provides both a mean predictor +and an estimate of the posterior prediction variance, the latter being used to +produce Bayesian credibility intervals. Interpreting these intervals relies on +the Gaussianity of the simulation model as well as the well-specification of +the priors which are not always appropriate. We propose to address this issue +with the help of conformal prediction. In the present work, a method for +building adaptive cross-conformal prediction intervals is proposed by weighting +the non-conformity score with the posterior standard deviation of the GP. The +resulting conformal prediction intervals exhibit a level of adaptivity akin to +Bayesian credibility sets and display a significant correlation with the +surrogate model local approximation error, while being free from the underlying +model assumptions and having frequentist coverage guarantees. These estimators +can thus be used for evaluating the quality of a GP surrogate model and can +assist a decision-maker in the choice of the best prior for the specific +application of the GP. The performance of the method is illustrated through a +panel of numerical examples based on various reference databases. Moreover, the +potential applicability of the method is demonstrated in the context of +surrogate modeling of an expensive-to-evaluate simulator of the clogging +phenomenon in steam generators of nuclear reactors. + +
+
+
+
+
+ + ☆ Efficient Nonparametric Tensor Decomposition for Binary and Count Data AAAI-24 + + +
+ In numerous applications, binary reactions or event counts are observed and +stored within high-order tensors. Tensor decompositions (TDs) serve as a +powerful tool to handle such high-dimensional and sparse data. However, many +traditional TDs are explicitly or implicitly designed based on the Gaussian +distribution, which is unsuitable for discrete data. Moreover, most TDs rely on +predefined multi-linear structures, such as CP and Tucker formats. Therefore, +they may not be effective enough to handle complex real-world datasets. To +address these issues, we propose ENTED, an \underline{E}fficient +\underline{N}onparametric \underline{TE}nsor \underline{D}ecomposition for +binary and count tensors. Specifically, we first employ a nonparametric +Gaussian process (GP) to replace traditional multi-linear structures. Next, we +utilize the \pg augmentation which provides a unified framework to establish +conjugate models for binary and count distributions. Finally, to address the +computational issue of GPs, we enhance the model by incorporating sparse +orthogonal variational inference of inducing points, which offers a more +effective covariance approximation within GPs and stochastic natural gradient +updates for nonparametric models. We evaluate our model on several real-world +tensor completion tasks, considering binary and count datasets. The results +manifest both better performance and computational advantages of the proposed +model. + +
+
+ comment: AAAI-24 +
+
+
+
+
+ + ☆ Go-Explore for Residential Energy Management + + +
+ Reinforcement learning is commonly applied in residential energy management, +particularly for optimizing energy costs. However, RL agents often face +challenges when dealing with deceptive and sparse rewards in the energy control +domain, especially with stochastic rewards. In such situations, thorough +exploration becomes crucial for learning an optimal policy. Unfortunately, the +exploration mechanism can be misled by deceptive reward signals, making +thorough exploration difficult. Go-Explore is a family of algorithms which +combines planning methods and reinforcement learning methods to achieve +efficient exploration. We use the Go-Explore algorithm to solve the cost-saving +task in residential energy management problems and achieve an improvement of up +to 19.84\% compared to the well-known reinforcement learning algorithms. + +
+
+
+
+
+ + ☆ Data vs. Model Machine Learning Fairness Testing: An Empirical Study + + +
+ Although several fairness definitions and bias mitigation techniques exist in +the literature, all existing solutions evaluate fairness of Machine Learning +(ML) systems after the training stage. In this paper, we take the first steps +towards evaluating a more holistic approach by testing for fairness both before +and after model training. We evaluate the effectiveness of the proposed +approach and position it within the ML development lifecycle, using an +empirical analysis of the relationship between model dependent and independent +fairness metrics. The study uses 2 fairness metrics, 4 ML algorithms, 5 +real-world datasets and 1600 fairness evaluation cycles. We find a linear +relationship between data and model fairness metrics when the distribution and +the size of the training data changes. Our results indicate that testing for +fairness prior to training can be a ``cheap'' and effective means of catching a +biased data collection process early; detecting data drifts in production +systems and minimising execution of full training cycles thus reducing +development time and costs. + +
+
+
+
+
+ + ☆ Stochastic optimization with arbitrary recurrent data sampling + + +
+ For obtaining optimal first-order convergence guarantee for stochastic +optimization, it is necessary to use a recurrent data sampling algorithm that +samples every data point with sufficient frequency. Most commonly used data +sampling algorithms (e.g., i.i.d., MCMC, random reshuffling) are indeed +recurrent under mild assumptions. In this work, we show that for a particular +class of stochastic optimization algorithms, we do not need any other property +(e.g., independence, exponential mixing, and reshuffling) than recurrence in +data sampling algorithms to guarantee the optimal rate of first-order +convergence. Namely, using regularized versions of Minimization by Incremental +Surrogate Optimization (MISO), we show that for non-convex and possibly +non-smooth objective functions, the expected optimality gap converges at an +optimal rate $O(n^{-1/2})$ under general recurrent sampling schemes. +Furthermore, the implied constant depends explicitly on the `speed of +recurrence', measured by the expected amount of time to visit a given data +point either averaged (`target time') or supremized (`hitting time') over the +current location. We demonstrate theoretically and empirically that convergence +can be accelerated by selecting sampling algorithms that cover the data set +most effectively. We discuss applications of our general framework to +decentralized optimization and distributed non-negative matrix factorization. + +
+
+ comment: 41 pages, 3 figures, 1 table +
+
+
+
+
+ + ☆ CLSA-CIM: A Cross-Layer Scheduling Approach for Computing-in-Memory + Architectures + + +
+ The demand for efficient machine learning (ML) accelerators is growing +rapidly, driving the development of novel computing concepts such as resistive +random access memory (RRAM)-based tiled computing-in-memory (CIM) +architectures. CIM allows to compute within the memory unit, resulting in +faster data processing and reduced power consumption. Efficient compiler +algorithms are essential to exploit the potential of tiled CIM architectures. +While conventional ML compilers focus on code generation for CPUs, GPUs, and +other von Neumann architectures, adaptations are needed to cover CIM +architectures. Cross-layer scheduling is a promising approach, as it enhances +the utilization of CIM cores, thereby accelerating computations. Although +similar concepts are implicitly used in previous work, there is a lack of clear +and quantifiable algorithmic definitions for cross-layer scheduling for tiled +CIM architectures. To close this gap, we present CLSA-CIM, a cross-layer +scheduling algorithm for tiled CIM architectures. We integrate CLSA-CIM with +existing weight-mapping strategies and compare performance against +state-of-the-art (SOTA) scheduling algorithms. CLSA-CIM improves the +utilization by up to 17.9 x , resulting in an overall speedup increase of up to +29.2 x compared to SOTA. + +
+
+
+
+
+ + ☆ Empirical Evidence for the Fragment level Understanding on Drug + Molecular Structure of LLMs AAAI 2024 + + +
+ AI for drug discovery has been a research hotspot in recent years, and +SMILES-based language models has been increasingly applied in drug molecular +design. However, no work has explored whether and how language models +understand the chemical spatial structure from 1D sequences. In this work, we +pre-train a transformer model on chemical language and fine-tune it toward drug +design objectives, and investigate the correspondence between high-frequency +SMILES substrings and molecular fragments. The results indicate that language +models can understand chemical structures from the perspective of molecular +fragments, and the structural knowledge learned through fine-tuning is +reflected in the high-frequency SMILES substrings generated by the model. + +
+
+ comment: Accepted by AAAI 2024 workshop: Large Language Models for Biological + Discoveries (LLMs4Bio) +
+
+
+
+
+ + ☆ Learning Explainable and Better Performing Representations of POMDP + Strategies + + +
+ Strategies for partially observable Markov decision processes (POMDP) +typically require memory. One way to represent this memory is via automata. We +present a method to learn an automaton representation of a strategy using the +L*-algorithm. Compared to the tabular representation of a strategy, the +resulting automaton is dramatically smaller and thus also more explainable. +Moreover, in the learning process, our heuristics may even improve the +strategy's performance. In contrast to approaches that synthesize an automaton +directly from the POMDP thereby solving it, our approach is incomparably more +scalable. + +
+
+ comment: Technical report for the submission to TACAS 24 +
+
+
+
+
+ + ☆ MLAD: A Unified Model for Multi-system Log Anomaly Detection + + +
+ In spite of the rapid advancements in unsupervised log anomaly detection +techniques, the current mainstream models still necessitate specific training +for individual system datasets, resulting in costly procedures and limited +scalability due to dataset size, thereby leading to performance bottlenecks. +Furthermore, numerous models lack cognitive reasoning capabilities, posing +challenges in direct transferability to similar systems for effective anomaly +detection. Additionally, akin to reconstruction networks, these models often +encounter the "identical shortcut" predicament, wherein the majority of system +logs are classified as normal, erroneously predicting normal classes when +confronted with rare anomaly logs due to reconstruction errors. + To address the aforementioned issues, we propose MLAD, a novel anomaly +detection model that incorporates semantic relational reasoning across multiple +systems. Specifically, we employ Sentence-bert to capture the similarities +between log sequences and convert them into highly-dimensional learnable +semantic vectors. Subsequently, we revamp the formulas of the Attention layer +to discern the significance of each keyword in the sequence and model the +overall distribution of the multi-system dataset through appropriate vector +space diffusion. Lastly, we employ a Gaussian mixture model to highlight the +uncertainty of rare words pertaining to the "identical shortcut" problem, +optimizing the vector space of the samples using the maximum expectation model. +Experiments on three real-world datasets demonstrate the superiority of MLAD. + +
+
+
+
+
+ + ☆ Multifractal-spectral features enhance classification of anomalous + diffusion + + +
+ Anomalous diffusion processes pose a unique challenge in classification and +characterization. Previously (Mangalam et al., 2023, Physical Review Research +5, 023144), we established a framework for understanding anomalous diffusion +using multifractal formalism. The present study delves into the potential of +multifractal spectral features for effectively distinguishing anomalous +diffusion trajectories from five widely used models: fractional Brownian +motion, scaled Brownian motion, continuous time random walk, annealed transient +time motion, and L\'evy walk. To accomplish this, we generate extensive +datasets comprising $10^6$ trajectories from these five anomalous diffusion +models and extract multiple multifractal spectra from each trajectory. Our +investigation entails a thorough analysis of neural network performance, +encompassing features derived from varying numbers of spectra. Furthermore, we +explore the integration of multifractal spectra into traditional feature +datasets, enabling us to assess their impact comprehensively. To ensure a +statistically meaningful comparison, we categorize features into concept groups +and train neural networks using features from each designated group. Notably, +several feature groups demonstrate similar levels of accuracy, with the highest +performance observed in groups utilizing moving-window characteristics and +$p$-variation features. Multifractal spectral features, particularly those +derived from three spectra involving different timescales and cutoffs, closely +follow, highlighting their robust discriminatory potential. Remarkably, a +neural network exclusively trained on features from a single multifractal +spectrum exhibits commendable performance, surpassing other feature groups. Our +findings underscore the diverse and potent efficacy of multifractal spectral +features in enhancing classification of anomalous diffusion. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ ToolkenGPT: Augmenting Frozen Language Models with Massive Tools via + Tool Embeddings NeurIPS 2023 + + +
+ Augmenting large language models (LLMs) with external tools has emerged as a +promising approach to solving complex problems. However, traditional methods, +which finetune LLMs with tool demonstration data, can be both costly and +restricted to a predefined set of tools. Recent in-context learning paradigm +alleviates these issues, but the limited context length only allows for a few +shots of demonstrations, leading to suboptimal understandings of the tools. +Moreover, when there are numerous tools to choose from, in-context learning +could completely fail to work. In this paper, we propose an alternative +approach, $\textbf{ToolkenGPT}$, which combines the benefits of both sides. Our +approach represents each $\underline{tool}$ as a to$\underline{ken}$ +($\textit{toolken}$) and learns an embedding for it, enabling tool calls in the +same way as generating a regular word token. Once a toolken is triggered, the +LLM is prompted to complete arguments for the tool to execute. ToolkenGPT +offers the flexibility to plug in an arbitrary number of tools by expanding the +set of toolkens on the fly. In addition, it improves tool use by allowing +extensive demonstration data for learning the toolken embeddings. In diverse +domains, including numerical reasoning, knowledge-based question answering, and +embodied plan generation, our approach effectively augments LLMs with tools and +substantially outperforms various latest baselines. ToolkenGPT demonstrates the +promising ability to use relevant tools from a large tool set in complex +scenarios. + +
+
+ comment: NeurIPS 2023 (oral). Code: https://github.com/Ber666/ToolkenGPT +
+
+
+
+
+ + ♻ ☆ Tensor-on-Tensor Regression: Riemannian Optimization, + Over-parameterization, Statistical-computational Gap, and Their Interplay + + +
+ We study the tensor-on-tensor regression, where the goal is to connect tensor +responses to tensor covariates with a low Tucker rank parameter tensor/matrix +without the prior knowledge of its intrinsic rank. We propose the Riemannian +gradient descent (RGD) and Riemannian Gauss-Newton (RGN) methods and cope with +the challenge of unknown rank by studying the effect of rank +over-parameterization. We provide the first convergence guarantee for the +general tensor-on-tensor regression by showing that RGD and RGN respectively +converge linearly and quadratically to a statistically optimal estimate in both +rank correctly-parameterized and over-parameterized settings. Our theory +reveals an intriguing phenomenon: Riemannian optimization methods naturally +adapt to over-parameterization without modifications to their implementation. +We also prove the statistical-computational gap in scalar-on-tensor regression +by a direct low-degree polynomial argument. Our theory demonstrates a "blessing +of statistical-computational gap" phenomenon: in a wide range of scenarios in +tensor-on-tensor regression for tensors of order three or higher, the +computationally required sample size matches what is needed by moderate rank +over-parameterization when considering computationally feasible estimators, +while there are no such benefits in the matrix settings. This shows moderate +rank over-parameterization is essentially "cost-free" in terms of sample size +in tensor-on-tensor regression of order three or higher. Finally, we conduct +simulation studies to show the advantages of our proposed methods and to +corroborate our theoretical findings. + +
+
+
+
+
+ + ♻ ☆ Deep Learning Architecture for Network-Efficiency at the Edge + + +
+ The growing number of AI-driven applications in the mobile devices has led to +solutions that integrate deep learning models with the available edge-cloud +resources; due to multiple benefits such as reduction in on-device energy +consumption, improved latency, improved network usage, and certain privacy +improvements, split learning, where deep learning models are split away from +the mobile device and computed in a distributed manner, has become an +extensively explored topic. Combined with compression-aware methods where +learning adapts to compression of communicated data, the benefits of this +approach have further improved and could serve as an alternative to established +approaches like federated learning methods. In this work, we develop an +adaptive compression-aware split learning method ('deprune') to improve and +train deep learning models so that they are much more network-efficient (use +less network resources and are faster), which would make them ideal to deploy +in weaker devices with the help of edge-cloud resources. This method is also +extended ('prune') to very quickly train deep learning models, through a +transfer learning approach, that trades off little accuracy for much more +network-efficient inference abilities. We show that the 'deprune' method can +reduce network usage by 4x when compared with a split-learning approach (that +does not use our method) without loss of accuracy, while also improving +accuracy over compression-aware split-learning by 4 percent. Lastly, we show +that the 'prune' method can reduce the training time for certain models by up +to 6x without affecting the accuracy when compared against a compression-aware +split-learning approach. + +
+
+
+
+
+ + ♻ ☆ Learning to Unlearn: Instance-wise Unlearning for Pre-trained + Classifiers AAAI 2024 + + +
+ Since the recent advent of regulations for data protection (e.g., the General +Data Protection Regulation), there has been increasing demand in deleting +information learned from sensitive data in pre-trained models without +retraining from scratch. The inherent vulnerability of neural networks towards +adversarial attacks and unfairness also calls for a robust method to remove or +correct information in an instance-wise fashion, while retaining the predictive +performance across remaining data. To this end, we consider instance-wise +unlearning, of which the goal is to delete information on a set of instances +from a pre-trained model, by either misclassifying each instance away from its +original prediction or relabeling the instance to a different label. We also +propose two methods that reduce forgetting on the remaining data: 1) utilizing +adversarial examples to overcome forgetting at the representation-level and 2) +leveraging weight importance metrics to pinpoint network parameters guilty of +propagating unwanted information. Both methods only require the pre-trained +model and data instances to forget, allowing painless application to real-life +settings where the entire training set is unavailable. Through extensive +experimentation on various image classification benchmarks, we show that our +approach effectively preserves knowledge of remaining data while unlearning +given instances in both single-task and continual unlearning scenarios. + +
+
+ comment: AAAI 2024 camera ready version +
+
+
+
+
+ + ♻ ☆ Interventions Against Machine-Assisted Statistical Discrimination + + +
+ This article studies how to intervene against statistical discrimination, +when it is based on beliefs generated by machine learning, rather than by +humans. Unlike beliefs formed by a human mind, machine learning-generated +beliefs are verifiable. This allows interventions to move beyond simple, +belief-free designs like affirmative action, to more sophisticated ones, that +constrain decision makers in ways that depend on what they are thinking. Such +mind reading interventions can perform well where affirmative action does not, +even when the beliefs being conditioned on are possibly incorrect and biased. + +
+
+
+
+
+ + ♻ ☆ QuIP: 2-Bit Quantization of Large Language Models With Guarantees + + +
+ This work studies post-training parameter quantization in large language +models (LLMs). We introduce quantization with incoherence processing (QuIP), a +new method based on the insight that quantization benefits from +$\textit{incoherent}$ weight and Hessian matrices, i.e., from the weights being +even in magnitude and the directions in which it is important to round them +accurately being unaligned with the coordinate axes. QuIP consists of two +steps: (1) an adaptive rounding procedure minimizing a quadratic proxy +objective; (2) efficient pre- and post-processing that ensures weight and +Hessian incoherence via multiplication by random orthogonal matrices. We +complement QuIP with the first theoretical analysis for an LLM-scale +quantization algorithm, and show that our theory also applies to an existing +method, OPTQ. Empirically, we find that our incoherence preprocessing improves +several existing quantization algorithms and yields the first LLM quantization +methods that produce viable results using only two bits per weight. Our code +can be found at https://github.com/Cornell-RelaxML/QuIP. + +
+
+
+
+
+ + ♻ ☆ Morphological Profiling for Drug Discovery in the Era of Deep Learning + + +
+ Morphological profiling is a valuable tool in phenotypic drug discovery. The +advent of high-throughput automated imaging has enabled the capturing of a wide +range of morphological features of cells or organisms in response to +perturbations at the single-cell resolution. Concurrently, significant advances +in machine learning and deep learning, especially in computer vision, have led +to substantial improvements in analyzing large-scale high-content images at +high-throughput. These efforts have facilitated understanding of compound +mechanism-of-action (MOA), drug repurposing, characterization of cell +morphodynamics under perturbation, and ultimately contributing to the +development of novel therapeutics. In this review, we provide a comprehensive +overview of the recent advances in the field of morphological profiling. We +summarize the image profiling analysis workflow, survey a broad spectrum of +analysis strategies encompassing feature engineering- and deep learning-based +approaches, and introduce publicly available benchmark datasets. We place a +particular emphasis on the application of deep learning in this pipeline, +covering cell segmentation, image representation learning, and multimodal +learning. Additionally, we illuminate the application of morphological +profiling in phenotypic drug discovery and highlight potential challenges and +opportunities in this field. + +
+
+ comment: 44 pages, 5 figure, 5 tables +
+
+
+
+
+ + ♻ ☆ Learning to Transform for Generalizable Instance-wise Invariance ICCV 2023 + + +
+ Computer vision research has long aimed to build systems that are robust to +spatial transformations found in natural data. Traditionally, this is done +using data augmentation or hard-coding invariances into the architecture. +However, too much or too little invariance can hurt, and the correct amount is +unknown a priori and dependent on the instance. Ideally, the appropriate +invariance would be learned from data and inferred at test-time. + We treat invariance as a prediction problem. Given any image, we use a +normalizing flow to predict a distribution over transformations and average the +predictions over them. Since this distribution only depends on the instance, we +can align instances before classifying them and generalize invariance across +classes. The same distribution can also be used to adapt to out-of-distribution +poses. This normalizing flow is trained end-to-end and can learn a much larger +range of transformations than Augerino and InstaAug. When used as data +augmentation, our method shows accuracy and robustness gains on CIFAR 10, +CIFAR10-LT, and TinyImageNet. + +
+
+ comment: Accepted to ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Time-changed normalizing flows for accurate SDE modeling + + +
+ The generative paradigm has become increasingly important in machine learning +and deep learning models. Among popular generative models are normalizing +flows, which enable exact likelihood estimation by transforming a base +distribution through diffeomorphic transformations. Extending the normalizing +flow framework to handle time-indexed flows gave dynamic normalizing flows, a +powerful tool to model time series, stochastic processes, and neural stochastic +differential equations (SDEs). In this work, we propose a novel variant of +dynamic normalizing flows, a Time Changed Normalizing Flow (TCNF), based on +time deformation of a Brownian motion which constitutes a versatile and +extensive family of Gaussian processes. This approach enables us to effectively +model some SDEs, that cannot be modeled otherwise, including standard ones such +as the well-known Ornstein-Uhlenbeck process, and generalizes prior +methodologies, leading to improved results and better inference and prediction +capability. + +
+
+
+
+
+ + ♻ ☆ Act as You Learn: Adaptive Decision-Making in Non-Stationary Markov + Decision Processes AAMAS + + +
+ A fundamental (and largely open) challenge in sequential decision-making is +dealing with non-stationary environments, where exogenous environmental +conditions change over time. Such problems are traditionally modeled as +non-stationary Markov decision processes (NSMDP). However, existing approaches +for decision-making in NSMDPs have two major shortcomings: first, they assume +that the updated environmental dynamics at the current time are known (although +future dynamics can change); and second, planning is largely pessimistic, i.e., +the agent acts ``safely'' to account for the non-stationary evolution of the +environment. We argue that both these assumptions are invalid in practice -- +updated environmental conditions are rarely known, and as the agent interacts +with the environment, it can learn about the updated dynamics and avoid being +pessimistic, at least in states whose dynamics it is confident about. We +present a heuristic search algorithm called \textit{Adaptive Monte Carlo Tree +Search (ADA-MCTS)} that addresses these challenges. We show that the agent can +learn the updated dynamics of the environment over time and then act as it +learns, i.e., if the agent is in a region of the state space about which it has +updated knowledge, it can avoid being pessimistic. To quantify ``updated +knowledge,'' we disintegrate the aleatoric and epistemic uncertainty in the +agent's updated belief and show how the agent can use these estimates for +decision-making. We compare the proposed approach with the multiple +state-of-the-art approaches in decision-making across multiple well-established +open-source problems and empirically show that our approach is faster and +highly adaptive without sacrificing safety. + +
+
+ comment: Accepted for publication at the International Conference on + Autonomous Agents and MultiAgent Systems (AAMAS), 2024 +
+
+
+
+
+ + ♻ ☆ A PAC Learning Algorithm for LTL and Omega-regular Objectives in MDPs + + +
+ Linear temporal logic (LTL) and omega-regular objectives -- a superset of LTL +-- have seen recent use as a way to express non-Markovian objectives in +reinforcement learning. We introduce a model-based probably approximately +correct (PAC) learning algorithm for omega-regular objectives in Markov +decision processes (MDPs). As part of the development of our algorithm, we +introduce the epsilon-recurrence time: a measure of the speed at which a policy +converges to the satisfaction of the omega-regular objective in the limit. We +prove that our algorithm only requires a polynomial number of samples in the +relevant parameters, and perform experiments which confirm our theory. + +
+
+
+
+
+ + ♻ ☆ Fuzz4All: Universal Fuzzing with Large Language Models ICSE 2024 + + +
+ Fuzzing has achieved tremendous success in discovering bugs and +vulnerabilities in various software systems. Systems under test (SUTs) that +take in programming or formal language as inputs, e.g., compilers, runtime +engines, constraint solvers, and software libraries with accessible APIs, are +especially important as they are fundamental building blocks of software +development. However, existing fuzzers for such systems often target a specific +language, and thus cannot be easily applied to other languages or even other +versions of the same language. Moreover, the inputs generated by existing +fuzzers are often limited to specific features of the input language, and thus +can hardly reveal bugs related to other or new features. This paper presents +Fuzz4All, the first fuzzer that is universal in the sense that it can target +many different input languages and many different features of these languages. +The key idea behind Fuzz4All is to leverage large language models (LLMs) as an +input generation and mutation engine, which enables the approach to produce +diverse and realistic inputs for any practically relevant language. To realize +this potential, we present a novel autoprompting technique, which creates LLM +prompts that are wellsuited for fuzzing, and a novel LLM-powered fuzzing loop, +which iteratively updates the prompt to create new fuzzing inputs. We evaluate +Fuzz4All on nine systems under test that take in six different languages (C, +C++, Go, SMT2, Java and Python) as inputs. The evaluation shows, across all six +languages, that universal fuzzing achieves higher coverage than existing, +language-specific fuzzers. Furthermore, Fuzz4All has identified 98 bugs in +widely used systems, such as GCC, Clang, Z3, CVC5, OpenJDK, and the Qiskit +quantum computing platform, with 64 bugs already confirmed by developers as +previously unknown. + +
+
+ comment: Accepted at ICSE 2024 +
+
+
+
+
+ + ♻ ☆ To Stay or Not to Stay in the Pre-train Basin: Insights on Ensembling in + Transfer Learning NeurIPS 2023 + + +
+ Transfer learning and ensembling are two popular techniques for improving the +performance and robustness of neural networks. Due to the high cost of +pre-training, ensembles of models fine-tuned from a single pre-trained +checkpoint are often used in practice. Such models end up in the same basin of +the loss landscape, which we call the pre-train basin, and thus have limited +diversity. In this work, we show that ensembles trained from a single +pre-trained checkpoint may be improved by better exploring the pre-train basin, +however, leaving the basin results in losing the benefits of transfer learning +and in degradation of the ensemble quality. Based on the analysis of existing +exploration methods, we propose a more effective modification of the Snapshot +Ensembles (SSE) for transfer learning setup, StarSSE, which results in stronger +ensembles and uniform model soups. + +
+
+ comment: Published in NeurIPS 2023. First two authors contributed equally +
+
+
+
+
+ + ♻ ☆ Contrastive Active Inference NeurIPS 2021 + + +
+ Active inference is a unifying theory for perception and action resting upon +the idea that the brain maintains an internal model of the world by minimizing +free energy. From a behavioral perspective, active inference agents can be seen +as self-evidencing beings that act to fulfill their optimistic predictions, +namely preferred outcomes or goals. In contrast, reinforcement learning +requires human-designed rewards to accomplish any desired outcome. Although +active inference could provide a more natural self-supervised objective for +control, its applicability has been limited because of the shortcomings in +scaling the approach to complex environments. In this work, we propose a +contrastive objective for active inference that strongly reduces the +computational burden in learning the agent's generative model and planning +future actions. Our method performs notably better than likelihood-based active +inference in image-based tasks, while also being computationally cheaper and +easier to train. We compare to reinforcement learning agents that have access +to human-designed reward functions, showing that our approach closely matches +their performance. Finally, we also show that contrastive methods perform +significantly better in the case of distractors in the environment and that our +method is able to generalize goals to variations in the background. Website and +code: https://contrastive-aif.github.io/ + +
+
+ comment: Accepted as a conference paper at 35th Conference on Neural + Information Processing Systems (NeurIPS 2021) +
+
+
+
+
+ + ♻ ☆ Resource-Efficient Separation Transformer ICASSP 2024 + + +
+ Transformers have recently achieved state-of-the-art performance in speech +separation. These models, however, are computationally demanding and require a +lot of learnable parameters. This paper explores Transformer-based speech +separation with a reduced computational cost. Our main contribution is the +development of the Resource-Efficient Separation Transformer (RE-SepFormer), a +self-attention-based architecture that reduces the computational burden in two +ways. First, it uses non-overlapping blocks in the latent space. Second, it +operates on compact latent summaries calculated from each chunk. The +RE-SepFormer reaches a competitive performance on the popular WSJ0-2Mix and +WHAM! datasets in both causal and non-causal settings. Remarkably, it scales +significantly better than the previous Transformer-based architectures in terms +of memory and inference time, making it more suitable for processing long +mixtures. + +
+
+ comment: Accepted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ Gradient Descent with Linearly Correlated Noise: Theory and Applications + to Differential Privacy + + +
+ We study gradient descent under linearly correlated noise. Our work is +motivated by recent practical methods for optimization with differential +privacy (DP), such as DP-FTRL, which achieve strong performance in settings +where privacy amplification techniques are infeasible (such as in federated +learning). These methods inject privacy noise through a matrix factorization +mechanism, making the noise linearly correlated over iterations. We propose a +simplified setting that distills key facets of these methods and isolates the +impact of linearly correlated noise. We analyze the behavior of gradient +descent in this setting, for both convex and non-convex functions. Our analysis +is demonstrably tighter than prior work and recovers multiple important special +cases exactly (including anticorrelated perturbed gradient descent). We use our +results to develop new, effective matrix factorizations for differentially +private optimization, and highlight the benefits of these factorizations +theoretically and empirically. + +
+
+
+
+
+ + ♻ ☆ Improved Information Theoretic Generalization Bounds for Distributed and + Federated Learning + + +
+ We consider information-theoretic bounds on expected generalization error for +statistical learning problems in a networked setting. In this setting, there +are $K$ nodes, each with its own independent dataset, and the models from each +node have to be aggregated into a final centralized model. We consider both +simple averaging of the models as well as more complicated multi-round +algorithms. We give upper bounds on the expected generalization error for a +variety of problems, such as those with Bregman divergence or Lipschitz +continuous losses, that demonstrate an improved dependence of $1/K$ on the +number of nodes. These "per node" bounds are in terms of the mutual information +between the training dataset and the trained weights at each node, and are +therefore useful in describing the generalization properties inherent to having +communication or privacy constraints at each node. + +
+
+ comment: This version of the paper adds an assumption that was missing from + Theorem 4 for loss functions of type (i). Thanks to Peyman Gholami for + spotting this bug +
+
+
+
+
+ + ♻ ☆ Strategic Classification under Unknown Personalized Manipulation + + +
+ We study the fundamental mistake bound and sample complexity in the strategic +classification, where agents can strategically manipulate their feature vector +up to an extent in order to be predicted as positive. For example, given a +classifier determining college admission, student candidates may try to take +easier classes to improve their GPA, retake SAT and change schools in an effort +to fool the classifier. Ball manipulations are a widely studied class of +manipulations in the literature, where agents can modify their feature vector +within a bounded radius ball. Unlike most prior work, our work considers +manipulations to be personalized, meaning that agents can have different levels +of manipulation abilities (e.g., varying radii for ball manipulations), and +unknown to the learner. + We formalize the learning problem in an interaction model where the learner +first deploys a classifier and the agent manipulates the feature vector within +their manipulation set to game the deployed classifier. We investigate various +scenarios in terms of the information available to the learner during the +interaction, such as observing the original feature vector before or after +deployment, observing the manipulated feature vector, or not seeing either the +original or the manipulated feature vector. We begin by providing online +mistake bounds and PAC sample complexity in these scenarios for ball +manipulations. We also explore non-ball manipulations and show that, even in +the simplest scenario where both the original and the manipulated feature +vectors are revealed, the mistake bounds and sample complexity are lower +bounded by $\Omega(|H|)$ when the target function belongs to a known class $H$. + +
+
+
+
+
+ + ♻ ☆ PhilEO Bench: Evaluating Geo-Spatial Foundation Models + + +
+ Massive amounts of unlabelled data are captured by Earth Observation (EO) +satellites, with the Sentinel-2 constellation generating 1.6 TB of data daily. +This makes Remote Sensing a data-rich domain well suited to Machine Learning +(ML) solutions. However, a bottleneck in applying ML models to EO is the lack +of annotated data as annotation is a labour-intensive and costly process. As a +result, research in this domain has focused on Self-Supervised Learning and +Foundation Model approaches. This paper addresses the need to evaluate +different Foundation Models on a fair and uniform benchmark by introducing the +PhilEO Bench, a novel evaluation framework for EO Foundation Models. The +framework comprises of a testbed and a novel 400 GB Sentinel-2 dataset +containing labels for three downstream tasks, building density estimation, road +segmentation, and land cover classification. We present experiments using our +framework evaluating different Foundation Models, including Prithvi and SatMAE, +at multiple n-shots and convergence rates. + +
+
+ comment: 6 pages, 5 figures, Submitted to IGARSS 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Node Selection in Private Personalized Decentralized Learning + + +
+ Personalized decentralized learning is a promising paradigm for distributed +learning, enabling each node to train a local model on its own data and +collaborate with other nodes to improve without sharing any data. However, this +approach poses significant privacy risks, as nodes may inadvertently disclose +sensitive information about their data or preferences through their +collaboration choices. In this paper, we propose Private Personalized +Decentralized Learning (PPDL), a novel approach that combines secure +aggregation and correlated adversarial multi-armed bandit optimization to +protect node privacy while facilitating efficient node selection. By leveraging +dependencies between different arms, represented by potential collaborators, we +demonstrate that PPDL can effectively identify suitable collaborators solely +based on aggregated models. Additionally, we show that PPDL surpasses previous +non-private methods in model performance on standard benchmarks under label and +covariate shift scenarios. + +
+
+
+
+
+ + ♻ ☆ A Trade-off Analysis of Replacing Proprietary LLMs with Open Source SLMs + in Production + + +
+ Many companies rely on APIs of managed AI models such as OpenAI's GPT-4 to +create AI-enabled experiences in their products. Along with the benefits of +ease of use and shortened time to production, this reliance on proprietary APIs +has downsides in terms of model control, performance reliability, up-time +predictability, and cost. At the same time, there has been a flurry of open +source small language models (SLMs) that have been made available for +commercial use. However, their readiness to replace existing capabilities +remains unclear, and a systematic approach to test these models is not readily +available. In this paper, we present a systematic evaluation methodology for, +and characterization of, modern open source SLMs and their trade-offs when +replacing a proprietary LLM APIs for a real-world product feature. We have +designed SLaM, an automated analysis tool that enables the quantitative and +qualitative testing of product features utilizing arbitrary SLMs. Using SLaM, +we examine both the quality and the performance characteristics of modern SLMs +relative to an existing customer-facing OpenAI-based implementation. We find +that across 9 SLMs and 29 variants, we observe competitive quality-of-results +for our use case, significant performance consistency improvement, and a cost +reduction of 5x-29x when compared to OpenAI GPT-4. + +
+
+ comment: Updated title +
+
+
+
+
+ + ♻ ☆ Learned Interferometric Imaging for the SPIDER Instrument + + +
+ The Segmented Planar Imaging Detector for Electro-Optical Reconnaissance +(SPIDER) is an optical interferometric imaging device that aims to offer an +alternative to the large space telescope designs of today with reduced size, +weight and power consumption. This is achieved through interferometric imaging. +State-of-the-art methods for reconstructing images from interferometric +measurements adopt proximal optimization techniques, which are computationally +expensive and require handcrafted priors. In this work we present two +data-driven approaches for reconstructing images from measurements made by the +SPIDER instrument. These approaches use deep learning to learn prior +information from training data, increasing the reconstruction quality, and +significantly reducing the computation time required to recover images by +orders of magnitude. Reconstruction time is reduced to ${\sim} 10$ +milliseconds, opening up the possibility of real-time imaging with SPIDER for +the first time. Furthermore, we show that these methods can also be applied in +domains where training data is scarce, such as astronomical imaging, by +leveraging transfer learning from domains where plenty of training data are +available. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ A tree-based varying coefficient model + + +
+ The paper introduces a tree-based varying coefficient model (VCM) where the +varying coefficients are modelled using the cyclic gradient boosting machine +(CGBM) from Delong et al. (2023). Modelling the coefficient functions using a +CGBM allows for dimension-wise early stopping and feature importance scores. +The dimension-wise early stopping not only reduces the risk of +dimension-specific overfitting, but also reveals differences in model +complexity across dimensions. The use of feature importance scores allows for +simple feature selection and easy model interpretation. The model is evaluated +on the same simulated and real data examples as those used in Richman and +W\"uthrich (2023), and the results show that it produces results in terms of +out of sample loss that are comparable to those of their neural network-based +VCM called LocalGLMnet. + +
+
+ comment: 23 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Topological Learning in Multi-Class Data Sets + + +
+ We specialize techniques from topological data analysis to the problem of +characterizing the topological complexity (as defined in the body of the paper) +of a multi-class data set. As a by-product, a topological classifier is defined +that uses an open sub-covering of the data set. This sub-covering can be used +to construct a simplicial complex whose topological features (e.g., Betti +numbers) provide information about the classification problem. We use these +topological constructs to study the impact of topological complexity on +learning in feedforward deep neural networks (DNNs). We hypothesize that +topological complexity is negatively correlated with the ability of a fully +connected feedforward deep neural network to learn to classify data correctly. +We evaluate our topological classification algorithm on multiple constructed +and open source data sets. We also validate our hypothesis regarding the +relationship between topological complexity and learning in DNN's on multiple +data sets. + +
+
+ comment: 16 pages, 18 figures. This is a revision of v2 +
+
+
+
+
+ + ♻ ☆ Multi-task convolutional neural network for image aesthetic assessment + + +
+ As people's aesthetic preferences for images are far from understood, image +aesthetic assessment is a challenging artificial intelligence task. The range +of factors underlying this task is almost unlimited, but we know that some +aesthetic attributes affect those preferences. In this study, we present a +multi-task convolutional neural network that takes into account these +attributes. The proposed neural network jointly learns the attributes along +with the overall aesthetic scores of images. This multi-task learning framework +allows for effective generalization through the utilization of shared +representations. Our experiments demonstrate that the proposed method +outperforms the state-of-the-art approaches in predicting overall aesthetic +scores for images in one benchmark of image aesthetics. We achieve near-human +performance in terms of overall aesthetic scores when considering the +Spearman's rank correlations. Moreover, our model pioneers the application of +multi-tasking in another benchmark, serving as a new baseline for future +research. Notably, our approach achieves this performance while using fewer +parameters compared to existing multi-task neural networks in the literature, +and consequently makes our method more efficient in terms of computational +complexity. + +
+
+
+
+
+ + ♻ ☆ Deep Signature Algorithm for Multi-dimensional Path-Dependent Options + + +
+ In this work, we study the deep signature algorithms for path-dependent +options. We extend the backward scheme in [Hur\'e-Pham-Warin. Mathematics of +Computation 89, no. 324 (2020)] for state-dependent FBSDEs with reflections to +path-dependent FBSDEs with reflections, by adding the signature layer to the +backward scheme. Our algorithm applies to both European and American type +option pricing problems while the payoff function depends on the whole paths of +the underlying forward stock process. We prove the convergence analysis of our +numerical algorithm with explicit dependence on the truncation order of the +signature and the neural network approximation errors. Numerical examples for +the algorithm are provided including: Amerasian option under the Black-Scholes +model, American option with a path-dependent geometric mean payoff function, +and the Shiryaev's optimal stopping problem. + +
+
+ comment: 21 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ Bringing the Discussion of Minima Sharpness to the Audio Domain: a + Filter-Normalised Evaluation for Acoustic Scene Classification + + +
+ The correlation between the sharpness of loss minima and generalisation in +the context of deep neural networks has been subject to discussion for a long +time. Whilst mostly investigated in the context of selected benchmark data sets +in the area of computer vision, we explore this aspect for the acoustic scene +classification task of the DCASE2020 challenge data. Our analysis is based on +two-dimensional filter-normalised visualisations and a derived sharpness +measure. Our exploratory analysis shows that sharper minima tend to show better +generalisation than flat minima -even more so for out-of-domain data, recorded +from previously unseen devices-, thus adding to the dispute about better +generalisation capabilities of flat minima. We further find that, in +particular, the choice of optimisers is a main driver of the sharpness of +minima and we discuss resulting limitations with respect to comparability. Our +code, trained model states and loss landscape visualisations are publicly +available. + +
+
+ comment: This work has been submitted to the IEEE for possible publication +
+
+
+
+
+ + ♻ ☆ Identifying Policy Gradient Subspaces + + +
+ Policy gradient methods hold great potential for solving complex continuous +control tasks. Still, their training efficiency can be improved by exploiting +structure within the optimization problem. Recent work indicates that +supervised learning can be accelerated by leveraging the fact that gradients +lie in a low-dimensional and slowly-changing subspace. In this paper, we +conduct a thorough evaluation of this phenomenon for two popular deep policy +gradient methods on various simulated benchmark tasks. Our results demonstrate +the existence of such gradient subspaces despite the continuously changing data +distribution inherent to reinforcement learning. These findings reveal +promising directions for future work on more efficient reinforcement learning, +e.g., through improving parameter-space exploration or enabling second-order +optimization. + +
+
+ comment: 21 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Harmonizing Covariance and Expressiveness for Deep Hamiltonian + Regression in Crystalline Material Research: a Hybrid Cascaded Regression + Framework + + +
+ Deep learning for Hamiltonian regression of quantum systems in material +research necessitates satisfying the covariance laws, among which achieving +SO(3)-equivariance without sacrificing the expressiveness capability of +networks remains an elusive challenge due to the restriction to non-linear +mappings on guaranteeing theoretical equivariance. To alleviate the +covariance-expressiveness dilemma, we propose a hybrid framework with two +cascaded regression stages. The first stage, i.e., a theoretically-guaranteed +covariant neural network modeling symmetry properties of 3D atom systems, +predicts baseline Hamiltonians with theoretically covariant features extracted, +assisting the second stage in learning covariance. Meanwhile, the second stage, +powered by a non-linear 3D graph Transformer network we propose for structural +modeling of atomic systems, refines the first stage's output as a fine-grained +prediction of Hamiltonians with better expressiveness capability. The +combination of a theoretically covariant yet inevitably less expressive model +with a highly expressive non-linear network enables precise, generalizable +predictions while maintaining robust covariance under coordinate +transformations. Our method achieves state-of-the-art performance in +Hamiltonian prediction for electronic structure calculations, confirmed through +experiments on six crystalline material databases. The codes and configuration +scripts are available in the supplementary material. + +
+
+
+
+
+ + ♻ ☆ Single and Few-step Diffusion for Generative Speech Enhancement + + +
+ Diffusion models have shown promising results in speech enhancement, using a +task-adapted diffusion process for the conditional generation of clean speech +given a noisy mixture. However, at test time, the neural network used for score +estimation is called multiple times to solve the iterative reverse process. +This results in a slow inference process and causes discretization errors that +accumulate over the sampling trajectory. In this paper, we address these +limitations through a two-stage training approach. In the first stage, we train +the diffusion model the usual way using the generative denoising score matching +loss. In the second stage, we compute the enhanced signal by solving the +reverse process and compare the resulting estimate to the clean speech target +using a predictive loss. We show that using this second training stage enables +achieving the same performance as the baseline model using only 5 function +evaluations instead of 60 function evaluations. While the performance of usual +generative diffusion algorithms drops dramatically when lowering the number of +function evaluations (NFEs) to obtain single-step diffusion, we show that our +proposed method keeps a steady performance and therefore largely outperforms +the diffusion baseline in this setting and also generalizes better than its +predictive counterpart. + +
+
+ comment: copyright 2023 IEEE. Personal use of this material is permitted. + Permission from IEEE must be obtained for all other uses, in any current or + future media, including reprinting/republishing this material for advertising + or promotional purposes, creating new collective works, for resale or + redistribution to servers or lists, or reuse of any copyrighted component of + this work in other works +
+
+
+
+
+ + ♻ ☆ Learning to Taste: A Multimodal Wine Dataset NeurIPS 2023 + + +
+ We present WineSensed, a large multimodal wine dataset for studying the +relations between visual perception, language, and flavor. The dataset +encompasses 897k images of wine labels and 824k reviews of wines curated from +the Vivino platform. It has over 350k unique bottlings, annotated with year, +region, rating, alcohol percentage, price, and grape composition. We obtained +fine-grained flavor annotations on a subset by conducting a wine-tasting +experiment with 256 participants who were asked to rank wines based on their +similarity in flavor, resulting in more than 5k pairwise flavor distances. We +propose a low-dimensional concept embedding algorithm that combines human +experience with automatic machine similarity kernels. We demonstrate that this +shared concept embedding space improves upon separate embedding spaces for +coarse flavor classification (alcohol percentage, country, grape, price, +rating) and aligns with the intricate human perception of flavor. + +
+
+ comment: Accepted to NeurIPS 2023. See project page: + https://thoranna.github.io/learning_to_taste/ +
+
+
+
+
+ + ♻ ☆ Explore to Generalize in Zero-Shot RL + + +
+ We study zero-shot generalization in reinforcement learning-optimizing a +policy on a set of training tasks to perform well on a similar but unseen test +task. To mitigate overfitting, previous work explored different notions of +invariance to the task. However, on problems such as the ProcGen Maze, an +adequate solution that is invariant to the task visualization does not exist, +and therefore invariance-based approaches fail. Our insight is that learning a +policy that effectively $\textit{explores}$ the domain is harder to memorize +than a policy that maximizes reward for a specific task, and therefore we +expect such learned behavior to generalize well; we indeed demonstrate this +empirically on several domains that are difficult for invariance-based +approaches. Our $\textit{Explore to Generalize}$ algorithm (ExpGen) builds on +this insight: we train an additional ensemble of agents that optimize reward. +At test time, either the ensemble agrees on an action, and we generalize well, +or we take exploratory actions, which generalize well and drive us to a novel +part of the state space, where the ensemble may potentially agree again. We +show that our approach is the state-of-the-art on tasks of the ProcGen +challenge that have thus far eluded effective generalization, yielding a +success rate of $83\%$ on the Maze task and $74\%$ on Heist with $200$ training +levels. ExpGen can also be combined with an invariance based approach to gain +the best of both worlds, setting new state-of-the-art results on ProcGen. + +
+
+
+
+
+ + ♻ ☆ An Autoregressive Text-to-Graph Framework for Joint Entity and Relation + Extraction AAAI 2024 + + +
+ In this paper, we propose a novel method for joint entity and relation +extraction from unstructured text by framing it as a conditional sequence +generation problem. In contrast to conventional generative information +extraction models that are left-to-right token-level generators, our approach +is \textit{span-based}. It generates a linearized graph where nodes represent +text spans and edges represent relation triplets. Our method employs a +transformer encoder-decoder architecture with pointing mechanism on a dynamic +vocabulary of spans and relation types. Our model can capture the structural +characteristics and boundaries of entities and relations through span +representations while simultaneously grounding the generated output in the +original text thanks to the pointing mechanism. Evaluation on benchmark +datasets validates the effectiveness of our approach, demonstrating competitive +results. Code is available at https://github.com/urchade/ATG. + +
+
+ comment: AAAI 2024 (camera ready version) +
+
+
+
+
+ + ♻ ☆ Pgx: Hardware-Accelerated Parallel Game Simulators for Reinforcement + Learning + + +
+ We propose Pgx, a suite of board game reinforcement learning (RL) +environments written in JAX and optimized for GPU/TPU accelerators. By +leveraging JAX's auto-vectorization and parallelization over accelerators, Pgx +can efficiently scale to thousands of simultaneous simulations over +accelerators. In our experiments on a DGX-A100 workstation, we discovered that +Pgx can simulate RL environments 10-100x faster than existing implementations +available in Python. Pgx includes RL environments commonly used as benchmarks +in RL research, such as backgammon, chess, shogi, and Go. Additionally, Pgx +offers miniature game sets and baseline models to facilitate rapid research +cycles. We demonstrate the efficient training of the Gumbel AlphaZero algorithm +with Pgx environments. Overall, Pgx provides high-performance environment +simulators for researchers to accelerate their RL experiments. Pgx is available +at http://github.com/sotetsuk/pgx. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Neural Networks for High-Frequency and Multi-Scale + Problems using Transfer Learning + + +
+ Physics-informed neural network (PINN) is a data-driven solver for partial +and ordinary differential equations(ODEs/PDEs). It provides a unified framework +to address both forward and inverse problems. However, the complexity of the +objective function often leads to training failures. This issue is particularly +prominent when solving high-frequency and multi-scale problems. We proposed +using transfer learning to boost the robustness and convergence of training +PINN, starting training from low-frequency problems and gradually approaching +high-frequency problems. Through two case studies, we discovered that transfer +learning can effectively train PINN to approximate solutions from low-frequency +problems to high-frequency problems without increasing network parameters. +Furthermore, it requires fewer data points and less training time. We +elaborately described our training strategy, including optimizer selection, and +suggested guidelines for using transfer learning to train neural networks for +solving more complex problems. + +
+
+ comment: 18 pages +
+
+
+
+
+ + ♻ ☆ Necessary and Sufficient Conditions for Optimal Decision Trees using + Dynamic Programming + + +
+ Global optimization of decision trees has shown to be promising in terms of +accuracy, size, and consequently human comprehensibility. However, many of the +methods used rely on general-purpose solvers for which scalability remains an +issue. Dynamic programming methods have been shown to scale much better because +they exploit the tree structure by solving subtrees as independent subproblems. +However, this only works when an objective can be optimized separately for +subtrees. We explore this relationship in detail and show the necessary and +sufficient conditions for such separability and generalize previous dynamic +programming approaches into a framework that can optimize any combination of +separable objectives and constraints. Experiments on five application domains +show the general applicability of this framework, while outperforming the +scalability of general-purpose solvers by a large margin. + +
+
+
+
+
+ + ♻ ☆ A Smooth Binary Mechanism for Efficient Private Continual Observation NeurIPS 2023 + + +
+ In privacy under continual observation we study how to release differentially +private estimates based on a dataset that evolves over time. The problem of +releasing private prefix sums of $x_1,x_2,x_3,\dots \in\{0,1\}$ (where the +value of each $x_i$ is to be private) is particularly well-studied, and a +generalized form is used in state-of-the-art methods for private stochastic +gradient descent (SGD). The seminal binary mechanism privately releases the +first $t$ prefix sums with noise of variance polylogarithmic in $t$. Recently, +Henzinger et al. and Denisov et al. showed that it is possible to improve on +the binary mechanism in two ways: The variance of the noise can be reduced by a +(large) constant factor, and also made more even across time steps. However, +their algorithms for generating the noise distribution are not as efficient as +one would like in terms of computation time and (in particular) space. We +address the efficiency problem by presenting a simple alternative to the binary +mechanism in which 1) generating the noise takes constant average time per +value, 2) the variance is reduced by a factor about 4 compared to the binary +mechanism, and 3) the noise distribution at each step is identical. +Empirically, a simple Python implementation of our approach outperforms the +running time of the approach of Henzinger et al., as well as an attempt to +improve their algorithm using high-performance algorithms for multiplication +with Toeplitz matrices. + +
+
+ comment: Appeared at NeurIPS 2023 +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Transformer-based Video Saliency Prediction with High Temporal Dimension + Decoding + + +
+ In recent years, finding an effective and efficient strategy for exploiting +spatial and temporal information has been a hot research topic in video +saliency prediction (VSP). With the emergence of spatio-temporal transformers, +the weakness of the prior strategies, e.g., 3D convolutional networks and +LSTM-based networks, for capturing long-range dependencies has been effectively +compensated. While VSP has drawn benefits from spatio-temporal transformers, +finding the most effective way for aggregating temporal features is still +challenging. To address this concern, we propose a transformer-based video +saliency prediction approach with high temporal dimension decoding network +(THTD-Net). This strategy accounts for the lack of complex hierarchical +interactions between features that are extracted from the transformer-based +spatio-temporal encoder: in particular, it does not require multiple decoders +and aims at gradually reducing temporal features' dimensions in the decoder. +This decoder-based architecture yields comparable performance to multi-branch +and over-complicated models on common benchmarks such as DHF1K, UCF-sports and +Hollywood-2. + +
+
+ comment: 8 pages, 2 figures, 3 tables +
+
+
+
+
+ + ☆ Startup Delay Aware Short Video Ordering: Problem, Model, and A + Reinforcement Learning based Algorithm + + +
+ Short video applications have attracted billions of users on the Internet and +can satisfy diverse users' fragmented spare time with content-rich and +duration-short videos. To achieve fast playback at user side, existing short +video systems typically enforce burst transmission of initial segment of each +video when being requested for improved quality of user experiences. However, +such a way of burst transmissions can cause unexpected large startup delays at +user side. This is because users may frequently switch videos when sequentially +watching a list of short videos recommended by the server side, which can cause +excessive burst transmissions of initial segments of different short videos and +thus quickly deplete the network transmission capacity. In this paper, we adopt +token bucket to characterize the video transmission path between video server +and each user, and accordingly study how to effectively reduce the startup +delay of short videos by effectively arranging the viewing order of a video +list at the server side. We formulate the optimal video ordering problem for +minimizing the maximum video startup delay as a combinatorial optimization +problem and prove its NP-hardness. We accordingly propose a Partially Shared +Actor Critic reinforcement learning algorithm (PSAC) to learn optimized video +ordering strategy. Numerical results based on a real dataset provided by a +large-scale short video service provider demonstrate that the proposed PSAC +algorithm can significantly reduce the video startup delay compared to baseline +algorithms. + +
+
+
+
+
+ + ♻ ☆ CASR: Refining Action Segmentation via Marginalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose Causal Abstraction Segmentation Refiner (CASR), which can +refine TAS results from various models by enhancing video causality in +marginalizing frame-level casual relationships. Specifically, we define the +equivalent frame-level casual model and segment-level causal model, so that the +causal adjacency matrix constructed from marginalized frame-level causal +relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. + +
+
+
+
+
+ + ♻ ☆ MolCA: Molecular Graph-Language Modeling with Cross-Modal Projector and + Uni-Modal Adapter EMNLP + + +
+ Language Models (LMs) have demonstrated impressive molecule understanding +ability on various 1D text-related tasks. However, they inherently lack 2D +graph perception - a critical ability of human professionals in comprehending +molecules' topological structures. To bridge this gap, we propose MolCA: +Molecular Graph-Language Modeling with Cross-Modal Projector and Uni-Modal +Adapter. MolCA enables an LM (e.g., Galactica) to understand both text- and +graph-based molecular contents via the cross-modal projector. Specifically, the +cross-modal projector is implemented as a Q-Former to connect a graph encoder's +representation space and an LM's text space. Further, MolCA employs a uni-modal +adapter (i.e., LoRA) for the LM's efficient adaptation to downstream tasks. +Unlike previous studies that couple an LM with a graph encoder via cross-modal +contrastive learning, MolCA retains the LM's ability of open-ended text +generation and augments it with 2D graph information. To showcase its +effectiveness, we extensively benchmark MolCA on tasks of molecule captioning, +IUPAC name prediction, and molecule-text retrieval, on which MolCA +significantly outperforms the baselines. Our codes and checkpoints can be found +at https://github.com/acharkq/MolCA. + +
+
+ comment: EMNLP main conference. 9 pages +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`